103 files changed, 4051 insertions, 802 deletions
diff --git a/README.md b/README.md
index 916e5200b2..ef5bdc66ef 100644
--- a/README.md
+++ b/README.md
@@ -4,9 +4,10 @@
 
 -----------------
 
-| **`Linux CPU`** | **`Linux GPU`** | **`Mac OS CPU`** | **`Windows CPU`** | **`Android`** |
-|-----------------|---------------------|------------------|-------------------|---------------|
-| [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-cpu)](https://ci.tensorflow.org/job/tensorflow-master-cpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-linux-gpu)](https://ci.tensorflow.org/job/tensorflow-master-linux-gpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-mac)](https://ci.tensorflow.org/job/tensorflow-master-mac) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) [ ![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg) ](https://bintray.com/google/tensorflow/tensorflow/_latestVersion) |
+
+| **`Documentation`** | **`Linux CPU`** | **`Linux GPU`** | **`Mac OS CPU`** | **`Windows CPU`** | **`Android`** |
+|-----------------|---------------------|------------------|-------------------|---------------|---------------|
+| [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://www.tensorflow.org/api_docs/) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-cpu)](https://ci.tensorflow.org/job/tensorflow-master-cpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-linux-gpu)](https://ci.tensorflow.org/job/tensorflow-master-linux-gpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-mac)](https://ci.tensorflow.org/job/tensorflow-master-mac) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) [ ![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg) ](https://bintray.com/google/tensorflow/tensorflow/_latestVersion)
 
 **TensorFlow** is an open source software library for numerical computation using
 data flow graphs.  The graph nodes represent mathematical operations, while
@@ -21,20 +22,6 @@ organization for the purposes of conducting machine learning and deep neural
 networks research.  The system is general enough to be applicable in a wide
 variety of other domains, as well.
 
-**If you want to contribute to TensorFlow, be sure to review the [contribution
-guidelines](CONTRIBUTING.md). This project adheres to TensorFlow's
-[code of conduct](CODE_OF_CONDUCT.md). By participating, you are expected to
-uphold this code.**
-
-**We use [GitHub issues](https://github.com/tensorflow/tensorflow/issues) for
-tracking requests and bugs. So please see
-[TensorFlow Discuss](https://groups.google.com/a/tensorflow.org/forum/#!forum/discuss) for general questions
-and discussion, and please direct specific questions to [Stack Overflow](https://stackoverflow.com/questions/tagged/tensorflow).**
-
-The TensorFlow project strives to abide by generally accepted best practices in open-source software development:
-
-[![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/1486/badge)](https://bestpractices.coreinfrastructure.org/projects/1486)
-
 ## Installation
 *See [Installing TensorFlow](https://www.tensorflow.org/get_started/os_setup.html) for instructions on how to install our release binaries or how to build from source.*
 
@@ -75,6 +62,22 @@ $ python
 >>> sess.close()
 ```
 
+## Contribution guidelines
+
+**If you want to contribute to TensorFlow, be sure to review the [contribution
+guidelines](CONTRIBUTING.md). This project adheres to TensorFlow's
+[code of conduct](CODE_OF_CONDUCT.md). By participating, you are expected to
+uphold this code.**
+
+**We use [GitHub issues](https://github.com/tensorflow/tensorflow/issues) for
+tracking requests and bugs. So please see
+[TensorFlow Discuss](https://groups.google.com/a/tensorflow.org/forum/#!forum/discuss) for general questions
+and discussion, and please direct specific questions to [Stack Overflow](https://stackoverflow.com/questions/tagged/tensorflow).**
+
+The TensorFlow project strives to abide by generally accepted best practices in open-source software development:
+
+[![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/1486/badge)](https://bestpractices.coreinfrastructure.org/projects/1486)
+
 ## For more information
 
 * [TensorFlow Website](https://www.tensorflow.org)
diff --git a/SECURITY.md b/SECURITY.md
index 6ddac1f964..fea24b2739 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -233,7 +233,7 @@ v//Fw6ZeY+HmRDFdirjD7wXtIuER4vqCryIqR6Xe9X8oJXz9L/Jhslc=
 
 ### Known vulnerabilities
 
-| Type | Versions affected | Reported by | Additional Information |
-|------|:-----------------:|---------------------------------------|
-| out of bounds read| <=1.4 | TenCent Blade Team | [issue report](https://github.com/tensorflow/tensorflow/issues/14959) |
+| Type              | Versions affected |        Reported by | Additional Information      |
+|-------------------|:-----------------:|--------------------|-----------------------------|
+| out of bounds read|             <=1.4 | TenCent Blade Team | [issue report](https://github.com/tensorflow/tensorflow/issues/14959) |
 
diff --git a/configure b/configure
index 9c21d2b03a..66b66ba54e 100755
--- a/configure
+++ b/configure
@@ -8,7 +8,8 @@ if [ -z "$PYTHON_BIN_PATH" ]; then
 fi
 
 # Set all env variables
-"$PYTHON_BIN_PATH" configure.py
+CONFIGURE_DIR=$(dirname "$0")
+"$PYTHON_BIN_PATH" "${CONFIGURE_DIR}/configure.py" "$@"
 
 echo "Configuration finished"
 
diff --git a/configure.py b/configure.py
index 9744f6ac81..97f46757ee 100644
--- a/configure.py
+++ b/configure.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import argparse
 import errno
 import os
 import platform
@@ -32,10 +33,6 @@ except ImportError:
   from distutils.spawn import find_executable as which
 # pylint: enable=g-import-not-at-top
 
-_TF_BAZELRC = os.path.join(os.path.dirname(os.path.abspath(__file__)),
-                           '.tf_configure.bazelrc')
-_TF_WORKSPACE = os.path.join(os.path.dirname(os.path.abspath(__file__)),
-                             'WORKSPACE')
 _DEFAULT_CUDA_VERSION = '9.0'
 _DEFAULT_CUDNN_VERSION = '7'
 _DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,5.2'
@@ -51,6 +48,11 @@ _SUPPORTED_ANDROID_NDK_VERSIONS = [10, 11, 12, 13, 14, 15]
 
 _DEFAULT_PROMPT_ASK_ATTEMPTS = 10
 
+_TF_WORKSPACE_ROOT = os.path.abspath(os.path.dirname(__file__))
+_TF_BAZELRC_FILENAME = '.tf_configure.bazelrc'
+_TF_BAZELRC = os.path.join(_TF_WORKSPACE_ROOT, _TF_BAZELRC_FILENAME)
+_TF_WORKSPACE = os.path.join(_TF_WORKSPACE_ROOT, 'WORKSPACE')
+
 
 class UserInputError(Exception):
   pass
@@ -119,22 +121,6 @@ def sed_in_place(filename, old, new):
     f.write(newdata)
 
 
-def remove_line_with(filename, token):
-  """Remove lines that contain token from file.
-
-  Args:
-    filename: string for filename.
-    token: string token to check if to remove a line from file or not.
-  """
-  with open(filename, 'r') as f:
-    filedata = f.read()
-
-  with open(filename, 'w') as f:
-    for line in filedata.strip().split('\n'):
-      if token not in line:
-        f.write(line + '\n')
-
-
 def write_to_bazelrc(line):
   with open(_TF_BAZELRC, 'a') as f:
     f.write(line + '\n')
@@ -245,25 +231,30 @@ def setup_python(environ_cp):
   environ_cp['PYTHON_BIN_PATH'] = python_bin_path
 
   # Write tools/python_bin_path.sh
-  with open('tools/python_bin_path.sh', 'w') as f:
+  with open(os.path.join(
+      _TF_WORKSPACE_ROOT, 'tools', 'python_bin_path.sh'), 'w') as f:
     f.write('export PYTHON_BIN_PATH="%s"' % python_bin_path)
 
 
-def reset_tf_configure_bazelrc():
+def reset_tf_configure_bazelrc(workspace_path):
   """Reset file that contains customized config settings."""
   open(_TF_BAZELRC, 'w').close()
-
-  home = os.path.expanduser('~')
-  if not os.path.exists('.bazelrc'):
-    if os.path.exists(os.path.join(home, '.bazelrc')):
-      with open('.bazelrc', 'a') as f:
-        f.write('import %s/.bazelrc\n' % home.replace('\\', '/'))
+  bazelrc_path = os.path.join(workspace_path, '.bazelrc')
+
+  data = []
+  if os.path.exists(bazelrc_path):
+    with open(bazelrc_path, 'r') as f:
+      data = f.read().splitlines()
+  with open(bazelrc_path, 'w') as f:
+    for l in data:
+      if _TF_BAZELRC_FILENAME in l:
+        continue
+      f.write('%s\n' % l)
+    if is_windows():
+      tf_bazelrc_path = _TF_BAZELRC.replace("\\", "/")
     else:
-      open('.bazelrc', 'w').close()
-
-  remove_line_with('.bazelrc', 'tf_configure')
-  with open('.bazelrc', 'a') as f:
-    f.write('import %workspace%/.tf_configure.bazelrc\n')
+      tf_bazelrc_path = _TF_BAZELRC
+    f.write('import %s\n' % tf_bazelrc_path)
 
 
 def cleanup_makefile():
@@ -271,7 +262,8 @@ def cleanup_makefile():
 
   These files could interfere with Bazel parsing.
   """
-  makefile_download_dir = 'tensorflow/contrib/makefile/downloads'
+  makefile_download_dir = os.path.join(
+      _TF_WORKSPACE_ROOT, 'tensorflow', 'contrib', 'makefile', 'downloads')
   if os.path.isdir(makefile_download_dir):
     for root, _, filenames in os.walk(makefile_download_dir):
       for f in filenames:
@@ -456,7 +448,7 @@ def check_bazel_version(min_version):
   if which('bazel') is None:
     print('Cannot find bazel. Please install bazel.')
     sys.exit(0)
-  curr_version = run_shell(['bazel', '--batch', 'version'])
+  curr_version = run_shell(['bazel', '--batch', '--bazelrc=/dev/null', 'version'])
 
   for line in curr_version.split('\n'):
     if 'Build label: ' in line:
@@ -502,7 +494,8 @@ def set_cc_opt_flags(environ_cp):
   for opt in cc_opt_flags.split():
     write_to_bazelrc('build:opt --copt=%s' % opt)
   # It should be safe on the same build host.
-  write_to_bazelrc('build:opt --host_copt=-march=native')
+  if not is_ppc64le():
+    write_to_bazelrc('build:opt --host_copt=-march=native')
   write_to_bazelrc('build:opt --define with_default_optimizations=true')
   # TODO(mikecase): Remove these default defines once we are able to get
   # TF Lite targets building without them.
@@ -1229,7 +1222,7 @@ def set_host_c_compiler(environ_cp):
       environ_cp,
       var_name='HOST_C_COMPILER',
       var_default=default_c_host_compiler,
-      ask_for_var=('Please specify which C compiler should be used as the host'
+      ask_for_var=('Please specify which C compiler should be used as the host '
                    'C compiler.'),
       check_success=os.path.exists,
       error_msg='Invalid C compiler path. %s cannot be found.',
@@ -1373,13 +1366,20 @@ def config_info_line(name, help_text):
 
 
 def main():
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--workspace",
+                      type=str,
+                      default=_TF_WORKSPACE_ROOT,
+                      help="The absolute path to your active Bazel workspace.")
+  args = parser.parse_args()
+
   # Make a copy of os.environ to be clear when functions and getting and setting
   # environment variables.
   environ_cp = dict(os.environ)
 
   check_bazel_version('0.5.4')
 
-  reset_tf_configure_bazelrc()
+  reset_tf_configure_bazelrc(args.workspace)
   cleanup_makefile()
   setup_python(environ_cp)
 
diff --git a/tensorflow/cc/gradients/nn_grad.cc b/tensorflow/cc/gradients/nn_grad.cc
index 9b732421e5..0cb3132e94 100644
--- a/tensorflow/cc/gradients/nn_grad.cc
+++ b/tensorflow/cc/gradients/nn_grad.cc
@@ -182,6 +182,70 @@ Status MaxPoolGradV2Helper(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("MaxPoolV2", MaxPoolGradV2Helper);
 
+Status MaxPool3DGradHelper(const Scope& scope, const Operation& op,
+                           const std::vector<Output>& grad_inputs,
+                           std::vector<Output>* grad_outputs) {
+  std::vector<int32> ksize;
+  std::vector<int32> strides;
+  string padding;
+  string data_format;
+  auto attrs = op.output(0).node()->attrs();
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "ksize", &ksize));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "strides", &strides));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
+  MaxPool3DGrad::Attrs grad_attrs;
+  auto dx = MaxPool3DGrad(scope, op.input(0), op.output(0), grad_inputs[0],
+                          ksize, strides, padding,
+                          grad_attrs.DataFormat(data_format));
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("MaxPool3D", MaxPool3DGradHelper);
+
+Status AvgPoolGradHelper(const Scope& scope, const Operation& op,
+                         const std::vector<Output>& grad_inputs,
+                         std::vector<Output>* grad_outputs) {
+  std::vector<int32> ksize;
+  std::vector<int32> strides;
+  string padding;
+  string data_format;
+  auto attrs = op.output(0).node()->attrs();
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "ksize", &ksize));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "strides", &strides));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
+  internal::AvgPoolGrad::Attrs grad_attrs;
+  auto dx =
+      internal::AvgPoolGrad(scope, Shape(scope, op.input(0)), grad_inputs[0],
+                            ksize, strides, padding,
+                            grad_attrs.DataFormat(data_format));
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("AvgPool", AvgPoolGradHelper);
+
+Status AvgPool3DGradHelper(const Scope& scope, const Operation& op,
+                           const std::vector<Output>& grad_inputs,
+                           std::vector<Output>* grad_outputs) {
+  std::vector<int32> ksize;
+  std::vector<int32> strides;
+  string padding;
+  string data_format;
+  auto attrs = op.output(0).node()->attrs();
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "ksize", &ksize));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "strides", &strides));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
+  AvgPool3DGrad::Attrs grad_attrs;
+  auto dx = AvgPool3DGrad(scope, Shape(scope, op.input(0)), grad_inputs[0],
+                          ksize, strides, padding,
+                          grad_attrs.DataFormat(data_format));
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("AvgPool3D", AvgPool3DGradHelper);
+
 Status LRNGradHelper(const Scope& scope, const Operation& op,
                      const std::vector<Output>& grad_inputs,
                      std::vector<Output>* grad_outputs) {
diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc
index 0cfe5f6e3c..c4eba7ecb0 100644
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@@ -31,8 +31,11 @@ using ops::Elu;
 using ops::L2Loss;
 using ops::LogSoftmax;
 using ops::LRN;
+using ops::AvgPool;
+using ops::AvgPool3D;
 using ops::MaxPool;
 using ops::MaxPoolV2;
+using ops::MaxPool3D;
 using ops::Placeholder;
 using ops::Relu;
 using ops::Relu6;
@@ -70,9 +73,9 @@ class NNGradTest : public ::testing::Test {
 
   // Sets tensor with random values, ensuring that the max value is largest by
   // a reasonable amount.
-  // This is an issue for MaxPool and MaxPoolV2, in which perturbations by the
-  // numeric gradient computation in the gradient checker can change the max
-  // value if values are too close together.
+  // This is an issue for MaxPool, MaxPoolV2 and MaxPool3D, in which
+  // perturbations by the numeric gradient computation in the gradient checker
+  // can change the max value if values are too close together.
   template <typename T>
   void SetRandomValuesWithBumpedMax(Tensor* tensor) {
     auto tensor_flat = tensor->flat<T>();
@@ -203,6 +206,41 @@ TEST_F(NNGradTest, MaxPoolGradV2Helper) {
   RunTest(x, x_init_value, y, y_shape);
 }
 
+TEST_F(NNGradTest, MaxPool3DGradHelper) {
+  TensorShape x_shape({1, 3, 3, 3, 1});
+  TensorShape y_shape({1, 1, 1, 1, 1});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+  // Setup window and strides so that we only do one MaxPool3D.
+  const std::vector<int> ksize{1, 3, 3, 3, 1};
+  const std::vector<int> strides{1, 3, 3, 3, 1};
+  auto y = MaxPool3D(scope_, x, ksize, strides, "VALID");
+  Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
+  SetRandomValuesWithBumpedMax<float>(&x_init_value);
+  RunTest(x, x_init_value, y, y_shape);
+}
+
+TEST_F(NNGradTest, AvgPoolGradHelper) {
+  TensorShape x_shape({1, 2, 2, 1});
+  TensorShape y_shape({1, 1, 1, 1});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+  // Setup window and strides so that we only do one AvgPool.
+  const std::vector<int> ksize{1, 2, 2, 1};
+  const std::vector<int> strides{1, 2, 2, 1};
+  auto y = AvgPool(scope_, x, ksize, strides, "SAME");
+  RunTest(x, x_shape, y, y_shape);
+}
+
+TEST_F(NNGradTest, AvgPool3DGradHelper) {
+  TensorShape x_shape({1, 3, 3, 3, 1});
+  TensorShape y_shape({1, 1, 1, 1, 1});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+  // Setup window and strides so that we only do one AvgPool3D.
+  const std::vector<int> ksize{1, 3, 3, 3, 1};
+  const std::vector<int> strides{1, 3, 3, 3, 1};
+  auto y = AvgPool3D(scope_, x, ksize, strides, "SAME");
+  RunTest(x, x_shape, y, y_shape);
+}
+
 TEST_F(NNGradTest, LRN){
   TensorShape x_shape({1, 1, 2, 1});
   auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
diff --git a/tensorflow/cc/profiler/profiler.h b/tensorflow/cc/profiler/profiler.h
index 6077c45c58..64edbb5766 100644
--- a/tensorflow/cc/profiler/profiler.h
+++ b/tensorflow/cc/profiler/profiler.h
@@ -61,18 +61,18 @@ class Profiler {
   /// Adds tracing information `run_meta` to profiler. A `run_meta` is
   /// generated by a TensorFlow session run call. `step` is the key
   /// to the `run_meta`. When calling ProfileXXX methods, caller can specify
-  /// `step` in `options` to seletively profile the corresponding `run_meta`.
+  /// `step` in `options` to selectively profile the corresponding `run_meta`.
   /// Multiple different `run_meta` can be keyed by the same `step` in order
   /// to group them together.
   void AddStep(int64 step, const RunMetadata& run_meta);
 
   /// Profiles the model by organizing nodes in graph structure.
-  /// Each node is an op and the nodes are contected by the op inputs/outputs.
+  /// Each node is an op and the nodes are connected by the op inputs/outputs.
   GraphNodeProto ProfileGraph(const Options& options);
 
   /// Profiles the model by organizing nodes in name scope structure.
   /// Each node is an op, and nodes are organized by the ops' name
-  /// scope, similar to a filesystem tree.
+  /// scope, similar to a file system tree.
   /// E.g. /foo is the root of operation /foo/matmul_1 and foo/conv_2.
   GraphNodeProto ProfileNameScope(const Options& options);
 
diff --git a/tensorflow/contrib/cmake/tests/cuda/compatibility_test.cc b/tensorflow/contrib/cmake/tests/cuda/compatibility_test.cc
index a50461cafd..beb574061b 100644
--- a/tensorflow/contrib/cmake/tests/cuda/compatibility_test.cc
+++ b/tensorflow/contrib/cmake/tests/cuda/compatibility_test.cc
@@ -17,4 +17,6 @@ limitations under the License.
 #define __CUDACC__
 #include "crt/host_config.h"
 
-int main(void) { return 0; }
+int main(void) {
+  return 0;
+}
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequential_feature_column.py b/tensorflow/contrib/feature_column/python/feature_column/sequential_feature_column.py
new file mode 100644
index 0000000000..4ed7268e7a
--- /dev/null
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequential_feature_column.py
@@ -0,0 +1,325 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental methods for tf.feature_column sequence input."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+import abc
+import collections
+
+
+from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import variable_scope
+
+# TODO(b/73160931): Fix pydoc.
+# pylint: disable=g-doc-args,missing-docstring,protected-access
+# TODO(b/73827486): Support SequenceExample.
+
+
+def sequence_input_layer(
+    features,
+    feature_columns,
+    weight_collections=None,
+    trainable=True,
+    scope=None):
+  """"Builds input layer for sequence input.
+
+  All `feature_columns` must be sequence dense columns with the same
+  `sequence_length`. The output of this method can be fed into sequence
+  networks, such as RNN.
+
+  The output of this method is a 3D `Tensor` of shape `[batch_size, T, D]`.
+  `T` is the maximum sequence length for this batch, which could differ from
+  batch to batch.
+
+  If multiple `feature_columns` are given with `Di` `num_elements` each, their
+  outputs are concatenated. So, the final `Tensor` has shape
+  `[batch_size, T, D0 + D1 + ... + Dn]`.
+
+  Example:
+
+  ```python
+  rating = sequence_numeric_column('rating')
+  watches = sequence_categorical_column_with_identity(
+      'watches', num_buckets=1000)
+  watches_embedding = embedding_column(watches, dimension=10)
+  columns = [rating, watches]
+
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  input_layer, sequence_length = sequence_input_layer(features, columns)
+
+  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
+  outputs, state = tf.nn.dynamic_rnn(
+      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  ```
+
+  Returns:
+    An `(input_layer, sequence_length)` tuple where:
+    - input_layer: A float `Tensor` of shape `[batch_size, T, D]`.
+        `T` is the maximum sequence length for this batch, which could differ
+        from batch to batch. `D` is the sum of `num_elements` for all
+        `feature_columns`.
+    - sequence_length: An int `Tensor` of shape `[batch_size]`. The sequence
+        length for each example.
+  Raises:
+    ValueError: If any of the `feature_columns` is the wrong type.
+  """
+  feature_columns = fc._clean_feature_columns(feature_columns)
+  for c in feature_columns:
+    if not isinstance(c, _SequenceDenseColumn):
+      raise ValueError(
+          'All feature_columns must be of type _SequenceDenseColumn. '
+          'Given (type {}): {}'.format(type(c), c))
+
+  with variable_scope.variable_scope(
+      scope, default_name='sequence_input_layer', values=features.values()):
+    builder = fc._LazyBuilder(features)
+    output_tensors = []
+    sequence_lengths = []
+    ordered_columns = []
+    for column in sorted(feature_columns, key=lambda x: x.name):
+      ordered_columns.append(column)
+      with variable_scope.variable_scope(
+          None, default_name=column._var_scope_name):
+        dense_tensor, sequence_length = column._get_sequence_dense_tensor(
+            builder,
+            weight_collections=weight_collections,
+            trainable=trainable)
+        # Flattens the final dimension to produce a 3D Tensor.
+        num_elements = column._variable_shape.num_elements()
+        shape = array_ops.shape(dense_tensor)
+        output_tensors.append(
+            array_ops.reshape(
+                dense_tensor,
+                shape=array_ops.concat([shape[:2], [num_elements]], axis=0)))
+        sequence_lengths.append(sequence_length)
+    fc._verify_static_batch_size_equality(output_tensors, ordered_columns)
+    # TODO(b/73160931): Verify sequence_length equality.
+    return array_ops.concat(output_tensors, -1), sequence_lengths[0]
+
+
+# TODO(b/73160931): Add remaining categorical columns.
+def sequence_categorical_column_with_identity(
+    key, num_buckets, default_value=None):
+  return _SequenceCategoricalColumn(
+      fc.categorical_column_with_identity(
+          key=key,
+          num_buckets=num_buckets,
+          default_value=default_value))
+
+
+# TODO(b/73160931): Merge with embedding_column
+def _sequence_embedding_column(
+    categorical_column, dimension, initializer=None, ckpt_to_load_from=None,
+    tensor_name_in_ckpt=None, max_norm=None, trainable=True):
+  if not isinstance(categorical_column, _SequenceCategoricalColumn):
+    raise ValueError(
+        'categorical_column must be of type _SequenceCategoricalColumn. '
+        'Given (type {}): {}'.format(
+            type(categorical_column), categorical_column))
+  return _SequenceEmbeddingColumn(
+      fc.embedding_column(
+          categorical_column,
+          dimension=dimension,
+          initializer=initializer,
+          ckpt_to_load_from=ckpt_to_load_from,
+          tensor_name_in_ckpt=tensor_name_in_ckpt,
+          max_norm=max_norm,
+          trainable=trainable))
+
+
+def sequence_numeric_column(
+    key,
+    shape=(1,),
+    default_value=0.,
+    dtype=dtypes.float32):
+  # TODO(b/73160931): Add validations.
+  return _SequenceNumericColumn(
+      key,
+      shape=shape,
+      default_value=default_value,
+      dtype=dtype)
+
+
+class _SequenceDenseColumn(fc._FeatureColumn):
+  """Represents dense sequence data."""
+
+  __metaclass__ = abc.ABCMeta
+
+  TensorSequenceLengthPair = collections.namedtuple(  # pylint: disable=invalid-name
+      'TensorSequenceLengthPair', ['dense_tensor', 'sequence_length'])
+
+  @abc.abstractproperty
+  def _variable_shape(self):
+    """`TensorShape` without batch and sequence dimensions."""
+    pass
+
+  @abc.abstractmethod
+  def _get_sequence_dense_tensor(
+      self, inputs, weight_collections=None, trainable=None):
+    """Returns a `TensorSequenceLengthPair`."""
+    pass
+
+
+def _sequence_length_from_sparse_tensor(sp_tensor, num_elements=1):
+  with ops.name_scope(None, 'sequence_length') as name_scope:
+    row_ids = sp_tensor.indices[:, 0]
+    column_ids = sp_tensor.indices[:, 1]
+    column_ids += array_ops.ones_like(column_ids)
+    seq_length = (
+        math_ops.segment_max(column_ids, segment_ids=row_ids) / num_elements)
+    # If the last n rows do not have ids, seq_length will have shape
+    # [batch_size - n]. Pad the remaining values with zeros.
+    n_pad = array_ops.shape(sp_tensor)[:1] - array_ops.shape(seq_length)[:1]
+    padding = array_ops.zeros(n_pad, dtype=seq_length.dtype)
+    return array_ops.concat([seq_length, padding], axis=0, name=name_scope)
+
+
+class _SequenceCategoricalColumn(
+    fc._CategoricalColumn,
+    collections.namedtuple(
+        '_SequenceCategoricalColumn', ['categorical_column'])):
+
+  @property
+  def name(self):
+    return self.categorical_column.name
+
+  @property
+  def _parse_example_spec(self):
+    return self.categorical_column._parse_example_spec
+
+  def _transform_feature(self, inputs):
+    return self.categorical_column._transform_feature(inputs)
+
+  @property
+  def _num_buckets(self):
+    return self.categorical_column._num_buckets
+
+  def _get_sparse_tensors(self, inputs, weight_collections=None,
+                          trainable=None):
+    sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)
+    id_tensor = sparse_tensors.id_tensor
+    weight_tensor = sparse_tensors.weight_tensor
+    # Expands final dimension, so that embeddings are not combined during
+    # embedding lookup.
+    check_id_rank = check_ops.assert_equal(
+        array_ops.rank(id_tensor), 2,
+        data=[
+            'Column {} expected ID tensor of rank 2. '.format(self.name),
+            'id_tensor shape: ', array_ops.shape(id_tensor)])
+    with ops.control_dependencies([check_id_rank]):
+      id_tensor = sparse_ops.sparse_reshape(
+          id_tensor,
+          shape=array_ops.concat([id_tensor.dense_shape, [1]], axis=0))
+    if weight_tensor is not None:
+      check_weight_rank = check_ops.assert_equal(
+          array_ops.rank(weight_tensor), 2,
+          data=[
+              'Column {} expected weight tensor of rank 2.'.format(self.name),
+              'weight_tensor shape:', array_ops.shape(weight_tensor)])
+      with ops.control_dependencies([check_weight_rank]):
+        weight_tensor = sparse_ops.sparse_reshape(
+            weight_tensor,
+            shape=array_ops.concat([weight_tensor.dense_shape, [1]], axis=0))
+    return fc._CategoricalColumn.IdWeightPair(id_tensor, weight_tensor)
+
+  def _sequence_length(self, inputs):
+    sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)
+    return _sequence_length_from_sparse_tensor(sparse_tensors.id_tensor)
+
+
+class _SequenceEmbeddingColumn(
+    _SequenceDenseColumn,
+    collections.namedtuple('_SequenceEmbeddingColumn', ['embedding_column'])):
+
+  @property
+  def name(self):
+    return self.embedding_column.name
+
+  @property
+  def _parse_example_spec(self):
+    return self.embedding_column._parse_example_spec
+
+  def _transform_feature(self, inputs):
+    return self.embedding_column._transform_feature(inputs)
+
+  @property
+  def _variable_shape(self):
+    return self.embedding_column._variable_shape
+
+  def _get_sequence_dense_tensor(
+      self, inputs, weight_collections=None, trainable=None):
+    dense_tensor = self.embedding_column._get_dense_tensor(
+        inputs=inputs,
+        weight_collections=weight_collections,
+        trainable=trainable)
+    sequence_length = self.embedding_column.categorical_column._sequence_length(
+        inputs)
+    return _SequenceDenseColumn.TensorSequenceLengthPair(
+        dense_tensor=dense_tensor, sequence_length=sequence_length)
+
+
+class _SequenceNumericColumn(
+    _SequenceDenseColumn,
+    collections.namedtuple(
+        '_SequenceNumericColumn',
+        ['key', 'shape', 'default_value', 'dtype'])):
+
+  @property
+  def name(self):
+    return self.key
+
+  @property
+  def _parse_example_spec(self):
+    return {self.key: parsing_ops.VarLenFeature(self.dtype)}
+
+  def _transform_feature(self, inputs):
+    return inputs.get(self.key)
+
+  @property
+  def _variable_shape(self):
+    return tensor_shape.TensorShape(self.shape)
+
+  def _get_sequence_dense_tensor(
+      self, inputs, weight_collections=None, trainable=None):
+    # Do nothing with weight_collections and trainable since no variables are
+    # created in this function.
+    del weight_collections
+    del trainable
+    sp_tensor = inputs.get(self)
+    dense_tensor = sparse_ops.sparse_tensor_to_dense(
+        sp_tensor, default_value=self.default_value)
+    # Reshape into [batch_size, T, variable_shape].
+    dense_shape = array_ops.concat(
+        [array_ops.shape(dense_tensor)[:1], [-1], self._variable_shape],
+        axis=0)
+    dense_tensor = array_ops.reshape(dense_tensor, shape=dense_shape)
+    sequence_length = _sequence_length_from_sparse_tensor(
+        sp_tensor, num_elements=self._variable_shape.num_elements())
+    return _SequenceDenseColumn.TensorSequenceLengthPair(
+        dense_tensor=dense_tensor, sequence_length=sequence_length)
+
+# pylint: enable=g-doc-args,missing-docstring,protected-access
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequential_feature_column_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequential_feature_column_test.py
new file mode 100644
index 0000000000..59674869a2
--- /dev/null
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequential_feature_column_test.py
@@ -0,0 +1,471 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for sequential_feature_column."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.feature_column.python.feature_column import sequential_feature_column as sfc
+from tensorflow.python.feature_column.feature_column import _LazyBuilder
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.platform import test
+from tensorflow.python.training import monitored_session
+
+
+class SequenceInputLayerTest(test.TestCase):
+
+  def test_embedding_column(self):
+    vocabulary_size = 3
+    sparse_input_a = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+    sparse_input_b = sparse_tensor.SparseTensorValue(
+        # example 0, ids [1]
+        # example 1, ids [2, 0]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(1, 2, 0),
+        dense_shape=(2, 2))
+
+    embedding_dimension_a = 2
+    embedding_values_a = (
+        (1., 2.),  # id 0
+        (3., 4.),  # id 1
+        (5., 6.)  # id 2
+    )
+    embedding_dimension_b = 3
+    embedding_values_b = (
+        (11., 12., 13.),  # id 0
+        (14., 15., 16.),  # id 1
+        (17., 18., 19.)  # id 2
+    )
+    def _get_initializer(embedding_dimension, embedding_values):
+      def _initializer(shape, dtype, partition_info):
+        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+        self.assertEqual(dtypes.float32, dtype)
+        self.assertIsNone(partition_info)
+        return embedding_values
+      return _initializer
+
+    expected_input_layer = [
+        # example 0, ids_a [2], ids_b [1]
+        [[5., 6., 14., 15., 16.], [0., 0., 0., 0., 0.]],
+        # example 1, ids_a [0, 1], ids_b [2, 0]
+        [[1., 2., 17., 18., 19.], [3., 4., 11., 12., 13.]],
+    ]
+    expected_sequence_length = [1, 2]
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column_a = sfc._sequence_embedding_column(
+        categorical_column_a, dimension=embedding_dimension_a,
+        initializer=_get_initializer(embedding_dimension_a, embedding_values_a))
+    categorical_column_b = sfc.sequence_categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    embedding_column_b = sfc._sequence_embedding_column(
+        categorical_column_b, dimension=embedding_dimension_b,
+        initializer=_get_initializer(embedding_dimension_b, embedding_values_b))
+
+    input_layer, sequence_length = sfc.sequence_input_layer(
+        features={
+            'aaa': sparse_input_a,
+            'bbb': sparse_input_b,
+        },
+        # Test that columns are reordered alphabetically.
+        feature_columns=[embedding_column_b, embedding_column_a])
+
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('sequence_input_layer/aaa_embedding/embedding_weights:0',
+         'sequence_input_layer/bbb_embedding/embedding_weights:0'),
+        tuple([v.name for v in global_vars]))
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(embedding_values_a, global_vars[0].eval(session=sess))
+      self.assertAllEqual(embedding_values_b, global_vars[1].eval(session=sess))
+      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+  def test_numeric_column(self):
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, values [[0.], [1]]
+        # example 1, [[10.]]
+        indices=((0, 0), (0, 1), (1, 0)),
+        values=(0., 1., 10.),
+        dense_shape=(2, 2))
+    expected_input_layer = [
+        [[0.], [1.]],
+        [[10.], [0.]],
+    ]
+    expected_sequence_length = [2, 1]
+    numeric_column = sfc.sequence_numeric_column('aaa')
+
+    input_layer, sequence_length = sfc.sequence_input_layer(
+        features={'aaa': sparse_input},
+        feature_columns=[numeric_column])
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+  def test_numeric_column_multi_dim(self):
+    """Tests sequence_input_layer for multi-dimensional numeric_column."""
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, values [[[0., 1.],  [2., 3.]], [[4., 5.],  [6., 7.]]]
+        # example 1, [[[10., 11.],  [12., 13.]]]
+        indices=((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7),
+                 (1, 0), (1, 1), (1, 2), (1, 3)),
+        values=(0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
+        dense_shape=(2, 8))
+    # The output of numeric_column._get_dense_tensor should be flattened.
+    expected_input_layer = [
+        [[0., 1., 2., 3.], [4., 5., 6., 7.]],
+        [[10., 11., 12., 13.], [0., 0., 0., 0.]],
+    ]
+    expected_sequence_length = [2, 1]
+    numeric_column = sfc.sequence_numeric_column('aaa', shape=(2, 2))
+
+    input_layer, sequence_length = sfc.sequence_input_layer(
+        features={'aaa': sparse_input},
+        feature_columns=[numeric_column])
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+
+def _assert_sparse_tensor_value(test_case, expected, actual):
+  test_case.assertEqual(np.int64, np.array(actual.indices).dtype)
+  test_case.assertAllEqual(expected.indices, actual.indices)
+
+  test_case.assertEqual(
+      np.array(expected.values).dtype, np.array(actual.values).dtype)
+  test_case.assertAllEqual(expected.values, actual.values)
+
+  test_case.assertEqual(np.int64, np.array(actual.dense_shape).dtype)
+  test_case.assertAllEqual(expected.dense_shape, actual.dense_shape)
+
+
+class SequenceCategoricalColumnWithIdentityTest(test.TestCase):
+
+  def test_get_sparse_tensors(self):
+    column = sfc.sequence_categorical_column_with_identity(
+        'aaa', num_buckets=3)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(1, 2, 0),
+        dense_shape=(2, 2))
+    expected_sparse_ids = sparse_tensor.SparseTensorValue(
+        indices=((0, 0, 0), (1, 0, 0), (1, 1, 0)),
+        values=np.array((1, 2, 0), dtype=np.int64),
+        dense_shape=(2, 2, 1))
+
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with monitored_session.MonitoredSession() as sess:
+      _assert_sparse_tensor_value(
+          self,
+          expected_sparse_ids,
+          id_weight_pair.id_tensor.eval(session=sess))
+
+  def test_get_sparse_tensors_inputs3d(self):
+    """Tests _get_sparse_tensors when the input is already 3D Tensor."""
+    column = sfc.sequence_categorical_column_with_identity(
+        'aaa', num_buckets=3)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0, 0), (1, 0, 0), (1, 1, 0)),
+        values=(1, 2, 0),
+        dense_shape=(2, 2, 1))
+
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        r'Column aaa expected ID tensor of rank 2\.\s*'
+        r'id_tensor shape:\s*\[2 2 1\]'):
+      id_weight_pair = column._get_sparse_tensors(
+          _LazyBuilder({'aaa': inputs}))
+      with monitored_session.MonitoredSession() as sess:
+        id_weight_pair.id_tensor.eval(session=sess)
+
+  def test_sequence_length(self):
+    column = sfc.sequence_categorical_column_with_identity(
+        'aaa', num_buckets=3)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(1, 2, 0),
+        dense_shape=(2, 2))
+    expected_sequence_length = [1, 2]
+
+    sequence_length = column._sequence_length(_LazyBuilder({'aaa': inputs}))
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+  def test_sequence_length_with_zeros(self):
+    column = sfc.sequence_categorical_column_with_identity(
+        'aaa', num_buckets=3)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((1, 0), (3, 0), (3, 1)),
+        values=(1, 2, 0),
+        dense_shape=(5, 2))
+    expected_sequence_length = [0, 1, 0, 2, 0]
+
+    sequence_length = column._sequence_length(_LazyBuilder({'aaa': inputs}))
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+
+class SequenceEmbeddingColumnTest(test.TestCase):
+
+  def test_get_sequence_dense_tensor(self):
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 1), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 2))
+
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    expected_lookups = [
+        # example 0, ids [2]
+        [[7., 11.], [0., 0.]],
+        # example 1, ids [0, 1]
+        [[1., 2.], [3., 5.]],
+        # example 2, ids []
+        [[0., 0.], [0., 0.]],
+        # example 3, ids [1]
+        [[3., 5.], [0., 0.]],
+    ]
+
+    categorical_column = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = sfc._sequence_embedding_column(
+        categorical_column, dimension=embedding_dimension,
+        initializer=_initializer)
+
+    embedding_lookup, _ = embedding_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': sparse_input}))
+
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(embedding_values, global_vars[0].eval(session=sess))
+      self.assertAllEqual(expected_lookups, embedding_lookup.eval(session=sess))
+
+  def test_sequence_length(self):
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+    expected_sequence_length = [1, 2]
+
+    categorical_column = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = sfc._sequence_embedding_column(
+        categorical_column, dimension=2)
+
+    _, sequence_length = embedding_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': sparse_input}))
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+  def test_sequence_length_with_empty_rows(self):
+    """Tests _sequence_length when some examples do not have ids."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids []
+        # example 1, ids [2]
+        # example 2, ids [0, 1]
+        # example 3, ids []
+        # example 4, ids [1]
+        # example 5, ids []
+        indices=((1, 0), (2, 0), (2, 1), (4, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(6, 2))
+    expected_sequence_length = [0, 1, 2, 0, 1, 0]
+
+    categorical_column = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = sfc._sequence_embedding_column(
+        categorical_column, dimension=2)
+
+    _, sequence_length = embedding_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': sparse_input}))
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+
+class SequenceNumericColumnTest(test.TestCase):
+
+  def test_get_sequence_dense_tensor(self):
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, values [[0.], [1]]
+        # example 1, [[10.]]
+        indices=((0, 0), (0, 1), (1, 0)),
+        values=(0., 1., 10.),
+        dense_shape=(2, 2))
+    expected_dense_tensor = [
+        [[0.], [1.]],
+        [[10.], [0.]],
+    ]
+    numeric_column = sfc.sequence_numeric_column('aaa')
+
+    dense_tensor, _ = numeric_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': sparse_input}))
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_dense_tensor, dense_tensor.eval(session=sess))
+
+  def test_get_sequence_dense_tensor_with_shape(self):
+    """Tests get_sequence_dense_tensor with shape !=(1,)."""
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, values [[0., 1., 2.], [3., 4., 5.]]
+        # example 1, [[10., 11., 12.]]
+        indices=((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5),
+                 (1, 0), (1, 1), (1, 2)),
+        values=(0., 1., 2., 3., 4., 5., 10., 11., 12.),
+        dense_shape=(2, 6))
+    expected_dense_tensor = [
+        [[0., 1., 2.], [3., 4., 5.]],
+        [[10., 11., 12.], [0., 0., 0.]],
+    ]
+    numeric_column = sfc.sequence_numeric_column('aaa', shape=(3,))
+
+    dense_tensor, _ = numeric_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': sparse_input}))
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_dense_tensor, dense_tensor.eval(session=sess))
+
+  def test_get_dense_tensor_multi_dim(self):
+    """Tests get_sequence_dense_tensor for multi-dim numeric_column."""
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, values [[[0., 1.],  [2., 3.]], [[4., 5.],  [6., 7.]]]
+        # example 1, [[[10., 11.],  [12., 13.]]]
+        indices=((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7),
+                 (1, 0), (1, 1), (1, 2), (1, 3)),
+        values=(0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
+        dense_shape=(2, 8))
+    expected_dense_tensor = [
+        [[[0., 1.], [2., 3.]], [[4., 5.], [6., 7.]]],
+        [[[10., 11.], [12., 13.]], [[0., 0.], [0., 0.]]],
+    ]
+    numeric_column = sfc.sequence_numeric_column('aaa', shape=(2, 2))
+
+    dense_tensor, _ = numeric_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': sparse_input}))
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_dense_tensor, dense_tensor.eval(session=sess))
+
+  def test_sequence_length(self):
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, values [[0., 1., 2.], [3., 4., 5.]]
+        # example 1, [[10., 11., 12.]]
+        indices=((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5),
+                 (1, 0), (1, 1), (1, 2)),
+        values=(0., 1., 2., 3., 4., 5., 10., 11., 12.),
+        dense_shape=(2, 6))
+    expected_sequence_length = [2, 1]
+    numeric_column = sfc.sequence_numeric_column('aaa', shape=(3,))
+
+    _, sequence_length = numeric_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': sparse_input}))
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+  def test_sequence_length_with_shape(self):
+    """Tests _sequence_length with shape !=(1,)."""
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, values [[0.], [1]]
+        # example 1, [[10.]]
+        indices=((0, 0), (0, 1), (1, 0)),
+        values=(0., 1., 10.),
+        dense_shape=(2, 2))
+    expected_sequence_length = [2, 1]
+    numeric_column = sfc.sequence_numeric_column('aaa')
+
+    _, sequence_length = numeric_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': sparse_input}))
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+  def test_sequence_length_with_empty_rows(self):
+    """Tests _sequence_length when some examples do not have ids."""
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, values []
+        # example 1, values [[0.], [1.]]
+        # example 2, [[2.]]
+        # example 3, values []
+        # example 4, [[3.]]
+        # example 5, values []
+        indices=((1, 0), (1, 1), (2, 0), (4, 0)),
+        values=(0., 1., 2., 3.),
+        dense_shape=(6, 2))
+    expected_sequence_length = [0, 2, 1, 0, 1, 0]
+    numeric_column = sfc.sequence_numeric_column('aaa')
+
+    _, sequence_length = numeric_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': sparse_input}))
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/gan/python/eval/python/summaries_test.py b/tensorflow/contrib/gan/python/eval/python/summaries_test.py
index 5549df971d..45eb108586 100644
--- a/tensorflow/contrib/gan/python/eval/python/summaries_test.py
+++ b/tensorflow/contrib/gan/python/eval/python/summaries_test.py
@@ -71,10 +71,11 @@ def get_cyclegan_model():
 
 class SummariesTest(test.TestCase):
 
-  def _test_add_gan_model_image_summaries_impl(
-      self, get_model_fn, expected_num_summary_ops, model_summaries):
-    summaries.add_gan_model_image_summaries(
-        get_model_fn(), grid_size=2, model_summaries=model_summaries)
+  def _test_add_gan_model_image_summaries_impl(self, get_model_fn,
+                                               expected_num_summary_ops,
+                                               model_summaries):
+    summaries.add_gan_model_image_summaries(get_model_fn(), grid_size=2,
+                                            model_summaries=model_summaries)
 
     self.assertEquals(expected_num_summary_ops,
                       len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index 559c0c63da..350bcb3bca 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -58,12 +58,12 @@ __all__ = [
     'avg_pool2d', 'avg_pool3d', 'batch_norm', 'bias_add', 'conv2d', 'conv3d',
     'conv2d_in_plane', 'conv2d_transpose', 'conv3d_transpose', 'convolution',
     'convolution2d', 'convolution2d_in_plane', 'convolution2d_transpose',
-    'convolution3d', 'convolution3d_transpose', 'dense_to_sparse', 'dropout',
-    'elu', 'flatten', 'fully_connected', 'GDN', 'gdn', 'images_to_sequence',
-    'layer_norm', 'linear', 'pool', 'max_pool2d', 'max_pool3d',
-    'one_hot_encoding', 'relu', 'relu6', 'repeat', 'scale_gradient',
-    'separable_conv2d', 'separable_convolution2d', 'sequence_to_images',
-    'softmax', 'spatial_softmax', 'stack', 'unit_norm',
+    'convolution3d', 'convolution3d_transpose', 'dense_to_sparse',
+    'dropout', 'elu', 'flatten', 'fully_connected', 'GDN', 'gdn',
+    'images_to_sequence', 'layer_norm', 'linear', 'pool', 'max_pool2d',
+    'max_pool3d', 'one_hot_encoding', 'relu', 'relu6', 'repeat',
+    'scale_gradient', 'separable_conv2d', 'separable_convolution2d',
+    'sequence_to_images', 'softmax', 'spatial_softmax', 'stack', 'unit_norm',
     'legacy_fully_connected', 'legacy_linear', 'legacy_relu', 'maxout'
 ]
 
@@ -2718,7 +2718,8 @@ def sequence_to_images(inputs,
       num_batches = -1
     else:
       num_batches = num_batches // height
-    reshaped = array_ops.reshape(inputs, [width, num_batches, height, depth])
+    reshaped = array_ops.reshape(inputs,
+                                 [width, num_batches, height, depth])
     if output_data_format == 'channels_first':
       outputs = array_ops.transpose(reshaped, [1, 3, 2, 0])
     else:
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index ba70432c48..997f910a2a 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -3447,8 +3447,9 @@ class SequenceToImagesTest(test.TestCase):
     num_time_steps = 11
     num_channels = 5
     desired_height = 7
-    sequence = np.random.uniform(
-        size=(num_time_steps, num_batches, num_channels)).astype(np.float32)
+    sequence = np.random.uniform(size=(num_time_steps,
+                                       num_batches,
+                                       num_channels)).astype(np.float32)
     output = _layers.sequence_to_images(sequence, desired_height)
     self.assertListEqual(output.get_shape().as_list(), [2, 7, 11, 5])
 
@@ -3457,10 +3458,12 @@ class SequenceToImagesTest(test.TestCase):
     num_time_steps = 11
     num_channels = 5
     desired_height = 7
-    sequence = np.random.uniform(
-        size=(num_time_steps, num_batches, num_channels)).astype(np.float32)
-    output = _layers.sequence_to_images(
-        sequence, desired_height, output_data_format='channels_first')
+    sequence = np.random.uniform(size=(num_time_steps,
+                                       num_batches,
+                                       num_channels)).astype(np.float32)
+    output = _layers.sequence_to_images(sequence,
+                                        desired_height,
+                                        output_data_format='channels_first')
     self.assertListEqual(output.get_shape().as_list(), [2, 5, 7, 11])
 
 
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
index 2c91be9d62..c57bb348c5 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
@@ -20,6 +20,9 @@ import android.content.res.AssetFileDescriptor;
 import android.graphics.Bitmap;
 import android.os.SystemClock;
 import android.util.Log;
+
+import org.tensorflow.lite.Interpreter;
+
 import java.io.BufferedReader;
 import java.io.FileInputStream;
 import java.io.IOException;
@@ -34,9 +37,10 @@ import java.util.Comparator;
 import java.util.List;
 import java.util.Map;
 import java.util.PriorityQueue;
-import org.tensorflow.lite.Interpreter;
 
-/** Classifies images with Tensorflow Lite. */
+/**
+ * Classifies images with Tensorflow Lite.
+ */
 public abstract class ImageClassifier {
 
   /** Tag for the {@link Log}. */
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierFloatInception.java b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierFloatInception.java
index 3108422952..be17b85e0c 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierFloatInception.java
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierFloatInception.java
@@ -16,22 +16,24 @@ limitations under the License.
 package com.example.android.tflitecamerademo;
 
 import android.app.Activity;
+
 import java.io.IOException;
 
 /**
- * This classifier works with the Inception-v3 slim model. It applies floating point inference
- * rather than using a quantized model.
+ * This classifier works with the Inception-v3 slim model.
+ * It applies floating point inference rather than using a quantized model.
  */
 public class ImageClassifierFloatInception extends ImageClassifier {
 
-  /** The inception net requires additional normalization of the used input. */
+  /**
+   * The inception net requires additional normalization of the used input.
+   */
   private static final int IMAGE_MEAN = 128;
-
   private static final float IMAGE_STD = 128.0f;
 
   /**
-   * An array to hold inference results, to be feed into Tensorflow Lite as outputs. This isn't part
-   * of the super class, because we need a primitive array here.
+   * An array to hold inference results, to be feed into Tensorflow Lite as outputs.
+   * This isn't part of the super class, because we need a primitive array here.
    */
   private float[][] labelProbArray = null;
 
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierQuantizedMobileNet.java b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierQuantizedMobileNet.java
index ee89dbd375..c533de7927 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierQuantizedMobileNet.java
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierQuantizedMobileNet.java
@@ -16,14 +16,17 @@ limitations under the License.
 package com.example.android.tflitecamerademo;
 
 import android.app.Activity;
+
 import java.io.IOException;
 
-/** This classifier works with the quantized MobileNet model. */
+/**
+ * This classifier works with the quantized MobileNet model.
+ */
 public class ImageClassifierQuantizedMobileNet extends ImageClassifier {
 
   /**
-   * An array to hold inference results, to be feed into Tensorflow Lite as outputs. This isn't part
-   * of the super class, because we need a primitive array here.
+   * An array to hold inference results, to be feed into Tensorflow Lite as outputs.
+   * This isn't part of the super class, because we need a primitive array here.
    */
   private byte[][] labelProbArray = null;
 
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
index 883c7f270d..780401e052 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include <string.h>
 
 #include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/kernels/internal/common.h"
 #include "tensorflow/contrib/lite/kernels/activation_functor.h"
 #include "tensorflow/contrib/lite/kernels/internal/common.h"
 #include "tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h"
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index 2481add769..5488b71fcf 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -36,6 +36,7 @@ import traceback
 import zipfile
 import numpy as np
 from six import StringIO
+from six.moves import xrange
 
 # TODO(aselle): Disable GPU for now
 os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
index f21915ffbc..63fdd91d36 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
@@ -1585,7 +1585,8 @@ class WeightNormLSTMCellTest(test.TestCase):
 
     with self.test_session() as sess:
       init = init_ops.constant_initializer(0.5)
-      with variable_scope.variable_scope("root", initializer=init):
+      with variable_scope.variable_scope("root",
+                                         initializer=init):
         x = array_ops.zeros([1, 2])
         c0 = array_ops.zeros([1, 2])
         h0 = array_ops.zeros([1, 2])
@@ -1595,12 +1596,11 @@ class WeightNormLSTMCellTest(test.TestCase):
         xout, sout = cell()(x, state0)
 
       sess.run([variables.global_variables_initializer()])
-      res = sess.run(
-          [xout, sout], {
-              x.name: np.array([[1., 1.]]),
-              c0.name: 0.1 * np.asarray([[0, 1]]),
-              h0.name: 0.1 * np.asarray([[2, 3]]),
-          })
+      res = sess.run([xout, sout], {
+          x.name: np.array([[1., 1.]]),
+          c0.name: 0.1 * np.asarray([[0, 1]]),
+          h0.name: 0.1 * np.asarray([[2, 3]]),
+      })
 
     actual_state_c = res[1].c
     actual_state_h = res[1].h
@@ -1611,8 +1611,9 @@ class WeightNormLSTMCellTest(test.TestCase):
     """Tests cell w/o peepholes and w/o normalisation."""
 
     def cell():
-      return contrib_rnn_cell.WeightNormLSTMCell(
-          2, norm=False, use_peepholes=False)
+      return contrib_rnn_cell.WeightNormLSTMCell(2,
+                                                 norm=False,
+                                                 use_peepholes=False)
 
     actual_c, actual_h = self._cell_output(cell)
 
@@ -1626,8 +1627,9 @@ class WeightNormLSTMCellTest(test.TestCase):
     """Tests cell with peepholes and w/o normalisation."""
 
     def cell():
-      return contrib_rnn_cell.WeightNormLSTMCell(
-          2, norm=False, use_peepholes=True)
+      return contrib_rnn_cell.WeightNormLSTMCell(2,
+                                                 norm=False,
+                                                 use_peepholes=True)
 
     actual_c, actual_h = self._cell_output(cell)
 
@@ -1641,8 +1643,9 @@ class WeightNormLSTMCellTest(test.TestCase):
     """Tests cell w/o peepholes and with normalisation."""
 
     def cell():
-      return contrib_rnn_cell.WeightNormLSTMCell(
-          2, norm=True, use_peepholes=False)
+      return contrib_rnn_cell.WeightNormLSTMCell(2,
+                                                 norm=True,
+                                                 use_peepholes=False)
 
     actual_c, actual_h = self._cell_output(cell)
 
@@ -1656,8 +1659,9 @@ class WeightNormLSTMCellTest(test.TestCase):
     """Tests cell with peepholes and with normalisation."""
 
     def cell():
-      return contrib_rnn_cell.WeightNormLSTMCell(
-          2, norm=True, use_peepholes=True)
+      return contrib_rnn_cell.WeightNormLSTMCell(2,
+                                                 norm=True,
+                                                 use_peepholes=True)
 
     actual_c, actual_h = self._cell_output(cell)
 
diff --git a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
index 6e57ccd6dd..03fe31abf7 100644
--- a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
@@ -722,7 +722,7 @@ def _mask_probs(probs, eos_token, finished):
       eos_token,
       vocab_size,
       dtype=probs.dtype,
-      on_value=0.,
+      on_value=ops.convert_to_tensor(0., dtype=probs.dtype),
       off_value=probs.dtype.min)
   finished_probs = array_ops.tile(
       array_ops.reshape(finished_row, [1, 1, -1]),
diff --git a/tensorflow/contrib/slim/python/slim/data/parallel_reader.py b/tensorflow/contrib/slim/python/slim/data/parallel_reader.py
index ad5e985487..b3343aef47 100644
--- a/tensorflow/contrib/slim/python/slim/data/parallel_reader.py
+++ b/tensorflow/contrib/slim/python/slim/data/parallel_reader.py
@@ -221,7 +221,7 @@ def parallel_read(data_sources,
         the data will be cycled through indefinitely.
     num_readers: a integer, number of Readers to create.
     reader_kwargs: an optional dict, of kwargs for the reader.
-    shuffle: boolean, wether should shuffle the files and the records by using
+    shuffle: boolean, whether should shuffle the files and the records by using
       RandomShuffleQueue as common_queue.
     dtypes:  A list of types.  The length of dtypes must equal the number
         of elements in each record. If it is None it will default to
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h
index 04e6b0a735..dc3e9fe79d 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h
@@ -468,7 +468,7 @@ class FixedSizeSparseClassificationGrowStats : public ClassificationStats {
   void PackToProto(FertileSlot* slot) const override;
 
   void InitLeafClassStats(int best_split_index, LeafStat* left_stats,
-                          LeafStat* right_stats) const;
+                          LeafStat* right_stats) const override;
 
  protected:
   void ClassificationAddSplitStats() override {
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index 3b7b68f61b..c832c6f2e0 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -47,7 +47,10 @@ tf_cuda_cc_test(
 
 tf_custom_op_library(
     name = "python/ops/_trt_engine_op.so",
-    srcs = ["ops/trt_engine_op.cc"],
+    srcs = [
+        "ops/trt_calib_op.cc",
+        "ops/trt_engine_op.cc",
+    ],
     deps = [
         ":trt_engine_op_kernel",
         ":trt_shape_function",
@@ -71,11 +74,18 @@ tf_cuda_library(
 
 cc_library(
     name = "trt_engine_op_kernel",
-    srcs = ["kernels/trt_engine_op.cc"],
-    hdrs = ["kernels/trt_engine_op.h"],
+    srcs = [
+        "kernels/trt_calib_op.cc",
+        "kernels/trt_engine_op.cc",
+    ],
+    hdrs = [
+        "kernels/trt_calib_op.h",
+        "kernels/trt_engine_op.h",
+    ],
     copts = tf_copts(),
     deps = [
         ":trt_logging",
+        ":trt_resources",
         "//tensorflow/core:gpu_headers_lib",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:stream_executor_headers_lib",
@@ -87,7 +97,10 @@ cc_library(
 )
 
 tf_gen_op_libs(
-    op_lib_names = ["trt_engine_op"],
+    op_lib_names = [
+        "trt_engine_op",
+        "trt_calib_op",
+    ],
     deps = if_tensorrt([
         "@local_config_tensorrt//:nv_infer",
     ]),
@@ -109,6 +122,7 @@ tf_gen_op_wrapper_py(
     name = "trt_engine_op",
     gen_locally = True,
     deps = [
+        ":trt_calib_op_op_lib",
         ":trt_engine_op_op_lib",
         ":trt_logging",
         ":trt_shape_function",
@@ -172,6 +186,27 @@ tf_py_wrap_cc(
     ],
 )
 
+tf_cuda_library(
+    name = "trt_resources",
+    srcs = [
+        "resources/trt_int8_calibrator.cc",
+        "resources/trt_resource_manager.cc",
+    ],
+    hdrs = [
+        "resources/trt_int8_calibrator.h",
+        "resources/trt_resource_manager.h",
+        "resources/trt_resources.h",
+    ],
+    deps = [
+        ":trt_logging",
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib_proto_parsing",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:nv_infer",
+    ]),
+)
+
 # Library for the node-level conversion portion of TensorRT operation creation
 tf_cuda_library(
     name = "trt_conversion",
@@ -186,6 +221,7 @@ tf_cuda_library(
     deps = [
         ":segment",
         ":trt_logging",
+        ":trt_resources",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core:framework",
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 4003ba056d..9ee717dd7f 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -809,9 +809,9 @@ tensorflow::Status BinaryTensorOpTensor(
   CHECK_EQ_TYPE(tensor_r->getType(), dtype);
   auto op_pair = ops.find(node_def.op());
   if (op_pair == ops.end())
-    return tensorflow::errors::Unimplemented(
-        "binary op: " + node_def.op() +
-        " not supported at: " + node_def.name());
+    return tensorflow::errors::Unimplemented("binary op: " + node_def.op() +
+                                             " not supported at: " +
+                                             node_def.name());
 
   nvinfer1::IElementWiseLayer* layer = ctx.network()->addElementWise(
       *const_cast<nvinfer1::ITensor*>(tensor_l),
@@ -1471,13 +1471,13 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
             << std::to_string(op_info_vec.size());
 
     // TODO(ben,jie): update TRT input format/dimension
-    nvinfer1::DimsCHW input_dim_pseudo_chw;
-    for (int i = 0; i < 3; i++) input_dim_pseudo_chw.d[i] = 1;
+    nvinfer1::DimsCHW input_dim_psuedo_chw;
+    for (int i = 0; i < 3; i++) input_dim_psuedo_chw.d[i] = 1;
 
     for (int i = 1; i < op_info.shape().dim_size(); i++) {
       VLOG(2) << "dimension: " << i
               << " , size: " << op_info.shape().dim(i).size();
-      input_dim_pseudo_chw.d[i - 1] = op_info.shape().dim(i).size();
+      input_dim_psuedo_chw.d[i - 1] = op_info.shape().dim(i).size();
     }
 
     // TODO(ben,jie): proper way to restore input tensor name?
@@ -1486,7 +1486,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
       input_tensor_name = node_name + ":" + std::to_string(output_idx);
 
     nvinfer1::ITensor* input_tensor = converter.network()->addInput(
-        input_tensor_name.c_str(), dtype, input_dim_pseudo_chw);
+        input_tensor_name.c_str(), dtype, input_dim_psuedo_chw);
 
     if (!input_tensor)
       return tensorflow::errors::InvalidArgument(
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_calib_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_calib_op.cc
new file mode 100644
index 0000000000..1dcb87e768
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/kernels/trt_calib_op.cc
@@ -0,0 +1,129 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/kernels/trt_calib_op.h"
+#include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h"
+#include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h"
+#include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "cuda_runtime_api.h"
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+TRTCalibOp::TRTCalibOp(OpKernelConstruction* context) : OpKernel(context) {
+  OP_REQUIRES_OK(context, context->GetAttr("segment_nodes", &segment_nodes_));
+  OP_REQUIRES_OK(context, context->GetAttr("input_names", &input_names_));
+  OP_REQUIRES_OK(context, context->GetAttr("resource_name", &resource_name_));
+};
+
+#define TYPECASE(dt, X, Y)                                                \
+  case dt: {                                                              \
+    return (void*)X->flat<tensorflow::EnumToDataType<dt>::Type>().data(); \
+  }
+
+void* GetTensorAddress(const Tensor* tensor_ptr) {
+  auto tensor_type = tensor_ptr->dtype();
+  switch (tensor_type) {
+    TYPECASE(tensorflow::DT_FLOAT, tensor_ptr, dest_ptr);
+    TYPECASE(tensorflow::DT_HALF, tensor_ptr, dest_ptr);
+    TYPECASE(tensorflow::DT_INT8, tensor_ptr, dest_ptr);
+    default: {
+      LOG(FATAL) << "Unsupported Data type "
+                 << tensorflow::DataTypeString(tensor_type);
+      return nullptr;
+    }
+  }
+}
+
+void TRTCalibOp::Compute(tensorflow::OpKernelContext* ctx) {
+  // TODO(aaroey): make sure ctx->resource_mgr() is used in future PR.
+  auto trt_rm = tensorflow::tensorrt::TRTResourceManager::instance();
+  auto res_mgr = trt_rm->getManager("TRTCalibOps");
+  tensorflow::tensorrt::TRTCalibrationResource* calib_res = nullptr;
+  auto status = res_mgr->Lookup(resource_name_, resource_name_, &calib_res);
+
+  if (!status.ok()) {
+    ctx->SetStatus(status);
+    return;
+  }
+  int num_inputs = ctx->num_inputs();
+  // first run instantiate calibrator
+  if (calib_res->calibrator_ == nullptr) {
+    dev_tensors_.resize(num_inputs);
+    int batch_size = ctx->input(0).dim_size(0);
+    VLOG(1) << " Constructing calibrator";
+    for (int i = 0; i < num_inputs; i++) {
+      // allocate workspace on device for inputs
+      const tensorflow::Tensor& t = ctx->input(i);
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_persistent(t.dtype(), t.shape(),
+                                              &dev_tensors_.at(i), nullptr));
+      const auto device_tensor = dev_tensors_.at(i).AccessTensor(ctx);
+      CHECK_EQ(t.TotalBytes(), device_tensor->TotalBytes());
+      void* device_address = GetTensorAddress(device_tensor);
+      device_buffers_.emplace(input_names_.at(i),
+                              std::pair<void*, size_t>(
+                                  device_address, device_tensor->TotalBytes()));
+    }
+
+    calib_res->calibrator_ =
+        new TRTInt8Calibrator(device_buffers_, batch_size, resource_name_);
+    string label(resource_name_);
+    calib_res->thr_ = new std::thread([calib_res, label]() {
+      VLOG(1) << "Starting calibration thread, Calibration Resource @ "
+              << calib_res;
+      calib_res->builder_->setInt8Calibrator(calib_res->calibrator_);
+      calib_res->builder_->setInt8Mode(true);
+      calib_res->engine_ = calib_res->builder_->buildCudaEngine(
+          *calib_res->network_);  // will loop until we terminate calibrator
+      VLOG(1) << "Calibration loop terminated " << label;
+    });
+    VLOG(1) << "initialized calibrator resource";
+  }  //  calibrator initialized
+
+  // Pass input data to calibrator
+  std::unordered_map<string, void*> input_data;
+  for (int i = 0; i < num_inputs; i++) {
+    const Tensor& t = ctx->input(i);
+    void* data_address = GetTensorAddress(&t);
+    const auto device_tensor = dev_tensors_.at(i).AccessTensor(ctx);
+    CHECK_EQ(t.TotalBytes(),
+             device_tensor->TotalBytes());  // use the tensor so FW keeps it
+    input_data.emplace(input_names_.at(i), data_address);
+    ctx->set_output(i, t);
+  }
+  VLOG(2) << "Filled map for sending";
+  calib_res->calibrator_->setBatch(input_data);
+  VLOG(2) << "Passed calibration data";
+  // TODO(aaroey): make sure we wait for the completion of calibration on the
+  // last batch in future PR.
+};
+
+#undef TYPECASE
+
+REGISTER_KERNEL_BUILDER(Name("TRTCalibOp").Device(DEVICE_GPU), TRTCalibOp);
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+#endif
+#endif
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_calib_op.h b/tensorflow/contrib/tensorrt/kernels/trt_calib_op.h
new file mode 100644
index 0000000000..23df9db32f
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/kernels/trt_calib_op.h
@@ -0,0 +1,52 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_CALIB_OP_H
+#define TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_CALIB_OP_H
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/platform/types.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+namespace tensorflow {
+namespace tensorrt {
+// TODO(sami): Convert this to async kernel!
+class TRTCalibOp : public OpKernel {
+ public:
+  explicit TRTCalibOp(OpKernelConstruction* context);
+
+  void Compute(OpKernelContext* context) override;
+
+ private:
+  string resource_name_;
+  std::vector<string> segment_nodes_;
+  std::vector<string> input_names_;
+  std::vector<tensorflow::TensorShape> shapes_;
+  std::unordered_map<string, std::pair<void*, size_t>> device_buffers_;
+  std::vector<tensorflow::PersistentTensor> dev_tensors_;
+};
+}  // namespace tensorrt
+}  // namespace tensorflow
+#endif
+#endif
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_CALIB_OP_H
diff --git a/tensorflow/contrib/tensorrt/ops/trt_calib_op.cc b/tensorflow/contrib/tensorrt/ops/trt_calib_op.cc
new file mode 100644
index 0000000000..4835e50650
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/ops/trt_calib_op.cc
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+namespace tensorflow {
+
+REGISTER_OP("TRTCalibOp")
+    .Attr("segment_nodes: list(string)")         // names of the ops in segment
+    .Attr("segment_output_names: list(string)")  // names of the output ops in
+                                                 // segment
+    .Attr("input_names: list(string)")           // names of the inputs for
+                                                 // passing into tensorrt
+    .Attr("resource_name: string")
+    .Attr("InT: list({int8, float16, float32})")
+    .Input("in_tensor: InT")
+    .Output("out_tensor: InT")
+    .SetShapeFn([](tensorflow::shape_inference::InferenceContext* c) {
+      for (int i = 0; i < c->num_inputs(); i++) {
+        c->set_output(i, c->input(i));
+      }
+      return Status::OK();
+    });
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc b/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc
new file mode 100644
index 0000000000..3d5cc76c42
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc
@@ -0,0 +1,119 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h"
+
+#include <atomic>
+#include <chrono>
+#include <unordered_map>
+
+#include "tensorflow/core/platform/logging.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "cuda_runtime_api.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+// set the batch size before constructing the thread to execute engine
+int TRTInt8Calibrator::getBatchSize() const { return batch_size_; }
+
+TRTInt8Calibrator::TRTInt8Calibrator(
+    const std::unordered_map<string, std::pair<void*, size_t>>& dev_buffers,
+    int batch_size, string engine_name)
+    : batch_size_(batch_size),
+      done_(false),
+      dev_buffers_(dev_buffers),
+      calib_running_(false),
+      engine_name_(engine_name) {}
+
+bool TRTInt8Calibrator::setBatch(
+    const std::unordered_map<string, void*>& data) {
+  // TODO(aaroey): make sure that in future PR:
+  // 1. the mutex_lock is outside of the loop
+  // 2. wait() is used instead of wait_for()
+  // 3. done_ is to be protected by the mutex
+  // 4. the first batch is not missed
+  if (done_) return false;
+  while (calib_running_.load(
+      std::memory_order_acquire)) {  // wait while calibration is running
+    tensorflow::mutex_lock l(cond_mtx_);
+    cond_.wait_for(l, std::chrono::milliseconds(50));
+    if (done_) return false;
+  }
+  VLOG(1) << "Set Batch Waiting finished";
+  for (const auto it : data) {
+    auto devptr = dev_buffers_.find(it.first);
+    if (devptr == dev_buffers_.end()) {
+      LOG(FATAL) << "FATAL " << engine_name_ << " input name '" << it.first
+                 << "' does not match with the buffer names";
+    }
+    const auto& d = devptr->second;
+
+    // TODO(aaroey): we should not use sync copy on default stream. Make sure
+    // stream->ThenMemcpy() is used in future PRs.
+    auto status =
+        cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice);
+    if (status != cudaSuccess) {
+      LOG(FATAL) << "cudaMemcpy " << engine_name_ << " for '" << it.first
+                 << "' failed with " << status;
+    }
+  }
+  calib_running_.store(true, std::memory_order_release);  // release builder
+  cond_.notify_all();
+  return true;
+}
+
+bool TRTInt8Calibrator::getBatch(void** bindings, const char** names,
+                                 int num_bindings) {
+  calib_running_.store(false, std::memory_order_release);  // wait for new batch
+  cond_.notify_all();
+  while (!calib_running_.load(
+      std::memory_order_acquire)) {  // wait until new batch arrives
+    tensorflow::mutex_lock l(cond_mtx_);
+    cond_.wait_for(l, std::chrono::milliseconds(50));
+    if (done_) return false;
+  }
+  if (done_) {
+    return false;
+  }
+
+  for (int i = 0; i < num_bindings; i++) {
+    auto it = dev_buffers_.find(names[i]);
+    if (it == dev_buffers_.end()) {
+      LOG(FATAL) << "Calibration engine asked for unknown tensor name '"
+                 << names[i] << "' at position " << i;
+    }
+
+    bindings[i] = it->second.first;
+  }
+  return true;
+}
+
+const void* TRTInt8Calibrator::readCalibrationCache(std::size_t& length) {
+  return nullptr;
+}
+
+void TRTInt8Calibrator::writeCalibrationCache(const void* ptr,
+                                              std::size_t length) {}
+TRTInt8Calibrator::~TRTInt8Calibrator() {
+  VLOG(1) << "Destroying calibrator for " << engine_name_;
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+#endif
+#endif
diff --git a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h b/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h
new file mode 100644
index 0000000000..8830f7efe7
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h
@@ -0,0 +1,65 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_INT8_CALIBRATOR_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_INT8_CALIBRATOR_H_
+
+#include <atomic>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include "tensorflow/core/platform/mutex.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+namespace tensorflow {
+namespace tensorrt {
+// This class provides a 1 element queue to match TFs push model to
+// TRTs pull model for calibration. When TRT implements a means for
+// a push calibration This class should be updated accordingly
+
+struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator {
+ public:
+  TRTInt8Calibrator(
+      const std::unordered_map<string, std::pair<void*, size_t>>& dev_buffers,
+      int batch_size, string engine_name);
+  int getBatchSize() const override;
+  bool getBatch(void* bindings[], const char* names[],
+                int num_bindings) override;
+  bool setBatch(const std::unordered_map<string, void*>& data);
+  void setDone() { done_ = true; }
+  const void* readCalibrationCache(std::size_t& length) override;
+  void writeCalibrationCache(const void* ptr, std::size_t length) override;
+  ~TRTInt8Calibrator();
+
+ private:
+  const int batch_size_;
+  tensorflow::mutex cond_mtx_;           // mutex for condition_variable
+  tensorflow::condition_variable cond_;  // condition variable to implement
+                                         // producer-consumer queue for
+                                         // calibration
+  bool done_;
+  const std::unordered_map<string, std::pair<void*, size_t>>
+      dev_buffers_;  // map to keep tensorrt input buffers and sizes keyed with
+                     // buffer names
+  std::atomic_bool calib_running_;
+  string engine_name_;
+};
+}  // namespace tensorrt
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_INT8_CALIBRATOR_H_
+#endif
+#endif
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc
new file mode 100644
index 0000000000..e663eed4dd
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+std::shared_ptr<tensorflow::ResourceMgr>
+tensorflow::tensorrt::TRTResourceManager::getManager(const string& op_name) {
+  // mutex is held for lookup only. Most instantiations where mutex will be held
+  // longer will be during op creation and should be ok.
+  tensorflow::mutex_lock lock(map_mutex_);
+  auto s = managers_.find(op_name);
+  if (s == managers_.end()) {
+    auto it = managers_.emplace(
+        op_name, std::make_shared<tensorflow::ResourceMgr>(op_name));
+    VLOG(1) << "Returning a new manager " << op_name;
+    return it.first->second;
+  }
+  VLOG(1) << "Returning old manager " << op_name;
+  return s->second;
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h
new file mode 100644
index 0000000000..5f8ad491d3
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h
@@ -0,0 +1,49 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_RESOURCE_MANAGER_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_RESOURCE_MANAGER_H_
+#include <memory>
+
+#include <string>
+#include <unordered_map>
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+class TRTResourceManager {
+  TRTResourceManager() = default;
+
+ public:
+  static std::shared_ptr<TRTResourceManager> instance() {
+    static std::shared_ptr<TRTResourceManager> instance_(
+        new TRTResourceManager);
+    return instance_;
+  }
+  // returns a manager for given op, if it doesn't exists it creates one
+  std::shared_ptr<tensorflow::ResourceMgr> getManager(const string& op_name);
+
+ private:
+  std::unordered_map<string, std::shared_ptr<tensorflow::ResourceMgr>>
+      managers_;
+  tensorflow::mutex map_mutex_;
+};
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCE_TRT_RESOURCE_MANAGER_H_
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resources.h b/tensorflow/contrib/tensorrt/resources/trt_resources.h
new file mode 100644
index 0000000000..3c85968ae7
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/resources/trt_resources.h
@@ -0,0 +1,95 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRTRESOURCES_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRTRESOURCES_H_
+
+#include <list>
+#include <sstream>
+#include <string>
+#include <thread>
+#include <vector>
+#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h"
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+class TRTCalibrationResource : public tensorflow::ResourceBase {
+ public:
+  TRTCalibrationResource()
+      : calibrator_(nullptr),
+        builder_(nullptr),
+        network_(nullptr),
+        engine_(nullptr),
+        logger_(nullptr),
+        thr_(nullptr) {}
+  string DebugString() override {
+    std::stringstream oss;
+    oss << " Calibrator = " << std::hex << calibrator_ << std::dec << std::endl
+        << " Builder    = " << std::hex << builder_ << std::dec << std::endl
+        << " Network    = " << std::hex << network_ << std::dec << std::endl
+        << " Engine     = " << std::hex << engine_ << std::dec << std::endl
+        << " Logger     = " << std::hex << logger_ << std::dec << std::endl
+        << " Thread     = " << std::hex << thr_ << std::dec << std::endl;
+    return oss.str();
+  }
+  ~TRTCalibrationResource() {
+    VLOG(0) << "Destroying Calibration Resource " << std::endl << DebugString();
+  }
+  TRTInt8Calibrator* calibrator_;
+  nvinfer1::IBuilder* builder_;
+  nvinfer1::INetworkDefinition* network_;
+  nvinfer1::ICudaEngine* engine_;
+  tensorflow::tensorrt::Logger* logger_;
+  // TODO(sami): Use threadpool threads!
+  std::thread* thr_;
+};
+
+class TRTWeightStore : public tensorflow::ResourceBase {
+ public:
+  TRTWeightStore() {}
+  std::list<std::vector<uint8_t>> store_;
+  string DebugString() override {
+    std::stringstream oss;
+    size_t lenBytes = 0;
+    for (const auto& v : store_) {
+      lenBytes += v.size() * sizeof(uint8_t);
+    }
+    oss << " Number of entries     = " << store_.size() << std::endl
+        << " Total number of bytes = "
+        << store_.size() * sizeof(std::vector<uint8_t>) + lenBytes << std::endl;
+    return oss.str();
+  }
+  virtual ~TRTWeightStore() { VLOG(1) << "Destroying store" << DebugString(); }
+};
+
+class TRTEngineResource : public tensorflow::ResourceBase {
+ public:
+  TRTEngineResource() : runtime_(nullptr), ctx_(nullptr){};
+  string DebugString() override { return string(""); }
+  nvinfer1::IRuntime* runtime_;
+  nvinfer1::IExecutionContext* ctx_;
+};
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCEMGR_TRTRESOURCES_H_
+#endif
+#endif
diff --git a/tensorflow/contrib/timeseries/python/timeseries/BUILD b/tensorflow/contrib/timeseries/python/timeseries/BUILD
index fff972c1f3..ed3ed4c0e1 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/BUILD
@@ -140,11 +140,13 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/estimator:export",
         "//tensorflow/python/estimator:head",
+        "//tensorflow/python/estimator:metric_keys",
     ],
 )
 
diff --git a/tensorflow/contrib/timeseries/python/timeseries/head.py b/tensorflow/contrib/timeseries/python/timeseries/head.py
index 8731b10923..f4d9351432 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/head.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/head.py
@@ -26,6 +26,7 @@ from tensorflow.contrib.timeseries.python.timeseries import feature_keys
 
 from tensorflow.python.estimator import estimator_lib
 from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.estimator.canned import metric_keys
 from tensorflow.python.estimator.export import export_lib
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -35,6 +36,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.util import nest
+from tensorflow.python.summary import summary
 
 
 def time_series_regression_head(model,
@@ -71,14 +73,34 @@ class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-acc
     self.input_statistics_generator = input_statistics_generator
     self._name = name
 
+  @property
+  def name(self):
+    return self._name
+
+  # TODO(terrytangyuan): consolidate `model_outputs` and `_Head.LossSpec`
+  # once `_Head.create_loss` becomes extendable
+  def create_loss(self, features, mode, logits=None, labels=None):
+    """See `_Head`."""
+    model_outputs = self.state_manager.define_loss(
+        self.model, features, mode)
+    summary.scalar(
+        head_lib._summary_key(self._name, metric_keys.MetricKeys.LOSS),
+        model_outputs.loss)
+    return model_outputs
+
+  @property
+  def logits_dimension(self):
+    """See `_Head`."""
+    return 1
+
   def _train_ops(self, features):
     """Add training ops to the graph."""
+    mode = estimator_lib.ModeKeys.TRAIN
     with variable_scope.variable_scope(
         "model",
         # Use ResourceVariables to avoid race conditions.
         use_resource=True):
-      model_outputs = self.state_manager.define_loss(
-          self.model, features, estimator_lib.ModeKeys.TRAIN)
+      model_outputs = self.create_loss(features, mode)
 
     train_op = optimizers.optimize_loss(
         model_outputs.loss,
@@ -88,31 +110,14 @@ class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-acc
         learning_rate=None)
     return estimator_lib.EstimatorSpec(
         loss=model_outputs.loss,
-        mode=estimator_lib.ModeKeys.TRAIN,
+        mode=mode,
         train_op=train_op)
 
-  # TODO(terrytangyuan): suffix summary and metrics keys by `"/" + name`
-  @property
-  def name(self):
-    return self._name
-
-  # TODO(terrytangyuan): unused for now. Need to decouple
-  # `state_manager.define_loss` to satisfy the extendable return signature of
-  # `_Head.create_loss`.
-  def create_loss(self, features, mode, logits, labels):
-    """See `_Head`."""
-    return None
-
-  # TODO(terrytangyuan): check label dimension
-  @property
-  def logits_dimension(self):
-    return None
-
   def _evaluate_ops(self, features):
     """Add ops for evaluation (aka filtering) to the graph."""
+    mode = estimator_lib.ModeKeys.EVAL
     with variable_scope.variable_scope("model", use_resource=True):
-      model_outputs = self.state_manager.define_loss(
-          self.model, features, estimator_lib.ModeKeys.EVAL)
+      model_outputs = self.create_loss(features, mode)
     metrics = {}
     # Just output in-sample predictions for the last chunk seen
     for prediction_key, prediction_value in model_outputs.predictions.items():
@@ -125,7 +130,7 @@ class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-acc
                                 model_outputs.end_state))
     return estimator_lib.EstimatorSpec(
         loss=model_outputs.loss,
-        mode=estimator_lib.ModeKeys.EVAL,
+        mode=mode,
         eval_metric_ops=metrics,
         predictions={})
 
@@ -143,9 +148,8 @@ class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-acc
     with variable_scope.variable_scope("model", use_resource=True):
       prediction_outputs = self.model.predict(features=features)
     with variable_scope.variable_scope("model", reuse=True):
-      filtering_outputs = self.state_manager.define_loss(
-          self.model, features, estimator_lib.ModeKeys.EVAL)
-
+      filtering_outputs = self.create_loss(
+          features, estimator_lib.ModeKeys.EVAL)
     return estimator_lib.EstimatorSpec(
         mode=estimator_lib.ModeKeys.PREDICT,
         export_outputs={
@@ -194,7 +198,7 @@ class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-acc
 
   def create_estimator_spec(self, features, mode, labels=None):
     """Performs basic error checking and returns an EstimatorSpec."""
-    with ops.name_scope("head"):
+    with ops.name_scope(self._name, "head"):
       if labels:
         raise ValueError(
             "The model received a `labels` dictionary, which is "
diff --git a/tensorflow/contrib/verbs/README.md b/tensorflow/contrib/verbs/README.md
index 58fed4e5cb..4b6104a8b4 100644
--- a/tensorflow/contrib/verbs/README.md
+++ b/tensorflow/contrib/verbs/README.md
@@ -93,7 +93,7 @@ When the receiver receives the RDMA write, it will locate the relevant **RdmaTen
 
 1. When the sender receives a tensor request, the source tensor may or may not be ready yet. The situation is handled through a process of tag matching:
 	* If the request arrives before the tensor is ready, then a callback is put in a local table, and will be invoked once the tensor arrives.
-	* If the tensor is ready before the request arives, than the tensor is put in a local table. When the request arrives, it will invoke the callback immediately.
+	* If the tensor is ready before the request arrives, than the tensor is put in a local table. When the request arrives, it will invoke the callback immediately.
    In code it is done by calling **RecvLocalAsync()**, which receives the tensor's key, step-id, and the callback.
 2. When the callback is invoked, the relevant tensor is removed from the tag matching table. In the case where we need to send the tensor's meta-data, the **RdmaTensorResponse** will store a copy of the tensor until the re-request arrives.
 3. The sending of protocol messages (**RDMA_MESSAGE_TENSOR_REQUEST**, **RDMA_MESSAGE_META_DATA_RESPONSE** and **RDMA_MESSAGE_TENSOR_RE_REQUEST**) is done by the class **RdmaMessageBuffer**. All messages are sent using RDMA writes from/to fixed messages buffers. This implies that we cannot send on a specific channel more than one message at a time. In order to synchronize the messages, the **RdmaMessageBuffer** holds the a local and remote buffer statuses which can be either busy or idle. When a write is issued, both statuses will be changed to busy. When the write-complete event is received, the local status is changed to idle. When the write is received on the remote side, the remote side will parse the message, and return an ACK back to the sending side on which the sending side will update the remote status to idle. When both the local and remote statuses are idle, the next message can be sent.
diff --git a/tensorflow/contrib/verbs/patch_notes_verbs_with_0_copies.md b/tensorflow/contrib/verbs/patch_notes_verbs_with_0_copies.md
index 956b8f2147..da6fdd48e1 100644
--- a/tensorflow/contrib/verbs/patch_notes_verbs_with_0_copies.md
+++ b/tensorflow/contrib/verbs/patch_notes_verbs_with_0_copies.md
@@ -64,7 +64,7 @@ The protocol messages themselves will remain mostly unchanged at the first stage
 	* type - The message type.
 	* request_index - Request index.
 	* is_dead/data_type/tensor_shape/tensor_bytes - The up-to-date meta-data.
-* **RDMA_MESSAGE_BUFFER_RESPONSE** - (receiver ==> sender) Tensor re-requset after meta-data update and reallocation of result/proxy tensors.
+* **RDMA_MESSAGE_BUFFER_RESPONSE** - (receiver ==> sender) Tensor re-request after meta-data update and reallocation of result/proxy tensors.
 	* type - The message type.
 	* name (name_size) - Name of the requested tensor.
 	* step_id - Step ID.
diff --git a/tensorflow/contrib/verbs/rdma.cc b/tensorflow/contrib/verbs/rdma.cc
index 7d95b6522c..86350a08e5 100644
--- a/tensorflow/contrib/verbs/rdma.cc
+++ b/tensorflow/contrib/verbs/rdma.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/core/api_def/base_api/api_def_UniqueWithCountsV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_UniqueWithCountsV2.pbtxt
new file mode 100644
index 0000000000..e21f56ba5b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UniqueWithCountsV2.pbtxt
@@ -0,0 +1,85 @@
+op {
+  graph_op_name: "UniqueWithCountsV2"
+  in_arg {
+    name: "x"
+    description: <<END
+A `Tensor`.
+END
+  }
+  in_arg {
+    name: "axis"
+    description: <<END
+A `Tensor` of type `int32` (default: None). The axis of the Tensor to
+find the unique elements.
+END
+  }
+  out_arg {
+    name: "y"
+    description: <<END
+A `Tensor`. Unique elements along the `axis` of `Tensor` x.
+END
+  }
+  out_arg {
+    name: "idx"
+    description: <<END
+A 1-D Tensor. Has the same type as x that contains the index of each
+value of x in the output y.
+END
+  }
+  out_arg {
+    name: "count"
+    description: <<END
+A 1-D Tensor. The count of each value of x in the output y.
+END
+  }
+  summary: "Finds unique elements along an axis of a tensor."
+  description: <<END
+This operation either returns a tensor `y` containing unique elements
+along the `axis` of a tensor. The returned unique elements is sorted
+in the same order as they occur along `axis` in `x`.
+This operation also returns a tensor `idx` and a tensor `count`
+that are the same size as the number of the elements in `x` along the
+`axis` dimension. The `idx` contains the index in the unique output `y`
+and the `count` contains the count in the unique output `y`.
+In other words, for an `1-D` tensor `x` with `axis = None:
+
+`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+
+For example:
+
+```
+# tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+y, idx, count = unique_with_counts(x)
+y ==> [1, 2, 4, 7, 8]
+idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+count ==> [2, 1, 3, 1, 2]
+```
+
+For an `2-D` tensor `x` with `axis = 0`:
+
+```
+# tensor 'x' is [[1, 0, 0],
+#                [1, 0, 0],
+#                [2, 0, 0]]
+y, idx, count = unique_with_counts(x, axis=0)
+y ==> [[1, 0, 0],
+       [2, 0, 0]]
+idx ==> [0, 0, 1]
+count ==> [2, 1]
+```
+
+For an `2-D` tensor `x` with `axis = 1`:
+
+```
+# tensor 'x' is [[1, 0, 0],
+#                [1, 0, 0],
+#                [2, 0, 0]]
+y, idx, count = unique_with_counts(x, axis=1)
+y ==> [[1, 0],
+       [1, 0],
+       [2, 0]]
+idx ==> [0, 1, 1]
+count ==> [1, 2]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
index 4e69e0bc63..4ca6780c95 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
@@ -14,20 +14,21 @@ Has same shape as data, except for dimension 0 which
 has size `num_segments`.
 END
   }
-  summary: "Computes the Max along segments of a tensor."
+  summary: "Computes the maximum along segments of a tensor."
   description: <<END
 Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 segments.
 
-This operator is similar to the [unsorted segment sum operator](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-Instead of computing the sum over segments, it computes the maximum
-such that:
+This operator is similar to the unsorted segment sum operator found
+[(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+Instead of computing the sum over segments, it computes the maximum such that:
 
 \\(output_i = \max_j data_j\\) where max is over `j` such
 that `segment_ids[j] == i`.
 
-If the maximum is empty for a given segment ID `i`, it outputs the smallest possible value for specific numeric type,
- `output[i] = numeric_limits<T>::min()`.
+If the maximum is empty for a given segment ID `i`, it outputs the smallest
+possible value for the specific numeric type,
+`output[i] = numeric_limits<T>::lowest()`.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMin.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMin.pbtxt
new file mode 100644
index 0000000000..55ea69b5dd
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMin.pbtxt
@@ -0,0 +1,33 @@
+op {
+  graph_op_name: "UnsortedSegmentMin"
+  in_arg {
+    name: "segment_ids"
+    description: <<END
+A 1-D tensor whose rank is equal to the rank of `data`'s
+first dimension.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has same shape as data, except for dimension 0 which
+has size `num_segments`.
+END
+  }
+  summary: "Computes the minimum along segments of a tensor."
+  description: <<END
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
+
+This operator is similar to the unsorted segment sum operator found
+[(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+Instead of computing the sum over segments, it computes the minimum such that:
+
+\\(output_i = \min_j data_j\\) where min is over `j` such
+that `segment_ids[j] == i`.
+
+If the minimum is empty for a given segment ID `i`, it outputs the largest
+possible value for the specific numeric type,
+`output[i] = numeric_limits<T>::max()`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentProd.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentProd.pbtxt
new file mode 100644
index 0000000000..577ff53d60
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentProd.pbtxt
@@ -0,0 +1,32 @@
+op {
+  graph_op_name: "UnsortedSegmentProd"
+  in_arg {
+    name: "segment_ids"
+    description: <<END
+A 1-D tensor whose rank is equal to the rank of `data`'s
+first dimension.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has same shape as data, except for dimension 0 which
+has size `num_segments`.
+END
+  }
+  summary: "Computes the product along segments of a tensor."
+  description: <<END
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
+
+This operator is similar to the unsorted segment sum operator found
+[(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+Instead of computing the sum over segments, it computes the product of all
+entries belonging to a segment such that:
+
+\\(output_i = \prod_j data_j\\) where the product is over `j` such
+that `segment_ids[j] == i`.
+
+If there is no entry for a given segment ID `i`, it outputs 1.
+END
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_UniqueWithCounts.pbtxt b/tensorflow/core/api_def/python_api/api_def_UniqueWithCounts.pbtxt
new file mode 100644
index 0000000000..71b35eaab5
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_UniqueWithCounts.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "UniqueWithCounts"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_UniqueWithCountsV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_UniqueWithCountsV2.pbtxt
new file mode 100644
index 0000000000..7876e55cf3
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_UniqueWithCountsV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "UniqueWithCountsV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index c88daa8ff8..d817c7dd1f 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -68,7 +68,7 @@ class BaseGPUDevice : public LocalDevice {
       const TensorReferenceVector& tensor_refs) override;
 
   Status FillContextMap(const Graph* graph,
-                        DeviceContextMap* device_context_map);
+                        DeviceContextMap* device_context_map) override;
 
   void Compute(OpKernel* op_kernel, OpKernelContext* context) override;
 
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index 90664c3612..51b9547f53 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -43,8 +43,8 @@ SessionMgr::SessionMgr(
       worker_cache_factory_(std::move(worker_cache_factory)) {}
 
 string SessionMgr::WorkerNameFromServerDef(const ServerDef& server_def) {
-  return strings::StrCat("/job:", server_def.job_name(),
-                         "/replica:0/task:", server_def.task_index());
+  return strings::StrCat("/job:", server_def.job_name(), "/replica:0/task:",
+                         server_def.task_index());
 }
 
 Status SessionMgr::CreateSession(const string& session,
diff --git a/tensorflow/core/framework/numeric_types.h b/tensorflow/core/framework/numeric_types.h
index 99a5d0a054..4c38fbbe59 100644
--- a/tensorflow/core/framework/numeric_types.h
+++ b/tensorflow/core/framework/numeric_types.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define TENSORFLOW_FRAMEWORK_NUMERIC_TYPES_H_
 
 #include <complex>
-
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 // Disable clang-format to prevent 'FixedPoint' header from being included
 // before 'Tensor' header on which it depends.
@@ -43,12 +42,47 @@ typedef Eigen::QUInt16 quint16;
 
 }  // namespace tensorflow
 
+
+
+
+static inline tensorflow::bfloat16 FloatToBFloat16(float float_val) {
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    return *reinterpret_cast<tensorflow::bfloat16*>(
+        reinterpret_cast<uint16_t*>(&float_val));
+#else
+    return *reinterpret_cast<tensorflow::bfloat16*>(
+        &(reinterpret_cast<uint16_t*>(&float_val)[1]));
+#endif
+}
+    
 namespace Eigen {
-// TOOD(xpan): We probably need to overwrite more methods to have correct eigen
-// behavior. E.g. loest(), is_integer, etc. See NumTraits.h in eigen.
+// TODO(xpan): We probably need to overwrite more methods to have correct eigen
+// behavior. E.g. epsilon(), dummy_precision, etc. See NumTraits.h in eigen.
 template <>
 struct NumTraits<tensorflow::bfloat16>
-    : GenericNumTraits<tensorflow::bfloat16> {};
+    : GenericNumTraits<tensorflow::bfloat16> {
+  enum {
+    IsInteger = 0,
+    IsSigned = 1,
+    RequireInitialization = 0
+  };
+  static EIGEN_STRONG_INLINE tensorflow::bfloat16 highest() {
+    return FloatToBFloat16(NumTraits<float>::highest());
+  }
+
+  static EIGEN_STRONG_INLINE tensorflow::bfloat16 lowest() {
+    return FloatToBFloat16(NumTraits<float>::lowest());
+  }
+
+  static EIGEN_STRONG_INLINE tensorflow::bfloat16 infinity() {
+    return FloatToBFloat16(NumTraits<float>::infinity());
+  }
+
+  static EIGEN_STRONG_INLINE tensorflow::bfloat16 quiet_NaN() {
+    return FloatToBFloat16(NumTraits<float>::quiet_NaN());
+  }
+};
+
 
 using ::tensorflow::operator==;
 using ::tensorflow::operator!=;
diff --git a/tensorflow/core/framework/variant_op_registry.h b/tensorflow/core/framework/variant_op_registry.h
index e94100e994..c9e8dd2217 100644
--- a/tensorflow/core/framework/variant_op_registry.h
+++ b/tensorflow/core/framework/variant_op_registry.h
@@ -310,8 +310,8 @@ Status BinaryOpVariants(OpKernelContext* ctx, VariantBinaryOp op,
     return errors::Internal(
         "No unary variant binary_op function found for binary variant op "
         "enum: ",
-        op, " Variant type_name: '", a.TypeName(),
-        "' for device type: ", device);
+        op, " Variant type_name: '", a.TypeName(), "' for device type: ",
+        device);
   }
   return (*binary_op_fn)(ctx, a, b, out);
 }
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 6ded261c7d..fe095a725a 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -532,6 +532,7 @@ cc_library(
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/utils:frame",
     ],
 )
 
@@ -539,6 +540,11 @@ tf_cc_test(
     name = "loop_optimizer_test",
     size = "small",
     srcs = ["loop_optimizer_test.cc"],
+    tags = [
+        "manual",
+        "no_oss",  # b/74111495
+        "notap",
+    ],
     deps = [
         ":loop_optimizer",
         "//tensorflow/cc:cc_ops",
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.cc b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
index 131466430e..244653504d 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
@@ -15,19 +15,31 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/loop_optimizer.h"
 
+#include <algorithm>
+#include <limits>
 #include <unordered_map>
 #include <unordered_set>
+#include <vector>
+#include <deque>
 
 #include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/frame.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/tensor_coding.h"
+#include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/saved_tensor_slice_util.h"
+
+using tensorflow::strings::StrCat;
 
 namespace tensorflow {
 namespace grappler {
@@ -94,10 +106,375 @@ Status RemoveStackOps(const GraphDef& graph, GraphDef* optimized_graph) {
 
 }  // namespace
 
+Status LoopOptimizer::LINMHandleInvariantEnter(NodeDef* node,
+                                               const int num_outputs) {
+  auto consumers = node_map_->GetOutputs(node->name());
+  std::vector<string> enter_control_inputs;
+  string enter_input;
+  for (auto& input : node->input()) {
+    if (IsControlInput(input)) {
+      enter_control_inputs.push_back(input);
+    } else {
+      enter_input = input;
+    }
+  }
+  for (auto* consumer : consumers) {
+    if (invariant_nodes_.count(consumer)) {
+      for (int i = 0; i < consumer->input_size(); ++i) {
+        if (NodeName(consumer->input(i)) == node->name()) {
+          consumer->set_input(i, enter_input);
+          node_map_->AddOutput(NodeName(enter_input), consumer->name());
+          node_map_->RemoveOutput(node->name(), consumer->name());
+        }
+      }
+      for (auto& control_input : enter_control_inputs) {
+        consumer->add_input(control_input);
+        node_map_->AddOutput(NodeName(control_input), consumer->name());
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status LoopOptimizer::LINMHandleConst(NodeDef* node,
+    const int num_outputs, const int frame_id) {
+  NodeDef* const_node;
+  if (num_outputs == 0) {
+    // all successor nodes are invariant
+    // Remove the control inputs from this frame to the const node,
+    // when moving it out of the frame (in parent frame)
+    const_node = node;
+    node_map_->RemoveInputs(node->name());
+    node->clear_input();
+  } else {
+    // some successor nodes are variant
+    // Have to keep the const node in the frame,
+    // so create a new one outside the frame (in parent frame)
+    const_node = optimized_graph_->add_node();
+    const_node->set_name(AddPrefixToNodeName(node->name(), kLoopOptimizer));
+    const_node->set_op("Const");
+    const_node->set_device(node->device());
+    *const_node->mutable_attr() = node->attr();
+    node_map_->AddNode(const_node->name(), const_node);
+    auto consumers = node_map_->GetOutputs(node->name());
+    for (auto* consumer : consumers) {
+      if (invariant_nodes_.count(consumer)) {
+        for (int i = 0; i < consumer->input_size(); ++i) {
+          if (NodeName(consumer->input(i)) == node->name()) {
+            if (IsControlInput(consumer->input(i))) {
+              *consumer->mutable_input(i) = AsControlDependency(*const_node);
+            } else {
+              *consumer->mutable_input(i) = const_node->name();
+            }
+            node_map_->AddOutput(const_node->name(), consumer->name());
+            node_map_->RemoveOutput(node->name(), consumer->name());
+          }
+        }
+      }
+    }
+  }
+  // add a control input from the parent frame
+  auto parent_it = frame_parent_.find(frame_id);
+  if (parent_it != frame_parent_.end()) {
+    int parent_id = parent_it->second;
+    auto loop_cond_it = loop_cond_.find(parent_id);
+    if (loop_cond_it == loop_cond_.end()) {
+      return errors::InvalidArgument(
+          "Frame ", frame_id, " doesn't have a LoopCond node");
+    }
+    auto& loop_cond_name = loop_cond_it->second->name();
+    NodeDef* switch_node = nullptr;
+    for (auto* node : node_map_->GetOutputs(loop_cond_name)) {
+      if (node->op() == "Switch") {
+        switch_node = node;
+        break;
+      }
+    }
+    if (!switch_node) {
+      return errors::InvalidArgument(
+          "LoopCond node of Frame ", frame_id,
+          " doesn't connect to any Switch node");
+    }
+    string switch_output = StrCat(switch_node->name(), ":1");
+    const string ctrl_dep = ConstantFolding::AddControlDependency(
+        switch_output, optimized_graph_, node_map_.get());
+    const_node->add_input(ctrl_dep);
+    node_map_->AddOutput(NodeName(ctrl_dep), const_node->name());
+  }
+  return Status::OK();
+}
+
+Status LoopOptimizer::LINMHandleInvariantNode(NodeDef* node,
+    const int num_outputs, const int frame_id) {
+  // have to remove control inputs to the invariant node from the same frame
+  // when moving this node out of this frame
+  for (int i = 0; i < node->input_size(); ++i) {
+    if (IsControlInput(node->input(i))) {
+      node->mutable_input()->SwapElements(i, node->input_size() - 1);
+      node->mutable_input()->RemoveLast();
+    }
+  }
+  if (num_outputs == 0) {
+    return Status::OK();
+  }
+
+  DataTypeVector input_types;
+  DataTypeVector output_types;
+  OpRegistryInterface* op_registry = OpRegistry::Global();
+  const OpRegistrationData* op_reg_data = nullptr;
+  TF_RETURN_IF_ERROR(
+      op_registry->LookUp(node->op(), &op_reg_data));
+  TF_RETURN_IF_ERROR(
+      InOutTypesForNode(*node, op_reg_data->op_def,
+                        &input_types, &output_types));
+
+  auto consumers = node_map_->GetOutputs(node->name());
+  string fname = invariant_enters_[frame_id][0]->attr().at("frame_name").s();
+  int piterations = invariant_enters_[frame_id][0]
+                    ->attr().at("parallel_iterations").i();
+  for (auto* consumer : consumers) {
+    if (!invariant_nodes_.count(consumer)) {
+      for (int i = 0; i < consumer->input_size(); ++i) {
+        int port;
+        string node_name = ParseNodeName(consumer->input(i), &port);
+        if (node_name != node->name()) {
+          continue;
+        }
+        if (port < 0) {
+          return errors::InvalidArgument(
+              "Invariant node should not have control outputs "
+              "to variant node");
+        }
+        DataType output_type = output_types[port];
+        NodeDef* new_enter = optimized_graph_->add_node();
+        new_enter->set_op("Enter");
+        new_enter->set_device(node->device());
+        new_enter->set_name(AddPrefixToNodeName(
+            StrCat(fname, "_enter_", new_enter_id_++), kLoopOptimizer));
+        AttrValue data_type;
+        data_type.set_type(output_type);
+        new_enter->mutable_attr()->insert({"T", data_type});
+        AttrValue frame_name;
+        frame_name.set_s(fname);
+        new_enter->mutable_attr()->insert({"frame_name", frame_name});
+        AttrValue is_const;
+        is_const.set_b(true);
+        new_enter->mutable_attr()->insert({"is_constant", is_const});
+        AttrValue parallel_iterations;
+        parallel_iterations.set_i(piterations);
+        new_enter->mutable_attr()->insert(
+            {"parallel_iterations", parallel_iterations});
+        new_enter->add_input(consumer->input(i));
+        *consumer->mutable_input(i) = new_enter->name();
+        node_map_->AddNode(new_enter->name(), new_enter);
+        node_map_->AddOutput(node->name(), new_enter->name());
+        node_map_->AddOutput(new_enter->name(), consumer->name());
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status LoopOptimizer::MoveInvariantNodes(const int frame_id) {
+  for (auto iter = invariant_nodes_.begin();
+       iter != invariant_nodes_.end(); ++iter) {
+    auto* invariant_node = iter->first;
+    const int num_outputs = iter->second;
+    if (IsEnter(*invariant_node)) {
+      TF_RETURN_IF_ERROR(
+          LINMHandleInvariantEnter(invariant_node, num_outputs));
+    } else if (IsConstant(*invariant_node)) {
+      TF_RETURN_IF_ERROR(
+          LINMHandleConst(invariant_node, num_outputs, frame_id));
+    } else {
+      TF_RETURN_IF_ERROR(
+          LINMHandleInvariantNode(invariant_node, num_outputs, frame_id));
+    }
+  }
+  return Status::OK();
+}
+
+Status LoopOptimizer::RevertInvariantNodes() {
+  std::deque<const NodeDef*> reverted_nodes;
+  for (auto iter=invariant_nodes_.begin(); iter != invariant_nodes_.end();) {
+    bool erased = false;
+    const auto* node = iter->first;
+    if (!IsConstant(*node) && !IsEnter(*node) && iter->second > 0) {
+      auto& consumers = node_map_->GetOutputs(node->name());
+      for (auto* consumer : consumers) {
+        if (!invariant_nodes_.count(consumer)) {
+          for (const auto& input : consumer->input()) {
+            if (IsControlInput(input) && NodeName(input) == node->name()) {
+              reverted_nodes.push_back(node);
+              invariant_nodes_.erase(iter++);
+              erased = true;
+              break;
+            }
+          }
+          if (erased) break;
+        }
+      }
+    }
+    if (!erased) ++iter;
+  }
+  while (!reverted_nodes.empty()) {
+    const auto* node = reverted_nodes.front();
+    reverted_nodes.pop_front();
+    std::set<NodeDef*> producers;
+    for (const auto& input : node->input()) {
+      auto* producer = node_map_->GetNode(input);
+      auto iter = invariant_nodes_.find(producer);
+      if (iter != invariant_nodes_.end()) {
+        if (IsControlInput(input) &&
+            !IsConstant(*producer) && !IsEnter(*producer)) {
+          reverted_nodes.push_back(producer);
+          invariant_nodes_.erase(iter);
+        } else {
+          producers.insert(producer);
+        }
+      }
+    }
+    for (auto* producer : producers) {
+      auto iter = invariant_nodes_.find(producer);
+      if (iter != invariant_nodes_.end()) {
+        ++iter->second;
+      }
+    }
+    for (auto* consumer : node_map_->GetOutputs(node->name())) {
+      auto iter = invariant_nodes_.find(consumer);
+      if (iter != invariant_nodes_.end()) {
+        reverted_nodes.push_back(consumer);
+        invariant_nodes_.erase(iter);
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status LoopOptimizer::FindInvariantNodes(NodeDef* node) {
+  auto consumers = node_map_->GetOutputs(node->name());
+  invariant_nodes_.insert(std::make_pair(node, consumers.size()));
+  for (auto* consumer : consumers) {
+    if (invariant_nodes_.count(consumer) ||
+        ModifiesFrameInfo(*consumer)) {
+      continue;
+    }
+    bool is_invariant = true;
+    for (const auto& input : consumer->input()) {
+      if (!IsControlInput(input)) {
+        const auto& name = NodeName(input);
+        auto* producer = node_map_->GetNode(name);
+        if (!invariant_nodes_.count(producer)) {
+          if (IsConstant(*producer)) {
+            invariant_nodes_.insert(
+                std::make_pair(producer, node_map_->GetOutputs(name).size()));
+          } else {
+            is_invariant = false;
+            break;
+          }
+        }
+      }
+    }
+    if (is_invariant) {
+      std::set<NodeDef*> producers;
+      for (const auto& input : consumer->input()) {
+        auto* producer = node_map_->GetNode(input);
+        producers.insert(producer);
+      }
+      for (auto* producer : producers) {
+        auto iter = invariant_nodes_.find(producer);
+        if (iter != invariant_nodes_.end()) {
+          --iter->second;
+        }
+      }
+      TF_RETURN_IF_ERROR(FindInvariantNodes(consumer));
+    }
+  }
+  return Status::OK();
+}
+
+Status LoopOptimizer::LoopInvariantNodeMotion() {
+  std::deque<int> worklist;
+  for (auto iter = frame_map_.begin(); iter != frame_map_.end(); ++iter) {
+    auto* node = iter->first;
+    auto& frame_ids = iter->second;
+    if (frame_ids.size() >= 3) {
+      for (unsigned int i = 1; i < frame_ids.size() - 1; ++i) {
+        frame_parent_[frame_ids[i]] = frame_ids[i - 1];
+        frame_children_[frame_ids[i]].insert(frame_ids[i + 1]);
+      }
+    }
+    if (frame_ids.size() >= 2) {
+      frame_children_[frame_ids[0]].insert(frame_ids[1]);
+      frame_parent_[frame_ids.back()] = frame_ids[frame_ids.size() - 2];
+    }
+    if (!frame_ids.empty()) {
+      frame_children_.insert(std::make_pair(frame_ids.back(), empty_set_));
+      if (node->op() == "LoopCond") {
+        if (loop_cond_.count(frame_ids.back())) {
+          return errors::InvalidArgument(
+              "Loop ", frame_ids.back(),
+              " has more than one LoopCond node: ", node->name(), " and ",
+              loop_cond_[frame_ids.back()]->name());
+        }
+        loop_cond_[frame_ids.back()] = node;
+      }
+      if (IsEnter(*node) && node->attr().at("is_constant").b()) {
+        invariant_enters_[frame_ids.back()].push_back(
+            const_cast<NodeDef*>(node));
+      }
+    }
+  }
+
+  for (auto it = frame_children_.begin(); it != frame_children_.end(); ++it) {
+    if (it->second.empty()) {
+      worklist.push_back(it->first);
+    }
+  }
+
+  while (!worklist.empty()) {
+    int frame_id = worklist.front();
+    new_enter_id_ = 0;
+    worklist.pop_front();
+    auto parent_it = frame_parent_.find(frame_id);
+    if (parent_it != frame_parent_.end()) {
+      int parent_id = parent_it->second;
+      frame_children_[parent_id].erase(frame_id);
+      if (frame_children_[parent_id].empty()) {
+        worklist.push_back(parent_id);
+      }
+    }
+
+    if (invariant_enters_[frame_id].empty()) {
+      continue;
+    }
+    invariant_nodes_.clear();
+    for (auto* enter : invariant_enters_[frame_id]) {
+      TF_RETURN_IF_ERROR(FindInvariantNodes(enter));
+    }
+
+    // revert invariant nodes that have control outputs to variant nodes
+    TF_RETURN_IF_ERROR(RevertInvariantNodes());
+
+    TF_RETURN_IF_ERROR(MoveInvariantNodes(frame_id));
+  }
+  return Status::OK();
+}
+
 Status LoopOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                GraphDef* optimized_graph) {
-  Status status = RemoveStackOps(item.graph, optimized_graph);
-  return status;
+  TF_RETURN_IF_ERROR(RemoveStackOps(item.graph, optimized_graph));
+
+  optimized_graph_ = optimized_graph;
+
+  // Set up helper data structures.
+  node_map_.reset(new NodeMap(optimized_graph_));
+  int num_frames;
+  TF_RETURN_IF_ERROR(IdentifyFramesWithNodeMap(*optimized_graph_, *node_map_,
+                                               &frame_map_, &num_frames));
+
+  TF_RETURN_IF_ERROR(LoopInvariantNodeMotion());
+  return Status::OK();
 }
 
 void LoopOptimizer::Feedback(Cluster* /*cluster*/, const GrapplerItem& /*item*/,
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.h b/tensorflow/core/grappler/optimizers/loop_optimizer.h
index 106d4628ae..c1b0321e4e 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.h
@@ -17,13 +17,17 @@ limitations under the License.
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_LOOP_OPTIMIZER_H_
 
 #include <unordered_set>
+#include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/frame.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
 namespace grappler {
 
+constexpr char kLoopOptimizer[] = "LoopOptimizer";
+
 class LoopOptimizer : public GraphOptimizer {
  public:
   LoopOptimizer() : opt_level_(RewriterConfig::ON) {}
@@ -40,7 +44,29 @@ class LoopOptimizer : public GraphOptimizer {
                 const GraphDef& optimized_graph, double result) override;
 
  private:
+  Status LoopInvariantNodeMotion();
+  Status FindInvariantNodes(NodeDef* node);
+  Status RevertInvariantNodes();
+  Status MoveInvariantNodes(const int frame_id);
+  Status LINMHandleInvariantNode(NodeDef* node, const int num_outputs,
+      const int frame_id);
+  Status LINMHandleConst(NodeDef* node, const int num_outputs,
+      const int frame_id);
+  Status LINMHandleInvariantEnter(NodeDef* node, const int num_outputs);
+
+  std::map<NodeDef*, int> invariant_nodes_;
+  std::set<int> empty_set_;
+  std::map<int, std::set<int>> frame_children_;
+  std::map<int, int> frame_parent_;
+  std::map<int, const NodeDef*> loop_cond_;
+  std::map<int, std::vector<NodeDef*>> invariant_enters_;
+  int new_enter_id_;
   RewriterConfig::Toggle opt_level_;
+
+  std::unique_ptr<NodeMap> node_map_;
+  FrameMap frame_map_;
+  std::unique_ptr<GraphProperties> graph_properties_;
+  GraphDef* optimized_graph_;  // Not owned.
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc b/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
index 3d54aa7a79..0bd202a2ab 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
@@ -26,7 +26,494 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-class LoopOptimizerTest : public ::testing::Test {};
+class LoopOptimizerTest : public ::testing::Test {
+ protected:
+  static NodeDef CreateNode(const string& name,
+                            const std::vector<string>& inputs) {
+    return CreateNode(name, "Identity", "", false, 0, inputs);
+  }
+  static NodeDef CreateNode(const string& name, const string& op,
+                            const std::vector<string>& inputs) {
+    return CreateNode(name, op, "", false, 0, inputs);
+  }
+  static NodeDef CreateNode(const string& name, const string& op,
+                            const string& frame,
+                            const bool is_constant,
+                            const int piterations,
+                            const std::vector<string>& inputs) {
+    NodeDef node;
+    node.set_name(name);
+    if (!op.empty()) {
+      node.set_op(op);
+    }
+    if (!frame.empty()) {
+      AttrValue frame_name;
+      frame_name.set_s(frame);
+      node.mutable_attr()->insert({"frame_name", frame_name});
+    }
+    if (op == "Enter") {
+      AttrValue is_const;
+      is_const.set_b(is_constant);
+      node.mutable_attr()->insert({"is_constant", is_const});
+      AttrValue parallel_iterations;
+      parallel_iterations.set_i(piterations);
+      node.mutable_attr()->insert(
+          {"parallel_iterations", parallel_iterations});
+    }
+    AttrValue type;
+    type.set_type(DT_FLOAT);
+    node.mutable_attr()->insert({"T", type});
+    for (const string& input : inputs) {
+      node.add_input(input);
+    }
+    return node;
+  }
+};
+
+TEST_F(LoopOptimizerTest, Basic) {
+  GraphDef graph;
+  *graph.add_node() = CreateNode("0", {});
+  *graph.add_node() = CreateNode(
+      "InvariantEnter", "Enter", "while/while_context", true, 1, {"0"});
+  *graph.add_node() = CreateNode(
+      "InvariantAdd", "Add", {"InvariantEnter", "InvariantEnter"});
+  *graph.add_node() = CreateNode(
+      "VariantAdd", "Add", {"InvariantAdd", "Identity"});
+  *graph.add_node() = CreateNode(
+      "VariantEnter", "Enter", "while/while_context", false, 1, {"0"});
+  *graph.add_node() = CreateNode(
+      "Merge", "Merge", {"VariantEnter", "NextIteration"});
+  *graph.add_node() = CreateNode("Less/y", "Const", {"^Identity"});
+  *graph.add_node() = CreateNode("Less", "Less", {"VariantAdd", "less/y"});
+  *graph.add_node() = CreateNode("LoopCond", "LoopCond", {"Less"});
+  *graph.add_node() = CreateNode("Switch", "Switch", {"Merge", "LoopCond"});
+  *graph.add_node() = CreateNode("Identity", {"Switch:1"});
+  *graph.add_node() = CreateNode(
+      "NextIteration", "NextIteration", {"VariantAdd"});
+  *graph.add_node() = CreateNode("Exit", "Exit", {"Switch"});
+  *graph.add_node() = CreateNode("1", {"Exit"});
+
+  GrapplerItem item;
+  item.graph = graph;
+
+  LoopOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  std::unique_ptr<NodeMap> node_map;
+  std::unordered_map<const NodeDef*, std::vector<int>> frames;
+  int num_frames;
+
+  node_map.reset(new NodeMap(&graph));
+  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
+  EXPECT_EQ(num_frames, 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).back(), 0);
+  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd")).size(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd")).back(), 0);
+
+  node_map.reset(new NodeMap(&output));
+  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
+  EXPECT_EQ(num_frames, 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 0);
+  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd")).size(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd")).back(), 0);
+}
+
+TEST_F(LoopOptimizerTest, Const) {
+  GraphDef graph;
+  *graph.add_node() = CreateNode("0", {});
+  *graph.add_node() = CreateNode(
+      "InvariantEnter", "Enter", "while/while_context", true, 1, {"0"});
+  *graph.add_node() = CreateNode("Const", "Const", {"^Identity"});
+  *graph.add_node() = CreateNode(
+      "InvariantAdd", "Add", {"InvariantEnter", "Const"});
+  *graph.add_node() = CreateNode(
+      "VariantAdd", "Add", {"InvariantAdd", "Identity"});
+  *graph.add_node() = CreateNode(
+      "VariantEnter", "Enter", "while/while_context", false, 1, {"0"});
+  *graph.add_node() = CreateNode(
+      "Merge", "Merge", {"VariantEnter", "NextIteration"});
+  *graph.add_node() = CreateNode("Less/y", "Const", {"^Identity"});
+  *graph.add_node() = CreateNode("Less", "Less", {"VariantAdd", "less/y"});
+  *graph.add_node() = CreateNode("LoopCond", "LoopCond", {"Less"});
+  *graph.add_node() = CreateNode("Switch", "Switch", {"Merge", "LoopCond"});
+  *graph.add_node() = CreateNode("Identity", {"Switch:1"});
+  *graph.add_node() = CreateNode(
+      "NextIteration", "NextIteration", {"VariantAdd"});
+  *graph.add_node() = CreateNode("Exit", "Exit", {"Switch"});
+  *graph.add_node() = CreateNode("1", {"Exit"});
+
+  GrapplerItem item;
+  item.graph = graph;
+
+  LoopOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  std::unique_ptr<NodeMap> node_map;
+  std::unordered_map<const NodeDef*, std::vector<int>> frames;
+  int num_frames;
+
+  node_map.reset(new NodeMap(&graph));
+  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
+  EXPECT_EQ(num_frames, 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).back(), 0);
+  EXPECT_EQ(frames.at(node_map->GetNode("Const")).size(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("Const")).back(), 0);
+
+  node_map.reset(new NodeMap(&output));
+  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
+  EXPECT_EQ(num_frames, 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 0);
+  EXPECT_EQ(frames.at(node_map->GetNode("Const")).size(), 0);
+}
+
+TEST_F(LoopOptimizerTest, ControlOutput) {
+  GraphDef graph;
+  *graph.add_node() = CreateNode("0", {});
+  *graph.add_node() = CreateNode(
+      "InvariantEnter", "Enter", "while/while_context", true, 1, {"0"});
+  *graph.add_node() = CreateNode(
+      "InvariantAdd", "Add", {"InvariantEnter", "InvariantEnter"});
+  *graph.add_node() = CreateNode(
+      "VariantAdd", "Add", {"InvariantAdd", "Identity"});
+  *graph.add_node() = CreateNode(
+      "VariantEnter", "Enter", "while/while_context", false, 1, {"0"});
+  *graph.add_node() = CreateNode(
+      "Merge", "Merge", {"VariantEnter", "NextIteration"});
+  *graph.add_node() = CreateNode("Less/y", "Const", {"^Identity"});
+  *graph.add_node() = CreateNode(
+      "Less", "Less", {"VariantAdd", "less/y", "^InvariantAdd"});
+  *graph.add_node() = CreateNode("LoopCond", "LoopCond", {"Less"});
+  *graph.add_node() = CreateNode("Switch", "Switch", {"Merge", "LoopCond"});
+  *graph.add_node() = CreateNode("Identity", {"Switch:1"});
+  *graph.add_node() = CreateNode(
+      "NextIteration", "NextIteration", {"VariantAdd"});
+  *graph.add_node() = CreateNode("Exit", "Exit", {"Switch"});
+  *graph.add_node() = CreateNode("1", {"Exit"});
+
+  GrapplerItem item;
+  item.graph = graph;
+
+  LoopOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  std::unique_ptr<NodeMap> node_map;
+  std::unordered_map<const NodeDef*, std::vector<int>> frames;
+  int num_frames;
+
+  node_map.reset(new NodeMap(&graph));
+  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
+  EXPECT_EQ(num_frames, 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).back(), 0);
+
+  node_map.reset(new NodeMap(&output));
+  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
+  EXPECT_EQ(num_frames, 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).back(), 0);
+}
+
+TEST_F(LoopOptimizerTest, NestedLoop1) {
+  GraphDef graph;
+  *graph.add_node() = CreateNode("0", {});
+  *graph.add_node() = CreateNode(
+      "InvariantEnter", "Enter", "while/while_context", true, 1, {"0"});
+  *graph.add_node() = CreateNode(
+      "InvariantAdd", "Add", {"InvariantEnter", "InvariantEnter"});
+  *graph.add_node() = CreateNode(
+      "VariantAdd", "Add", {"InvariantAdd", "Identity"});
+  *graph.add_node() = CreateNode(
+      "VariantEnter", "Enter", "while/while_context", false, 1, {"0"});
+  *graph.add_node() = CreateNode(
+      "Merge", "Merge", {"VariantEnter", "NextIteration"});
+  *graph.add_node() = CreateNode("Less/y", "Const", {"^Identity"});
+  *graph.add_node() = CreateNode("Less", "Less", {"Exit2", "less/y"});
+  *graph.add_node() = CreateNode("LoopCond", "LoopCond", {"Less"});
+  *graph.add_node() = CreateNode("Switch", "Switch", {"Merge", "LoopCond"});
+  *graph.add_node() = CreateNode("Identity", {"Switch:1"});
+  *graph.add_node() = CreateNode(
+      "NextIteration", "NextIteration", {"Exit2"});
+  *graph.add_node() = CreateNode("Exit", "Exit", {"Switch"});
+  *graph.add_node() = CreateNode("1", {"Exit"});
+
+  *graph.add_node() = CreateNode(
+      "InvariantEnter2", "Enter", "while/while/while_context", true, 1,
+      {"VariantAdd"});
+  *graph.add_node() = CreateNode(
+      "InvariantAdd2", "Add", {"InvariantEnter2", "InvariantEnter2"});
+  *graph.add_node() = CreateNode(
+      "VariantAdd2", "Add", {"InvariantAdd2", "Identity2"});
+  *graph.add_node() = CreateNode(
+      "VariantEnter2", "Enter", "while/while/while_context", false, 1,
+      {"VariantEnter"});
+  *graph.add_node() = CreateNode(
+      "Merge2", "Merge", {"VariantEnter2", "NextIteration2"});
+  *graph.add_node() = CreateNode("Less2/y", "Const", {"^Identity2"});
+  *graph.add_node() = CreateNode("Less2", "Less", {"VariantAdd2", "less2/y"});
+  *graph.add_node() = CreateNode("LoopCond2", "LoopCond", {"Less2"});
+  *graph.add_node() = CreateNode("Switch2", "Switch", {"Merge2", "LoopCond2"});
+  *graph.add_node() = CreateNode("Identity2", {"Switch2:1"});
+  *graph.add_node() = CreateNode(
+      "NextIteration2", "NextIteration", {"VariantAdd2"});
+  *graph.add_node() = CreateNode("Exit2", "Exit", {"Switch2"});
+
+  GrapplerItem item;
+  item.graph = graph;
+
+  LoopOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  std::unique_ptr<NodeMap> node_map;
+  std::unordered_map<const NodeDef*, std::vector<int>> frames;
+  int num_frames;
+
+  node_map.reset(new NodeMap(&graph));
+  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
+  EXPECT_EQ(num_frames, 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).back(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).size(), 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).back(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).back(), 0);
+
+  node_map.reset(new NodeMap(&output));
+  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
+  EXPECT_EQ(num_frames, 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).back(), 0);
+  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).size(), 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).back(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 0);
+}
+
+TEST_F(LoopOptimizerTest, NestedLoop2) {
+  GraphDef graph;
+  *graph.add_node() = CreateNode("0", {});
+  *graph.add_node() = CreateNode(
+      "InvariantEnter", "Enter", "while/while_context", true, 1, {"0"});
+  *graph.add_node() = CreateNode(
+      "InvariantAdd", "Add", {"InvariantEnter", "InvariantEnter"});
+  *graph.add_node() = CreateNode(
+      "VariantAdd", "Add", {"InvariantAdd", "Identity"});
+  *graph.add_node() = CreateNode(
+      "VariantEnter", "Enter", "while/while_context", false, 1, {"0"});
+  *graph.add_node() = CreateNode(
+      "Merge", "Merge", {"VariantEnter", "NextIteration"});
+  *graph.add_node() = CreateNode("Less/y", "Const", {"^Identity"});
+  *graph.add_node() = CreateNode("Less", "Less", {"Exit2", "less/y"});
+  *graph.add_node() = CreateNode("LoopCond", "LoopCond", {"Less"});
+  *graph.add_node() = CreateNode("Switch", "Switch", {"Merge", "LoopCond"});
+  *graph.add_node() = CreateNode("Identity", {"Switch:1"});
+  *graph.add_node() = CreateNode(
+      "NextIteration", "NextIteration", {"Exit2"});
+  *graph.add_node() = CreateNode("Exit", "Exit", {"Switch"});
+  *graph.add_node() = CreateNode("1", {"Exit"});
+
+  *graph.add_node() = CreateNode(
+      "InvariantEnter2", "Enter", "while/while/while_context", true, 1,
+      {"InvariantAdd"});
+  *graph.add_node() = CreateNode(
+      "InvariantAdd2", "Add", {"InvariantEnter2", "InvariantEnter2"});
+  *graph.add_node() = CreateNode(
+      "VariantAdd2", "Add", {"InvariantAdd2", "Identity2"});
+  *graph.add_node() = CreateNode(
+      "VariantEnter2", "Enter", "while/while/while_context", false, 1,
+      {"VariantEnter"});
+  *graph.add_node() = CreateNode(
+      "Merge2", "Merge", {"VariantEnter2", "NextIteration2"});
+  *graph.add_node() = CreateNode("Less2/y", "Const", {"^Identity2"});
+  *graph.add_node() = CreateNode("Less2", "Less", {"VariantAdd2", "less2/y"});
+  *graph.add_node() = CreateNode("LoopCond2", "LoopCond", {"Less2"});
+  *graph.add_node() = CreateNode("Switch2", "Switch", {"Merge2", "LoopCond2"});
+  *graph.add_node() = CreateNode("Identity2", {"Switch2:1"});
+  *graph.add_node() = CreateNode(
+      "NextIteration2", "NextIteration", {"VariantAdd2"});
+  *graph.add_node() = CreateNode("Exit2", "Exit", {"Switch2"});
+
+  GrapplerItem item;
+  item.graph = graph;
+
+  LoopOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  std::unique_ptr<NodeMap> node_map;
+  std::unordered_map<const NodeDef*, std::vector<int>> frames;
+  int num_frames;
+
+  node_map.reset(new NodeMap(&graph));
+  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
+  EXPECT_EQ(num_frames, 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).back(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).size(), 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).back(), 1);
+
+  node_map.reset(new NodeMap(&output));
+  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
+  EXPECT_EQ(num_frames, 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 0);
+  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).size(), 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).back(), 1);
+}
+
+TEST_F(LoopOptimizerTest, NestedLoopConst1) {
+  GraphDef graph;
+  *graph.add_node() = CreateNode("0", {});
+  *graph.add_node() = CreateNode(
+      "InvariantEnter", "Enter", "while/while_context", true, 1, {"0"});
+  *graph.add_node() = CreateNode(
+      "InvariantAdd", "Add", {"InvariantEnter", "InvariantEnter"});
+  *graph.add_node() = CreateNode(
+      "VariantAdd", "Add", {"InvariantAdd", "Identity"});
+  *graph.add_node() = CreateNode(
+      "VariantEnter", "Enter", "while/while_context", false, 1, {"0"});
+  *graph.add_node() = CreateNode(
+      "Merge", "Merge", {"VariantEnter", "NextIteration"});
+  *graph.add_node() = CreateNode("Less/y", "Const", {"^Identity"});
+  *graph.add_node() = CreateNode("Less", "Less", {"Exit2", "less/y"});
+  *graph.add_node() = CreateNode("LoopCond", "LoopCond", {"Less"});
+  *graph.add_node() = CreateNode("Switch", "Switch", {"Merge", "LoopCond"});
+  *graph.add_node() = CreateNode("Identity", {"Switch:1"});
+  *graph.add_node() = CreateNode(
+      "NextIteration", "NextIteration", {"Exit2"});
+  *graph.add_node() = CreateNode("Exit", "Exit", {"Switch"});
+  *graph.add_node() = CreateNode("1", {"Exit"});
+
+  *graph.add_node() = CreateNode(
+      "InvariantEnter2", "Enter", "while/while/while_context", true, 1,
+      {"VariantAdd"});
+  *graph.add_node() = CreateNode("Const2", "Const", {"^Identity2"});
+  *graph.add_node() = CreateNode(
+      "InvariantAdd2", "Add", {"InvariantEnter2", "Const2"});
+  *graph.add_node() = CreateNode(
+      "VariantAdd2", "Add", {"InvariantAdd2", "Identity2"});
+  *graph.add_node() = CreateNode(
+      "VariantEnter2", "Enter", "while/while/while_context", false, 1,
+      {"VariantEnter"});
+  *graph.add_node() = CreateNode(
+      "Merge2", "Merge", {"VariantEnter2", "NextIteration2"});
+  *graph.add_node() = CreateNode("Less2/y", "Const", {"^Identity2"});
+  *graph.add_node() = CreateNode("Less2", "Less", {"VariantAdd2", "less2/y"});
+  *graph.add_node() = CreateNode("LoopCond2", "LoopCond", {"Less2"});
+  *graph.add_node() = CreateNode("Switch2", "Switch", {"Merge2", "LoopCond2"});
+  *graph.add_node() = CreateNode("Identity2", {"Switch2:1"});
+  *graph.add_node() = CreateNode(
+      "NextIteration2", "NextIteration", {"VariantAdd2"});
+  *graph.add_node() = CreateNode("Exit2", "Exit", {"Switch2"});
+
+  GrapplerItem item;
+  item.graph = graph;
+
+  LoopOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  std::unique_ptr<NodeMap> node_map;
+  std::unordered_map<const NodeDef*, std::vector<int>> frames;
+  int num_frames;
+
+  node_map.reset(new NodeMap(&graph));
+  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
+  EXPECT_EQ(num_frames, 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).back(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).size(), 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).back(), 1);
+
+  node_map.reset(new NodeMap(&output));
+  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
+  EXPECT_EQ(num_frames, 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).back(), 0);
+  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).size(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).back(), 0);
+}
+
+TEST_F(LoopOptimizerTest, NestedLoopConst2) {
+  GraphDef graph;
+  *graph.add_node() = CreateNode("0", {});
+  *graph.add_node() = CreateNode(
+      "InvariantEnter", "Enter", "while/while_context", true, 1, {"0"});
+  *graph.add_node() = CreateNode(
+      "InvariantAdd", "Add", {"InvariantEnter", "InvariantEnter"});
+  *graph.add_node() = CreateNode(
+      "VariantAdd", "Add", {"InvariantAdd", "Identity"});
+  *graph.add_node() = CreateNode(
+      "VariantEnter", "Enter", "while/while_context", false, 1, {"0"});
+  *graph.add_node() = CreateNode(
+      "Merge", "Merge", {"VariantEnter", "NextIteration"});
+  *graph.add_node() = CreateNode("Less/y", "Const", {"^Identity"});
+  *graph.add_node() = CreateNode("Less", "Less", {"Exit2", "less/y"});
+  *graph.add_node() = CreateNode("LoopCond", "LoopCond", {"Less"});
+  *graph.add_node() = CreateNode("Switch", "Switch", {"Merge", "LoopCond"});
+  *graph.add_node() = CreateNode("Identity", {"Switch:1"});
+  *graph.add_node() = CreateNode(
+      "NextIteration", "NextIteration", {"Exit2"});
+  *graph.add_node() = CreateNode("Exit", "Exit", {"Switch"});
+  *graph.add_node() = CreateNode("1", {"Exit"});
+
+  *graph.add_node() = CreateNode(
+      "InvariantEnter2", "Enter", "while/while/while_context", true, 1,
+      {"InvariantAdd"});
+  *graph.add_node() = CreateNode("Const2", "Const", {"^Identity2"});
+  *graph.add_node() = CreateNode(
+      "InvariantAdd2", "Add", {"InvariantEnter2", "Const2"});
+  *graph.add_node() = CreateNode(
+      "VariantAdd2", "Add", {"InvariantAdd2", "Identity2"});
+  *graph.add_node() = CreateNode(
+      "VariantEnter2", "Enter", "while/while/while_context", false, 1,
+      {"VariantEnter"});
+  *graph.add_node() = CreateNode(
+      "Merge2", "Merge", {"VariantEnter2", "NextIteration2"});
+  *graph.add_node() = CreateNode("Less2/y", "Const", {"^Identity2"});
+  *graph.add_node() = CreateNode("Less2", "Less", {"VariantAdd2", "less2/y"});
+  *graph.add_node() = CreateNode("LoopCond2", "LoopCond", {"Less2"});
+  *graph.add_node() = CreateNode("Switch2", "Switch", {"Merge2", "LoopCond2"});
+  *graph.add_node() = CreateNode("Identity2", {"Switch2:1"});
+  *graph.add_node() = CreateNode(
+      "NextIteration2", "NextIteration", {"VariantAdd2"});
+  *graph.add_node() = CreateNode("Exit2", "Exit", {"Switch2"});
+
+  GrapplerItem item;
+  item.graph = graph;
+
+  LoopOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  std::unique_ptr<NodeMap> node_map;
+  std::unordered_map<const NodeDef*, std::vector<int>> frames;
+  int num_frames;
+
+  node_map.reset(new NodeMap(&graph));
+  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
+  EXPECT_EQ(num_frames, 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).back(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).size(), 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).back(), 1);
+
+  node_map.reset(new NodeMap(&output));
+  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
+  EXPECT_EQ(num_frames, 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 0);
+  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).size(), 0);
+}
 
 void VerifyGraphsEqual(const GraphDef& original_graph,
                        const GraphDef& optimized_graph, const string& func) {
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 1e2a33566b..48d5955ad1 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -5137,7 +5137,6 @@ tf_kernel_library(
     srcs = [
         "dequantize_op.cc",
         "meta_support.cc",
-        "quantization_utils.cc",
         "quantize_down_and_shrink_range.cc",
         "quantize_op.cc",
         "quantized_activation_ops.cc",
@@ -5169,6 +5168,7 @@ tf_kernel_library(
         ":image_resizer_state",
         ":ops_util",
         ":pooling_ops",
+        ":quantization_utils",
         "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -5716,6 +5716,16 @@ tf_kernel_library(
 )
 
 cc_library(
+    name = "quantization_utils",
+    srcs = ["quantization_utils.cc"],
+    hdrs = ["quantization_utils.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "@gemmlowp",
+    ],
+)
+
+cc_library(
     name = "remote_fused_graph_execute_utils",
     srcs = [
         "i_remote_fused_graph_ops_definitions.cc",
@@ -6088,7 +6098,6 @@ cc_library(
     srcs = [
         "cwise_ops_common.cc",
         "meta_support.cc",
-        "quantization_utils.cc",
     ],
     hdrs = [
         "cwise_ops.h",
@@ -6097,10 +6106,10 @@ cc_library(
         "cwise_ops_gpu_gradients.cu.h",
         "cwise_ops_gradients.h",
         "meta_support.h",
-        "quantization_utils.h",
     ],
     deps = [
         ":bounds_check",
+        ":quantization_utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//third_party/eigen3",
diff --git a/tensorflow/core/kernels/cwise_op_maximum.cc b/tensorflow/core/kernels/cwise_op_maximum.cc
index 8c54f22f10..e8a58eea80 100644
--- a/tensorflow/core/kernels/cwise_op_maximum.cc
+++ b/tensorflow/core/kernels/cwise_op_maximum.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(BinaryOp, CPU, "Maximum", functor::maximum, float, Eigen::half,
-          double, int32, int64);
+REGISTER6(BinaryOp, CPU, "Maximum", functor::maximum, float, Eigen::half,
+          bfloat16, double, int32, int64);
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "Maximum", functor::maximum, float, Eigen::half,
           double, int64);
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
index 9b2146aca3..9e564b016f 100644
--- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -1109,19 +1109,12 @@ class MklFusedBatchNormGradOp : public OpKernel {
         return;
       }
 
-      if (dnn_shape_src.IsMklTensor())
-        depth_ = dnn_shape_src.DimSize(MklDnnDims::Dim_C);
-      else
-        ExtractParams(context);
-
-      memory::format format_m;
       if (dnn_shape_src.IsMklTensor()) {
-        if (dnn_shape_src.IsTensorInNCHWFormat())
-          format_m = memory::format::nchw;
-        else
-          format_m = memory::format::nhwc;
+        depth_ = dnn_shape_src.DimSize(MklDnnDims::Dim_C);
+      } else if (dnn_shape_diff_dst.IsMklTensor()) {
+        depth_ = dnn_shape_diff_dst.DimSize(MklDnnDims::Dim_C);
       } else {
-        format_m = TFDataFormatToMklDnnDataFormat(tensor_format_);
+        ExtractParams(context);
       }
 
       MklDnnData<T> src(&cpu_engine);
@@ -1145,20 +1138,20 @@ class MklFusedBatchNormGradOp : public OpKernel {
         diff_dst_dims =
             TFShapeToMklDnnDimsInNCHW(diff_dst_tensor.shape(), tensor_format_);
 
-      // set src and diff_dst primitives
+      // set src and diff_dst primitives according to input layout
       memory::desc src_md({}, memory::data_undef, memory::format_undef);
       memory::desc diff_dst_md({}, memory::data_undef, memory::format_undef);
-      if (dnn_shape_src.IsMklTensor() || dnn_shape_diff_dst.IsMklTensor()) {
-        if (dnn_shape_src.IsMklTensor()) {
-          src_md = dnn_shape_src.GetMklLayout();
-          diff_dst_md = src_md;
-        } else {
-          diff_dst_md = dnn_shape_diff_dst.GetMklLayout();
-          src_md = diff_dst_md;
-        }
+      if (dnn_shape_src.IsMklTensor()) {
+        src_md = dnn_shape_src.GetMklLayout();
       } else {
-        src_md = memory::desc(src_dims, MklDnnType<T>(), format_m);
-        diff_dst_md = src_md;
+        src_md =  memory::desc(src_dims, MklDnnType<T>(),
+                TFDataFormatToMklDnnDataFormat(tensor_format_));
+      }
+      if (dnn_shape_diff_dst.IsMklTensor()) {
+        diff_dst_md = dnn_shape_diff_dst.GetMklLayout();
+      } else {
+        diff_dst_md = memory::desc(diff_dst_dims, MklDnnType<T>(),
+                TFDataFormatToMklDnnDataFormat(tensor_format_));
       }
       src.SetUsrMem(src_md, &src_tensor);
       diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
@@ -1210,28 +1203,64 @@ class MklFusedBatchNormGradOp : public OpKernel {
       // allocate diff_src tensor
       MklDnnShape dnn_shape_diff_src;
       TensorShape tf_shape_diff_src;
-      if (dnn_shape_src.IsMklTensor()) {
+
+      // MKL-DNN's BN primitive not provide API to fetch internal format
+      // set common_md as OpMem
+      // src and diff_dst will reorder to common_md
+      // diff_src will set as common_md
+      memory::desc common_md({}, memory::data_undef, memory::format_undef);
+      if (dnn_shape_src.IsMklTensor() || dnn_shape_diff_dst.IsMklTensor()) {
+        if (dnn_shape_src.IsMklTensor()) {
+          common_md = dnn_shape_src.GetMklLayout();
+        } else {
+          common_md = dnn_shape_diff_dst.GetMklLayout();
+        }
+      } else {
+        common_md = memory::desc(src_dims, MklDnnType<T>(),
+                TFDataFormatToMklDnnDataFormat(tensor_format_));
+      }
+      // if any of src and diff_dst as mkl layout,
+      // then we set diff_src as mkl layout
+      if (dnn_shape_src.IsMklTensor() ||
+              dnn_shape_diff_dst.IsMklTensor()) {
         dnn_shape_diff_src.SetMklTensor(true);
-        auto diff_src_pd = bnrm_fwd_pd.dst_primitive_desc();
+        // set diff_src's mkl layout as common_md
+        auto diff_src_pd = memory::primitive_desc(common_md, cpu_engine);
         dnn_shape_diff_src.SetMklLayout(&diff_src_pd);
         dnn_shape_diff_src.SetElemType(MklDnnType<T>());
-        dnn_shape_diff_src.SetTfLayout(dnn_shape_src.GetDimension(), src_dims,
-                                       format_m);
-        dnn_shape_diff_src.SetTfDimOrder(dnn_shape_src.GetDimension(),
-                                         tensor_format_);
+        if (dnn_shape_src.IsMklTensor()) {
+          dnn_shape_diff_src.SetTfLayout(
+                  dnn_shape_src.GetDimension(),
+                  src_dims,
+                  dnn_shape_src.GetTfDataFormat());
+          dnn_shape_diff_src.SetTfDimOrder(
+                  dnn_shape_src.GetDimension(),
+                  tensor_format_);
+        } else {
+          dnn_shape_diff_src.SetTfLayout(
+                  dnn_shape_diff_dst.GetDimension(),
+                  src_dims,
+                  dnn_shape_diff_dst.GetTfDataFormat());
+          dnn_shape_diff_src.SetTfDimOrder(
+                  dnn_shape_diff_dst.GetDimension(),
+                  tensor_format_);
+        }
         tf_shape_diff_src.AddDim(diff_src_pd.get_size() / sizeof(T));
       } else {
         dnn_shape_diff_src.SetMklTensor(false);
+        // both src and diff_dst are TensorFlow layout,
+        // so it is OK to get TensorFlow shape.
         tf_shape_diff_src = src_tensor.shape();
       }
       AllocateOutputSetMklShape(context, kDiffSrcIndex, &diff_src_tensor,
                                 tf_shape_diff_src, dnn_shape_diff_src);
 
-      diff_src.SetUsrMem(src_md, diff_src_tensor);
+      // set diff_src
+      diff_src.SetUsrMem(common_md, diff_src_tensor);
 
       prop_kind pk = prop_kind::backward;
       auto bnrm_bwd_desc = batch_normalization_backward::desc(
-          pk, diff_src.GetUsrMemDesc(), src.GetUsrMemDesc(), epsilon_,
+          pk, common_md, common_md, epsilon_,
           /* for inference, specify use_global_stats
              1. on fwd prop, use mean and variance
                 provided as inputs
@@ -1244,11 +1273,16 @@ class MklFusedBatchNormGradOp : public OpKernel {
       auto bnrm_bwd_pd = batch_normalization_backward::primitive_desc(
           bnrm_bwd_desc, cpu_engine, bnrm_fwd_pd);
 
+      std::vector<primitive> net;
+      src.CheckReorderToOpMem(memory::primitive_desc(common_md,
+                                   cpu_engine), &net);
+      diff_dst.CheckReorderToOpMem(memory::primitive_desc(common_md,
+                                   cpu_engine), &net);
+
       auto bnrm_bwd_op = batch_normalization_backward(
           bnrm_bwd_pd, src.GetOpMem(), mean.GetOpMem(), variance.GetOpMem(),
           diff_dst.GetOpMem(), weights_m, diff_src.GetOpMem(), diff_weights_m);
 
-      std::vector<primitive> net;
       net.push_back(bnrm_bwd_op);
       stream(stream::kind::eager).submit(net).wait();
 
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index f006954c03..267f4f8d12 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -367,8 +367,11 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
   mkl_context.MklCleanup();
 }
 
+
+
 #else  // INTEL_MKL_ML
 
+
 template <typename Device, typename T, algorithm alg_kind>
 class MklReluOpBase : public OpKernel {
  public:
@@ -578,17 +581,26 @@ class MklReluGradOpBase : public OpKernel {
       // allocate diff_src tensor
       MklDnnShape dnn_shape_diff_src;
       TensorShape tf_shape_diff_src;
-      if (dnn_shape_src.IsMklTensor()) {
+      if (dnn_shape_src.IsMklTensor() ||
+              dnn_shape_diff_dst.IsMklTensor()) {
         dnn_shape_diff_src.SetMklTensor(true);
         auto diff_src_pd = relu_bwd_pd.diff_src_primitive_desc();
         dnn_shape_diff_src.SetMklLayout(&diff_src_pd);
         dnn_shape_diff_src.SetElemType(MklDnnType<T>());
-        dnn_shape_diff_src.SetTfLayout(dnn_shape_src.GetDimension(),
-                                       dnn_shape_src.GetSizesAsMklDnnDims(),
-                                       dnn_shape_src.GetTfDataFormat());
+        if (dnn_shape_src.IsMklTensor()) {
+          dnn_shape_diff_src.SetTfLayout(dnn_shape_src.GetDimension(),
+                                         dnn_shape_src.GetSizesAsMklDnnDims(),
+                                         dnn_shape_src.GetTfDataFormat());
+        } else {
+          dnn_shape_diff_src.SetTfLayout(dnn_shape_diff_dst.GetDimension(),
+                                 dnn_shape_diff_dst.GetSizesAsMklDnnDims(),
+                                 dnn_shape_diff_dst.GetTfDataFormat());
+        }
         tf_shape_diff_src.AddDim(diff_src_pd.get_size() / sizeof(T));
       } else {
         dnn_shape_diff_src.SetMklTensor(false);
+        // both src and diff_dst are TensorFlow layout,
+        // so it is ok to get TensorFlow shape.
         tf_shape_diff_src = src_tensor.shape();
       }
       AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
diff --git a/tensorflow/core/kernels/reshape_op.cc b/tensorflow/core/kernels/reshape_op.cc
index 8b86596721..33c63e7050 100644
--- a/tensorflow/core/kernels/reshape_op.cc
+++ b/tensorflow/core/kernels/reshape_op.cc
@@ -43,7 +43,6 @@ REGISTER_KERNEL_BUILDER(Name("Reshape")
                               .TypeConstraint<int64>("Tshape"), \
                           ReshapeOp);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
-TF_CALL_bfloat16(REGISTER_GPU_KERNEL);
 TF_CALL_bool(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
diff --git a/tensorflow/core/kernels/segment_reduction_ops.cc b/tensorflow/core/kernels/segment_reduction_ops.cc
index bbf8696531..2fc73a3309 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops.cc
@@ -20,10 +20,10 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #endif  // GOOGLE_CUDA
 
-#include "tensorflow/core/kernels/segment_reduction_ops.h"
-#include <vector>
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/kernels/segment_reduction_ops.h"
+#include <vector>
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -356,158 +356,180 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_SORTED_KERNELS_ALL);
 #undef REGISTER_GPU_SORTED_KERNELS_ALL
 #endif  // GOOGLE_CUDA
 
+// ____________________________________________________________________________
+// Unsorted segment reduction ops.
+
 namespace functor {
 
-// UnsortedSegmentSumFunctor implementation for CPUDevice.
-// todo: Remove duplicate code in UnsortedSegmentSumFunctor and
-// UnsortedSegmentMaxFunctor.
-template <typename T, typename Index>
-struct UnsortedSegmentSumFunctor<CPUDevice, T, Index>
-    : UnsortedSegmentBaseFunctor<CPUDevice, T, Index> {
-  void operator()(OpKernelContext* ctx, const CPUDevice& d,
-                  const Index output_rows, const TensorShape& segment_ids_shape,
+// The ReductionFunctor implementation for CPU.
+template <typename T, typename Index, typename InitialValueF,
+          typename ReductionF>
+struct UnsortedSegmentFunctor<CPUDevice, T, Index, InitialValueF, ReductionF> {
+  void operator()(OpKernelContext* ctx, const Index num_segments,
+                  const TensorShape& segment_ids_shape,
                   typename TTypes<Index>::ConstFlat segment_ids,
                   const Index data_size, const T* data,
-                  typename TTypes<T, 2>::Tensor output) override {
-    output.setZero();
+                  typename TTypes<T, 2>::Tensor output) {
+    output.setConstant(InitialValueF()());
     if (data_size == 0) {
       return;
     }
     const int64 N = segment_ids.dimension(0);
+    ReductionF reduction;
     auto data_flat = typename TTypes<T, 2>::ConstTensor(data, N, data_size / N);
     for (int64 i = 0; i < N; ++i) {
       Index j = internal::SubtleMustCopy(segment_ids(i));
       if (j < 0) {
         continue;
       }
-      OP_REQUIRES(ctx, FastBoundsCheck(j, output_rows),
+      OP_REQUIRES(ctx, FastBoundsCheck(j, num_segments),
                   errors::InvalidArgument(
                       "segment_ids", SliceDebugString(segment_ids_shape, i),
-                      " = ", j, " is out of range [0, ", output_rows, ")"));
-      output.template chip<0>(j) += data_flat.template chip<0>(i);
+                      " = ", j, " is out of range [0, ", num_segments, ")"));
+      reduction(data_flat.template chip<0>(i), output.template chip<0>(j));
     }
   }
 };
-// UnsortedSegmentMaxFunctor implementation for CPUDevice.
-template <typename T, typename Index>
-struct UnsortedSegmentMaxFunctor<CPUDevice, T, Index>
-    : UnsortedSegmentBaseFunctor<CPUDevice, T, Index> {
-  void operator()(OpKernelContext* ctx, const CPUDevice& d,
-                  const Index output_rows, const TensorShape& segment_ids_shape,
-                  typename TTypes<Index>::ConstFlat segment_ids,
-                  const Index data_size, const T* data,
-                  typename TTypes<T, 2>::Tensor output) override {
-    output.setConstant(std::numeric_limits<T>::lowest());
-    if (data_size == 0) {
-      return;
-    }
-    const int64 N = segment_ids.dimension(0);
-    auto data_flat = typename TTypes<T, 2>::ConstTensor(data, N, data_size / N);
-    for (int64 i = 0; i < N; ++i) {
-      Index j = internal::SubtleMustCopy(segment_ids(i));
-      OP_REQUIRES(ctx, FastBoundsCheck(j, output_rows),
-                  errors::InvalidArgument(
-                      "segment_ids", SliceDebugString(segment_ids_shape, i),
-                      " = ", j, " is out of range [0, ", output_rows, ")"));
-      output.template chip<0>(j) =
-          data_flat.template chip<0>(i).cwiseMax(output.template chip<0>(j));
-    }
+
+template <typename T>
+using MatrixChip = Eigen::TensorChippingOp<0l, typename TTypes<T, 2>::Matrix>;
+
+template <typename T>
+using constMatrixChip =
+    Eigen::TensorChippingOp<0l, const typename TTypes<T, 2>::ConstMatrix>;
+
+// reduction functors
+template <typename T>
+struct SumOp {
+  void operator()(const constMatrixChip<T> data, MatrixChip<T> output) {
+    output += data;
+  }
+};
+
+template <typename T>
+struct MaxOp {
+  void operator()(const constMatrixChip<T> data, MatrixChip<T> output) {
+    output = data.cwiseMax(output);
+  }
+};
+
+template <typename T>
+struct MinOp {
+  void operator()(const constMatrixChip<T> data, MatrixChip<T> output) {
+    output = data.cwiseMin(output);
+  }
+};
+
+template <typename T>
+struct ProdOp {
+  void operator()(const constMatrixChip<T> data, MatrixChip<T> output) {
+    output *= data;
   }
 };
 }  // namespace functor
 
-// Base class for SegmentReductionOps that can handle unsorted segment
-// definitions
-// and specifying the size of the output in addition to a reduction function
-template <typename Device, class T, class Index>
-class UnsortedSegmentBaseOp : public OpKernel {
+// Static check routines not in the templated class to reduce code size
+static void UnsortedSegmentReductionValidation(OpKernel* op_kernel,
+                                               OpKernelContext* context,
+                                               const Tensor& data,
+                                               const Tensor& segment_ids,
+                                               const Tensor& num_segments) {
+  OP_REQUIRES(
+      context, op_kernel->IsLegacyScalar(num_segments.shape()),
+      errors::InvalidArgument("num_segments should be a scalar, not shape ",
+                              num_segments.shape().DebugString()));
+  OP_REQUIRES(
+      context, TensorShapeUtils::StartsWith(data.shape(), segment_ids.shape()),
+      errors::InvalidArgument("data.shape = ", data.shape().DebugString(),
+                              " does not start with segment_ids.shape = ",
+                              segment_ids.shape().DebugString()));
+}
+
+static bool UnsortedSegmentReductionDoValidation(OpKernel* op_kernel,
+                                                 OpKernelContext* context,
+                                                 const Tensor& data,
+                                                 const Tensor& segment_ids,
+                                                 const Tensor& num_segments) {
+  UnsortedSegmentReductionValidation(op_kernel, context, data, segment_ids,
+                                     num_segments);
+  return context->status().ok();
+}
+
+// The UnsortedSegmentReduction OpKernel. The DeviceReductionFunctor
+// is the device specific implementation of the reduction. These device
+// specific implementations are templated themselves with the corresponding
+// initial value functors and reduction functors.
+template <typename T, typename Index, typename DeviceReductionFunctor>
+class UnsortedSegmentReductionOp : public OpKernel {
  public:
-  explicit UnsortedSegmentBaseOp(
-      OpKernelConstruction* context,
-      functor::UnsortedSegmentBaseFunctor<Device, T, Index>& functor)
-      : OpKernel(context), reduction_functor_(functor) {}
+  explicit UnsortedSegmentReductionOp(OpKernelConstruction* context)
+      : OpKernel(context), reduction_functor_(DeviceReductionFunctor()) {}
 
   void Compute(OpKernelContext* context) override {
     const Tensor& data = context->input(0);
     const Tensor& segment_ids = context->input(1);
     const Tensor& num_segments = context->input(2);
-
-    OP_REQUIRES(
-        context, IsLegacyScalar(num_segments.shape()),
-        errors::InvalidArgument("num_segments should be a scalar, not shape ",
-                                num_segments.shape().DebugString()));
-    OP_REQUIRES(
-        context,
-        TensorShapeUtils::StartsWith(data.shape(), segment_ids.shape()),
-        errors::InvalidArgument("data.shape = ", data.shape().DebugString(),
-                                " does not start with segment_ids.shape = ",
-                                segment_ids.shape().DebugString()));
-
+    if (!UnsortedSegmentReductionDoValidation(this, context, data, segment_ids,
+                                              num_segments)) {
+      return;
+    }
     const auto segment_flat = segment_ids.flat<Index>();
     const Index output_rows =
         internal::SubtleMustCopy(num_segments.scalar<int32>()());
     OP_REQUIRES(context, output_rows >= 0,
                 errors::InvalidArgument("Input num_segments == ", output_rows,
                                         " must not be negative."));
-
     TensorShape output_shape;
     output_shape.AddDim(output_rows);
     for (int i = segment_ids.dims(); i < data.dims(); i++) {
       output_shape.AddDim(data.dim_size(i));
     }
-
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
     auto output_flat = output->flat_outer_dims<T>();
-
     auto data_ptr = data.template flat<T>().data();
-    reduction_functor_(context, context->template eigen_device<Device>(),
-                       output_rows, segment_ids.shape(), segment_flat,
+    reduction_functor_(context, output_rows, segment_ids.shape(), segment_flat,
                        data.NumElements(), data_ptr, output_flat);
   }
 
- private:
-  functor::UnsortedSegmentBaseFunctor<Device, T, Index>& reduction_functor_;
-};
-
-template <typename Device, class T, class Index>
-class UnsortedSegmentSumOp : public UnsortedSegmentBaseOp<Device, T, Index> {
- public:
-  explicit UnsortedSegmentSumOp(OpKernelConstruction* context)
-      : UnsortedSegmentBaseOp<Device, T, Index>(context, sum_functor_) {}
-
- private:
-  functor::UnsortedSegmentSumFunctor<Device, T, Index> sum_functor_;
+ protected:
+  DeviceReductionFunctor reduction_functor_;
 };
 
-template <typename Device, class T, class Index>
-class UnsortedSegmentMaxOp : public UnsortedSegmentBaseOp<Device, T, Index> {
- public:
-  explicit UnsortedSegmentMaxOp(OpKernelConstruction* context)
-      : UnsortedSegmentBaseOp<Device, T, Index>(context, max_functor_) {}
-
- private:
-  functor::UnsortedSegmentMaxFunctor<Device, T, Index> max_functor_;
-};
-
-#define REGISTER_REAL_CPU_UNSORTED_KERNELS(type, index_type)                  \
-  REGISTER_KERNEL_BUILDER(Name("UnsortedSegmentSum")                          \
-                              .Device(DEVICE_CPU)                             \
-                              .TypeConstraint<type>("T")                      \
-                              .TypeConstraint<index_type>("Tindices"),        \
-                          UnsortedSegmentSumOp<CPUDevice, type, index_type>); \
-  REGISTER_KERNEL_BUILDER(Name("UnsortedSegmentMax")                          \
-                              .Device(DEVICE_CPU)                             \
-                              .TypeConstraint<type>("T")                      \
-                              .TypeConstraint<index_type>("Tindices"),        \
-                          UnsortedSegmentMaxOp<CPUDevice, type, index_type>);
-
-#define REGISTER_COMPLEX_CPU_UNSORTED_KERNELS(type, index_type)        \
-  REGISTER_KERNEL_BUILDER(Name("UnsortedSegmentSum")                   \
-                              .Device(DEVICE_CPU)                      \
-                              .TypeConstraint<type>("T")               \
-                              .TypeConstraint<index_type>("Tindices"), \
-                          UnsortedSegmentSumOp<CPUDevice, type, index_type>);
+#define REGISTER_CPU_KERNEL_UNSORTEDSEGMENT(                           \
+    name, type, index_type, initial_value_functor, reduction_functor)  \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name(name)                                                       \
+          .Device(DEVICE_CPU)                                          \
+          .TypeConstraint<type>("T")                                   \
+          .TypeConstraint<index_type>("Tindices"),                     \
+      UnsortedSegmentReductionOp<                                      \
+          type, index_type,                                            \
+          functor::UnsortedSegmentFunctor<CPUDevice, type, index_type, \
+                                          initial_value_functor,       \
+                                          reduction_functor> >)
+
+#define REGISTER_REAL_CPU_UNSORTED_KERNELS(type, index_type)                   \
+  REGISTER_CPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentSum", type, index_type,  \
+                                      functor::Zero<type>,                     \
+                                      functor::SumOp<type>);                   \
+  REGISTER_CPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentMax", type, index_type,  \
+                                      functor::Lowest<type>,                   \
+                                      functor::MaxOp<type>);                   \
+  REGISTER_CPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentMin", type, index_type,  \
+                                      functor::Highest<type>,                  \
+                                      functor::MinOp<type>);                   \
+  REGISTER_CPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentProd", type, index_type, \
+                                      functor::One<type>,                      \
+                                      functor::ProdOp<type>);
+
+#define REGISTER_COMPLEX_CPU_UNSORTED_KERNELS(type, index_type)                \
+  REGISTER_CPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentSum", type, index_type,  \
+                                      functor::Zero<type>,                     \
+                                      functor::SumOp<type>);                   \
+  REGISTER_CPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentProd", type, index_type, \
+                                      functor::One<type>,                      \
+                                      functor::ProdOp<type>)
 
 #define REGISTER_REAL_CPU_UNSORTED_KERNELS_ALL(type) \
   REGISTER_REAL_CPU_UNSORTED_KERNELS(type, int32);   \
@@ -520,31 +542,72 @@ class UnsortedSegmentMaxOp : public UnsortedSegmentBaseOp<Device, T, Index> {
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_REAL_CPU_UNSORTED_KERNELS_ALL);
 REGISTER_COMPLEX_CPU_UNSORTED_KERNELS_ALL(complex64);
 REGISTER_COMPLEX_CPU_UNSORTED_KERNELS_ALL(complex128);
+
 #undef REGISTER_REAL_CPU_UNSORTED_KERNELS
+#undef REGISTER_CPU_KERNEL_UNSORTEDSEGMENT
 #undef REGISTER_COMPLEX_CPU_UNSORTED_KERNELS
 #undef REGISTER_COMPLEX_CPU_UNSORTED_KERNELS_ALL
 #undef REGISTER_REAL_CPU_UNSORTED_KERNELS_ALL
 
 #if GOOGLE_CUDA
-#define REGISTER_GPU_UNSORTED_KERNELS(type, index_type)                \
-  REGISTER_KERNEL_BUILDER(Name("UnsortedSegmentSum")                   \
-                              .Device(DEVICE_GPU)                      \
-                              .HostMemory("num_segments")              \
-                              .TypeConstraint<type>("T")               \
-                              .TypeConstraint<index_type>("Tindices"), \
-                          UnsortedSegmentSumOp<GPUDevice, type, index_type>);
-
-#define REGISTER_GPU_UNSORTED_KERNELS_ALL(type) \
-  REGISTER_GPU_UNSORTED_KERNELS(type, int32);   \
-  REGISTER_GPU_UNSORTED_KERNELS(type, int64);
+#define REGISTER_GPU_KERNEL_UNSORTEDSEGMENT(                                 \
+    name, type, index_type, initial_value_functor, reduction_kernel_functor) \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name(name)                                                             \
+          .Device(DEVICE_GPU)                                                \
+          .HostMemory("num_segments")                                        \
+          .TypeConstraint<type>("T")                                         \
+          .TypeConstraint<index_type>("Tindices"),                           \
+      UnsortedSegmentReductionOp<                                            \
+          type, index_type,                                                  \
+          functor::UnsortedSegmentFunctor<GPUDevice, type, index_type,       \
+                                          initial_value_functor,             \
+                                          reduction_kernel_functor> >)
+
+// sum is the only op that supports all input types currently
+#define REGISTER_REAL_GPU_UNSORTED_KERNELS(type, index_type)                   \
+  REGISTER_GPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentMax", type, index_type,  \
+                                      functor::Lowest<type>,                   \
+                                      functor::MaxOpGpu<type>);                \
+  REGISTER_GPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentMin", type, index_type,  \
+                                      functor::Highest<type>,                  \
+                                      functor::MinOpGpu<type>);                \
+  REGISTER_GPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentProd", type, index_type, \
+                                      functor::One<type>,                      \
+                                      functor::ProdOpGpu<type>);
+
+#define REGISTER_SUM_GPU_UNSORTED_KERNELS(type, index_type)                   \
+  REGISTER_GPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentSum", type, index_type, \
+                                      functor::Zero<type>,                    \
+                                      functor::SumOpGpu<type>);
+
+#define REGISTER_REAL_GPU_UNSORTED_KERNELS_ALL(type) \
+  REGISTER_REAL_GPU_UNSORTED_KERNELS(type, int32);   \
+  REGISTER_REAL_GPU_UNSORTED_KERNELS(type, int64);
+
+#define REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL(type) \
+  REGISTER_SUM_GPU_UNSORTED_KERNELS(type, int32);   \
+  REGISTER_SUM_GPU_UNSORTED_KERNELS(type, int64);
+
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_REAL_GPU_UNSORTED_KERNELS_ALL);
+TF_CALL_int32(REGISTER_REAL_GPU_UNSORTED_KERNELS_ALL);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL);
+TF_CALL_int32(REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL);
+TF_CALL_complex64(REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL);
+TF_CALL_complex128(REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL);
+
+#undef REGISTER_GPU_KERNEL_UNSORTEDSEGMENT
+#undef REGISTER_REAL_GPU_UNSORTED_KERNELS
+#undef REGISTER_SUM_GPU_UNSORTED_KERNELS
+#undef REGISTER_REAL_GPU_UNSORTED_KERNELS_ALL
+#undef REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_UNSORTED_KERNELS_ALL);
-TF_CALL_complex64(REGISTER_GPU_UNSORTED_KERNELS_ALL);
-TF_CALL_complex128(REGISTER_GPU_UNSORTED_KERNELS_ALL);
-#undef REGISTER_GPU_UNSORTED_KERNELS
-#undef REGISTER_GPU_UNSORTED_KERNELS_ALL
 #endif  // GOOGLE_CUDA
 
+// ____________________________________________________________________________
+// Sparse segment reduction ops.
+
 // Same as SegmentReductionOp but takes as input a "sparse" tensor, represented
 // by two dense tensors, one containing the data, and the other containing
 // indices into the data.
diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h
index 5c9cfe0906..4abfbfb1a6 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.h
+++ b/tensorflow/core/kernels/segment_reduction_ops.h
@@ -16,6 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 #define TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 
+// This file requires the following include because it uses CudaAtomicMax:
+// #include "tensorflow/core/util/cuda_kernel_helper.h"
+
+// Unfortunately we can't add the #include, since it breaks compilation for
+// non-GPU targets. This only breaks in clang, because it's more strict for
+// template code and CudaAtomicMax is used in template context.
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -46,58 +53,80 @@ struct SegmentSumFunctor {
                   const Index data_size, const T* data,
                   typename TTypes<T, 2>::Tensor output);
 };
-#endif
 
-// BaseFunctor for definition of UnsorteSegmentReductionOp
-// for usage without templates.
-template <typename Device, typename T, typename Index>
-struct UnsortedSegmentBaseFunctor {
-  virtual ~UnsortedSegmentBaseFunctor() {}
-  virtual void operator()(OpKernelContext* ctx, const Device& d,
-                          const Index output_rows,
-                          const TensorShape& segment_ids_shape,
-                          typename TTypes<Index>::ConstFlat segment_ids,
-                          const Index data_size, const T* data,
-                          typename TTypes<T, 2>::Tensor output){};
-};
+#endif
 
-// Functor for UnsortedSegmentSumOp.
-// output_rows: the number of output segments (unique segment ids in
-//                'segment_ids').
-// segment_ids_shape: shape of 'segment_ids' tensor.
-// segment_ids: unsorted map from input to output segment ids at which to
-//                perform segment sum operation.
-// data_size: size of input data tensor.
-// data: input data tensor.
-// output: output reshaped to {output_rows, output.size/output_rows}
-template <typename Device, typename T, typename Index>
-struct UnsortedSegmentSumFunctor
-    : public UnsortedSegmentBaseFunctor<Device, T, Index> {
-  void operator()(OpKernelContext* ctx, const Device& d,
-                  const Index output_rows, const TensorShape& segment_ids_shape,
+template <typename Device, typename T, typename Index, typename InitialValueF,
+          typename ReductionF>
+struct UnsortedSegmentFunctor {
+  void operator()(OpKernelContext* ctx, const Index num_segments,
+                  const TensorShape& segment_ids_shape,
                   typename TTypes<Index>::ConstFlat segment_ids,
                   const Index data_size, const T* data,
                   typename TTypes<T, 2>::Tensor output);
 };
 
-// Functor for UnsortedSegmentMaxOp.
-// output_rows: the number of output segments (unique segment ids in
-//                'segment_ids').
-// segment_ids_shape: shape of 'segment_ids' tensor.
-// segment_ids: unsorted map from input to output segment ids at which to
-//                perform segment sum operation.
-// data_size: size of input data tensor.
-// data: input data tensor.
-// output: output reshaped to {output_rows, output.size/output_rows}
-template <typename Device, typename T, typename Index>
-struct UnsortedSegmentMaxFunctor
-    : public UnsortedSegmentBaseFunctor<Device, T, Index> {
-  void operator()(OpKernelContext* ctx, const Device& d,
-                  const Index output_rows, const TensorShape& segment_ids_shape,
-                  typename TTypes<Index>::ConstFlat segment_ids,
-                  const Index data_size, const T* data,
-                  typename TTypes<T, 2>::Tensor output);
+#ifdef GOOGLE_CUDA
+// reduction functors for the gpu
+template <typename T>
+struct SumOpGpu {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(T* dest,
+                                                        const T& value) {
+    CudaAtomicAdd(dest, value);
+  }
+};
+
+template <typename T>
+struct ProdOpGpu {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(T* dest,
+                                                        const T& value) {
+    CudaAtomicMul(dest, value);
+  }
+};
+
+template <typename T>
+struct MaxOpGpu {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(T* dest,
+                                                        const T& value) {
+    CudaAtomicMax(dest, value);
+  }
+};
+
+template <typename T>
+struct MinOpGpu {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(T* dest,
+                                                        const T& value) {
+    CudaAtomicMin(dest, value);
+  }
 };
+
+#endif  // GOOGLE_CUDA
+
+// initial value functors
+template <typename T>
+struct Zero {
+  EIGEN_STRONG_INLINE T operator()() const { return T(0); }
+};
+
+template <typename T>
+struct One {
+  EIGEN_STRONG_INLINE T operator()() const { return T(1); }
+};
+
+template <typename T>
+struct Lowest {
+  EIGEN_STRONG_INLINE T operator()() const {
+    return Eigen::NumTraits<T>::lowest();
+  }
+};
+
+template <typename T>
+struct Highest {
+  EIGEN_STRONG_INLINE T operator()() const {
+    return Eigen::NumTraits<T>::highest();
+  }
+};
+
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
index 39d520698e..3511c85f71 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
@@ -17,43 +17,19 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/segment_reduction_ops.h"
+// We need to include cuda_kernel_helper.h before segment_reduction_ops.h
+// See comment in segment_reduction_ops.h for more details.
+#include "tensorflow/core/util/cuda_kernel_helper.h"
 
+#include "tensorflow/core/kernels/segment_reduction_ops.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/util/cuda_kernel_helper.h"
+#include "tensorflow/core/util/cuda_device_functions.h"
+
 
 namespace tensorflow {
 
 using GPUDevice = Eigen::GpuDevice;
 
-// Helper for UnusortedSegmentSumCustomKernel that adds value into dest
-// atomically.
-template <typename T>
-static __device__ __forceinline__ void AccumulateInto(T* dest, const T& value) {
-  CudaAtomicAdd(dest, value);
-}
-
-// Specializations of AccumulateInto for complex types, which CudaAtomicAdd does
-// not support. We treat a std::complex<T>* as a T* (the C++ standard section
-// 26.4.4 allows this explicitly) and atomic add the real and imaginary
-// components individually. The operation as a whole is not atomic, but we can
-// safely treat the components independently for the purpose of accumulating.
-template <>
-__device__ __forceinline__ void AccumulateInto(
-    std::complex<float>* dest, const std::complex<float>& value) {
-  auto dest_scalar = reinterpret_cast<float*>(dest);
-  CudaAtomicAdd(dest_scalar, value.real());
-  CudaAtomicAdd(dest_scalar + 1, value.imag());
-}
-
-template <>
-__device__ __forceinline__ void AccumulateInto(
-    std::complex<double>* dest, const std::complex<double>& value) {
-  auto dest_scalar = reinterpret_cast<double*>(dest);
-  CudaAtomicAdd(dest_scalar, value.real());
-  CudaAtomicAdd(dest_scalar + 1, value.imag());
-}
-
 // SortedSegmentSumFunctor kernel reduces input data just as
 // UnsortedSegmentSumCustomKernel does except that input data
 // is partitioned along the outer reduction dimension. This is
@@ -81,7 +57,7 @@ __global__ void SortedSegmentSumCustomKernel(const Index input_outer_dim_size,
                                              const Index* segment_ids,
                                              const T* input, T* output,
                                              const Index total_stripe_count) {
-  CUDA_1D_KERNEL_LOOP(stripe_index, total_stripe_count) {
+  for (int stripe_index : CudaGridRangeX(total_stripe_count)) {
     const Index segment_offset = stripe_index % inner_dim_size;
     const Index input_outer_dim_index_base =
         stripe_index / inner_dim_size * Index(OuterDimTileSize);
@@ -106,7 +82,7 @@ __global__ void SortedSegmentSumCustomKernel(const Index input_outer_dim_size,
         // decide whether to write result to global memory using atomic
         // operations
         if (last_output_segment_id == first_segment_id) {
-          AccumulateInto<T>(output + output_index, sum);
+          CudaAtomicAdd(output + output_index, sum);
         } else {
           *(output + output_index) = sum;
         }
@@ -121,31 +97,31 @@ __global__ void SortedSegmentSumCustomKernel(const Index input_outer_dim_size,
     // the following strip.
     const Index output_index =
         last_output_segment_id * inner_dim_size + segment_offset;
-    AccumulateInto<T>(output + output_index, sum);
+    CudaAtomicAdd(output + output_index, sum);
   }
 }
 
-// UnsortedSegmentSumFunctor kernel processes 'input_total_size' elements.
+// UnsortedSegmentSumKernel processes 'input_total_size' elements.
 // Each element is mapped from input to output by a combination of its
 // 'segment_ids' mapping and 'inner_dim_size'.
-template <typename T, typename Index>
-__global__ void UnsortedSegmentSumCustomKernel(
-    const Index input_outer_dim_size, const Index inner_dim_size,
-    const Index output_outer_dim_size, const Index* segment_ids, const T* input,
-    T* output) {
+template <typename T, typename Index, typename KernelReductionFunctor>
+__global__ void UnsortedSegmentCustomKernel(const Index input_outer_dim_size,
+                                            const Index inner_dim_size,
+                                            const Index output_outer_dim_size,
+                                            const Index* segment_ids,
+                                            const T* input, T* output) {
   const Index input_total_size = input_outer_dim_size * inner_dim_size;
   const Index output_total_size = output_outer_dim_size * inner_dim_size;
-  CUDA_1D_KERNEL_LOOP(input_index, input_total_size) {
+  for (int input_index : CudaGridRangeX(input_total_size)) {
     const Index input_segment_index = input_index / inner_dim_size;
     const Index segment_offset = input_index % inner_dim_size;
     const Index output_segment_index = segment_ids[input_segment_index];
-
     if (output_segment_index < 0 || output_segment_index >= output_total_size) {
       continue;
     }
     const Index output_index =
         output_segment_index * inner_dim_size + segment_offset;
-    AccumulateInto<T>(output + output_index, ldg(input + input_index));
+    KernelReductionFunctor()(output + output_index, ldg(input + input_index));
   }
 }
 
@@ -190,41 +166,39 @@ void SegmentSumFunctor<T, Index>::operator()(
       <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
           input_outer_dim_size, input_inner_dim_size, output_rows,
           segment_ids.data(), data, output.data(), total_stripe_count);
-};
+}
 
-// UnsortedSegmentSumFunctor implementation for GPUDevice.
-template <typename T, typename Index>
-struct UnsortedSegmentSumFunctor<GPUDevice, T, Index>
-    : UnsortedSegmentBaseFunctor<GPUDevice, T, Index> {
-  void operator()(OpKernelContext* ctx, const GPUDevice& d,
-                  const Index output_rows, const TensorShape& segment_ids_shape,
+template <typename T, typename Index, typename InitialValueF,
+          typename ReductionF>
+struct UnsortedSegmentFunctor<GPUDevice, T, Index, InitialValueF, ReductionF> {
+  void operator()(OpKernelContext* ctx, const Index num_segments,
+                  const TensorShape& segment_ids_shape,
                   typename TTypes<Index>::ConstFlat segment_ids,
                   const Index data_size, const T* data,
-                  typename TTypes<T, 2>::Tensor output) override {
+                  typename TTypes<T, 2>::Tensor output) {
     if (output.size() == 0) {
       return;
     }
-    // Set 'output' to zeros.
+    // Set 'output' to initial value.
+    GPUDevice d = ctx->template eigen_device<GPUDevice>();
     CudaLaunchConfig config = GetCudaLaunchConfig(output.size(), d);
-    SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        output.size(), output.data());
+    SetToValue<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+        output.size(), output.data(), InitialValueF()());
     if (data_size == 0 || segment_ids_shape.num_elements() == 0) {
       return;
     }
-
-    // Launch kernel to compute unsorted segment sum.
+    // Launch kernel to compute unsorted segment reduction.
     // Notes:
-    // *) 'input_total_size' is the total number of elements to process.
+    // *) 'data_size' is the total number of elements to process.
     // *) 'segment_ids.shape' is a prefix of data's shape.
     // *) 'input_outer_dim_size' is the total number of segments to process.
-    const Index input_total_size = data_size;
     const Index input_outer_dim_size = segment_ids.dimension(0);
-    const Index input_inner_dim_size = input_total_size / input_outer_dim_size;
+    const Index input_inner_dim_size = data_size / input_outer_dim_size;
+    config = GetCudaLaunchConfig(data_size, d);
 
-    config = GetCudaLaunchConfig(input_total_size, d);
-    UnsortedSegmentSumCustomKernel<T, Index>
+    UnsortedSegmentCustomKernel<T, Index, ReductionF>
         <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            input_outer_dim_size, input_inner_dim_size, output_rows,
+            input_outer_dim_size, input_inner_dim_size, num_segments,
             segment_ids.data(), data, output.data());
   }
 };
@@ -238,19 +212,40 @@ struct UnsortedSegmentSumFunctor<GPUDevice, T, Index>
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_SORTED_GPU_SPECS);
 
-#define DEFINE_GPU_SPECS_INDEX(T, Index) \
-  template struct UnsortedSegmentSumFunctor<GPUDevice, T, Index>
-
-#define DEFINE_GPU_SPECS(T)         \
-  DEFINE_GPU_SPECS_INDEX(T, int32); \
-  DEFINE_GPU_SPECS_INDEX(T, int64);
-
-TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
-TF_CALL_complex64(DEFINE_GPU_SPECS);
-TF_CALL_complex128(DEFINE_GPU_SPECS);
-
-#undef DEFINE_GPU_SPECS
-#undef DEFINE_GPU_SPECS_INDEX
+#define DEFINE_REAL_UNSORTED_GPU_SPECS_INDEX(T, Index)                         \
+  template struct UnsortedSegmentFunctor<                                      \
+      GPUDevice, T, Index, functor::Lowest<T>, functor::MaxOpGpu<T>>;          \
+  template struct UnsortedSegmentFunctor<                                      \
+      GPUDevice, T, Index, functor::Highest<T>, functor::MinOpGpu<T>>;         \
+  template struct UnsortedSegmentFunctor<GPUDevice, T, Index, functor::One<T>, \
+                                         functor::ProdOpGpu<T>>;
+
+// sum is the only op that supports all input types currently
+#define DEFINE_SUM_UNSORTED_GPU_SPECS_INDEX(T, Index) \
+  template struct UnsortedSegmentFunctor<             \
+      GPUDevice, T, Index, functor::Zero<T>, functor::SumOpGpu<T>>;
+
+#define DEFINE_REAL_GPU_SPECS(T)                  \
+  DEFINE_REAL_UNSORTED_GPU_SPECS_INDEX(T, int32); \
+  DEFINE_REAL_UNSORTED_GPU_SPECS_INDEX(T, int64);
+
+#define DEFINE_SUM_GPU_SPECS(T)                  \
+  DEFINE_SUM_UNSORTED_GPU_SPECS_INDEX(T, int32); \
+  DEFINE_SUM_UNSORTED_GPU_SPECS_INDEX(T, int64);
+
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_REAL_GPU_SPECS);
+TF_CALL_int32(DEFINE_REAL_GPU_SPECS);
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_SUM_GPU_SPECS);
+TF_CALL_int32(DEFINE_SUM_GPU_SPECS);
+TF_CALL_complex64(DEFINE_SUM_GPU_SPECS);
+TF_CALL_complex128(DEFINE_SUM_GPU_SPECS);
+
+#undef DEFINE_SORTED_GPU_SPECS_INDEX
+#undef DEFINE_SORTED_GPU_SPECS
+#undef DEFINE_REAL_UNSORTED_GPU_SPECS_INDEX
+#undef DEFINE_SUM_UNSORTED_GPU_SPECS_INDEX
+#undef DEFINE_REAL_GPU_SPECS
+#undef DEFINE_SUM_GPU_SPECS
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/unique_op.cc b/tensorflow/core/kernels/unique_op.cc
index 0ef8724b10..31388e4290 100644
--- a/tensorflow/core/kernels/unique_op.cc
+++ b/tensorflow/core/kernels/unique_op.cc
@@ -223,6 +223,16 @@ class UniqueOp : public OpKernel {
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int64>("out_idx"), \
+                          UniqueOp<type, int64>);                \
+  REGISTER_KERNEL_BUILDER(Name("UniqueWithCountsV2")             \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int32>("out_idx"), \
+                          UniqueOp<type, int32>)                 \
+  REGISTER_KERNEL_BUILDER(Name("UniqueWithCountsV2")             \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int64>("out_idx"), \
                           UniqueOp<type, int64>)
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_UNIQUE);
 REGISTER_UNIQUE(string)
diff --git a/tensorflow/core/kernels/unravel_index_op.cc b/tensorflow/core/kernels/unravel_index_op.cc
index a61272675b..62e814ff77 100644
--- a/tensorflow/core/kernels/unravel_index_op.cc
+++ b/tensorflow/core/kernels/unravel_index_op.cc
@@ -15,11 +15,11 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index a1027f1422..39b92464cb 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -1203,6 +1203,23 @@ REGISTER_OP("UniqueWithCounts")
       return Status::OK();
     });
 
+REGISTER_OP("UniqueWithCountsV2")
+    .Input("x: T")
+    .Input("axis: Taxis")
+    .Output("y: T")
+    .Output("idx: out_idx")
+    .Output("count: out_idx")
+    .Attr("T: type")
+    .Attr("Taxis: {int32,int64} = DT_INT64")
+    .Attr("out_idx: {int32, int64} = DT_INT32")
+    .SetShapeFn([](InferenceContext* c) {
+      auto uniq = c->Vector(InferenceContext::kUnknownDim);
+      c->set_output(0, uniq);
+      c->set_output(1, c->input(0));
+      c->set_output(2, uniq);
+      return Status::OK();
+    });
+
 namespace {
 
 Status ShapeShapeFn(InferenceContext* c) {
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 872ebe98c1..8f33d51d5a 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1065,6 +1065,26 @@ REGISTER_OP("UnsortedSegmentMax")
     .Attr("Tnumsegments: {int32,int64} = DT_INT32")
     .SetShapeFn(UnsortedSegmentReductionShapeFn);
 
+REGISTER_OP("UnsortedSegmentMin")
+    .Input("data: T")
+    .Input("segment_ids: Tindices")
+    .Input("num_segments: Tnumsegments")
+    .Output("output: T")
+    .Attr("T: realnumbertype")
+    .Attr("Tindices: {int32,int64}")
+    .Attr("Tnumsegments: {int32,int64} = DT_INT32")
+    .SetShapeFn(UnsortedSegmentReductionShapeFn);
+
+REGISTER_OP("UnsortedSegmentProd")
+    .Input("data: T")
+    .Input("segment_ids: Tindices")
+    .Input("num_segments: Tnumsegments")
+    .Output("output: T")
+    .Attr("T: realnumbertype")
+    .Attr("Tindices: {int32,int64}")
+    .Attr("Tnumsegments: {int32,int64} = DT_INT32")
+    .SetShapeFn(UnsortedSegmentReductionShapeFn);
+
 REGISTER_OP("SparseSegmentSum")
     .Input("data: T")
     .Input("indices: Tidx")
diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index 52bf0d4694..301fcb9dbf 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include <aws/core/utils/StringUtils.h>
 #include <aws/core/utils/logging/AWSLogging.h>
 #include <aws/core/utils/logging/LogSystemInterface.h>
+#include <aws/core/utils/StringUtils.h>
 #include <aws/s3/S3Client.h>
 #include <aws/s3/S3Errors.h>
 #include <aws/s3/model/CopyObjectRequest.h>
diff --git a/tensorflow/core/platform/windows/port.cc b/tensorflow/core/platform/windows/port.cc
index 582b232054..f3b27ea394 100644
--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #endif
 
 #include <Windows.h>
+#include <shlwapi.h>
 
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/demangle.h"
@@ -149,11 +150,16 @@ bool Snappy_Uncompress(const char* input, size_t length, char* output) {
 string Demangle(const char* mangled) { return mangled; }
 
 double NominalCPUFrequency() {
-#ifdef TENSORFLOW_USE_ABSL
-  return absl::base_internal::NominalCPUFrequency();
-#else
+  DWORD data;
+  DWORD data_size = sizeof(data);
+  #pragma comment(lib, "shlwapi.lib")  // For SHGetValue().
+  if (SUCCEEDED(
+          SHGetValueA(HKEY_LOCAL_MACHINE,
+                      "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
+                      "~MHz", nullptr, &data, &data_size))) {
+    return data * 1e6;  // Value is MHz.
+  }
   return 1.0;
-#endif
 }
 
 int64 AvailableRam() {
diff --git a/tensorflow/core/util/cuda_device_functions.h b/tensorflow/core/util/cuda_device_functions.h
index f787687f66..f2d4e470c8 100644
--- a/tensorflow/core/util/cuda_device_functions.h
+++ b/tensorflow/core/util/cuda_device_functions.h
@@ -28,14 +28,10 @@ limitations under the License.
 
 #include <algorithm>
 #include <complex>
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "cuda/include/cuda.h"
-#include "cuda/include/device_functions.h"
 #include "tensorflow/core/platform/types.h"
 
-#if CUDA_VERSION >= 7050
-#include "cuda/include/cuda_fp16.h"
-#endif  // CUDA_VERSION >= 7050
-
 namespace tensorflow {
 
 namespace detail {
@@ -394,6 +390,17 @@ __global__ void SetZero(const int count, T* ptr) {
   }
 }
 
+// Helper to set all tensor entries to a specific value.
+template <typename T>
+__global__ void SetToValue(const int count, T* ptr, T value) {
+  // Check that the grid is one dimensional and index doesn't overflow.
+  assert(blockDim.y == 1 && blockDim.z == 1);
+  assert(blockDim.x * gridDim.x / blockDim.x == gridDim.x);
+  for (int i : CudaGridRangeX(count)) {
+    ptr[i] = value;
+  }
+}
+
 namespace detail {
 // Helper function for atomic accumulation implemented as CAS.
 template <typename T, typename F>
@@ -425,6 +432,47 @@ __device__ double CudaAtomicCasHelper(double* ptr, F accumulate) {
       }));
 }
 
+// Overload of above function for half. Note that we don't have
+// atomicCAS() for anything less than 32 bits, so we need to include the
+// other 16 bits in the operation.
+//
+// This version is going to be very slow
+// under high concurrency, since most threads will be spinning on failing
+// their compare-and-swap tests. (The fact that we get false sharing on the
+// neighboring fp16 makes this even worse.) If you are doing a large reduction,
+// you are much better off with doing the intermediate steps in fp32 and then
+// switching to fp16 as late as you can in the calculations.
+//
+// Note: Assumes little endian.
+template <typename F>
+__device__ Eigen::half CudaAtomicCasHelper(Eigen::half* ptr, F accumulate) {
+#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__)
+  static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Not little endian");
+#endif
+  namespace half_impl = Eigen::half_impl;
+  intptr_t intptr = reinterpret_cast<intptr_t>(ptr);
+  assert(!(intptr & 0x1));  // should be 2-aligned.
+  if (intptr & 0x2) {
+    // The half is in the second part of the uint32 (upper 16 bits).
+    uint32* address = reinterpret_cast<uint32*>(intptr - 2);
+    uint32 result = CudaAtomicCasHelper(address, [accumulate](uint32 arg) {
+      unsigned short high = static_cast<unsigned short>(arg >> 16);
+      Eigen::half acc = accumulate(half_impl::raw_uint16_to_half(high));
+      return (static_cast<uint32>(acc.x) << 16) | (arg & 0xffff);
+    });
+    return half_impl::raw_uint16_to_half(static_cast<uint16>(result >> 16));
+  } else {
+    // The half is in the first part of the uint32 (lower 16 bits).
+    uint32* address = reinterpret_cast<uint32*>(intptr);
+    uint32 result = CudaAtomicCasHelper(address, [accumulate](uint32 arg) {
+      unsigned short low = static_cast<unsigned short>(arg & 0xffff);
+      Eigen::half acc = accumulate(half_impl::raw_uint16_to_half(low));
+      return (arg & 0xffff0000) | static_cast<uint32>(acc.x);
+    });
+    return half_impl::raw_uint16_to_half(static_cast<uint16>(result & 0xffff));
+  }
+}
+
 template <typename From, typename To>
 using ToTypeIfConvertible =
     typename std::enable_if<std::is_convertible<From, To>::value, To>::type;
@@ -438,6 +486,14 @@ template <typename T, typename U>
 __device__ detail::ToTypeIfConvertible<U, T> CudaAtomicAdd(T* ptr, U value) {
   return atomicAdd(ptr, value);
 }
+
+__device__ inline Eigen::half CudaAtomicAdd(Eigen::half* ptr,
+                                            Eigen::half value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](Eigen::half a) { return a + value; });
+}
+
+
 #if __CUDA_ARCH__ < 600
 __device__ inline double CudaAtomicAdd(double* ptr, double value) {
   return detail::CudaAtomicCasHelper(ptr,
@@ -455,27 +511,74 @@ __device__ inline double CudaAtomicAdd(double* ptr, double value) {
   return result;
 }
 #endif
-
+// CudaAtomicAdd
+// Specializations of CudaAtomicAdd for complex types, which CudaAtomicAdd does
+// not support. We treat a std::complex<T>* as a T* (the C++ standard section
+// 26.4.4 allows this explicitly) and atomic add the real and imaginary
+// components individually. The operation as a whole is not atomic, but we can
+// safely treat the components independently for the purpose of accumulating.
+__device__ inline std::complex<float> CudaAtomicAdd(std::complex<float>* ptr,
+                                                    std::complex<float> value) {
+  auto ptr_scalar = reinterpret_cast<float*>(ptr);
+  return std::complex<float>(CudaAtomicAdd(ptr_scalar, value.real()),
+                             CudaAtomicAdd(ptr_scalar + 1, value.imag()));
+}
+
+__device__ inline std::complex<double> CudaAtomicAdd(
+    std::complex<double>* ptr, std::complex<double> value) {
+  auto ptr_scalar = reinterpret_cast<double*>(ptr);
+  return std::complex<double>(CudaAtomicAdd(ptr_scalar, value.real()),
+                              CudaAtomicAdd(ptr_scalar + 1, value.imag()));
+}
+
+// CudaAtomicSub
 template <typename T, typename U>
 __device__ detail::ToTypeIfConvertible<U, T> CudaAtomicSub(T* ptr, U value) {
   return atomicSub(ptr, value);
 }
+
 // Specializations of substraction which add the negative value.
 __device__ inline float CudaAtomicSub(float* ptr, float value) {
   return CudaAtomicAdd(ptr, -value);
 }
+
 __device__ inline double CudaAtomicSub(double* ptr, double value) {
   return CudaAtomicAdd(ptr, -value);
 }
+
 __device__ inline tensorflow::uint64 CudaAtomicSub(tensorflow::uint64* ptr,
                                                    tensorflow::uint64 value) {
   return CudaAtomicAdd(ptr, -value);
 }
 
+__device__ inline Eigen::half CudaAtomicSub(Eigen::half* ptr,
+                                            Eigen::half value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](Eigen::half a) { return a - value; });
+}
+
+// CudaAtomicMax
 template <typename T, typename U>
 __device__ detail::ToTypeIfConvertible<U, T> CudaAtomicMax(T* ptr, U value) {
   return atomicMax(ptr, value);
 }
+
+__device__ inline float CudaAtomicMax(float* ptr, float value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](float a) { return max(a, value); });
+}
+
+__device__ inline double CudaAtomicMax(double* ptr, double value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](double a) { return max(a, value); });
+}
+
+__device__ inline Eigen::half CudaAtomicMax(Eigen::half* ptr,
+                                            Eigen::half value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](Eigen::half a) { return max(a, value); });
+}
+
 #if __CUDA_ARCH__ < 320
 __device__ inline tensorflow::uint64 CudaAtomicMax(tensorflow::uint64* ptr,
                                                    tensorflow::uint64 value) {
@@ -484,10 +587,43 @@ __device__ inline tensorflow::uint64 CudaAtomicMax(tensorflow::uint64* ptr,
 }
 #endif
 
+// CudaAtomicMin
+template <typename T, typename U>
+__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicMin(T* ptr, U value) {
+  return atomicMin(ptr, value);
+}
+
+__device__ inline float CudaAtomicMin(float* ptr, float value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](float a) { return min(a, value); });
+}
+
+__device__ inline double CudaAtomicMin(double* ptr, double value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](double a) { return min(a, value); });
+}
+
+__device__ inline Eigen::half CudaAtomicMin(Eigen::half* ptr,
+                                            Eigen::half value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](Eigen::half a) { return min(a, value); });
+}
+
+#if __CUDA_ARCH__ < 320
+__device__ inline tensorflow::uint64 CudaAtomicMin(tensorflow::uint64* ptr,
+                                                   tensorflow::uint64 value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](tensorflow::uint64 a) { return min(a, value); });
+}
+#endif
+
+// CudaAtomicMul
 template <typename T, typename U>
 __device__ detail::ToTypeIfConvertible<U, T> CudaAtomicMul(T* ptr, U value) {
   return detail::CudaAtomicCasHelper(ptr, [value](T a) { return a * value; });
 }
+
+// CudaAtomicDiv
 template <typename T, typename U>
 __device__ detail::ToTypeIfConvertible<U, T> CudaAtomicDiv(T* ptr, U value) {
   return detail::CudaAtomicCasHelper(ptr, [value](T a) { return a / value; });
diff --git a/tensorflow/core/util/cuda_kernel_helper.h b/tensorflow/core/util/cuda_kernel_helper.h
index 01a5b6828a..0ab875625f 100644
--- a/tensorflow/core/util/cuda_kernel_helper.h
+++ b/tensorflow/core/util/cuda_kernel_helper.h
@@ -95,60 +95,6 @@ __device__ EIGEN_ALWAYS_INLINE Eigen::half CudaShuffleXorSync(
       CudaShuffleXorSync(mask, static_cast<uint16>(value), lane_mask, width));
 }
 
-namespace detail {
-// Overload of above function for half. Note that we don't have
-// atomicCAS() for anything less than 32 bits, so we need to include the
-// other 16 bits in the operation.
-//
-// This version is going to be very slow
-// under high concurrency, since most threads will be spinning on failing
-// their compare-and-swap tests. (The fact that we get false sharing on the
-// neighboring fp16 makes this even worse.) If you are doing a large reduction,
-// you are much better off with doing the intermediate steps in fp32 and then
-// switching to fp16 as late as you can in the calculations.
-//
-// Note: Assumes little endian.
-template <typename F>
-__device__ Eigen::half CudaAtomicCasHelper(Eigen::half* ptr, F accumulate) {
-#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__)
-  static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Not little endian");
-#endif
-  namespace half_impl = Eigen::half_impl;
-  intptr_t intptr = reinterpret_cast<intptr_t>(ptr);
-  assert(!(intptr & 0x1));  // should be 2-aligned.
-  if (intptr & 0x2) {
-    // The half is in the second part of the uint32 (upper 16 bits).
-    uint32* address = reinterpret_cast<uint32*>(intptr - 2);
-    uint32 result = CudaAtomicCasHelper(address, [accumulate](uint32 arg) {
-      unsigned short high = static_cast<unsigned short>(arg >> 16);
-      Eigen::half acc = accumulate(half_impl::raw_uint16_to_half(high));
-      return (static_cast<uint32>(acc.x) << 16) | (arg & 0xffff);
-    });
-    return half_impl::raw_uint16_to_half(static_cast<uint16>(result >> 16));
-  } else {
-    // The half is in the first part of the uint32 (lower 16 bits).
-    uint32* address = reinterpret_cast<uint32*>(intptr);
-    uint32 result = CudaAtomicCasHelper(address, [accumulate](uint32 arg) {
-      unsigned short low = static_cast<unsigned short>(arg & 0xffff);
-      Eigen::half acc = accumulate(half_impl::raw_uint16_to_half(low));
-      return (arg & 0xffff0000) | static_cast<uint32>(acc.x);
-    });
-    return half_impl::raw_uint16_to_half(static_cast<uint16>(result & 0xffff));
-  }
-}
-}  // namespace detail
-
-__device__ inline Eigen::half CudaAtomicAdd(Eigen::half* ptr,
-                                            Eigen::half value) {
-  return detail::CudaAtomicCasHelper(
-      ptr, [value](Eigen::half a) { return a + value; });
-}
-__device__ inline Eigen::half CudaAtomicSub(Eigen::half* ptr,
-                                            Eigen::half value) {
-  return detail::CudaAtomicCasHelper(
-      ptr, [value](Eigen::half a) { return a - value; });
-}
-
 namespace cuda_helper {
 template <typename IntType>
 __device__ IntType upper_bound(IntType* first, IntType count, IntType val) {
diff --git a/tensorflow/docs_src/get_started/checkpoints.md b/tensorflow/docs_src/get_started/checkpoints.md
index dfa2110e69..4aa07c7f2a 100644
--- a/tensorflow/docs_src/get_started/checkpoints.md
+++ b/tensorflow/docs_src/get_started/checkpoints.md
@@ -154,7 +154,7 @@ classifier = tf.estimator.DNNClassifier(
 
 The first time you call an Estimator's `train` method, TensorFlow saves a
 checkpoint to the `model_dir`. Each subsequent call to the Estimator's
-`train`, `eval`, or `predict` method causes the following:
+`train`, `evaluate`, or `predict` method causes the following:
 
 1.  The Estimator builds the model's
     [graph](https://developers.google.com/machine-learning/glossary/#graph)
@@ -222,7 +222,7 @@ does not match the shape stored in checkpoint: [20]
 
 To run experiments in which you train and compare slightly different
 versions of a model, save a copy of the code that created each
-`model-dir`, possibly by creating a separate git branch for each version.
+`model_dir`, possibly by creating a separate git branch for each version.
 This separation will keep your checkpoints recoverable.
 
 ## Summary
diff --git a/tensorflow/docs_src/get_started/custom_estimators.md b/tensorflow/docs_src/get_started/custom_estimators.md
index 185917baae..941c3e1690 100644
--- a/tensorflow/docs_src/get_started/custom_estimators.md
+++ b/tensorflow/docs_src/get_started/custom_estimators.md
@@ -213,7 +213,7 @@ is connected to every node in the preceding layer.  Here's the relevant code:
 ```
 
 * The `units` parameter defines the number of output neurons in a given layer.
-* The `activation` parameter defines the [activation function](https://developers.google.com/machine-learning/glossary/#a) —
+* The `activation` parameter defines the [activation function](https://developers.google.com/machine-learning/glossary/#activation_function) —
   [Relu](https://developers.google.com/machine-learning/glossary/#ReLU) in this
   case.
 
diff --git a/tensorflow/docs_src/performance/xla/operation_semantics.md b/tensorflow/docs_src/performance/xla/operation_semantics.md
index 11a4ef4312..5e39e710a0 100644
--- a/tensorflow/docs_src/performance/xla/operation_semantics.md
+++ b/tensorflow/docs_src/performance/xla/operation_semantics.md
@@ -138,8 +138,8 @@ element in `operand`. The `feature_index` must be a valid index for the feature
 dimension in `operand`.
 
 The algorithm goes as follows for each batch in `operand` \\(x\\) that
-contains `m` elements with `w` and `h` as the size of spatial dimensions (
-assuming `operand` is an 4 dimensional array):
+contains `m` elements with `w` and `h` as the size of spatial dimensions
+(assuming `operand` is an 4 dimensional array):
 
 - Calculates batch mean \\(\mu_l\\) for each feature `l` in feature dimension:
 \\(\mu_l=\frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h x_{ijkl}\\)
@@ -173,7 +173,7 @@ Similar to a `tf.bitcast` in TensorFlow, performs an element-wise bitcast
 operation from a data shape to a target shape. The dimensions must match, and
 the conversion is an element-wise one; e.g. `s32` elements become `f32` elements
 via bitcast routine. Bitcast is implemented as a low-level cast, so machines
-with different floating point representations will give different results.
+with different floating-point representations will give different results.
 
 <b> `BitcastConvertType(operand, new_element_type)` </b>
 
@@ -354,7 +354,7 @@ each other) and contains the arguments in the order that they were specified.
 :             :                         : concatenated between the `operands`. :
 
 With the exception of `dimension` all dimensions must be the same. This is
-because XLA does not support "ragged" arrays Also note that rank-0 values
+because XLA does not support "ragged" arrays. Also note that rank-0 values
 cannot be concatenated (as it's impossible to name the dimension along which the
 concatenation occurs).
 
@@ -473,7 +473,7 @@ filter/kernel/window. The dimensions are, in this order:
     window that moves across the base area.
 
 The `window_strides` argument specifies the stride of the convolutional window
-in the spatial dimensions. For example, if the stride in a the first spatial
+in the spatial dimensions. For example, if the stride in the first spatial
 dimension is 3, then the window can only be placed at coordinates where the
 first spatial index is divisible by 3.
 
@@ -947,7 +947,7 @@ expand the rank of the lower-rank operand up to the rank of the higher-rank
 operand. `broadcast_dimensions` maps the dimensions of the lower-rank shape to
 the dimensions of the higher-rank shape. The unmapped dimensions of the expanded
 shape are filled with dimensions of size one. Degenerate-dimension broadcasting
-then broadcasts the shapes along these degenerate dimension to equalize the
+then broadcasts the shapes along these degenerate dimensions to equalize the
 shapes of both operands. The semantics are described in detail on the
 @{$broadcasting$broadcasting page}.
 
@@ -1293,7 +1293,7 @@ result2 = while (condition, init = result1) {
 ```
 
 Nested tuple shapes are not supported. For an empty tuple shape, the Infeed
-operation is effectively a nop and proceeds without reading any data from the
+operation is effectively a no-op and proceeds without reading any data from the
 Infeed of the device.
 
 > Note: We plan to allow multiple Infeed operations without a total order, in
@@ -1356,7 +1356,7 @@ dimension.
 
 `PaddingConfig` is a repeated field of `PaddingConfigDimension`, which contains
 three fields for each dimension: `edge_padding_low`, `edge_padding_high`, and
-`interior_padding`. `edge_padding_low` and `edge_padding_high` specifies the
+`interior_padding`. `edge_padding_low` and `edge_padding_high` specify the
 amount of padding added at the low-end (next to index 0) and the high-end (next
 to the highest index) of each dimension respectively. The amount of edge padding
 can be negative -- the absolute value of negative padding indicates the number
@@ -1365,8 +1365,8 @@ the amount of padding added between any two elements in each dimension. Interior
 padding occurs logically before edge padding, so in the case of negative edge
 padding elements are removed from the interior-padded operand. This operation is
 a no-op if the edge padding pairs are all (0, 0) and the interior padding values
-are all 0. Figure below shows examples of different `edge_padding` and
-`interior_padding` values for a two dimensional array.
+are all 0. The figure below shows examples of different `edge_padding` and
+`interior_padding` values for a two-dimensional array.
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
   <img style="width:100%" src="https://www.tensorflow.org/images/ops_pad.png">
diff --git a/tensorflow/docs_src/programmers_guide/saved_model.md b/tensorflow/docs_src/programmers_guide/saved_model.md
index 1744494f72..d01d187e86 100644
--- a/tensorflow/docs_src/programmers_guide/saved_model.md
+++ b/tensorflow/docs_src/programmers_guide/saved_model.md
@@ -736,15 +736,15 @@ executing the computation graph later. For example:
 $ saved_model_cli show --dir \
 /tmp/saved_model_dir --tag_set serve --signature_def serving_default
 The given SavedModel SignatureDef contains the following input(s):
-inputs['x'] tensor_info:
-    dtype: DT_FLOAT
-    shape: (-1, 1)
-    name: x:0
+  inputs['x'] tensor_info:
+      dtype: DT_FLOAT
+      shape: (-1, 1)
+      name: x:0
 The given SavedModel SignatureDef contains the following output(s):
-outputs['y'] tensor_info:
-    dtype: DT_FLOAT
-    shape: (-1, 1)
-    name: y:0
+  outputs['y'] tensor_info:
+      dtype: DT_FLOAT
+      shape: (-1, 1)
+      name: y:0
 Method name is: tensorflow/serving/predict
 ```
 
@@ -756,32 +756,32 @@ $ saved_model_cli show --dir /tmp/saved_model_dir --all
 MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:
 
 signature_def['classify_x2_to_y3']:
-The given SavedModel SignatureDef contains the following input(s):
-inputs['inputs'] tensor_info:
-    dtype: DT_FLOAT
-    shape: (-1, 1)
-    name: x2:0
-The given SavedModel SignatureDef contains the following output(s):
-outputs['scores'] tensor_info:
-    dtype: DT_FLOAT
-    shape: (-1, 1)
-    name: y3:0
-Method name is: tensorflow/serving/classify
+  The given SavedModel SignatureDef contains the following input(s):
+    inputs['inputs'] tensor_info:
+        dtype: DT_FLOAT
+        shape: (-1, 1)
+        name: x2:0
+  The given SavedModel SignatureDef contains the following output(s):
+    outputs['scores'] tensor_info:
+        dtype: DT_FLOAT
+        shape: (-1, 1)
+        name: y3:0
+  Method name is: tensorflow/serving/classify
 
 ...
 
 signature_def['serving_default']:
-The given SavedModel SignatureDef contains the following input(s):
-inputs['x'] tensor_info:
-    dtype: DT_FLOAT
-    shape: (-1, 1)
-    name: x:0
-The given SavedModel SignatureDef contains the following output(s):
-outputs['y'] tensor_info:
-    dtype: DT_FLOAT
-    shape: (-1, 1)
-    name: y:0
-Method name is: tensorflow/serving/predict
+  The given SavedModel SignatureDef contains the following input(s):
+    inputs['x'] tensor_info:
+        dtype: DT_FLOAT
+        shape: (-1, 1)
+        name: x:0
+  The given SavedModel SignatureDef contains the following output(s):
+    outputs['y'] tensor_info:
+        dtype: DT_FLOAT
+        shape: (-1, 1)
+        name: y:0
+  Method name is: tensorflow/serving/predict
 ```
 
 
diff --git a/tensorflow/docs_src/programmers_guide/variables.md b/tensorflow/docs_src/programmers_guide/variables.md
index 6425073805..e8cf771155 100644
--- a/tensorflow/docs_src/programmers_guide/variables.md
+++ b/tensorflow/docs_src/programmers_guide/variables.md
@@ -62,9 +62,10 @@ them. For this reason TensorFlow provides **collections**, which are named lists
 of tensors or other objects, such as `tf.Variable` instances.
 
 By default every `tf.Variable` gets placed in the following two collections:
+
  * `tf.GraphKeys.GLOBAL_VARIABLES` --- variables that can be shared across
-multiple devices,
- * `tf.GraphKeys.TRAINABLE_VARIABLES`--- variables for which TensorFlow will
+   multiple devices,
+ * `tf.GraphKeys.TRAINABLE_VARIABLES` --- variables for which TensorFlow will
    calculate gradients.
 
 If you don't want a variable to be trainable, add it to the
diff --git a/tensorflow/examples/speech_commands/train.py b/tensorflow/examples/speech_commands/train.py
index 07c1919347..f084931215 100644
--- a/tensorflow/examples/speech_commands/train.py
+++ b/tensorflow/examples/speech_commands/train.py
@@ -357,14 +357,12 @@ if __name__ == '__main__':
       '--window_size_ms',
       type=float,
       default=30.0,
-      help='How long each spectrogram timeslice is.',
-  )
+      help='How long each spectrogram timeslice is.',)
   parser.add_argument(
       '--window_stride_ms',
       type=float,
       default=10.0,
-      help='How far to move in time between spectogram timeslices.',
-  )
+      help='How far to move in time between spectogram timeslices.',)
   parser.add_argument(
       '--dct_coefficient_count',
       type=int,
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index c4952cffdd..284e264acd 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -1413,7 +1413,8 @@ class TensorFlowTestCase(googletest.TestCase):
     """
     device1 = pydev.canonical_name(device1)
     device2 = pydev.canonical_name(device2)
-    self.assertEqual(device1, device2, "Devices %s and %s are not equal. %s" %
+    self.assertEqual(device1, device2,
+                     "Devices %s and %s are not equal. %s" % 
                      (device1, device2, msg))
 
   # Fix Python 3 compatibility issues
diff --git a/tensorflow/python/keras/_impl/keras/layers/lstm_test.py b/tensorflow/python/keras/_impl/keras/layers/lstm_test.py
index 74548d05c8..11a5e0aeaa 100644
--- a/tensorflow/python/keras/_impl/keras/layers/lstm_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/lstm_test.py
@@ -48,8 +48,8 @@ class LSTMLayerTest(test.TestCase):
     units = 2
 
     model = keras.models.Sequential()
-    inputs = keras.layers.Dense(
-        embedding_dim, input_shape=(timesteps, embedding_dim))
+    inputs = keras.layers.Dense(embedding_dim,
+                                input_shape=(timesteps, embedding_dim))
     model.add(inputs)
     layer = keras.layers.LSTM(units, return_sequences=True)
     model.add(layer)
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
index 343d158498..8cb9f9e621 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
@@ -129,7 +129,7 @@ class LinearOperatorDiagTest(
     with self.test_session() as sess:
       x = random_ops.random_normal(shape=(2, 2, 3, 4))
 
-      # This LinearOperatorDiag will be brodacast to (2, 2, 3, 3) during solve
+      # This LinearOperatorDiag will be broadcast to (2, 2, 3, 3) during solve
       # and matmul with 'x' as the argument.
       diag = random_ops.random_uniform(shape=(2, 1, 3))
       operator = linalg.LinearOperatorDiag(diag, is_self_adjoint=True)
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index 239a48d273..3bca5fadc4 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -46,7 +46,8 @@ class SegmentReductionHelper(test.TestCase):
     return constant_op.constant(
         np_values, shape=input_shape, dtype=dtype), np_values
 
-  def _segmentReduce(self, indices, x, op1, op2=None, num_segments=None):
+  def _segmentReduce(self, indices, x, op1, op2=None, num_segments=None,
+                     initial_value=0):
     if not x.size:
       return np.array([])
     indices = np.asarray(indices)
@@ -64,13 +65,8 @@ class SegmentReductionHelper(test.TestCase):
       else:
         output[index] = x_flat[i]
     # zero initialize values that are still uncalcuated.
-    # output = [o if o is not None else np.zeros(slice_shape) for o in output]
-    if not op1 == np.max:
-      output = [o if o is not None else np.zeros(slice_shape) for o in output]
-    else:
-      zeroslice = np.zeros(slice_shape)
-      zeroslice.fill(dtype.min)
-      output = [o if o is not None else zeroslice for o in output]
+    initial_value_slice = np.ones(slice_shape) * initial_value
+    output = [o if o is not None else initial_value_slice for o in output]
     if op2 is not None:
       output = [op2(o) for o in output]
     output = [o.reshape(slice_shape) for o in output]
@@ -82,6 +78,9 @@ class SegmentReductionHelper(test.TestCase):
   def _mean_reduce_op(self, x):
     return x[0] / x[1] if isinstance(x, tuple) else x
 
+  def _sqrt_n_reduce_op(self, x):
+    return x[0] / np.sqrt(x[1]) if isinstance(x, tuple) else x
+
 
 class SegmentReductionOpTest(SegmentReductionHelper):
 
@@ -244,27 +243,61 @@ class SegmentReductionOpTest(SegmentReductionHelper):
       self.assertAllClose(jacob_t, jacob_n)
 
 
-class UnsortedSegmentSumTest(SegmentReductionHelper):
+class UnsortedSegmentTest(SegmentReductionHelper):
+
+  def __init__(self, methodName='runTest'):
+    # Each item is np_op1, np_op2, tf_op, initial_value functor
+    self.ops_list = [(np.add, None,
+                      math_ops.unsorted_segment_sum, lambda t: 0),
+                     (self._mean_cum_op, self._mean_reduce_op,
+                      math_ops.unsorted_segment_mean, lambda t: 0),
+                     (self._mean_cum_op, self._sqrt_n_reduce_op,
+                      math_ops.unsorted_segment_sqrt_n, lambda t: 0),
+                     (np.ndarray.__mul__, None,
+                      math_ops.unsorted_segment_prod, lambda t: 1),
+                     (np.minimum, None,
+                      math_ops.unsorted_segment_min, lambda t: t.max),
+                     (np.maximum, None,
+                      math_ops.unsorted_segment_max, lambda t: t.min)]
+
+    # A subset of ops has been enabled for complex numbers
+    self.complex_ops_list = [(np.add, None,
+                              math_ops.unsorted_segment_sum, lambda t: 0)]
+    self.differentiable_dtypes = [dtypes_lib.float16, dtypes_lib.float32,
+                                  dtypes_lib.float64]
+    self.all_dtypes = (self.differentiable_dtypes +
+                       [dtypes_lib.bfloat16,
+                        dtypes_lib.int64, dtypes_lib.int32,
+                        dtypes_lib.complex64, dtypes_lib.complex128])
+    super(UnsortedSegmentTest, self).__init__(methodName=methodName)
 
   def testValues(self):
-    dtypes = [
-        dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int64,
-        dtypes_lib.int32, dtypes_lib.complex64, dtypes_lib.complex128
-    ]
     indices_flat = np.array([0, 4, 0, 8, 3, 8, 4, 7, 7, 3])
     num_segments = 12
     for indices in indices_flat, indices_flat.reshape(5, 2):
       shape = indices.shape + (2,)
-      for dtype in dtypes:
-        with self.test_session(use_gpu=True):
-          tf_x, np_x = self._input(shape, dtype=dtype)
-          np_ans = self._segmentReduce(
-              indices, np_x, np.add, op2=None, num_segments=num_segments)
-          s = math_ops.unsorted_segment_sum(
-              data=tf_x, segment_ids=indices, num_segments=num_segments)
-          tf_ans = s.eval()
-        self.assertAllClose(np_ans, tf_ans)
-        self.assertShapeEqual(np_ans, s)
+      for dtype in self.all_dtypes:
+        ops_list = self.complex_ops_list if dtype.is_complex else self.ops_list
+        tf_x, np_x = self._input(shape, dtype=dtype)
+        for use_gpu in [True, False]:
+          with self.test_session(use_gpu=True):
+            for np_op1, np_op2, tf_op, init_op in ops_list:
+              # sqrt_n doesn't support integers
+              if (np_op2 == self._sqrt_n_reduce_op and dtype.is_integer):
+                continue
+              # todo(philjd): enable this test once real_div supports bfloat16
+              if (np_op2 in [self._sqrt_n_reduce_op, self._mean_reduce_op] and
+                  dtype == dtypes_lib.bfloat16):
+                continue
+              np_ans = self._segmentReduce(
+                  indices, np_x, np_op1, np_op2, num_segments=num_segments,
+                  initial_value=init_op(dtype))
+              s = tf_op(tf_x, segment_ids=indices, num_segments=num_segments)
+              tf_ans = s.eval()
+              if dtype is dtypes_lib.bfloat16:
+                tf_ans = tf_ans.astype(np.float32)
+              self.assertAllClose(np_ans, tf_ans)
+              self.assertShapeEqual(np_ans, s)
 
   def testNumSegmentsTypes(self):
     dtypes = [dtypes_lib.int32, dtypes_lib.int64]
@@ -287,25 +320,51 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
         self.assertAllClose(np_ans, tf_ans)
         self.assertShapeEqual(np_ans, s)
 
-  def testGradientSegmentSum(self):
+  def testGradients(self):
     num_cols = 2
-    indices_flat = np.array([0, 4, 0, 8, 3, 8, 4, 7, 7, 3])
+    indices_flat = np.array([0, 4, 0, -1, 3, -1, 4, 7, 7, 3])
     num_segments = max(indices_flat) + 3
-    for dtype in [dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.complex64,
-                  dtypes_lib.complex128]:
+    for dtype in self.differentiable_dtypes:
+      ops_list = self.complex_ops_list if dtype.is_complex else self.ops_list
       for indices in indices_flat, indices_flat.reshape(5, 2):
         shape = indices.shape + (num_cols,)
-        with self.test_session(use_gpu=True):
-          tf_x, np_x = self._input(shape, dtype=dtype)
-          s = math_ops.unsorted_segment_sum(
-              data=tf_x, segment_ids=indices, num_segments=num_segments)
+        # test CPU and GPU as tf.gather behaves differently on each device
+        for use_gpu in [False, True]:
+          with self.test_session(use_gpu=use_gpu):
+            for _, _, tf_op, _ in ops_list:
+              tf_x, np_x = self._input(shape, dtype=dtype)
+              s = tf_op(tf_x, indices, num_segments)
+              jacob_t, jacob_n = gradient_checker.compute_gradient(
+                  tf_x,
+                  shape,
+                  s, [num_segments, num_cols],
+                  x_init_value=np_x,
+                  delta=1)
+            self.assertAllClose(jacob_t, jacob_n)
+
+  def testProdGrad(self):
+    # additional test for the prod gradient to ensure correct handling of zeros
+    values = np.array([0, 0, 1, 0, 2, 2, 3, 3, 3], dtype=np.float32)
+    indices = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2], dtype=np.int32)
+    indices_neg = np.array([-1, 0, 0, -1, 1, 1, -1, 2, 2], dtype=np.int32)
+    values_tf = constant_op.constant(values)
+    # ground truth partial derivatives
+    gradients_indices = np.zeros((9, 3), dtype=np.float32)
+    gradients_indices_neg = np.zeros((9, 3), dtype=np.float32)
+    # the derivative w.r.t. to the other segments is zero, so here we only
+    # explicitly set the grad values for the corresponding segment
+    gradients_indices[range(9), indices] = [0, 0, 0, 4, 0, 0, 9, 9, 9]
+    gradients_indices_neg[range(9), indices_neg] = [0, 1, 0, 0, 2, 2, 0, 3, 3]
+    for use_gpu in [False, True]:
+      with self.test_session(use_gpu=use_gpu):
+        for ind, grad_gt in [(indices, gradients_indices),
+                             (indices_neg, gradients_indices_neg)]:
+          s = math_ops.unsorted_segment_prod(values_tf,
+                                             constant_op.constant(ind), 3)
           jacob_t, jacob_n = gradient_checker.compute_gradient(
-              tf_x,
-              shape,
-              s, [num_segments, num_cols],
-              x_init_value=np_x,
-              delta=1)
-        self.assertAllClose(jacob_t, jacob_n)
+              values_tf, (9,), s, (3,), x_init_value=values, delta=1)
+          self.assertAllClose(jacob_t, jacob_n)
+          self.assertAllClose(jacob_t, grad_gt)
 
   def testGradientMatchesSegmentSum(self):
     # Strategy: compute the gradient for UnsortedSegmentSum and SegmentSum
@@ -318,8 +377,7 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
     num_cols = 2
     shape = [n, num_cols]
     num_segments = max(indices) + 1
-    for dtype in [dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.complex64,
-                  dtypes_lib.complex128]:
+    for dtype in self.differentiable_dtypes:
       with self.test_session(use_gpu=True):
         tf_x, np_x = self._input(shape, dtype=dtype)
         # Results from UnsortedSegmentSum
@@ -353,9 +411,8 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
           unsorted.eval()
 
   def testEmptySecondDimension(self):
-    dtypes = [
-        np.float32, np.float64, np.int64, np.int32, np.complex64, np.complex128
-    ]
+    dtypes = [np.float16, np.float32, np.float64, np.int64, np.int32,
+              np.complex64, np.complex128]
     with self.test_session(use_gpu=True):
       for dtype in dtypes:
         for itype in (np.int32, np.int64):
@@ -364,36 +421,14 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
           unsorted = math_ops.unsorted_segment_sum(data, segment_ids, 2)
           self.assertAllEqual(unsorted.eval(), np.zeros((2, 0), dtype=dtype))
 
-  def testGradientSegmentMax(self):
-    num_cols = 2
-    indices_flat = np.array([0, 4, 0, 8, 3, 8, 4, 7, 7, 3])
-    num_segments = max(indices_flat) + 3
-    for indices in indices_flat, indices_flat.reshape(5, 2):
-      shape = indices.shape + (num_cols,)
-      with self.test_session(use_gpu=True):
-        tf_x, np_x = self._input(shape, dtype=dtypes_lib.float64)
-        s = math_ops.unsorted_segment_max(
-            data=tf_x, segment_ids=indices, num_segments=num_segments)
-        jacob_t, jacob_n = gradient_checker.compute_gradient(
-            tf_x,
-            shape,
-            s,
-            [num_segments, num_cols],
-            x_init_value=np_x.astype(np.double), delta=1)
-      self.assertAllClose(jacob_t, jacob_n)
-
   def testDropNegatives(self):
     # Note: the test is done by replacing segment_ids with 8 to -1
     # for index  and replace values generated by numpy with 0.
-    dtypes = [
-        dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int64,
-        dtypes_lib.int32, dtypes_lib.complex64, dtypes_lib.complex128
-    ]
     indices_flat = np.array([0, 4, 0, 8, 3, 8, 4, 7, 7, 3])
     num_segments = 12
     for indices in indices_flat, indices_flat.reshape(5, 2):
       shape = indices.shape + (2,)
-      for dtype in dtypes:
+      for dtype in self.all_dtypes:
         with self.test_session(use_gpu=True):
           tf_x, np_x = self._input(shape, dtype=dtype)
           np_ans = self._segmentReduce(
diff --git a/tensorflow/python/kernel_tests/unique_op_test.py b/tensorflow/python/kernel_tests/unique_op_test.py
index 173d95b258..bbc040dc13 100644
--- a/tensorflow/python/kernel_tests/unique_op_test.py
+++ b/tensorflow/python/kernel_tests/unique_op_test.py
@@ -133,6 +133,39 @@ class UniqueWithCountsTest(test.TestCase):
       v = [1 if x[i] == value.decode('ascii') else 0 for i in range(7000)]
       self.assertEqual(count, sum(v))
 
+  def testInt32Axis(self):
+    for dtype in [np.int32, np.int64]:
+      x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]])
+      with self.test_session() as sess:
+        y0, idx0, count0 = gen_array_ops.unique_with_counts_v2(
+            x, axis=np.array([0], dtype))
+        tf_y0, tf_idx0, tf_count0 = sess.run([y0, idx0, count0])
+        y1, idx1, count1 = gen_array_ops.unique_with_counts_v2(
+            x, axis=np.array([1], dtype))
+        tf_y1, tf_idx1, tf_count1 = sess.run([y1, idx1, count1])
+      self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
+      self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
+      self.assertAllEqual(tf_count0, np.array([2, 1]))
+      self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]]))
+      self.assertAllEqual(tf_idx1, np.array([0, 1, 1]))
+      self.assertAllEqual(tf_count1, np.array([1, 2]))
+
+  def testInt32V2(self):
+    # This test is only temporary, once V2 is used
+    # by default, the axis will be wrapped to allow `axis=None`.
+    x = np.random.randint(2, high=10, size=7000)
+    with self.test_session() as sess:
+      y, idx, count = gen_array_ops.unique_with_counts_v2(
+          x, axis=np.array([], np.int32))
+      tf_y, tf_idx, tf_count = sess.run([y, idx, count])
+
+    self.assertEqual(len(x), len(tf_idx))
+    self.assertEqual(len(tf_y), len(np.unique(x)))
+    for i in range(len(x)):
+      self.assertEqual(x[i], tf_y[tf_idx[i]])
+    for value, count in zip(tf_y, tf_count):
+      self.assertEqual(count, np.sum(x == value))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index b4e1b9d781..ec7c14f7d8 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -1316,6 +1316,18 @@ def unique(x, out_idx=dtypes.int32, name=None):
 unique.__doc__ = gen_array_ops.unique.__doc__
 
 
+@tf_export("unique_with_counts")
+def unique_with_counts(x, out_idx=dtypes.int32, name=None):
+  # TODO(yongtang): switch to v2 once API deprecation
+  # period (3 weeks) pass.
+  # TODO(yongtang): The documentation should also
+  # be updated when switch  to v2.
+  return gen_array_ops.unique_with_counts(x, out_idx, name)
+
+
+unique_with_counts.__doc__ = gen_array_ops.unique_with_counts.__doc__
+
+
 @tf_export("split")
 def split(value, num_or_size_splits, axis=0, num=None, name="split"):
   """Splits a tensor into sub tensors.
diff --git a/tensorflow/python/ops/bitwise_ops_test.py b/tensorflow/python/ops/bitwise_ops_test.py
index 0a2af3716b..c4cfc0da19 100644
--- a/tensorflow/python/ops/bitwise_ops_test.py
+++ b/tensorflow/python/ops/bitwise_ops_test.py
@@ -70,10 +70,8 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(truth, popcnt_result)
 
   def testInvertOp(self):
-    dtype_list = [
-        dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64, dtypes.uint8,
-        dtypes.uint16, dtypes.uint32, dtypes.uint64
-    ]
+    dtype_list = [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
+                  dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64]
     inputs = [0, 5, 3, 14]
     with self.test_session(use_gpu=True) as sess:
       for dtype in dtype_list:
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index 7d6e047d7c..d6d75e4ef9 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -334,9 +334,9 @@ def assert_equal(x, y, data=None, summarize=None, message=None, name=None):
     @compatibility{eager} returns None
 
   Raises:
-    InvalidArgumentError if the check can be performed immediately and
-    `x == y` is False. The check can be performed immediately during
-    eager execution or if `x` and `y` are statically known.
+    InvalidArgumentError: if the check can be performed immediately and
+      `x == y` is False. The check can be performed immediately during eager
+      execution or if `x` and `y` are statically known.
   """
   message = message or ''
   with ops.name_scope(name, 'assert_equal', [x, y, data]):
diff --git a/tensorflow/python/ops/confusion_matrix.py b/tensorflow/python/ops/confusion_matrix.py
index e4ce2ab28a..b9a93c3bed 100644
--- a/tensorflow/python/ops/confusion_matrix.py
+++ b/tensorflow/python/ops/confusion_matrix.py
@@ -99,19 +99,16 @@ def confusion_matrix(labels, predictions, num_classes=None, dtype=dtypes.int32,
                      name=None, weights=None):
   """Computes the confusion matrix from predictions and labels.
 
-  Calculate the Confusion Matrix for a pair of prediction and
-  label 1-D int arrays.
-
   The matrix columns represent the prediction labels and the rows represent the
   real labels. The confusion matrix is always a 2-D array of shape `[n, n]`,
   where `n` is the number of valid labels for a given classification task. Both
   prediction and labels must be 1-D arrays of the same shape in order for this
   function to work.
 
-  If `num_classes` is None, then `num_classes` will be set to the one plus
-  the maximum value in either predictions or labels.
-  Class labels are expected to start at 0. E.g., if `num_classes` was
-  three, then the possible labels would be `[0, 1, 2]`.
+  If `num_classes` is `None`, then `num_classes` will be set to one plus the
+  maximum value in either predictions or labels. Class labels are expected to
+  start at 0. For example, if `num_classes` is 3, then the possible labels
+  would be `[0, 1, 2]`.
 
   If `weights` is not `None`, then each prediction contributes its
   corresponding weight to the total value of the confusion matrix cell.
@@ -141,8 +138,9 @@ def confusion_matrix(labels, predictions, num_classes=None, dtype=dtypes.int32,
     weights: An optional `Tensor` whose shape matches `predictions`.
 
   Returns:
-    A k X k matrix representing the confusion matrix, where k is the number of
-    possible labels in the classification task.
+    A `Tensor` of type `dtype` with shape `[n, n]` representing the confusion
+    matrix, where `n` is the number of possible labels in the classification
+    task.
 
   Raises:
     ValueError: If both predictions and labels are not 1-D vectors and have
@@ -188,7 +186,7 @@ def confusion_matrix(labels, predictions, num_classes=None, dtype=dtypes.int32,
       weights = math_ops.cast(weights, dtype)
 
     shape = array_ops.stack([num_classes, num_classes])
-    indices = array_ops.transpose(array_ops.stack([labels, predictions]))
+    indices = array_ops.stack([labels, predictions], axis=1)
     values = (array_ops.ones_like(predictions, dtype)
               if weights is None else weights)
     cm_sparse = sparse_tensor.SparseTensor(
diff --git a/tensorflow/python/ops/distributions/special_math.py b/tensorflow/python/ops/distributions/special_math.py
index bed4cbb2c1..1d605c5dfc 100644
--- a/tensorflow/python/ops/distributions/special_math.py
+++ b/tensorflow/python/ops/distributions/special_math.py
@@ -213,7 +213,7 @@ def _ndtri(p):
 
   # Compute x for p <= exp(-2): x = z - log(z)/z - (1/z) P(1/z) / Q(1/z),
   # where z = sqrt(-2. * log(p)), and P/Q are chosen between two different
-  # arrays based on wether p < exp(-32).
+  # arrays based on whether p < exp(-32).
   z = math_ops.sqrt(-2. * math_ops.log(sanitized_mcp))
   first_term = z - math_ops.log(z) / z
   second_term_small_p = (_create_polynomial(1. / z, p2)
diff --git a/tensorflow/python/ops/hidden_ops.txt b/tensorflow/python/ops/hidden_ops.txt
index f6ef6f3f3d..9b8172bf26 100644
--- a/tensorflow/python/ops/hidden_ops.txt
+++ b/tensorflow/python/ops/hidden_ops.txt
@@ -32,6 +32,8 @@ TileGrad  # Exported through array_grad instead of array_ops.
 ZerosLike  # TODO(josh11b): Use this instead of the Python version.
 Unique
 UniqueV2
+UniqueWithCounts
+UniqueWithCountsV2
 Unpack
 
 # candidate_sampling_ops
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 1088135b46..702e47d28f 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -456,7 +456,6 @@ def _rot90_4D(images, k, name_scope):
 
   def _rot180():
     return array_ops.reverse_v2(images, [1, 2])
-
   def _rot270():
     return array_ops.reverse_v2(array_ops.transpose(images, [0, 2, 1, 3]), [2])
 
@@ -469,7 +468,6 @@ def _rot90_4D(images, k, name_scope):
   result.set_shape([shape[0], None, None, shape[3]])
   return result
 
-
 @tf_export('image.transpose_image')
 def transpose_image(image):
   """Transpose image(s) by swapping the height and width dimension.
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index b99aac5be5..c437c12c27 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -1173,6 +1173,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(ValueError, "must be three-dimensional"):
         op(p_wrong_rank)
 
+
   def testRot90GroupOrder(self):
     image = np.arange(24, dtype=np.uint8).reshape([2, 4, 3])
     with self.test_session(use_gpu=True):
@@ -1207,7 +1208,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         y_np = np.rot90(image, k=k, axes=(1, 2))
         self.assertAllEqual(y_np, y_tf.eval({k_placeholder: k}))
 
-
 class RandomFlipTest(test_util.TensorFlowTestCase):
 
   def testRandomLeftRight(self):
diff --git a/tensorflow/python/ops/linalg/linear_operator_diag.py b/tensorflow/python/ops/linalg/linear_operator_diag.py
index b3ec3d5b7c..e180e83026 100644
--- a/tensorflow/python/ops/linalg/linear_operator_diag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_diag.py
@@ -67,7 +67,7 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
   operator = LinearOperatorDiag(diag)
 
   # Create a shape [2, 1, 4, 2] vector.  Note that this shape is compatible
-  # since the batch dimensions, [2, 1], are brodcast to
+  # since the batch dimensions, [2, 1], are broadcast to
   # operator.batch_shape = [2, 3].
   y = tf.random_normal(shape=[2, 1, 4, 2])
   x = operator.solve(y)
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 424fd09e09..0840760810 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -646,7 +646,7 @@ def sigmoid_cross_entropy(
 
   Args:
     multi_class_labels: `[batch_size, num_classes]` target integer labels in
-      `(0, 1)`.
+      `{0, 1}`.
     logits: Float `[batch_size, num_classes]` logits outputs of the network.
     weights: Optional `Tensor` whose rank is either 0, or the same rank as
       `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index e2ee9e4fe4..d220fe3cce 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -242,56 +242,142 @@ def _SparseSegmentSqrtNWithNumSegmentsGrad(op, grad):
                                               dim0), None, None, None)
 
 
-def _SegmentMinOrMaxGrad(op, grad, is_sorted):
-  """Gradient for SegmentMin and (unsorted) SegmentMax.
-
-  They share similar code.
-  """
-  zeros = array_ops.zeros(
-      array_ops.shape(op.inputs[0]), dtype=op.inputs[0].dtype)
-
+def _SegmentMinOrMaxGrad(op, grad):
+  """ Gradient for SegmentMin and SegmentMax. """
+  zeros = array_ops.zeros_like(op.inputs[0], dtype=op.inputs[0].dtype)
   # Get the number of selected (minimum or maximum) elements in each segment.
   gathered_outputs = array_ops.gather(op.outputs[0], op.inputs[1])
   is_selected = math_ops.equal(op.inputs[0], gathered_outputs)
-  if is_sorted:
-    num_selected = math_ops.segment_sum(
-        math_ops.cast(is_selected, grad.dtype), op.inputs[1])
-  else:
-    num_selected = math_ops.unsorted_segment_sum(
-        math_ops.cast(is_selected, grad.dtype), op.inputs[1], op.inputs[2])
-
+  num_selected = math_ops.segment_sum(math_ops.cast(is_selected, grad.dtype),
+                                      op.inputs[1])
   # Compute the gradient for each segment. The gradient for the ith segment is
   # divided evenly among the selected elements in that segment.
   weighted_grads = math_ops.div(grad, num_selected)
   gathered_grads = array_ops.gather(weighted_grads, op.inputs[1])
-
-  if is_sorted:
-    return array_ops.where(is_selected, gathered_grads, zeros), None
-  else:
-    return array_ops.where(is_selected, gathered_grads, zeros), None, None
+  return array_ops.where(is_selected, gathered_grads, zeros), None
 
 
 @ops.RegisterGradient("SegmentMin")
 def _SegmentMinGrad(op, grad):
   """Gradient for SegmentMin."""
-  return _SegmentMinOrMaxGrad(op, grad, True)
+  return _SegmentMinOrMaxGrad(op, grad)
 
 
 @ops.RegisterGradient("SegmentMax")
 def _SegmentMaxGrad(op, grad):
   """Gradient for SegmentMax."""
-  return _SegmentMinOrMaxGrad(op, grad, True)
+  return _SegmentMinOrMaxGrad(op, grad)
+
+
+def _GatherDropNegatives(params, ids, zero_clipped_indices=None,
+                         is_positive=None):
+  """ Helper function for unsorted segment ops. Gathers params for
+      positive segment ids and gathers 0 for inputs with negative segment id.
+      Also returns the clipped indices and a boolean mask with the same shape
+      as ids where a positive id is masked as true. With this, the latter two
+      can be passed as arguments to this function to reuse them.
+  """
+  if zero_clipped_indices is None:
+    zero_clipped_indices = math_ops.maximum(ids, array_ops.zeros_like(ids))
+  gathered = array_ops.gather(params, zero_clipped_indices)
+  if is_positive is None:
+    is_positive = math_ops.greater_equal(ids, 0)
+    # tf.where(condition, x, y) requires condition to have the same shape as x
+    # and y.
+    # todo(philjd): remove this if tf.where supports broadcasting (#9284)
+    for _ in range(gathered.shape.ndims - is_positive.shape.ndims):
+      is_positive = array_ops.expand_dims(is_positive, -1)
+    is_positive = (is_positive &
+                   array_ops.ones_like(gathered, dtype=dtypes.bool))
+  # replace gathered params of negative indices with 0
+  zero_slice = array_ops.zeros_like(gathered)
+  return (array_ops.where(is_positive, gathered, zero_slice),
+          zero_clipped_indices, is_positive)
+
+
+def _UnsortedSegmentMinOrMaxGrad(op, grad):
+  """ Gradient for UnsortedSegmentMin and UnsortedSegmentMax. """
+  # Get the number of selected (minimum or maximum) elements in each segment.
+  gathered_outputs, zero_clipped_indices, is_positive = \
+      _GatherDropNegatives(op.outputs[0], op.inputs[1])
+  is_selected = math_ops.equal(op.inputs[0], gathered_outputs)
+  is_selected = math_ops.logical_and(is_selected, is_positive)
+  num_selected = math_ops.unsorted_segment_sum(
+      math_ops.cast(is_selected, grad.dtype), op.inputs[1], op.inputs[2])
+  # Compute the gradient for each segment. The gradient for the ith segment is
+  # divided evenly among the selected elements in that segment.
+  weighted_grads = math_ops.div(grad, num_selected)
+  gathered_grads, _, _ = _GatherDropNegatives(weighted_grads, None,
+                                              zero_clipped_indices,
+                                              is_positive)
+  zeros = array_ops.zeros_like(gathered_grads)
+  return array_ops.where(is_selected, gathered_grads, zeros), None, None
 
 
 @ops.RegisterGradient("UnsortedSegmentSum")
 def _UnsortedSegmentSumGrad(op, grad):
-  """Gradient for SegmentSum."""
-  return array_ops.gather(grad, op.inputs[1]), None, None
+  """Gradient for UnsortedSegmentSum."""
+  return _GatherDropNegatives(grad, op.inputs[1])[0], None, None
 
 
 @ops.RegisterGradient("UnsortedSegmentMax")
 def _UnsortedSegmentMaxGrad(op, grad):
-  return _SegmentMinOrMaxGrad(op, grad, False)
+  """ Gradient for UnsortedSegmentMax. """
+  return _UnsortedSegmentMinOrMaxGrad(op, grad)
+
+
+@ops.RegisterGradient("UnsortedSegmentMin")
+def _UnsortedSegmentMinGrad(op, grad):
+  """ Gradient for UnsortedSegmentMin. """
+  return _UnsortedSegmentMinOrMaxGrad(op, grad)
+
+
+@ops.RegisterGradient("UnsortedSegmentProd")
+def _UnsortedSegmentProdGrad(op, grad):
+  """ Gradient for UnsortedSegmentProd.
+  The gradient can be expressed for each segment by dividing the segment's
+  product by each element of the segment input tensor, but this approach can't
+  deal with zeros in the input.
+  Unlike reduce_prod we can't use cumsum here as individual segments may have
+  a different number of elements. Therefore we consider three cases:
+  1) A segment input contains no zeros and we can safely divide by the input
+     tensor.
+  2) A segment contains exactly one zero. Then the gradient of each input of
+     the segment is zero except for the 0-input, there the gradient is
+     the product of the remaining segment entries.
+  3) A segment contains at least two zeros. The gradient is zero for all
+     segment inputs.
+  """
+  # Note that unsorted_segment_sum will filter out the negative indices,
+  # so we don't need to do a logical_and with is_positive here
+  is_zero = math_ops.equal(op.inputs[0], 0)
+  num_zeros = gen_math_ops.unsorted_segment_sum(
+      math_ops.cast(is_zero, dtype=dtypes.int32), op.inputs[1], op.inputs[2])
+  # handle case 3 and set the gradient to 0 for segments with more than one
+  # 0 as input
+  grad = array_ops.where(math_ops.greater(num_zeros, 1),
+                         array_ops.zeros_like(grad), grad)
+  # replace all zeros with ones and compute the unsorted_segment_prod
+  non_zero_data = array_ops.where(is_zero, array_ops.ones_like(op.inputs[0]),
+                                  op.inputs[0])
+  non_zero_prod = gen_math_ops.unsorted_segment_prod(
+      non_zero_data, op.inputs[1], op.inputs[2])
+  # clip the indices for gather to be positive
+  zero_clipped_indices = math_ops.maximum(op.inputs[1],
+                                          array_ops.zeros_like(op.inputs[1]))
+  gathered_prod = array_ops.gather(op.outputs[0], zero_clipped_indices)
+  gathered_non_zero_prod = array_ops.gather(non_zero_prod,
+                                            zero_clipped_indices)
+  prod_divided_by_el = gathered_prod / op.inputs[0]  # May contain nan/inf.
+  # Now fetch the individual results for segments containing 0 and those that
+  # don't. is_zero will also fetch results for entries with negative index
+  # but the following gather_drop_negatives sets the corresponding entry in
+  # grad to 0 for these
+  partial_derivative = array_ops.where(is_zero, gathered_non_zero_prod,
+                                       prod_divided_by_el)
+  gathered_grad = _GatherDropNegatives(grad, op.inputs[1],
+                                       zero_clipped_indices)[0]
+  return gathered_grad * partial_derivative, None, None
 
 
 @ops.RegisterGradient("Abs")
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 0063de52c7..0b3509360e 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -129,6 +129,9 @@ See the @{$python/math_ops} guide.
 @@segment_mean
 @@unsorted_segment_sum
 @@unsorted_segment_max
+@@unsorted_segment_min
+@@unsorted_segment_prod
+@@unsorted_segment_sqrt_n
 @@sparse_segment_sum
 @@sparse_segment_mean
 @@sparse_segment_sqrt_n
@@ -898,6 +901,40 @@ def to_bfloat16(x, name="ToBFloat16"):
   return cast(x, dtypes.bfloat16, name=name)
 
 
+@tf_export("to_complex64")
+def to_complex64(x, name="ToComplex64"):
+  """Casts a tensor to type `complex64`.
+
+  Args:
+    x: A `Tensor` or `SparseTensor`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor` with same shape as `x` with type `complex64`.
+
+  Raises:
+    TypeError: If `x` cannot be cast to the `complex64`.
+  """
+  return cast(x, dtypes.complex64, name=name)
+
+
+@tf_export("to_complex128")
+def to_complex128(x, name="ToComplex128"):
+  """Casts a tensor to type `complex128`.
+
+  Args:
+    x: A `Tensor` or `SparseTensor`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor` with same shape as `x` with type `complex128`.
+
+  Raises:
+    TypeError: If `x` cannot be cast to the `complex128`.
+  """
+  return cast(x, dtypes.complex128, name=name)
+
+
 ops.Tensor._override_operator("__neg__", gen_math_ops.neg)
 ops.Tensor._override_operator("__abs__", abs)
 # __invert__ corresponds to the ~ operator.  Here we follow the numpy convention
@@ -2559,6 +2596,87 @@ def reduced_shape(input_shape, axes):
       ])  # [1, 1]
 
 
+def _unsorted_segment_N(data, segment_ids, num_segments):
+  """ Helper function for unsorted_segment_mean/_sqrtN. Computes the number
+      of segment entries with 0-entries set to 1 to allow division by N.
+  """
+  # bincount doesn't support negative indices so we use unsorted_segment_sum
+  ones_tensor = array_ops.ones(segment_ids.shape, dtype=data.dtype)
+  N = gen_math_ops.unsorted_segment_sum(ones_tensor, segment_ids, num_segments)
+  # add dimensions for all non-reduced axes
+  ndims_output = data.shape.ndims - segment_ids.shape.ndims
+  broadcast_shape = [num_segments] + [1] * ndims_output
+  N = array_ops.reshape(N, broadcast_shape)
+  return gen_math_ops.maximum(N, 1)
+
+
+@tf_export("unsorted_segment_mean")
+def unsorted_segment_mean(data, segment_ids, num_segments, name=None):
+  r""" Computes the mean along segments of a tensor.
+
+  Read @{$math_ops#segmentation$the section on segmentation} for an explanation
+  of segments.
+
+  This operator is similar to the unsorted segment sum operator found
+  [here](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+  Instead of computing the sum over segments, it computes the mean of all
+  entries belonging to a segment such that:
+
+  \\(output_i = 1/N_i \sum data_j\\) where the sum is over `j` such
+  that `segment_ids[j] == i` with \\N_i\\ being the number of occurrences
+  of id \\i\\.
+
+  If there is no entry for a given segment ID `i`, it outputs 0.
+
+  segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+  first dimension.
+
+  output: Has same shape as data, except for dimension 0 which
+  has size `num_segments`.
+  """
+  with ops.name_scope(name, "UnsortedSegmentMean"):
+    data = ops.convert_to_tensor(data)
+    segment_ids = ops.convert_to_tensor(segment_ids)
+    N = _unsorted_segment_N(data, segment_ids, num_segments)
+    summed = gen_math_ops.unsorted_segment_sum(data, segment_ids, num_segments)
+    return summed / N
+
+
+@tf_export("unsorted_segment_sqrt_n")
+def unsorted_segment_sqrt_n(data, segment_ids, num_segments, name=None):
+  r"""Computes the sum along segments of a tensor divided by the sqrt(N).
+
+  Read @{$math_ops#segmentation$the section on segmentation} for an explanation
+  of segments.
+
+  This operator is similar to the unsorted segment sum operator found
+  [here](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+  Additionally to computing the sum over segments, it divides the results by
+  sqrt(N).
+
+  \\(output_i = 1/sqrt(N_i) \sum data_j\\) where the sum is over `j` such
+  that `segment_ids[j] == i` with \\N_i\\ being the number of occurrences
+  of id \\i\\.
+
+  If there is no entry for a given segment ID `i`, it outputs 0.
+
+  Note that this op only supports floating point and complex dtypes,
+  due to tf.sqrt only supporting these types.
+
+  segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+  first dimension.
+
+  output: Has same shape as data, except for dimension 0 which
+  has size `num_segments`.
+  """
+  with ops.name_scope(name, "UnsortedSegmentSqrtN"):
+    data = ops.convert_to_tensor(data)
+    segment_ids = ops.convert_to_tensor(segment_ids)
+    N = _unsorted_segment_N(data, segment_ids, num_segments)
+    summed = gen_math_ops.unsorted_segment_sum(data, segment_ids, num_segments)
+    return summed / gen_math_ops.sqrt(N)
+
+
 @tf_export("sparse_segment_sum")
 def sparse_segment_sum(data, indices, segment_ids, name=None,
                        num_segments=None):
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 7814a27311..9d6f65dbbf 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -1343,4 +1343,4 @@ def sampled_softmax_loss(weights,
   sampled_losses = nn_ops.softmax_cross_entropy_with_logits(
       labels=labels, logits=logits)
   # sampled_losses is a [batch_size] tensor.
-  return sampled_losses
+  return sampled_losses
+\ No newline at end of file
diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
index 33f6debbcb..b0e9e3e5ed 100644
--- a/tensorflow/python/tools/saved_model_cli.py
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -115,7 +115,7 @@ def _get_outputs_tensor_info_from_meta_graph_def(meta_graph_def,
                                                       signature_def_key).outputs
 
 
-def _show_inputs_outputs(saved_model_dir, tag_set, signature_def_key):
+def _show_inputs_outputs(saved_model_dir, tag_set, signature_def_key, indent=0):
   """Prints input and output TensorInfos.
 
   Prints the details of input and output TensorInfos for the SignatureDef mapped
@@ -126,6 +126,7 @@ def _show_inputs_outputs(saved_model_dir, tag_set, signature_def_key):
     tag_set: Group of tag(s) of the MetaGraphDef, in string format, separated by
         ','. For tag-set contains multiple tags, all tags must be passed in.
     signature_def_key: A SignatureDef key string.
+    indent: How far (in increments of 2 spaces) to indent each line of output.
   """
   meta_graph_def = saved_model_utils.get_meta_graph_def(saved_model_dir,
                                                         tag_set)
@@ -134,29 +135,39 @@ def _show_inputs_outputs(saved_model_dir, tag_set, signature_def_key):
   outputs_tensor_info = _get_outputs_tensor_info_from_meta_graph_def(
       meta_graph_def, signature_def_key)
 
-  print('The given SavedModel SignatureDef contains the following input(s):')
+  indent_str = "  " * indent
+  def in_print(s):
+    print(indent_str + s)
+
+  in_print('The given SavedModel SignatureDef contains the following input(s):')
   for input_key, input_tensor in sorted(inputs_tensor_info.items()):
-    print('inputs[\'%s\'] tensor_info:' % input_key)
-    _print_tensor_info(input_tensor)
+    in_print('  inputs[\'%s\'] tensor_info:' % input_key)
+    _print_tensor_info(input_tensor, indent+1)
 
-  print('The given SavedModel SignatureDef contains the following output(s):')
+  in_print('The given SavedModel SignatureDef contains the following '
+           'output(s):')
   for output_key, output_tensor in sorted(outputs_tensor_info.items()):
-    print('outputs[\'%s\'] tensor_info:' % output_key)
-    _print_tensor_info(output_tensor)
+    in_print('  outputs[\'%s\'] tensor_info:' % output_key)
+    _print_tensor_info(output_tensor, indent+1)
 
-  print('Method name is: %s' %
-        meta_graph_def.signature_def[signature_def_key].method_name)
+  in_print('Method name is: %s' %
+           meta_graph_def.signature_def[signature_def_key].method_name)
 
 
-def _print_tensor_info(tensor_info):
+def _print_tensor_info(tensor_info, indent=0):
   """Prints details of the given tensor_info.
 
   Args:
     tensor_info: TensorInfo object to be printed.
+    indent: How far (in increments of 2 spaces) to indent each line output
   """
-  print('    dtype: ' +
-        {value: key
-         for (key, value) in types_pb2.DataType.items()}[tensor_info.dtype])
+  indent_str = "  " * indent
+  def in_print(s):
+    print(indent_str + s)
+
+  in_print('    dtype: ' +
+           {value: key
+            for (key, value) in types_pb2.DataType.items()}[tensor_info.dtype])
   # Display shape as tuple.
   if tensor_info.tensor_shape.unknown_rank:
     shape = 'unknown_rank'
@@ -164,8 +175,8 @@ def _print_tensor_info(tensor_info):
     dims = [str(dim.size) for dim in tensor_info.tensor_shape.dim]
     shape = ', '.join(dims)
     shape = '(' + shape + ')'
-  print('    shape: ' + shape)
-  print('    name: ' + tensor_info.name)
+  in_print('    shape: ' + shape)
+  in_print('    name: ' + tensor_info.name)
 
 
 def _show_all(saved_model_dir):
@@ -186,7 +197,8 @@ def _show_all(saved_model_dir):
     signature_def_map = get_signature_def_map(saved_model_dir, tag_set)
     for signature_def_key in sorted(signature_def_map.keys()):
       print('\nsignature_def[\'' + signature_def_key + '\']:')
-      _show_inputs_outputs(saved_model_dir, tag_set, signature_def_key)
+      _show_inputs_outputs(saved_model_dir, tag_set, signature_def_key, 
+                           indent=1)
 
 
 def get_meta_graph_def(saved_model_dir, tag_set):
@@ -614,19 +626,19 @@ def create_parser():
   show_msg = (
       'Usage examples:\n'
       'To show all tag-sets in a SavedModel:\n'
-      '$saved_model_cli show --dir /tmp/saved_model\n'
+      '$saved_model_cli show --dir /tmp/saved_model\n\n'
       'To show all available SignatureDef keys in a '
       'MetaGraphDef specified by its tag-set:\n'
-      '$saved_model_cli show --dir /tmp/saved_model --tag_set serve\n'
+      '$saved_model_cli show --dir /tmp/saved_model --tag_set serve\n\n'
       'For a MetaGraphDef with multiple tags in the tag-set, all tags must be '
       'passed in, separated by \';\':\n'
       '$saved_model_cli show --dir /tmp/saved_model --tag_set serve,gpu\n\n'
       'To show all inputs and outputs TensorInfo for a specific'
       ' SignatureDef specified by the SignatureDef key in a'
       ' MetaGraph.\n'
-      '$saved_model_cli show --dir /tmp/saved_model --tag_set serve '
-      '--signature_def serving_default\n\n'
-      'To show all available information in the SavedModel\n:'
+      '$saved_model_cli show --dir /tmp/saved_model --tag_set serve'
+      ' --signature_def serving_default\n\n'
+      'To show all available information in the SavedModel:\n'
       '$saved_model_cli show --dir /tmp/saved_model --all')
   parser_show = subparsers.add_parser(
       'show',
@@ -658,12 +670,14 @@ def create_parser():
   run_msg = ('Usage example:\n'
              'To run input tensors from files through a MetaGraphDef and save'
              ' the output tensors to files:\n'
-             '$saved_model_cli show --dir /tmp/saved_model --tag_set serve '
-             '--signature_def serving_default '
-             '--inputs input1_key=/tmp/124.npz[x],input2_key=/tmp/123.npy '
-             '--input_exprs \'input3_key=np.ones(2)\' --input_examples '
-             '\'input4_key=[{"id":[26],"weights":[0.5, 0.5]}]\' '
-             '--outdir=/out\n\n'
+             '$saved_model_cli show --dir /tmp/saved_model --tag_set serve \\\n'
+             '   --signature_def serving_default \\\n'
+             '   --inputs input1_key=/tmp/124.npz[x],input2_key=/tmp/123.npy '
+             '\\\n'
+             '   --input_exprs \'input3_key=np.ones(2)\' \\\n'
+             '   --input_examples '
+             '\'input4_key=[{"id":[26],"weights":[0.5, 0.5]}]\' \\\n'
+             '   --outdir=/out\n\n'
              'For more information about input file format, please see:\n'
              'https://www.tensorflow.org/programmers_guide/saved_model_cli\n')
   parser_run = subparsers.add_parser(
diff --git a/tensorflow/python/tools/saved_model_cli_test.py b/tensorflow/python/tools/saved_model_cli_test.py
index d6cbc49ba1..f99c844845 100644
--- a/tensorflow/python/tools/saved_model_cli_test.py
+++ b/tensorflow/python/tools/saved_model_cli_test.py
@@ -61,83 +61,84 @@ class SavedModelCLITestCase(test.TestCase):
     exp_out = """MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:
 
 signature_def['classify_x2_to_y3']:
-The given SavedModel SignatureDef contains the following input(s):
-inputs['inputs'] tensor_info:
-    dtype: DT_FLOAT
-    shape: (-1, 1)
-    name: x2:0
-The given SavedModel SignatureDef contains the following output(s):
-outputs['scores'] tensor_info:
-    dtype: DT_FLOAT
-    shape: (-1, 1)
-    name: y3:0
-Method name is: tensorflow/serving/classify
+  The given SavedModel SignatureDef contains the following input(s):
+    inputs['inputs'] tensor_info:
+        dtype: DT_FLOAT
+        shape: (-1, 1)
+        name: x2:0
+  The given SavedModel SignatureDef contains the following output(s):
+    outputs['scores'] tensor_info:
+        dtype: DT_FLOAT
+        shape: (-1, 1)
+        name: y3:0
+  Method name is: tensorflow/serving/classify
 
 signature_def['classify_x_to_y']:
-The given SavedModel SignatureDef contains the following input(s):
-inputs['inputs'] tensor_info:
-    dtype: DT_STRING
-    shape: unknown_rank
-    name: tf_example:0
-The given SavedModel SignatureDef contains the following output(s):
-outputs['scores'] tensor_info:
-    dtype: DT_FLOAT
-    shape: (-1, 1)
-    name: y:0
-Method name is: tensorflow/serving/classify
+  The given SavedModel SignatureDef contains the following input(s):
+    inputs['inputs'] tensor_info:
+        dtype: DT_STRING
+        shape: unknown_rank
+        name: tf_example:0
+  The given SavedModel SignatureDef contains the following output(s):
+    outputs['scores'] tensor_info:
+        dtype: DT_FLOAT
+        shape: (-1, 1)
+        name: y:0
+  Method name is: tensorflow/serving/classify
 
 signature_def['regress_x2_to_y3']:
-The given SavedModel SignatureDef contains the following input(s):
-inputs['inputs'] tensor_info:
-    dtype: DT_FLOAT
-    shape: (-1, 1)
-    name: x2:0
-The given SavedModel SignatureDef contains the following output(s):
-outputs['outputs'] tensor_info:
-    dtype: DT_FLOAT
-    shape: (-1, 1)
-    name: y3:0
-Method name is: tensorflow/serving/regress
+  The given SavedModel SignatureDef contains the following input(s):
+    inputs['inputs'] tensor_info:
+        dtype: DT_FLOAT
+        shape: (-1, 1)
+        name: x2:0
+  The given SavedModel SignatureDef contains the following output(s):
+    outputs['outputs'] tensor_info:
+        dtype: DT_FLOAT
+        shape: (-1, 1)
+        name: y3:0
+  Method name is: tensorflow/serving/regress
 
 signature_def['regress_x_to_y']:
-The given SavedModel SignatureDef contains the following input(s):
-inputs['inputs'] tensor_info:
-    dtype: DT_STRING
-    shape: unknown_rank
-    name: tf_example:0
-The given SavedModel SignatureDef contains the following output(s):
-outputs['outputs'] tensor_info:
-    dtype: DT_FLOAT
-    shape: (-1, 1)
-    name: y:0
-Method name is: tensorflow/serving/regress
+  The given SavedModel SignatureDef contains the following input(s):
+    inputs['inputs'] tensor_info:
+        dtype: DT_STRING
+        shape: unknown_rank
+        name: tf_example:0
+  The given SavedModel SignatureDef contains the following output(s):
+    outputs['outputs'] tensor_info:
+        dtype: DT_FLOAT
+        shape: (-1, 1)
+        name: y:0
+  Method name is: tensorflow/serving/regress
 
 signature_def['regress_x_to_y2']:
-The given SavedModel SignatureDef contains the following input(s):
-inputs['inputs'] tensor_info:
-    dtype: DT_STRING
-    shape: unknown_rank
-    name: tf_example:0
-The given SavedModel SignatureDef contains the following output(s):
-outputs['outputs'] tensor_info:
-    dtype: DT_FLOAT
-    shape: (-1, 1)
-    name: y2:0
-Method name is: tensorflow/serving/regress
+  The given SavedModel SignatureDef contains the following input(s):
+    inputs['inputs'] tensor_info:
+        dtype: DT_STRING
+        shape: unknown_rank
+        name: tf_example:0
+  The given SavedModel SignatureDef contains the following output(s):
+    outputs['outputs'] tensor_info:
+        dtype: DT_FLOAT
+        shape: (-1, 1)
+        name: y2:0
+  Method name is: tensorflow/serving/regress
 
 signature_def['serving_default']:
-The given SavedModel SignatureDef contains the following input(s):
-inputs['x'] tensor_info:
-    dtype: DT_FLOAT
-    shape: (-1, 1)
-    name: x:0
-The given SavedModel SignatureDef contains the following output(s):
-outputs['y'] tensor_info:
-    dtype: DT_FLOAT
-    shape: (-1, 1)
-    name: y:0
-Method name is: tensorflow/serving/predict"""
+  The given SavedModel SignatureDef contains the following input(s):
+    inputs['x'] tensor_info:
+        dtype: DT_FLOAT
+        shape: (-1, 1)
+        name: x:0
+  The given SavedModel SignatureDef contains the following output(s):
+    outputs['y'] tensor_info:
+        dtype: DT_FLOAT
+        shape: (-1, 1)
+        name: y:0
+  Method name is: tensorflow/serving/predict"""
     # pylint: enable=line-too-long
+    self.maxDiff = None # Produce a useful error msg if the comparison fails
     self.assertMultiLineEqual(output, exp_out)
     self.assertEqual(err.getvalue().strip(), '')
 
@@ -193,11 +194,11 @@ Method name is: tensorflow/serving/predict"""
     output = out.getvalue().strip()
     expected_output = (
         'The given SavedModel SignatureDef contains the following input(s):\n'
-        'inputs[\'x\'] tensor_info:\n'
-        '    dtype: DT_FLOAT\n    shape: (-1, 1)\n    name: x:0\n'
+        '  inputs[\'x\'] tensor_info:\n'
+        '      dtype: DT_FLOAT\n      shape: (-1, 1)\n      name: x:0\n'
         'The given SavedModel SignatureDef contains the following output(s):\n'
-        'outputs[\'y\'] tensor_info:\n'
-        '    dtype: DT_FLOAT\n    shape: (-1, 1)\n    name: y:0\n'
+        '  outputs[\'y\'] tensor_info:\n'
+        '      dtype: DT_FLOAT\n      shape: (-1, 1)\n      name: y:0\n'
         'Method name is: tensorflow/serving/predict')
     self.assertEqual(output, expected_output)
     self.assertEqual(err.getvalue().strip(), '')
diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py
index 52d092bc22..e7f88de1d2 100644
--- a/tensorflow/python/training/checkpoint_utils.py
+++ b/tensorflow/python/training/checkpoint_utils.py
@@ -290,7 +290,11 @@ def _set_checkpoint_initializer(variable,
     name: Name of the operation.
   """
   base_type = variable.dtype.base_dtype
-  with ops.colocate_with(variable.op):
+  # Do not colocate with variable since RestoreV2 op only runs on CPU and
+  # colocation will force variable (and other ops that colocate with variable)
+  # to be on CPU as well. It is okay to place the variable's initializer op on
+  # CPU since it will only be run once at the start.
+  with ops.device(variable.device), ops.device("/cpu:0"):
     restore_op = io_ops.restore_v2(
         ckpt_file, [tensor_name], [slice_spec], [base_type], name=name)[0]
     if isinstance(variable, resource_variable_ops.ResourceVariable):
diff --git a/tensorflow/python/training/checkpoint_utils_test.py b/tensorflow/python/training/checkpoint_utils_test.py
index 2bb95b80ff..4e08a1c859 100644
--- a/tensorflow/python/training/checkpoint_utils_test.py
+++ b/tensorflow/python/training/checkpoint_utils_test.py
@@ -207,7 +207,9 @@ class CheckpointsTest(test.TestCase):
 
       checkpoint_utils.init_from_checkpoint(checkpoint_dir,
                                             {"useful_scope/": "useful_scope/"})
-      self.assertEqual(my4._initializer_op.op.inputs[1].device, "/job:ps")
+      # initializer runs on the same task but always on CPU.
+      self.assertEqual(my4._initializer_op.op.inputs[1].device,
+                       "/job:ps/device:CPU:0")
 
   def testInitFromRootCheckpoint(self):
     checkpoint_dir = self.get_temp_dir()
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index bb95f34e01..123d67fd9b 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -2077,6 +2077,18 @@ tf_module {
     argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
+    name: "unsorted_segment_min"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "unsorted_segment_prod"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "unsorted_segment_sqrt_n"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "unsorted_segment_sum"
     argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
diff --git a/tensorflow/tools/ci_build/builds/with_the_same_user b/tensorflow/tools/ci_build/builds/with_the_same_user
index 5817716c8d..d4bf546d40 100755
--- a/tensorflow/tools/ci_build/builds/with_the_same_user
+++ b/tensorflow/tools/ci_build/builds/with_the_same_user
@@ -36,8 +36,13 @@ else
   rm /this_is_writable_file_system
 fi
 
+if [ -n "${CI_BUILD_USER_FORCE_BADNAME}" ]; then
+  ADDUSER_OPTS="--force-badname"
+fi
+
 getent group "${CI_BUILD_GID}" || addgroup --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}"
-getent passwd "${CI_BUILD_UID}" || adduser --gid "${CI_BUILD_GID}" --uid "${CI_BUILD_UID}" \
+getent passwd "${CI_BUILD_UID}" || adduser ${ADDUSER_OPTS} \
+    --gid "${CI_BUILD_GID}" --uid "${CI_BUILD_UID}" \
     --gecos "${CI_BUILD_USER} (generated by with_the_same_user script)" \
     --disabled-password --home "${CI_BUILD_HOME}" --quiet "${CI_BUILD_USER}"
 usermod -a -G sudo "${CI_BUILD_USER}"
diff --git a/tensorflow/tools/ci_build/install/install_bazel.sh b/tensorflow/tools/ci_build/install/install_bazel.sh
index 1df6a84d7c..3e27a94cf2 100755
--- a/tensorflow/tools/ci_build/install/install_bazel.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel.sh
@@ -15,7 +15,7 @@
 # ==============================================================================
 
 # Select bazel version.
-BAZEL_VERSION="0.10.0"
+BAZEL_VERSION="0.11.0"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index d16761c367..22c73c3fe1 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -57,7 +57,7 @@ RUN echo "startup --batch" >>/etc/bazel.bazelrc
 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
     >>/etc/bazel.bazelrc
 # Install the most recent bazel release.
-ENV BAZEL_VERSION 0.8.0
+ENV BAZEL_VERSION 0.11.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 4ef37881bc..69ba340f92 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -66,7 +66,7 @@ RUN echo "startup --batch" >>/etc/bazel.bazelrc
 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
     >>/etc/bazel.bazelrc
 # Install the most recent bazel release.
-ENV BAZEL_VERSION 0.8.0
+ENV BAZEL_VERSION 0.11.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/graph_transforms/BUILD b/tensorflow/tools/graph_transforms/BUILD
index fba39526b2..b7d7fac315 100644
--- a/tensorflow/tools/graph_transforms/BUILD
+++ b/tensorflow/tools/graph_transforms/BUILD
@@ -132,8 +132,8 @@ cc_library(
         "//tensorflow/core:tensorflow",
         "//tensorflow/contrib/rnn:gru_ops_op_lib",
         "//tensorflow/contrib/rnn:lstm_ops_op_lib",
+        "//tensorflow/core/kernels:quantization_utils",
     ] + if_not_windows([
-        "//tensorflow/core/kernels:quantized_ops",
         "//tensorflow/core/kernels:remote_fused_graph_rewriter_transform",
         "//tensorflow/core/kernels/hexagon:hexagon_rewriter_transform",
     ]),
diff --git a/tensorflow/tools/graph_transforms/remove_control_dependencies.cc b/tensorflow/tools/graph_transforms/remove_control_dependencies.cc
index a900ee65b0..cba6b78fc5 100644
--- a/tensorflow/tools/graph_transforms/remove_control_dependencies.cc
+++ b/tensorflow/tools/graph_transforms/remove_control_dependencies.cc
@@ -24,24 +24,23 @@ namespace graph_transforms {
 // inputs which are referenced with "^tensor_name".
 // See node_def.proto for more details.
 Status RemoveControlDependencies(const GraphDef& input_graph_def,
-                                 const TransformFuncContext& context,
-                                 GraphDef* output_graph_def) {
-  output_graph_def->Clear();
-  for (const NodeDef& node : input_graph_def.node()) {
-    NodeDef* new_node = output_graph_def->mutable_node()->Add();
-    *new_node = node;
-    new_node->clear_input();
-    for (const auto& input : node.input()) {
-      if (input[0] != '^') {
-        new_node->add_input(input);
-      }
+                const TransformFuncContext& context,
+                GraphDef* output_graph_def) {
+    output_graph_def->Clear();
+    for (const NodeDef& node : input_graph_def.node()) {
+        NodeDef* new_node = output_graph_def->mutable_node()->Add();
+        *new_node = node;
+        new_node->clear_input();
+        for (const auto& input : node.input()) {
+            if (input[0] != '^') {
+                new_node->add_input(input);
+            }
+        }
     }
-  }
-  return Status::OK();
+    return Status::OK();
 }
 
-REGISTER_GRAPH_TRANSFORM("remove_control_dependencies",
-                         RemoveControlDependencies);
+REGISTER_GRAPH_TRANSFORM("remove_control_dependencies", RemoveControlDependencies);
 
 }  // namespace graph_transforms
 }  // namespace tensorflow
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 614457e899..3fbdb5cacd 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -27,6 +27,7 @@ pkg_tar(
         ":cheaders",
         ":clib",
         ":clicenses",
+        ":eager_cheaders",
     ],
 )
 
@@ -57,7 +58,6 @@ pkg_tar(
     name = "cheaders",
     files = [
         "//tensorflow/c:headers",
-        "//tensorflow/c/eager:headers",
     ],
     package_dir = "include/tensorflow/c",
     # Mark as "manual" till
@@ -69,6 +69,20 @@ pkg_tar(
 )
 
 pkg_tar(
+    name = "eager_cheaders",
+    files = [
+        "//tensorflow/c/eager:headers",
+    ],
+    package_dir = "include/tensorflow/c/eager",
+    # Mark as "manual" till
+    # https://github.com/bazelbuild/bazel/issues/2352
+    # and https://github.com/bazelbuild/bazel/issues/1580
+    # are resolved, otherwise these rules break when built
+    # with Python 3.
+    tags = ["manual"],
+)
+
+pkg_tar(
     name = "clib",
     files = ["//tensorflow:libtensorflow.so"],
     package_dir = "lib",