96 files changed, 1380 insertions, 5137 deletions
diff --git a/README.md b/README.md
index 916e5200b2..ef5bdc66ef 100644
--- a/README.md
+++ b/README.md
@@ -4,9 +4,10 @@
 
 -----------------
 
-| **`Linux CPU`** | **`Linux GPU`** | **`Mac OS CPU`** | **`Windows CPU`** | **`Android`** |
-|-----------------|---------------------|------------------|-------------------|---------------|
-| [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-cpu)](https://ci.tensorflow.org/job/tensorflow-master-cpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-linux-gpu)](https://ci.tensorflow.org/job/tensorflow-master-linux-gpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-mac)](https://ci.tensorflow.org/job/tensorflow-master-mac) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) [ ![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg) ](https://bintray.com/google/tensorflow/tensorflow/_latestVersion) |
+
+| **`Documentation`** | **`Linux CPU`** | **`Linux GPU`** | **`Mac OS CPU`** | **`Windows CPU`** | **`Android`** |
+|-----------------|---------------------|------------------|-------------------|---------------|---------------|
+| [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://www.tensorflow.org/api_docs/) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-cpu)](https://ci.tensorflow.org/job/tensorflow-master-cpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-linux-gpu)](https://ci.tensorflow.org/job/tensorflow-master-linux-gpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-mac)](https://ci.tensorflow.org/job/tensorflow-master-mac) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) [ ![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg) ](https://bintray.com/google/tensorflow/tensorflow/_latestVersion)
 
 **TensorFlow** is an open source software library for numerical computation using
 data flow graphs.  The graph nodes represent mathematical operations, while
@@ -21,20 +22,6 @@ organization for the purposes of conducting machine learning and deep neural
 networks research.  The system is general enough to be applicable in a wide
 variety of other domains, as well.
 
-**If you want to contribute to TensorFlow, be sure to review the [contribution
-guidelines](CONTRIBUTING.md). This project adheres to TensorFlow's
-[code of conduct](CODE_OF_CONDUCT.md). By participating, you are expected to
-uphold this code.**
-
-**We use [GitHub issues](https://github.com/tensorflow/tensorflow/issues) for
-tracking requests and bugs. So please see
-[TensorFlow Discuss](https://groups.google.com/a/tensorflow.org/forum/#!forum/discuss) for general questions
-and discussion, and please direct specific questions to [Stack Overflow](https://stackoverflow.com/questions/tagged/tensorflow).**
-
-The TensorFlow project strives to abide by generally accepted best practices in open-source software development:
-
-[![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/1486/badge)](https://bestpractices.coreinfrastructure.org/projects/1486)
-
 ## Installation
 *See [Installing TensorFlow](https://www.tensorflow.org/get_started/os_setup.html) for instructions on how to install our release binaries or how to build from source.*
 
@@ -75,6 +62,22 @@ $ python
 >>> sess.close()
 ```
 
+## Contribution guidelines
+
+**If you want to contribute to TensorFlow, be sure to review the [contribution
+guidelines](CONTRIBUTING.md). This project adheres to TensorFlow's
+[code of conduct](CODE_OF_CONDUCT.md). By participating, you are expected to
+uphold this code.**
+
+**We use [GitHub issues](https://github.com/tensorflow/tensorflow/issues) for
+tracking requests and bugs. So please see
+[TensorFlow Discuss](https://groups.google.com/a/tensorflow.org/forum/#!forum/discuss) for general questions
+and discussion, and please direct specific questions to [Stack Overflow](https://stackoverflow.com/questions/tagged/tensorflow).**
+
+The TensorFlow project strives to abide by generally accepted best practices in open-source software development:
+
+[![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/1486/badge)](https://bestpractices.coreinfrastructure.org/projects/1486)
+
 ## For more information
 
 * [TensorFlow Website](https://www.tensorflow.org)
diff --git a/configure b/configure
index 9c21d2b03a..66b66ba54e 100755
--- a/configure
+++ b/configure
@@ -8,7 +8,8 @@ if [ -z "$PYTHON_BIN_PATH" ]; then
 fi
 
 # Set all env variables
-"$PYTHON_BIN_PATH" configure.py
+CONFIGURE_DIR=$(dirname "$0")
+"$PYTHON_BIN_PATH" "${CONFIGURE_DIR}/configure.py" "$@"
 
 echo "Configuration finished"
 
diff --git a/configure.py b/configure.py
index 9744f6ac81..7d2e30cd8a 100644
--- a/configure.py
+++ b/configure.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import argparse
 import errno
 import os
 import platform
@@ -32,10 +33,6 @@ except ImportError:
   from distutils.spawn import find_executable as which
 # pylint: enable=g-import-not-at-top
 
-_TF_BAZELRC = os.path.join(os.path.dirname(os.path.abspath(__file__)),
-                           '.tf_configure.bazelrc')
-_TF_WORKSPACE = os.path.join(os.path.dirname(os.path.abspath(__file__)),
-                             'WORKSPACE')
 _DEFAULT_CUDA_VERSION = '9.0'
 _DEFAULT_CUDNN_VERSION = '7'
 _DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,5.2'
@@ -51,6 +48,11 @@ _SUPPORTED_ANDROID_NDK_VERSIONS = [10, 11, 12, 13, 14, 15]
 
 _DEFAULT_PROMPT_ASK_ATTEMPTS = 10
 
+_TF_WORKSPACE_ROOT = os.path.abspath(os.path.dirname(__file__))
+_TF_BAZELRC_FILENAME = '.tf_configure.bazelrc'
+_TF_BAZELRC = os.path.join(_TF_WORKSPACE_ROOT, _TF_BAZELRC_FILENAME)
+_TF_WORKSPACE = os.path.join(_TF_WORKSPACE_ROOT, 'WORKSPACE')
+
 
 class UserInputError(Exception):
   pass
@@ -119,22 +121,6 @@ def sed_in_place(filename, old, new):
     f.write(newdata)
 
 
-def remove_line_with(filename, token):
-  """Remove lines that contain token from file.
-
-  Args:
-    filename: string for filename.
-    token: string token to check if to remove a line from file or not.
-  """
-  with open(filename, 'r') as f:
-    filedata = f.read()
-
-  with open(filename, 'w') as f:
-    for line in filedata.strip().split('\n'):
-      if token not in line:
-        f.write(line + '\n')
-
-
 def write_to_bazelrc(line):
   with open(_TF_BAZELRC, 'a') as f:
     f.write(line + '\n')
@@ -245,25 +231,26 @@ def setup_python(environ_cp):
   environ_cp['PYTHON_BIN_PATH'] = python_bin_path
 
   # Write tools/python_bin_path.sh
-  with open('tools/python_bin_path.sh', 'w') as f:
+  with open(os.path.join(
+      _TF_WORKSPACE_ROOT, 'tools', 'python_bin_path.sh'), 'w') as f:
     f.write('export PYTHON_BIN_PATH="%s"' % python_bin_path)
 
 
-def reset_tf_configure_bazelrc():
+def reset_tf_configure_bazelrc(workspace_path):
   """Reset file that contains customized config settings."""
   open(_TF_BAZELRC, 'w').close()
+  bazelrc_path = os.path.join(workspace_path, '.bazelrc')
 
-  home = os.path.expanduser('~')
-  if not os.path.exists('.bazelrc'):
-    if os.path.exists(os.path.join(home, '.bazelrc')):
-      with open('.bazelrc', 'a') as f:
-        f.write('import %s/.bazelrc\n' % home.replace('\\', '/'))
-    else:
-      open('.bazelrc', 'w').close()
-
-  remove_line_with('.bazelrc', 'tf_configure')
-  with open('.bazelrc', 'a') as f:
-    f.write('import %workspace%/.tf_configure.bazelrc\n')
+  data = []
+  if os.path.exists(bazelrc_path):
+    with open(bazelrc_path, 'r') as f:
+      data = f.read().splitlines()
+  with open(bazelrc_path, 'w') as f:
+    for l in data:
+      if _TF_BAZELRC_FILENAME in l:
+        continue
+      f.write('%s\n' % l)
+    f.write('import %s\n' % _TF_BAZELRC)
 
 
 def cleanup_makefile():
@@ -271,7 +258,8 @@ def cleanup_makefile():
 
   These files could interfere with Bazel parsing.
   """
-  makefile_download_dir = 'tensorflow/contrib/makefile/downloads'
+  makefile_download_dir = os.path.join(
+      _TF_WORKSPACE_ROOT, 'tensorflow', 'contrib', 'makefile', 'downloads')
   if os.path.isdir(makefile_download_dir):
     for root, _, filenames in os.walk(makefile_download_dir):
       for f in filenames:
@@ -502,7 +490,8 @@ def set_cc_opt_flags(environ_cp):
   for opt in cc_opt_flags.split():
     write_to_bazelrc('build:opt --copt=%s' % opt)
   # It should be safe on the same build host.
-  write_to_bazelrc('build:opt --host_copt=-march=native')
+  if not is_ppc64le():
+    write_to_bazelrc('build:opt --host_copt=-march=native')
   write_to_bazelrc('build:opt --define with_default_optimizations=true')
   # TODO(mikecase): Remove these default defines once we are able to get
   # TF Lite targets building without them.
@@ -1229,7 +1218,7 @@ def set_host_c_compiler(environ_cp):
       environ_cp,
       var_name='HOST_C_COMPILER',
       var_default=default_c_host_compiler,
-      ask_for_var=('Please specify which C compiler should be used as the host'
+      ask_for_var=('Please specify which C compiler should be used as the host '
                    'C compiler.'),
       check_success=os.path.exists,
       error_msg='Invalid C compiler path. %s cannot be found.',
@@ -1373,13 +1362,20 @@ def config_info_line(name, help_text):
 
 
 def main():
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--workspace",
+                      type=str,
+                      default=_TF_WORKSPACE_ROOT,
+                      help="The absolute path to your active Bazel workspace.")
+  args = parser.parse_args()
+
   # Make a copy of os.environ to be clear when functions and getting and setting
   # environment variables.
   environ_cp = dict(os.environ)
 
   check_bazel_version('0.5.4')
 
-  reset_tf_configure_bazelrc()
+  reset_tf_configure_bazelrc(args.workspace)
   cleanup_makefile()
   setup_python(environ_cp)
 
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 2e71783b0d..dc995d231d 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -407,6 +407,14 @@ filegroup(
     visibility = ["//tensorflow:__subpackages__"],
 )
 
+py_library(
+    name = "tensorflow_py",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = ["//tensorflow/python"],
+)
+
 filegroup(
     name = "all_opensource_files",
     data = [
@@ -821,11 +829,3 @@ exports_files(
         "tf_exported_symbols.lds",
     ],
 )
-
-py_library(
-    name = "tensorflow_py",
-    srcs = ["__init__.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = ["//tensorflow/python"],
-)
diff --git a/tensorflow/SECURITY.md b/tensorflow/SECURITY.md
index 6ddac1f964..fea24b2739 100644
--- a/tensorflow/SECURITY.md
+++ b/tensorflow/SECURITY.md
@@ -233,7 +233,7 @@ v//Fw6ZeY+HmRDFdirjD7wXtIuER4vqCryIqR6Xe9X8oJXz9L/Jhslc=
 
 ### Known vulnerabilities
 
-| Type | Versions affected | Reported by | Additional Information |
-|------|:-----------------:|---------------------------------------|
-| out of bounds read| <=1.4 | TenCent Blade Team | [issue report](https://github.com/tensorflow/tensorflow/issues/14959) |
+| Type              | Versions affected |        Reported by | Additional Information      |
+|-------------------|:-----------------:|--------------------|-----------------------------|
+| out of bounds read|             <=1.4 | TenCent Blade Team | [issue report](https://github.com/tensorflow/tensorflow/issues/14959) |
 
diff --git a/tensorflow/contrib/cmake/tests/cuda/compatibility_test.cc b/tensorflow/contrib/cmake/tests/cuda/compatibility_test.cc
index a50461cafd..beb574061b 100644
--- a/tensorflow/contrib/cmake/tests/cuda/compatibility_test.cc
+++ b/tensorflow/contrib/cmake/tests/cuda/compatibility_test.cc
@@ -17,4 +17,6 @@ limitations under the License.
 #define __CUDACC__
 #include "crt/host_config.h"
 
-int main(void) { return 0; }
+int main(void) {
+  return 0;
+}
diff --git a/tensorflow/contrib/gan/python/eval/python/summaries_test.py b/tensorflow/contrib/gan/python/eval/python/summaries_test.py
index 5549df971d..45eb108586 100644
--- a/tensorflow/contrib/gan/python/eval/python/summaries_test.py
+++ b/tensorflow/contrib/gan/python/eval/python/summaries_test.py
@@ -71,10 +71,11 @@ def get_cyclegan_model():
 
 class SummariesTest(test.TestCase):
 
-  def _test_add_gan_model_image_summaries_impl(
-      self, get_model_fn, expected_num_summary_ops, model_summaries):
-    summaries.add_gan_model_image_summaries(
-        get_model_fn(), grid_size=2, model_summaries=model_summaries)
+  def _test_add_gan_model_image_summaries_impl(self, get_model_fn,
+                                               expected_num_summary_ops,
+                                               model_summaries):
+    summaries.add_gan_model_image_summaries(get_model_fn(), grid_size=2,
+                                            model_summaries=model_summaries)
 
     self.assertEquals(expected_num_summary_ops,
                       len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index b2ea75c7e1..80cbe68870 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -58,12 +58,12 @@ __all__ = [
     'avg_pool2d', 'avg_pool3d', 'batch_norm', 'bias_add', 'conv2d', 'conv3d',
     'conv2d_in_plane', 'conv2d_transpose', 'conv3d_transpose', 'convolution',
     'convolution2d', 'convolution2d_in_plane', 'convolution2d_transpose',
-    'convolution3d', 'convolution3d_transpose', 'dense_to_sparse', 'dropout',
-    'elu', 'flatten', 'fully_connected', 'GDN', 'gdn', 'images_to_sequence',
-    'layer_norm', 'linear', 'pool', 'max_pool2d', 'max_pool3d',
-    'one_hot_encoding', 'relu', 'relu6', 'repeat', 'scale_gradient',
-    'separable_conv2d', 'separable_convolution2d', 'sequence_to_images',
-    'softmax', 'spatial_softmax', 'stack', 'unit_norm',
+    'convolution3d', 'convolution3d_transpose', 'dense_to_sparse',
+    'dropout', 'elu', 'flatten', 'fully_connected', 'GDN', 'gdn',
+    'images_to_sequence', 'layer_norm', 'linear', 'pool', 'max_pool2d',
+    'max_pool3d', 'one_hot_encoding', 'relu', 'relu6', 'repeat',
+    'scale_gradient', 'separable_conv2d', 'separable_convolution2d',
+    'sequence_to_images', 'softmax', 'spatial_softmax', 'stack', 'unit_norm',
     'legacy_fully_connected', 'legacy_linear', 'legacy_relu', 'maxout'
 ]
 
@@ -2718,7 +2718,8 @@ def sequence_to_images(inputs,
       num_batches = -1
     else:
       num_batches = num_batches // height
-    reshaped = array_ops.reshape(inputs, [width, num_batches, height, depth])
+    reshaped = array_ops.reshape(inputs,
+                                 [width, num_batches, height, depth])
     if output_data_format == 'channels_first':
       outputs = array_ops.transpose(reshaped, [1, 3, 2, 0])
     else:
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index ba70432c48..997f910a2a 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -3447,8 +3447,9 @@ class SequenceToImagesTest(test.TestCase):
     num_time_steps = 11
     num_channels = 5
     desired_height = 7
-    sequence = np.random.uniform(
-        size=(num_time_steps, num_batches, num_channels)).astype(np.float32)
+    sequence = np.random.uniform(size=(num_time_steps,
+                                       num_batches,
+                                       num_channels)).astype(np.float32)
     output = _layers.sequence_to_images(sequence, desired_height)
     self.assertListEqual(output.get_shape().as_list(), [2, 7, 11, 5])
 
@@ -3457,10 +3458,12 @@ class SequenceToImagesTest(test.TestCase):
     num_time_steps = 11
     num_channels = 5
     desired_height = 7
-    sequence = np.random.uniform(
-        size=(num_time_steps, num_batches, num_channels)).astype(np.float32)
-    output = _layers.sequence_to_images(
-        sequence, desired_height, output_data_format='channels_first')
+    sequence = np.random.uniform(size=(num_time_steps,
+                                       num_batches,
+                                       num_channels)).astype(np.float32)
+    output = _layers.sequence_to_images(sequence,
+                                        desired_height,
+                                        output_data_format='channels_first')
     self.assertListEqual(output.get_shape().as_list(), [2, 5, 7, 11])
 
 
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/assets/labels.txt b/tensorflow/contrib/lite/java/demo/app/src/main/assets/labels.txt
deleted file mode 100644
index fe811239d8..0000000000
--- a/tensorflow/contrib/lite/java/demo/app/src/main/assets/labels.txt
+++ /dev/null
@@ -1,1001 +0,0 @@
-background
-tench
-goldfish
-great white shark
-tiger shark
-hammerhead
-electric ray
-stingray
-cock
-hen
-ostrich
-brambling
-goldfinch
-house finch
-junco
-indigo bunting
-robin
-bulbul
-jay
-magpie
-chickadee
-water ouzel
-kite
-bald eagle
-vulture
-great grey owl
-European fire salamander
-common newt
-eft
-spotted salamander
-axolotl
-bullfrog
-tree frog
-tailed frog
-loggerhead
-leatherback turtle
-mud turtle
-terrapin
-box turtle
-banded gecko
-common iguana
-American chameleon
-whiptail
-agama
-frilled lizard
-alligator lizard
-Gila monster
-green lizard
-African chameleon
-Komodo dragon
-African crocodile
-American alligator
-triceratops
-thunder snake
-ringneck snake
-hognose snake
-green snake
-king snake
-garter snake
-water snake
-vine snake
-night snake
-boa constrictor
-rock python
-Indian cobra
-green mamba
-sea snake
-horned viper
-diamondback
-sidewinder
-trilobite
-harvestman
-scorpion
-black and gold garden spider
-barn spider
-garden spider
-black widow
-tarantula
-wolf spider
-tick
-centipede
-black grouse
-ptarmigan
-ruffed grouse
-prairie chicken
-peacock
-quail
-partridge
-African grey
-macaw
-sulphur-crested cockatoo
-lorikeet
-coucal
-bee eater
-hornbill
-hummingbird
-jacamar
-toucan
-drake
-red-breasted merganser
-goose
-black swan
-tusker
-echidna
-platypus
-wallaby
-koala
-wombat
-jellyfish
-sea anemone
-brain coral
-flatworm
-nematode
-conch
-snail
-slug
-sea slug
-chiton
-chambered nautilus
-Dungeness crab
-rock crab
-fiddler crab
-king crab
-American lobster
-spiny lobster
-crayfish
-hermit crab
-isopod
-white stork
-black stork
-spoonbill
-flamingo
-little blue heron
-American egret
-bittern
-crane
-limpkin
-European gallinule
-American coot
-bustard
-ruddy turnstone
-red-backed sandpiper
-redshank
-dowitcher
-oystercatcher
-pelican
-king penguin
-albatross
-grey whale
-killer whale
-dugong
-sea lion
-Chihuahua
-Japanese spaniel
-Maltese dog
-Pekinese
-Shih-Tzu
-Blenheim spaniel
-papillon
-toy terrier
-Rhodesian ridgeback
-Afghan hound
-basset
-beagle
-bloodhound
-bluetick
-black-and-tan coonhound
-Walker hound
-English foxhound
-redbone
-borzoi
-Irish wolfhound
-Italian greyhound
-whippet
-Ibizan hound
-Norwegian elkhound
-otterhound
-Saluki
-Scottish deerhound
-Weimaraner
-Staffordshire bullterrier
-American Staffordshire terrier
-Bedlington terrier
-Border terrier
-Kerry blue terrier
-Irish terrier
-Norfolk terrier
-Norwich terrier
-Yorkshire terrier
-wire-haired fox terrier
-Lakeland terrier
-Sealyham terrier
-Airedale
-cairn
-Australian terrier
-Dandie Dinmont
-Boston bull
-miniature schnauzer
-giant schnauzer
-standard schnauzer
-Scotch terrier
-Tibetan terrier
-silky terrier
-soft-coated wheaten terrier
-West Highland white terrier
-Lhasa
-flat-coated retriever
-curly-coated retriever
-golden retriever
-Labrador retriever
-Chesapeake Bay retriever
-German short-haired pointer
-vizsla
-English setter
-Irish setter
-Gordon setter
-Brittany spaniel
-clumber
-English springer
-Welsh springer spaniel
-cocker spaniel
-Sussex spaniel
-Irish water spaniel
-kuvasz
-schipperke
-groenendael
-malinois
-briard
-kelpie
-komondor
-Old English sheepdog
-Shetland sheepdog
-collie
-Border collie
-Bouvier des Flandres
-Rottweiler
-German shepherd
-Doberman
-miniature pinscher
-Greater Swiss Mountain dog
-Bernese mountain dog
-Appenzeller
-EntleBucher
-boxer
-bull mastiff
-Tibetan mastiff
-French bulldog
-Great Dane
-Saint Bernard
-Eskimo dog
-malamute
-Siberian husky
-dalmatian
-affenpinscher
-basenji
-pug
-Leonberg
-Newfoundland
-Great Pyrenees
-Samoyed
-Pomeranian
-chow
-keeshond
-Brabancon griffon
-Pembroke
-Cardigan
-toy poodle
-miniature poodle
-standard poodle
-Mexican hairless
-timber wolf
-white wolf
-red wolf
-coyote
-dingo
-dhole
-African hunting dog
-hyena
-red fox
-kit fox
-Arctic fox
-grey fox
-tabby
-tiger cat
-Persian cat
-Siamese cat
-Egyptian cat
-cougar
-lynx
-leopard
-snow leopard
-jaguar
-lion
-tiger
-cheetah
-brown bear
-American black bear
-ice bear
-sloth bear
-mongoose
-meerkat
-tiger beetle
-ladybug
-ground beetle
-long-horned beetle
-leaf beetle
-dung beetle
-rhinoceros beetle
-weevil
-fly
-bee
-ant
-grasshopper
-cricket
-walking stick
-cockroach
-mantis
-cicada
-leafhopper
-lacewing
-dragonfly
-damselfly
-admiral
-ringlet
-monarch
-cabbage butterfly
-sulphur butterfly
-lycaenid
-starfish
-sea urchin
-sea cucumber
-wood rabbit
-hare
-Angora
-hamster
-porcupine
-fox squirrel
-marmot
-beaver
-guinea pig
-sorrel
-zebra
-hog
-wild boar
-warthog
-hippopotamus
-ox
-water buffalo
-bison
-ram
-bighorn
-ibex
-hartebeest
-impala
-gazelle
-Arabian camel
-llama
-weasel
-mink
-polecat
-black-footed ferret
-otter
-skunk
-badger
-armadillo
-three-toed sloth
-orangutan
-gorilla
-chimpanzee
-gibbon
-siamang
-guenon
-patas
-baboon
-macaque
-langur
-colobus
-proboscis monkey
-marmoset
-capuchin
-howler monkey
-titi
-spider monkey
-squirrel monkey
-Madagascar cat
-indri
-Indian elephant
-African elephant
-lesser panda
-giant panda
-barracouta
-eel
-coho
-rock beauty
-anemone fish
-sturgeon
-gar
-lionfish
-puffer
-abacus
-abaya
-academic gown
-accordion
-acoustic guitar
-aircraft carrier
-airliner
-airship
-altar
-ambulance
-amphibian
-analog clock
-apiary
-apron
-ashcan
-assault rifle
-backpack
-bakery
-balance beam
-balloon
-ballpoint
-Band Aid
-banjo
-bannister
-barbell
-barber chair
-barbershop
-barn
-barometer
-barrel
-barrow
-baseball
-basketball
-bassinet
-bassoon
-bathing cap
-bath towel
-bathtub
-beach wagon
-beacon
-beaker
-bearskin
-beer bottle
-beer glass
-bell cote
-bib
-bicycle-built-for-two
-bikini
-binder
-binoculars
-birdhouse
-boathouse
-bobsled
-bolo tie
-bonnet
-bookcase
-bookshop
-bottlecap
-bow
-bow tie
-brass
-brassiere
-breakwater
-breastplate
-broom
-bucket
-buckle
-bulletproof vest
-bullet train
-butcher shop
-cab
-caldron
-candle
-cannon
-canoe
-can opener
-cardigan
-car mirror
-carousel
-carpenter's kit
-carton
-car wheel
-cash machine
-cassette
-cassette player
-castle
-catamaran
-CD player
-cello
-cellular telephone
-chain
-chainlink fence
-chain mail
-chain saw
-chest
-chiffonier
-chime
-china cabinet
-Christmas stocking
-church
-cinema
-cleaver
-cliff dwelling
-cloak
-clog
-cocktail shaker
-coffee mug
-coffeepot
-coil
-combination lock
-computer keyboard
-confectionery
-container ship
-convertible
-corkscrew
-cornet
-cowboy boot
-cowboy hat
-cradle
-crane
-crash helmet
-crate
-crib
-Crock Pot
-croquet ball
-crutch
-cuirass
-dam
-desk
-desktop computer
-dial telephone
-diaper
-digital clock
-digital watch
-dining table
-dishrag
-dishwasher
-disk brake
-dock
-dogsled
-dome
-doormat
-drilling platform
-drum
-drumstick
-dumbbell
-Dutch oven
-electric fan
-electric guitar
-electric locomotive
-entertainment center
-envelope
-espresso maker
-face powder
-feather boa
-file
-fireboat
-fire engine
-fire screen
-flagpole
-flute
-folding chair
-football helmet
-forklift
-fountain
-fountain pen
-four-poster
-freight car
-French horn
-frying pan
-fur coat
-garbage truck
-gasmask
-gas pump
-goblet
-go-kart
-golf ball
-golfcart
-gondola
-gong
-gown
-grand piano
-greenhouse
-grille
-grocery store
-guillotine
-hair slide
-hair spray
-half track
-hammer
-hamper
-hand blower
-hand-held computer
-handkerchief
-hard disc
-harmonica
-harp
-harvester
-hatchet
-holster
-home theater
-honeycomb
-hook
-hoopskirt
-horizontal bar
-horse cart
-hourglass
-iPod
-iron
-jack-o'-lantern
-jean
-jeep
-jersey
-jigsaw puzzle
-jinrikisha
-joystick
-kimono
-knee pad
-knot
-lab coat
-ladle
-lampshade
-laptop
-lawn mower
-lens cap
-letter opener
-library
-lifeboat
-lighter
-limousine
-liner
-lipstick
-Loafer
-lotion
-loudspeaker
-loupe
-lumbermill
-magnetic compass
-mailbag
-mailbox
-maillot
-maillot
-manhole cover
-maraca
-marimba
-mask
-matchstick
-maypole
-maze
-measuring cup
-medicine chest
-megalith
-microphone
-microwave
-military uniform
-milk can
-minibus
-miniskirt
-minivan
-missile
-mitten
-mixing bowl
-mobile home
-Model T
-modem
-monastery
-monitor
-moped
-mortar
-mortarboard
-mosque
-mosquito net
-motor scooter
-mountain bike
-mountain tent
-mouse
-mousetrap
-moving van
-muzzle
-nail
-neck brace
-necklace
-nipple
-notebook
-obelisk
-oboe
-ocarina
-odometer
-oil filter
-organ
-oscilloscope
-overskirt
-oxcart
-oxygen mask
-packet
-paddle
-paddlewheel
-padlock
-paintbrush
-pajama
-palace
-panpipe
-paper towel
-parachute
-parallel bars
-park bench
-parking meter
-passenger car
-patio
-pay-phone
-pedestal
-pencil box
-pencil sharpener
-perfume
-Petri dish
-photocopier
-pick
-pickelhaube
-picket fence
-pickup
-pier
-piggy bank
-pill bottle
-pillow
-ping-pong ball
-pinwheel
-pirate
-pitcher
-plane
-planetarium
-plastic bag
-plate rack
-plow
-plunger
-Polaroid camera
-pole
-police van
-poncho
-pool table
-pop bottle
-pot
-potter's wheel
-power drill
-prayer rug
-printer
-prison
-projectile
-projector
-puck
-punching bag
-purse
-quill
-quilt
-racer
-racket
-radiator
-radio
-radio telescope
-rain barrel
-recreational vehicle
-reel
-reflex camera
-refrigerator
-remote control
-restaurant
-revolver
-rifle
-rocking chair
-rotisserie
-rubber eraser
-rugby ball
-rule
-running shoe
-safe
-safety pin
-saltshaker
-sandal
-sarong
-sax
-scabbard
-scale
-school bus
-schooner
-scoreboard
-screen
-screw
-screwdriver
-seat belt
-sewing machine
-shield
-shoe shop
-shoji
-shopping basket
-shopping cart
-shovel
-shower cap
-shower curtain
-ski
-ski mask
-sleeping bag
-slide rule
-sliding door
-slot
-snorkel
-snowmobile
-snowplow
-soap dispenser
-soccer ball
-sock
-solar dish
-sombrero
-soup bowl
-space bar
-space heater
-space shuttle
-spatula
-speedboat
-spider web
-spindle
-sports car
-spotlight
-stage
-steam locomotive
-steel arch bridge
-steel drum
-stethoscope
-stole
-stone wall
-stopwatch
-stove
-strainer
-streetcar
-stretcher
-studio couch
-stupa
-submarine
-suit
-sundial
-sunglass
-sunglasses
-sunscreen
-suspension bridge
-swab
-sweatshirt
-swimming trunks
-swing
-switch
-syringe
-table lamp
-tank
-tape player
-teapot
-teddy
-television
-tennis ball
-thatch
-theater curtain
-thimble
-thresher
-throne
-tile roof
-toaster
-tobacco shop
-toilet seat
-torch
-totem pole
-tow truck
-toyshop
-tractor
-trailer truck
-tray
-trench coat
-tricycle
-trimaran
-tripod
-triumphal arch
-trolleybus
-trombone
-tub
-turnstile
-typewriter keyboard
-umbrella
-unicycle
-upright
-vacuum
-vase
-vault
-velvet
-vending machine
-vestment
-viaduct
-violin
-volleyball
-waffle iron
-wall clock
-wallet
-wardrobe
-warplane
-washbasin
-washer
-water bottle
-water jug
-water tower
-whiskey jug
-whistle
-wig
-window screen
-window shade
-Windsor tie
-wine bottle
-wing
-wok
-wooden spoon
-wool
-worm fence
-wreck
-yawl
-yurt
-web site
-comic book
-crossword puzzle
-street sign
-traffic light
-book jacket
-menu
-plate
-guacamole
-consomme
-hot pot
-trifle
-ice cream
-ice lolly
-French loaf
-bagel
-pretzel
-cheeseburger
-hotdog
-mashed potato
-head cabbage
-broccoli
-cauliflower
-zucchini
-spaghetti squash
-acorn squash
-butternut squash
-cucumber
-artichoke
-bell pepper
-cardoon
-mushroom
-Granny Smith
-strawberry
-orange
-lemon
-fig
-pineapple
-banana
-jackfruit
-custard apple
-pomegranate
-hay
-carbonara
-chocolate sauce
-dough
-meat loaf
-pizza
-potpie
-burrito
-red wine
-espresso
-cup
-eggnog
-alp
-bubble
-cliff
-coral reef
-geyser
-lakeside
-promontory
-sandbar
-seashore
-valley
-volcano
-ballplayer
-groom
-scuba diver
-rapeseed
-daisy
-yellow lady's slipper
-corn
-acorn
-hip
-buckeye
-coral fungus
-agaric
-gyromitra
-stinkhorn
-earthstar
-hen-of-the-woods
-bolete
-ear
-toilet tissue
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
index 2c91be9d62..c57bb348c5 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
@@ -20,6 +20,9 @@ import android.content.res.AssetFileDescriptor;
 import android.graphics.Bitmap;
 import android.os.SystemClock;
 import android.util.Log;
+
+import org.tensorflow.lite.Interpreter;
+
 import java.io.BufferedReader;
 import java.io.FileInputStream;
 import java.io.IOException;
@@ -34,9 +37,10 @@ import java.util.Comparator;
 import java.util.List;
 import java.util.Map;
 import java.util.PriorityQueue;
-import org.tensorflow.lite.Interpreter;
 
-/** Classifies images with Tensorflow Lite. */
+/**
+ * Classifies images with Tensorflow Lite.
+ */
 public abstract class ImageClassifier {
 
   /** Tag for the {@link Log}. */
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierFloatInception.java b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierFloatInception.java
index 3108422952..be17b85e0c 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierFloatInception.java
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierFloatInception.java
@@ -16,22 +16,24 @@ limitations under the License.
 package com.example.android.tflitecamerademo;
 
 import android.app.Activity;
+
 import java.io.IOException;
 
 /**
- * This classifier works with the Inception-v3 slim model. It applies floating point inference
- * rather than using a quantized model.
+ * This classifier works with the Inception-v3 slim model.
+ * It applies floating point inference rather than using a quantized model.
  */
 public class ImageClassifierFloatInception extends ImageClassifier {
 
-  /** The inception net requires additional normalization of the used input. */
+  /**
+   * The inception net requires additional normalization of the used input.
+   */
   private static final int IMAGE_MEAN = 128;
-
   private static final float IMAGE_STD = 128.0f;
 
   /**
-   * An array to hold inference results, to be feed into Tensorflow Lite as outputs. This isn't part
-   * of the super class, because we need a primitive array here.
+   * An array to hold inference results, to be feed into Tensorflow Lite as outputs.
+   * This isn't part of the super class, because we need a primitive array here.
    */
   private float[][] labelProbArray = null;
 
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierQuantizedMobileNet.java b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierQuantizedMobileNet.java
index 5f341f0f5b..156c895146 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierQuantizedMobileNet.java
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierQuantizedMobileNet.java
@@ -16,14 +16,17 @@ limitations under the License.
 package com.example.android.tflitecamerademo;
 
 import android.app.Activity;
+
 import java.io.IOException;
 
-/** This classifier works with the quantized MobileNet model. */
+/**
+ * This classifier works with the quantized MobileNet model.
+ */
 public class ImageClassifierQuantizedMobileNet extends ImageClassifier {
 
   /**
-   * An array to hold inference results, to be feed into Tensorflow Lite as outputs. This isn't part
-   * of the super class, because we need a primitive array here.
+   * An array to hold inference results, to be feed into Tensorflow Lite as outputs.
+   * This isn't part of the super class, because we need a primitive array here.
    */
   private byte[][] labelProbArray = null;
 
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
index 883c7f270d..780401e052 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include <string.h>
 
 #include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/kernels/internal/common.h"
 #include "tensorflow/contrib/lite/kernels/activation_functor.h"
 #include "tensorflow/contrib/lite/kernels/internal/common.h"
 #include "tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h"
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index 2481add769..5488b71fcf 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -36,6 +36,7 @@ import traceback
 import zipfile
 import numpy as np
 from six import StringIO
+from six.moves import xrange
 
 # TODO(aselle): Disable GPU for now
 os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
diff --git a/tensorflow/contrib/mpi_collectives/mpi_ops.cc b/tensorflow/contrib/mpi_collectives/mpi_ops.cc
deleted file mode 100644
index a051ab0004..0000000000
--- a/tensorflow/contrib/mpi_collectives/mpi_ops.cc
+++ /dev/null
@@ -1,1236 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifdef TENSORFLOW_USE_MPI
-
-#include <queue>
-#include <thread>
-#include <unordered_map>
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/platform/mutex.h"
-
-#define EIGEN_USE_THREADS
-
-#if GOOGLE_CUDA
-#include <cuda_runtime.h>
-#include "tensorflow/stream_executor/stream.h"
-#endif
-
-#include "tensorflow/stream_executor/lib/statusor.h"
-
-#define OMPI_SKIP_MPICXX
-#include "third_party/mpi/mpi.h"
-#include "tensorflow/contrib/mpi_collectives/mpi_message.pb.h"
-#include "tensorflow/contrib/mpi_collectives/ring.h"
-
-/*
- * MPI Allreduce and Allgather Ops for TensorFlow.
- *
- * TensorFlow natively provides inter-device communication through send and
- * receive ops and inter-node communication through Distributed TensorFlow,
- * based on the same send and receive abstractions. These end up being
- * insufficient for synchronous data-parallel training on HPC clusters where
- * Infiniband or other high-speed interconnects are available.  This module
- * implements MPI ops for allgather and allreduce, which do bandwidth-optimal
- * gathers and reductions and can take advantage of hardware-optimized
- * communication libraries through the MPI implementation.
- *
- * The primary logic of the allreduce and allgather are in RingAllgather() and
- * RingAllreduce(). The background thread which facilitates MPI operations is
- * run in BackgroundThreadLoop(). The provided MPI ops are:
- *      – MPIInit:
- *          Initialize MPI on a given device (CPU or GPU).
- *          Should only be run on a single device in every process.
- *      – MPISize:
- *          Get the number of MPI processes in the global communicator.
- *      – MPIRank:
- *          Get the rank of the current MPI process in the global communicator.
- *      – MPILocalRank:
- *          Get the local rank of the current MPI process within its node.
- *      – MPIAllreduce:
- *          Perform an allreduce on a Tensor, returning the sum
- *          across all MPI processes in the global communicator.
- *      – MPIAllgather:
- *          Perform an allgather on a Tensor, returning the concatenation of
- *          the tensor on the first dimension across all MPI processes in the
- *          global communicator.
- *
- */
-
-template <class T>
-using StatusOr = perftools::gputools::port::StatusOr<T>;
-
-using CPUDevice = Eigen::ThreadPoolDevice;
-using GPUDevice = Eigen::GpuDevice;
-
-namespace tensorflow {
-namespace contrib {
-namespace mpi {
-
-// Make sure template specializations are generated in the ring.cu.cc and the
-// ring.cc file, not in this file.
-extern template Status RingAllreduce<GPUDevice, int>(OpKernelContext*,
-                                                     const Tensor*, Tensor*,
-                                                     Tensor*);
-extern template Status RingAllreduce<GPUDevice, long long>(OpKernelContext*,
-                                                           const Tensor*,
-                                                           Tensor*, Tensor*);
-extern template Status RingAllreduce<GPUDevice, float>(OpKernelContext*,
-                                                       const Tensor*, Tensor*,
-                                                       Tensor*);
-extern template Status RingAllgather<GPUDevice, int>(OpKernelContext*,
-                                                     const Tensor*,
-                                                     const std::vector<size_t>&,
-                                                     Tensor*);
-extern template Status RingAllgather<GPUDevice, long long>(
-    OpKernelContext*, const Tensor*, const std::vector<size_t>&, Tensor*);
-extern template Status RingAllgather<GPUDevice, float>(
-    OpKernelContext*, const Tensor*, const std::vector<size_t>&, Tensor*);
-extern template Status RingAllreduce<CPUDevice, int>(OpKernelContext*,
-                                                     const Tensor*, Tensor*,
-                                                     Tensor*);
-extern template Status RingAllreduce<CPUDevice, long long>(OpKernelContext*,
-                                                           const Tensor*,
-                                                           Tensor*, Tensor*);
-extern template Status RingAllreduce<CPUDevice, float>(OpKernelContext*,
-                                                       const Tensor*, Tensor*,
-                                                       Tensor*);
-extern template Status RingAllgather<CPUDevice, int>(OpKernelContext*,
-                                                     const Tensor*,
-                                                     const std::vector<size_t>&,
-                                                     Tensor*);
-extern template Status RingAllgather<CPUDevice, long long>(
-    OpKernelContext*, const Tensor*, const std::vector<size_t>&, Tensor*);
-extern template Status RingAllgather<CPUDevice, float>(
-    OpKernelContext*, const Tensor*, const std::vector<size_t>&, Tensor*);
-
-namespace {
-
-// Return true if the templated type is GPUDevice, otherwise false.
-template <typename T>
-bool IsGPUDevice();
-template <>
-bool IsGPUDevice<GPUDevice>() {
-  return true;
-};
-template <>
-bool IsGPUDevice<CPUDevice>() {
-  return false;
-};
-
-// A callback to call after the MPI communication completes. Since the
-// allreduce and allgather ops are asynchronous, this callback is what resumes
-// computation after the reduction is completed.
-typedef std::function<void(StatusOr<Tensor>)> CommunicationDoneCallback;
-
-struct CollectiveOpRecord {
-  // The rank performing this piece of the op
-  int rank;
-
-  // The name of the op/tensor to be reduced
-  std::string name;
-
-  // The op's kernel context
-  OpKernelContext* context;
-
-  // Data type of the op
-  DataType dtype;
-
-  // The input tensor
-  const Tensor* in_t;
-
-  // Allgather: Vector of per-rank first-dimension sizes
-  std::vector<size_t> sizes_vec;
-
-  // The temp tensor for intermediate results
-  Tensor temp_t;
-
-  // The output tensor
-  Tensor* out_t;
-
-  // Whether to run this op on the gpu
-  bool on_gpu;
-
-  // The callback to call after the op has completed
-  CommunicationDoneCallback callback;
-};
-
-// Table storing Tensors to be reduced, keyed by unique name.
-// This table contains everything necessary to do the reduction
-typedef std::unordered_map<std::string, CollectiveOpRecord> TensorTable;
-
-// Table for storing Tensor metadata on rank zero. This is used for error
-// checking and size calculations, as well as determining when a reduction is
-// ready to be done (when all nodes are ready to do it).
-typedef std::unordered_map<std::string, std::vector<MPIRequest> > MessageTable;
-
-// The global state required for the MPI ops.
-//
-// MPI is a library that stores a lot of global per-program state and often
-// requires running on a single thread. As a result, we have to have a single
-// background thread responsible for all MPI operations, and communicate with
-// that background thread through global state.
-struct MPIGlobalState {
-  // An atomic boolean which is set to true when MPI is initialized.
-  // This ensures that MPI_Init is never called twice.
-  std::atomic_flag initialized_flag = ATOMIC_FLAG_INIT;
-
-  // Condition variable to wait for initialization
-  condition_variable cv;
-
-  // Whether MPI_Init has been completed on the background thread.
-  bool initialization_done = false;
-
-  // Whether MPI_Init succeeded on the background thread.
-  Status init_status;
-
-  // A mutex that needs to be used whenever MPI operations touch
-  // shared structures.
-  mutex mu;
-
-  // Tensors waiting to be allreduced or allgathered.
-  TensorTable tensor_table;
-
-  // Queue of MPI requests waiting to be sent to the coordinator node.
-  std::queue<MPIRequest> message_queue;
-
-  // Background thread running MPI communication.
-  std::thread background_thread;
-
-  // Whether the background thread should shutdown.
-  bool shut_down = false;
-
-  // Only exists on the coordinator node (rank zero). Maintains a count of
-  // how many nodes are ready to allreduce every tensor (keyed by tensor
-  // name).
-  std::unique_ptr<MessageTable> message_table;
-
-  // The MPI rank, local rank, and size.
-  int rank = 0;
-  int local_rank = 0;
-  int size = 1;
-
-  // The device that MPI was initialized on. (-1 for no GPU)
-  int device = -1;
-
-  // The CUDA stream used for data transfers and within-allreduce operations.
-  // A naive implementation would use the TensorFlow StreamExecutor CUDA
-  // stream. However, the allreduce and allgather require doing memory copies
-  // and kernel executions (for accumulation of values on the GPU). However,
-  // the subsequent operations must wait for those operations to complete,
-  // otherwise MPI (which uses its own stream internally) will begin the data
-  // transfers before the CUDA calls are complete. In order to wait for those
-  // CUDA operations, if we were using the TensorFlow stream, we would have
-  // to synchronize that stream; however, other TensorFlow threads may be
-  // submitting more work to that stream, so synchronizing on it can cause
-  // the allreduce to be delayed, waiting for compute totally unrelated to it
-  // in other parts of the graph. Overlaying memory transfers and compute
-  // during backpropagation is crucial for good performance, so we cannot use
-  // the TensorFlow stream, and must use our own stream.
-#if GOOGLE_CUDA
-  cudaStream_t stream;
-  std::atomic_flag stream_created_flag = ATOMIC_FLAG_INIT;
-#endif
-
-  ~MPIGlobalState() {
-    // Make sure that the destructor of the background thread is safe to
-    // call. If a thread is still joinable (not detached or complete) its
-    // destructor cannot be called.
-    if (background_thread.joinable()) {
-      shut_down = true;
-      background_thread.join();
-    }
-  }
-};
-
-// All the MPI state that must be stored globally per-process.
-static MPIGlobalState mpi_global;
-
-// For clarify in argument lists.
-#define RANK_ZERO 0
-
-// A tag used for all coordinator messaging.
-#define TAG_NOTIFY 1
-
-// Store the MPIRequest for a name, and return whether the total count of
-// MPIRequests for that tensor is now equal to the MPI size (and thus we are
-// ready to reduce the tensor).
-bool IncrementTensorCount(std::unique_ptr<MessageTable>& message_table,
-                          MPIRequest msg, int mpi_size) {
-  auto name = msg.tensor_name();
-  auto table_iter = message_table->find(name);
-  if (table_iter == message_table->end()) {
-    message_table->emplace(name, std::vector<MPIRequest>({msg}));
-    table_iter = message_table->find(name);
-  } else {
-    table_iter->second.push_back(msg);
-  }
-
-  int count = table_iter->second.size();
-  return count == mpi_size;
-}
-
-// Once a tensor is ready to be reduced, the coordinator sends an MPIResponse
-// instructing all ranks to start the reduction to all ranks. The MPIResponse
-// also contains error messages in case the submitted MPIRequests were not
-// valid (for example, contained mismatched shapes or types).
-//
-// Constructing the MPIResponse, thus, requires a whole lot of error checking.
-MPIResponse ConstructMPIResponse(std::unique_ptr<MessageTable>& message_table,
-                                 std::string name) {
-  bool error = false;
-  auto it = message_table->find(name);
-  assert(it != message_table->end());
-
-  std::vector<MPIRequest> requests = it->second;
-  assert(requests.size() > 0);
-
-  std::ostringstream error_message_stream;
-
-  // Check that all data types being reduced or gathered are identical
-  auto data_type = requests[0].tensor_type();
-  for (unsigned int i = 1; i < requests.size(); i++) {
-    auto request_type = requests[i].tensor_type();
-    if (data_type != request_type) {
-      error = true;
-      error_message_stream << "Mismatched data types: One rank had type "
-                           << DataType_Name(data_type)
-                           << ", but another rank had type "
-                           << DataType_Name(request_type) << ".";
-      break;
-    }
-  }
-
-  // Check that all requested operations are the same
-  auto message_type = requests[0].request_type();
-  for (unsigned int i = 1; i < requests.size(); i++) {
-    if (error) {
-      break;
-    }
-
-    auto request_type = requests[i].request_type();
-    if (message_type != request_type) {
-      error = true;
-      error_message_stream << "Mismatched MPI operations: One rank did an "
-                           << message_type << ", but another rank did an "
-                           << request_type << ".";
-      break;
-    }
-  }
-
-  // If we are doing an allreduce, check that all tensor shapes
-  // are identical
-  if (message_type == MPIRequest::ALLREDUCE) {
-    TensorShape tensor_shape = requests[0].tensor_shape();
-    for (unsigned int i = 1; i < requests.size(); i++) {
-      if (error) {
-        break;
-      }
-
-      TensorShape request_shape = requests[i].tensor_shape();
-      if (tensor_shape != request_shape) {
-        error = true;
-        error_message_stream << "Mismatched allreduce tensor shapes: "
-                             << "One rank reduced a tensor of shape "
-                             << tensor_shape.DebugString()
-                             << ", but another rank sent a tensor of shape "
-                             << request_shape.DebugString() << ".";
-        break;
-      }
-    }
-  }
-
-  // If we are doing an allgather, make sure all but the first dimension are
-  // the same. The first dimension may be different and the output tensor is
-  // the sum of the first dimension. Collect the sizes by rank.
-  if (message_type == MPIRequest::ALLGATHER) {
-    TensorShape tensor_shape = requests[0].tensor_shape();
-
-    if (tensor_shape.dims() == 0) {
-      error = true;
-      error_message_stream << "Rank zero tried to gather a rank-zero tensor.";
-    }
-
-    for (unsigned int i = 1; i < requests.size(); i++) {
-      if (error) {
-        break;
-      }
-
-      TensorShape request_shape = requests[i].tensor_shape();
-      if (tensor_shape.dims() != request_shape.dims()) {
-        error = true;
-        error_message_stream << "Mismatched allgather tensor shapes: "
-                             << "One rank gathered a tensor of rank "
-                             << tensor_shape.dims()
-                             << ", but another rank sent a tensor of rank "
-                             << request_shape.dims() << ".";
-        break;
-      }
-
-      for (unsigned int dim = 1; dim < tensor_shape.dims(); dim++) {
-        if (tensor_shape.dim_size(dim) != request_shape.dim_size(dim)) {
-          error = true;
-          error_message_stream
-              << "Mismatched allgather tensor shapes: "
-              << "One rank gathered a tensor with dimension " << dim
-              << " equal to " << tensor_shape.dim_size(dim)
-              << ", but another rank sent a tensor with dimension " << dim
-              << " equal to " << request_shape.dim_size(dim) << ".";
-          break;
-        }
-      }
-    }
-  }
-
-  MPIResponse response;
-  response.set_tensor_name(name);
-  if (error) {
-    std::string error_message = error_message_stream.str();
-    response.set_response_type(MPIResponse::ERROR);
-    response.set_error_message(error_message);
-  } else {
-    auto response_type = MPIResponse::ERROR;
-    if (message_type == MPIRequest::ALLREDUCE) {
-      response_type = MPIResponse::ALLREDUCE;
-    } else {
-      response_type = MPIResponse::ALLGATHER;
-    }
-    response.set_response_type(response_type);
-  }
-
-  // Clear all queued up requests for this name. They are now taken care of
-  // by the constructed MPI response.
-  message_table->erase(it);
-
-  return response;
-}
-
-// Process an MPIResponse by doing a reduction, a gather, or raising an error.
-void PerformCollectiveOp(TensorTable& tensor_table, MPIResponse response) {
-  OpKernelContext* context;
-  const Tensor* input_tensor;
-  std::vector<size_t> sizes_vec;
-  Tensor temp_tensor;
-  Tensor* output_tensor;
-  CommunicationDoneCallback callback;
-  bool on_gpu;
-  {
-    // Lock on the tensor table.
-    mutex_lock guard(mpi_global.mu);
-
-    // We should never fail at finding this key in the tensor table.
-    auto name = response.tensor_name();
-    auto iter = tensor_table.find(name);
-    assert(iter != tensor_table.end());
-
-    assert(response.response_type() == MPIResponse::ALLREDUCE ||
-           response.response_type() == MPIResponse::ALLGATHER ||
-           response.response_type() == MPIResponse::ERROR);
-
-    CollectiveOpRecord record = iter->second;
-    context = record.context;
-    input_tensor = record.in_t;
-    sizes_vec = record.sizes_vec;
-    temp_tensor = record.temp_t;
-    output_tensor = record.out_t;
-    on_gpu = record.on_gpu;
-    callback = record.callback;
-
-    // Clear the tensor table of this tensor and its callbacks; the rest of
-    // this function takes care of it.
-    tensor_table.erase(iter);
-  }
-
-  // Use CPUDevice instead of GPUDevice if no CUDA, to ensure we don't
-  // link to non-existent symbols.
-#if GOOGLE_CUDA
-#define GPU_DEVICE_IF_CUDA GPUDevice
-#else
-#define GPU_DEVICE_IF_CUDA CPUDevice
-#endif
-
-  Status status;
-  auto dtype = input_tensor->dtype();
-  if (response.response_type() == MPIResponse::ALLGATHER) {
-    if (dtype == DT_FLOAT) {
-      status = on_gpu ? RingAllgather<GPU_DEVICE_IF_CUDA, float>(
-                            context, input_tensor, sizes_vec, output_tensor)
-                      : RingAllgather<CPUDevice, float>(
-                            context, input_tensor, sizes_vec, output_tensor);
-    } else if (dtype == DT_INT32) {
-      status = on_gpu ? RingAllgather<GPU_DEVICE_IF_CUDA, int>(
-                            context, input_tensor, sizes_vec, output_tensor)
-                      : RingAllgather<CPUDevice, int>(context, input_tensor,
-                                                      sizes_vec, output_tensor);
-    } else if (dtype == DT_INT64) {
-      status = on_gpu ? RingAllgather<GPU_DEVICE_IF_CUDA, long long>(
-                            context, input_tensor, sizes_vec, output_tensor)
-                      : RingAllgather<CPUDevice, long long>(
-                            context, input_tensor, sizes_vec, output_tensor);
-    } else {
-      status = errors::Unknown("Invalid tensor type for MPI allgather.");
-    }
-  } else if (response.response_type() == MPIResponse::ALLREDUCE) {
-    if (dtype == DT_FLOAT) {
-      status = on_gpu ? RingAllreduce<GPU_DEVICE_IF_CUDA, float>(
-                            context, input_tensor, &temp_tensor, output_tensor)
-                      : RingAllreduce<CPUDevice, float>(
-                            context, input_tensor, &temp_tensor, output_tensor);
-    } else if (dtype == DT_INT32) {
-      status = on_gpu ? RingAllreduce<GPU_DEVICE_IF_CUDA, int>(
-                            context, input_tensor, &temp_tensor, output_tensor)
-                      : RingAllreduce<CPUDevice, int>(
-                            context, input_tensor, &temp_tensor, output_tensor);
-    } else if (dtype == DT_INT64) {
-      status = on_gpu ? RingAllreduce<GPU_DEVICE_IF_CUDA, long long>(
-                            context, input_tensor, &temp_tensor, output_tensor)
-                      : RingAllreduce<CPUDevice, long long>(
-                            context, input_tensor, &temp_tensor, output_tensor);
-    } else {
-      status = errors::Unknown("Invalid tensor type for MPI allreduce.");
-    }
-  } else if (response.response_type() == MPIResponse::ERROR) {
-    status = errors::FailedPrecondition(response.error_message());
-  }
-
-  if (status.ok()) {
-    callback(StatusOr<Tensor>(*output_tensor));
-  } else {
-    callback(StatusOr<Tensor>(status));
-  }
-}
-
-// The MPI background thread loop coordinates all the MPI processes and the
-// tensor reductions. The design of the communicator mechanism is limited by a
-// few considerations:
-//
-//      1. Some MPI implementations require all MPI calls to happen from a
-//      single thread. Since TensorFlow may use several threads for graph
-//      processing, this means we must have our own dedicated thread for
-//      dealing with MPI.
-//      2. We want to gracefully handle errors, when MPI processes do not
-//      properly agree upon what should happen (such as mismatched types or
-//      shapes). To do so requires the MPI processes to know about the shapes
-//      and types of the relevant tensors on the other processes.
-//      3. The MPI reductions and gathers should be able to happen in parallel
-//      with other ongoing operations. Since MPI uses an internal
-//      (inaccessible) GPU stream separate from the TF GPUDevice streams, we
-//      cannot explicitly synchronize memcpys or kernels with it. As a result,
-//      MPIAllreduce and MPIAllgather must be AsyncOpKernels to ensure proper
-//      ordering of memcpys and kernels with respect to TF streams.
-//      4. NOTE: We cannot guarantee that all the MPI processes reduce their
-//      tensors in the same order. Thus, there must be a way to ensure the
-//      reduction memcpys and kernels occur for correct tensors across all
-//      ranks at the same time. We choose to use a coordinator (rank ID 0) to
-//      gather and trigger the reduction operations that are ready to execute.
-//
-// The coordinator currently follows a master-worker paradigm. Rank zero acts
-// as the master (the "coordinator"), whereas all other ranks are simply
-// workers. Each rank runs its own background thread which progresses in ticks.
-// In each tick, the following actions happen:
-//
-//      a) The workers send any available MPIRequests to the coordinator. These
-//      MPIRequests indicate what the worker would like to do (i.e. which
-//      tensor they would like to gather or reduce, as well as their shape and
-//      type). They repeat this for every tensor that they would like to
-//      operate on after that tensor's collective op has executed ComputeAsync.
-//
-//      b) The workers send an empty "DONE" message to the coordinator to
-//      indicate that there are no more tensors they wish to operate on.
-//
-//      c) The coordinator receives the MPIRequests from the workers, as well
-//      as from its own TensorFlow ops, and stores them in a request table. The
-//      coordinator continues to receive MPIRequest messages until it has
-//      received MPI_SIZE number of empty "DONE" messages.
-//
-//      d) The coordinator finds all tensors that are ready to be reduced,
-//      gathered, or all operations that result in an error. For each of those,
-//      it sends an MPIResponse to all the workers. When no more MPIResponses
-//      are available, it sends a "DONE" response to the workers. If the
-//      process is being shutdown, it instead sends a "SHUTDOWN" response.
-//
-//      e) The workers listen for MPIResponse messages, processing each one by
-//      doing the required reduce or gather, until they receive a "DONE"
-//      response from the coordinator. At that point, the tick ends.
-//      If instead of "DONE" they receive "SHUTDOWN", they exit their
-//      background loop.
-// TODO: Use the global mpi_global state variable instead of a local one
-void BackgroundThreadLoop() {
-#if GOOGLE_CUDA
-  // Set the device, so that this thread uses the same GPU context as the
-  // calling thread.
-  // TODO: Ensure that this is operating correctly. The background thread
-  // needs to be able to control all GPUs that the rank has access to, and
-  // might be more than 1 GPU. Tensors could be resident in any of the
-  // GPUs, so the background thread's accumulate and copy kernels might need
-  // to correctly set the device and it might be necessary for the background
-  // thread to manage multiple streams.
-  cudaSetDevice(mpi_global.device);
-  cudaStreamCreate(&mpi_global.stream);
-#endif
-
-  // Initialize MPI. This must happen on the background thread, since not all
-  // MPI implementations support being called from multiple threads.
-  auto init_result = MPI_Init(NULL, NULL);
-  if (init_result != MPI_SUCCESS) {
-    mpi_global.init_status =
-        errors::Unknown("Could not initialize MPI; MPI_Init() failed.");
-    mpi_global.initialization_done = true;
-    mpi_global.cv.notify_all();
-    return;
-  } else {
-    mpi_global.init_status = Status::OK();
-  }
-
-  // Get MPI rank to determine if we are rank zero.
-  int rank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  bool is_coordinator = rank == 0;
-
-  // Get MPI size to determine how many tensors to wait for before reducing.
-  int size;
-  MPI_Comm_size(MPI_COMM_WORLD, &size);
-
-  // Determine local rank by querying the local communicator.
-  MPI_Comm local_comm;
-  MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,
-                      &local_comm);
-  int local_rank;
-  MPI_Comm_rank(local_comm, &local_rank);
-
-  mpi_global.rank = rank;
-  mpi_global.local_rank = local_rank;
-  mpi_global.size = size;
-  mpi_global.initialization_done = true;
-
-  // Notify calling thread that initialization is complete
-  mpi_global.cv.notify_all();
-
-  // TODO: MOVE MESSAGE TABLE INITIALIZATION TO LIBRARY LOAD!
-  // Initialize the tensor count table. No tensors are available yet.
-  if (is_coordinator) {
-    mpi_global.message_table =
-        std::unique_ptr<MessageTable>(new MessageTable());
-  }
-
-  // The coordinator sends a SHUTDOWN message to trigger shutdown.
-  bool should_shut_down = false;
-  do {
-    // TODO: Eliminate the need for thread sleep by making all activity
-    // depend on other activity (e.g. condition or MPI waits).
-    std::this_thread::sleep_for(std::chrono::milliseconds(1));
-
-    // Copy the data structures from global state under this lock.
-    // However, don't keep the lock for the rest of the loop, so that
-    // enqueued stream callbacks can continue.
-    std::queue<MPIRequest> message_queue;
-    {
-      mutex_lock guard(mpi_global.mu);
-      while (!mpi_global.message_queue.empty()) {
-        MPIRequest message = mpi_global.message_queue.front();
-        mpi_global.message_queue.pop();
-        message_queue.push(message);
-      }
-    }
-
-    // Collect all tensors that are ready to be reduced. Record them in the
-    // tensor count table (rank zero) or send them to rank zero to be
-    // recorded (everyone else).
-    std::vector<std::string> ready_to_reduce;
-    while (!message_queue.empty()) {
-      // Pop the first available message message
-      MPIRequest message = message_queue.front();
-      message_queue.pop();
-
-      if (is_coordinator) {
-        bool reduce =
-            IncrementTensorCount(mpi_global.message_table, message, size);
-        if (reduce) {
-          ready_to_reduce.push_back(message.tensor_name());
-        }
-      } else {
-        std::string encoded_message;
-        message.SerializeToString(&encoded_message);
-        MPI_Send(encoded_message.c_str(), encoded_message.length() + 1,
-                 MPI_BYTE, RANK_ZERO, TAG_NOTIFY, MPI_COMM_WORLD);
-      }
-    }
-
-    // Rank zero has put all its own tensors in the tensor count table.
-    // Now, it should count all the tensors that are coming from other
-    // ranks at this tick. It should keep getting tensors until it gets a
-    // DONE message from all the other ranks.
-    if (is_coordinator) {
-      // Count of DONE messages. Keep receiving messages until the number
-      // of messages is equal to the number of processes. Initialize to
-      // one since the coordinator is effectively done.
-      int completed_ranks = 1;
-      while (completed_ranks != size) {
-        MPI_Status status;
-        MPI_Probe(MPI_ANY_SOURCE, TAG_NOTIFY, MPI_COMM_WORLD, &status);
-
-        // Find number of characters in message (including zero byte).
-        int source_rank = status.MPI_SOURCE;
-        int msg_length;
-        MPI_Get_count(&status, MPI_BYTE, &msg_length);
-
-        // If the length is zero, this is a DONE message.
-        if (msg_length == 0) {
-          completed_ranks++;
-          MPI_Recv(NULL, 0, MPI_BYTE, source_rank, TAG_NOTIFY, MPI_COMM_WORLD,
-                   &status);
-          continue;
-        }
-
-        // Get tensor name from MPI into an std::string.
-        char* buffer = new char[msg_length];
-        MPI_Recv(buffer, msg_length, MPI_BYTE, source_rank, TAG_NOTIFY,
-                 MPI_COMM_WORLD, &status);
-        std::string received_data(buffer);
-        delete[] buffer;
-
-        MPIRequest received_message;
-        received_message.ParseFromString(received_data);
-        auto received_name = received_message.tensor_name();
-
-        bool reduce = IncrementTensorCount(mpi_global.message_table,
-                                           received_message, size);
-        if (reduce) {
-          ready_to_reduce.push_back(received_name);
-        }
-      }
-
-      // At this point, rank zero should have a fully updated tensor
-      // count table and should know all the tensors that need to be
-      // reduced or gathered, and everyone else should have sent all
-      // their information to rank zero. We can now do reductions and
-      // gathers; rank zero will choose which ones and in what order,
-      // and will notify the other ranks before doing each reduction.
-      for (int i = 0; i < ready_to_reduce.size(); i++) {
-        // Notify all nodes which tensor we'd like to reduce now
-        auto name = ready_to_reduce[i];
-        MPIResponse response =
-            ConstructMPIResponse(mpi_global.message_table, name);
-
-        std::string encoded_response;
-        response.SerializeToString(&encoded_response);
-        for (int r = 1; r < size; r++) {
-          MPI_Send(encoded_response.c_str(), encoded_response.length() + 1,
-                   MPI_BYTE, r, TAG_NOTIFY, MPI_COMM_WORLD);
-        }
-
-        // Perform the reduction. All nodes should end up performing
-        // the same reduction.
-        PerformCollectiveOp(mpi_global.tensor_table, response);
-      }
-
-      // Notify all nodes that we are done with the reductions for this
-      // tick.
-      MPIResponse done_response;
-      should_shut_down = mpi_global.shut_down;
-      done_response.set_response_type(
-          mpi_global.shut_down ? MPIResponse::SHUTDOWN : MPIResponse::DONE);
-      std::string encoded_response;
-      done_response.SerializeToString(&encoded_response);
-      for (int r = 1; r < size; r++) {
-        MPI_Send(encoded_response.c_str(), encoded_response.length() + 1,
-                 MPI_BYTE, r, TAG_NOTIFY, MPI_COMM_WORLD);
-      }
-    } else {
-      // Notify the coordinator that this node is done sending messages.
-      // A DONE message is encoded as a zero-length message.
-      MPI_Send(NULL, 0, MPI_BYTE, RANK_ZERO, TAG_NOTIFY, MPI_COMM_WORLD);
-
-      // Receive names for tensors to reduce from rank zero. Once we
-      // receive a empty DONE message, stop waiting for more names.
-      while (true) {
-        MPI_Status status;
-        MPI_Probe(0, TAG_NOTIFY, MPI_COMM_WORLD, &status);
-
-        // Find number of characters in message (including zero byte).
-        int msg_length;
-        MPI_Get_count(&status, MPI_BYTE, &msg_length);
-
-        // Get tensor name from MPI into an std::string.
-        char* buffer = new char[msg_length];
-        MPI_Recv(buffer, msg_length, MPI_BYTE, 0, TAG_NOTIFY, MPI_COMM_WORLD,
-                 &status);
-        std::string received_message(buffer);
-        delete[] buffer;
-
-        MPIResponse response;
-        response.ParseFromString(received_message);
-        if (response.response_type() == MPIResponse::DONE) {
-          // No more messages this tick
-          break;
-        } else if (response.response_type() == MPIResponse::SHUTDOWN) {
-          // No more messages this tick, and the background thread
-          // should shut down
-          should_shut_down = true;
-          break;
-        } else {
-          // Process the current message
-          PerformCollectiveOp(mpi_global.tensor_table, response);
-        }
-      }
-    }
-  } while (!should_shut_down);
-
-  MPI_Finalize();
-}
-
-// Initialize MPI and start the MPI background thread. Ensure that this is
-// only done once no matter how many times this function is called.
-Status InitializeMPIOnce(bool gpu) {
-  // Ensure MPI is only initialized once.
-  if (mpi_global.initialized_flag.test_and_set()) return mpi_global.init_status;
-
-  mpi_global.device = -1;
-#if GOOGLE_CUDA
-  if (gpu) {
-    cudaGetDevice(&mpi_global.device);
-  }
-#endif
-
-  // Start the MPI background thread, which assumes MPI is initialized
-  // TODO: Change this to a Tensorflow thread
-  mpi_global.background_thread = std::thread(BackgroundThreadLoop);
-
-  // Wait to ensure that the background thread has finished initializing MPI
-  mutex_lock guard(mpi_global.mu);
-  mpi_global.cv.wait(guard);
-  if (!mpi_global.initialization_done) {
-    mpi_global.init_status =
-        errors::Unknown("Failed to wait for MPI initialization.");
-  }
-
-  return mpi_global.init_status;
-}
-
-// Check that MPI is initialized.
-Status IsMPIInitialized() {
-  if (!mpi_global.initialization_done) {
-    return errors::FailedPrecondition(
-        "MPI has not been initialized; use tf.contrib.mpi.Session.");
-  }
-  return Status::OK();
-}
-
-// This function (called from the callback set up in MPIAll*Op::ComputeAsync)
-// only adds the op's record into the local op queue (to track the op's
-// progress), and sends a message to the coordinator indicating that this rank
-// is ready to begin. The MPI background thread will handle the MPI message.
-void EnqueueTensorCollective(CollectiveOpRecord record,
-                             MPIRequest::RequestType rtype) {
-  const Tensor* input_tensor = record.in_t;
-  MPIRequest message;
-  message.set_request_rank(record.rank);
-  message.set_tensor_name(record.name);
-  message.set_tensor_type(record.dtype);
-  message.set_request_type(rtype);
-  input_tensor->shape().AsProto(message.mutable_tensor_shape());
-
-  mutex_lock guard(mpi_global.mu);
-  mpi_global.tensor_table.emplace(record.name, record);
-  mpi_global.message_queue.push(message);
-}
-
-}  // namespace
-
-#if GOOGLE_CUDA
-cudaStream_t CudaStreamForMPI() { return mpi_global.stream; }
-#endif
-
-// Op to initialize MPI in the current process. The settings used in the
-// configuration are the same that must be used for all future MPI ops.
-template <typename Device>
-class MPIInitOp : public OpKernel {
- public:
-  explicit MPIInitOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    bool on_gpu = IsGPUDevice<Device>();
-    OP_REQUIRES_OK(context, InitializeMPIOnce(on_gpu));
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("MPIInit").Device(DEVICE_CPU),
-                        MPIInitOp<CPUDevice>);
-#if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(Name("MPIInit").Device(DEVICE_GPU),
-                        MPIInitOp<GPUDevice>);
-#endif
-
-REGISTER_OP("MPIInit").Doc(R"doc(
-Initialize MPI for the current process.
-
-If this is run on a GPU, then that GPU must be used for all future MPI
-operations. If it is run on CPU, then all future MPI operations must also
-run on CPU.
-)doc");
-
-// Op to get the current MPI Size.
-template <typename Device>
-class MPISizeOp : public OpKernel {
- public:
-  explicit MPISizeOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    OP_REQUIRES_OK(context, IsMPIInitialized());
-
-    // Write integer to output tensor
-    Tensor* output;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, TensorShape({}), &output));
-
-    auto flat = output->flat<int>();
-    flat(0) = mpi_global.size;
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("MPISize").Device(DEVICE_CPU),
-                        MPISizeOp<CPUDevice>);
-#if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(Name("MPISize").Device(DEVICE_GPU).HostMemory("size"),
-                        MPISizeOp<GPUDevice>);
-#endif
-
-REGISTER_OP("MPISize")
-    .Output("size: int32")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->Scalar());
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Returns the number of running MPI processes.
-
-More precisely, returns the number of MPI processes in the group associated
-with the MPI_COMM_WORLD communicator.
-
-size:   Size of the MPI group.
-)doc");
-
-// Op to get the current MPI Rank.
-template <typename Device>
-class MPIRankOp : public OpKernel {
- public:
-  explicit MPIRankOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    OP_REQUIRES_OK(context, IsMPIInitialized());
-
-    // Write integer to output tensor
-    Tensor* output;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, TensorShape({}), &output));
-
-    auto flat = output->flat<int>();
-    flat(0) = mpi_global.rank;
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("MPIRank").Device(DEVICE_CPU),
-                        MPIRankOp<CPUDevice>);
-#if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(Name("MPIRank").Device(DEVICE_GPU).HostMemory("rank"),
-                        MPIRankOp<GPUDevice>);
-#endif
-
-REGISTER_OP("MPIRank")
-    .Output("rank: int32")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->Scalar());
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Returns the index of the current process in the MPI group.
-
-More precisely, returns the rank of the calling process in the MPI_COMM_WORLD
-communicator.
-
-rank:   Rank of the calling process.
-)doc");
-
-// Op to get the current local MPI Rank.
-template <typename Device>
-class MPILocalRankOp : public OpKernel {
- public:
-  explicit MPILocalRankOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    OP_REQUIRES_OK(context, IsMPIInitialized());
-
-    // Write integer to output tensor
-    Tensor* output;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, TensorShape({}), &output));
-
-    auto flat = output->flat<int>();
-    flat(0) = mpi_global.local_rank;
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("MPILocalRank").Device(DEVICE_CPU),
-                        MPILocalRankOp<CPUDevice>);
-#if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(
-    Name("MPILocalRank").Device(DEVICE_GPU).HostMemory("rank"),
-    MPILocalRankOp<GPUDevice>);
-#endif
-
-REGISTER_OP("MPILocalRank")
-    .Output("rank: int32")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->Scalar());
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Returns the index of the current process in the node it is on.
-
-More precisely, returns the rank of the calling process in communicator that
-only spans the MPI processes running on that node.
-
-rank:   Rank of the calling process on the node it is on.
-)doc");
-
-template <typename Device>
-class MPIAllreduceOp : public AsyncOpKernel {
- public:
-  explicit MPIAllreduceOp(OpKernelConstruction* context)
-      : AsyncOpKernel(context) {}
-
-  // Although this op is handled asynchronously, the ComputeAsync call is
-  // very inexpensive. It only sets up a CollectiveOpRecord and places it
-  // in the table for the background thread to handle. Thus, we do not need
-  // a TF pool thread to perform the op.
-  bool IsExpensive() override { return false; }
-
-  void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
-    OP_REQUIRES_OK_ASYNC(context, IsMPIInitialized(), done);
-    const Tensor* input_tensor = &context->input(0);
-    Tensor* output_tensor;
-    OP_REQUIRES_OK_ASYNC(
-        context,
-        context->allocate_output(0, input_tensor->shape(), &output_tensor),
-        done);
-
-    // Record allocated on stack so op can fail without memory leak
-    CollectiveOpRecord record;
-    record.name = name();
-    record.context = context;
-    record.in_t = input_tensor;
-    record.out_t = output_tensor;
-    record.on_gpu = IsGPUDevice<Device>();
-    record.dtype = input_tensor->dtype();
-
-    const size_t temp_size =
-        (input_tensor->NumElements() + mpi_global.size - 1) / mpi_global.size;
-    TensorShape temp_shape;
-    temp_shape.AddDim(temp_size);
-    OP_REQUIRES_OK_ASYNC(context,
-                         context->allocate_temp(input_tensor->dtype(),
-                                                temp_shape, &record.temp_t),
-                         done);
-
-    auto allreduce_done_callback = [done, context](StatusOr<Tensor> status) {
-      context->SetStatus(status.status());
-      done();
-    };
-    record.callback = allreduce_done_callback;
-
-    auto allreduce_launch_callback = [record] {
-      EnqueueTensorCollective(record, MPIRequest::ALLREDUCE);
-    };
-
-    // If we are on a CPU, our device context will be null and we can't
-    // get a stream to enqueue this on. On a CPU this op is called when the
-    // data is already available, so we can just immediately do the
-    // allreduce; we don't have to wait for the data to get populated.
-#if GOOGLE_CUDA
-    auto device_context = context->op_device_context();
-    if (device_context == nullptr) {
-      allreduce_launch_callback();
-    } else {
-      auto stream = device_context->stream();
-      stream->ThenDoHostCallback(allreduce_launch_callback);
-    }
-#else
-    allreduce_launch_callback();
-#endif
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("MPIAllreduce").Device(DEVICE_CPU),
-                        MPIAllreduceOp<CPUDevice>);
-#if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(Name("MPIAllreduce").Device(DEVICE_GPU),
-                        MPIAllreduceOp<GPUDevice>);
-#endif
-
-REGISTER_OP("MPIAllreduce")
-    .Attr("T: {int32, int64, float32}")
-    .Input("tensor: T")
-    .Output("sum: T")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->input(0));
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Perform an MPI Allreduce on a tensor. All other processes that do a reduction
-on a tensor with the same name must have the same dimension for that tensor.
-Tensors are reduced with other tensors that have the same node name for the
-allreduce.
-
-Arguments
-    tensor:     A tensor to reduce.
-
-Output
-    sum:        A tensor with the same shape as `tensor`, summed across all
-                MPI processes.
-)doc");
-
-template <typename Device>
-class MPIAllgatherOp : public AsyncOpKernel {
- public:
-  explicit MPIAllgatherOp(OpKernelConstruction* context)
-      : AsyncOpKernel(context) {}
-
-  // Although this op is handled asynchronously, the ComputeAsync call is
-  // very inexpensive. It only sets up a CollectiveOpRecord and places it
-  // in the table for the background thread to handle. Thus, we do not need
-  // a TF pool thread to perform the op.
-  bool IsExpensive() override { return false; }
-
-  void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
-    OP_REQUIRES_OK_ASYNC(context, IsMPIInitialized(), done);
-    const Tensor* input_tensor = &context->input(0);
-    const Tensor* sizing_tensor = &context->input(1);
-
-    // Record allocated on stack so op can fail without memory leak
-    CollectiveOpRecord record;
-    record.name = name();
-    record.context = context;
-    record.in_t = input_tensor;
-    record.on_gpu = IsGPUDevice<Device>();
-
-    // Construct the output size from the sizing tensor
-    size_t output_first_dim = 0;
-    if (sizing_tensor->shape().dims() == 0) {
-      // 0-dim sizing_tensor implies that the op is just gathering
-      // a single element from each rank
-      output_first_dim = mpi_global.size;
-      for (int i = 0; i < mpi_global.size; i++) {
-        record.sizes_vec.push_back(1);
-      }
-    } else {
-      // Collect the total output tensor sizing from the sizing tensor
-      // NOTE: The sizing tensor is forced to be placed on the CPU by
-      // declaring the input as HostMemory, so it is valid to read it here.
-      const int64* sizing_array =
-          (const int64*)sizing_tensor->tensor_data().data();
-      for (int i = 0; i < mpi_global.size; i++) {
-        record.sizes_vec.push_back(sizing_array[i]);
-        output_first_dim += sizing_array[i];
-      }
-    }
-
-    TensorShape output_shape;
-    output_shape.AddDim(output_first_dim);
-    for (int i = 1; i < input_tensor->shape().dims(); i++) {
-      output_shape.AddDim(input_tensor->shape().dim_size(i));
-    }
-
-    Tensor* output_tensor;
-    OP_REQUIRES_OK_ASYNC(
-        context, context->allocate_output(0, output_shape, &output_tensor),
-        done);
-
-    record.out_t = output_tensor;
-    record.dtype = input_tensor->dtype();
-
-    auto allgather_done_callback = [done, context](StatusOr<Tensor> status) {
-      context->SetStatus(status.status());
-      done();
-    };
-    record.callback = allgather_done_callback;
-
-    auto allgather_launch_callback = [record] {
-      EnqueueTensorCollective(record, MPIRequest::ALLGATHER);
-    };
-
-    // If we are on a CPU, our device context will be null and we can't
-    // get a stream to enqueue this on. On a CPU this op is called when the
-    // data is already available, so we can just immediately do the
-    // allgather; we don't have to wait for the data to get populated.
-#if GOOGLE_CUDA
-    auto device_context = context->op_device_context();
-    if (device_context == nullptr) {
-      allgather_launch_callback();
-    } else {
-      auto stream = device_context->stream();
-      stream->ThenDoHostCallback(allgather_launch_callback);
-    }
-#else
-    allgather_launch_callback();
-#endif
-  }
-};
-
-REGISTER_OP("MPIAllgather")
-    .Attr("T: {int32, int64, float32}")
-    .Attr("S: {int64}")
-    .Input("tensor: T")
-    .Input("sizes: S")
-    .Output("gathered: T")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle output;
-      TF_RETURN_IF_ERROR(
-          c->ReplaceDim(c->input(0), 0, c->UnknownDim(), &output));
-      c->set_output(0, output);
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Perform an MPI Allgather on a tensor. All other processes that do a gather on a
-tensor with the same name must have the same rank for that tensor, and have the
-same dimension on all but the first dimension.
-
-Arguments
-    tensor:     A tensor to gather.
-    sizes:      A tensor containing the first-dimension sizes of tensors to be
-                gathered from other ranks
-
-Output
-    gathered:   A tensor with the same shape as `tensor` except for the first
-                dimension, which is the sum of dimensions in `sizes`.
-)doc");
-
-REGISTER_KERNEL_BUILDER(
-    Name("MPIAllgather").Device(DEVICE_CPU).HostMemory("sizes"),
-    MPIAllgatherOp<CPUDevice>);
-#if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(
-    Name("MPIAllgather").Device(DEVICE_GPU).HostMemory("sizes"),
-    MPIAllgatherOp<GPUDevice>);
-#endif
-
-}  // namespace mpi
-}  // namespace contrib
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_USE_MPI
diff --git a/tensorflow/contrib/mpi_collectives/mpi_ops.py b/tensorflow/contrib/mpi_collectives/mpi_ops.py
deleted file mode 100644
index bd7096d9ce..0000000000
--- a/tensorflow/contrib/mpi_collectives/mpi_ops.py
+++ /dev/null
@@ -1,163 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-"""Inter-process communication using MPI."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import load_library
-from tensorflow.python.framework import ops
-from tensorflow.python.platform import resource_loader
-from tensorflow.python.platform import tf_logging as logging
-
-
-def _load_library(name, op_list=None):
-  """Loads a .so file containing the specified operators.
-
-  Args:
-    name: The name of the .so file to load.
-    op_list: A list of names of operators that the library should have. If None
-        then the .so file's contents will not be verified.
-
-  Raises:
-    NameError if one of the required ops is missing.
-  """
-  try:
-    filename = resource_loader.get_path_to_datafile(name)
-    library = load_library.load_op_library(filename)
-    for expected_op in (op_list or []):
-      for lib_op in library.OP_LIST.op:
-        if lib_op.name == expected_op:
-          break
-      else:
-        raise NameError('Could not find operator %s in dynamic library %s' %
-                        (expected_op, name))
-    return library
-  except errors.NotFoundError:
-    logging.warning('%s file could not be loaded.', name)
-
-
-MPI_LIB = _load_library(
-    'mpi_collectives.so',
-    ['MPISize', 'MPIRank', 'MPILocalRank', 'MPIAllgather', 'MPIAllreduce'])
-
-
-def size(name=None):
-  """An op which returns the number of MPI processes.
-
-  This is equivalent to running `MPI_Comm_size(MPI_COMM_WORLD, ...)` to get the
-  size of the global communicator.
-
-  Returns:
-    An integer scalar containing the number of MPI processes.
-  """
-  return MPI_LIB.mpi_size(name=name)
-
-
-ops.NotDifferentiable('MPISize')
-
-
-def rank(name=None):
-  """An op which returns the MPI rank of the calling process.
-
-  This is equivalent to running `MPI_Comm_rank(MPI_COMM_WORLD, ...)` to get the
-  rank of the current process in the global communicator.
-
-  Returns:
-    An integer scalar with the MPI rank of the calling process.
-  """
-  return MPI_LIB.mpi_rank(name=name)
-
-
-ops.NotDifferentiable('MPIRank')
-
-
-def init(name=None):
-  """An op which initializes MPI on the device on which it is run.
-
-  All future MPI ops must be run on the same device that the `init` op was run
-  on.
-  """
-  return MPI_LIB.mpi_init(name=name)
-
-
-ops.NotDifferentiable('MPIInit')
-
-
-def local_rank(name=None):
-  """An op which returns the local MPI rank of the calling process, within the
-  node that it is running on. For example, if there are seven processes running
-  on a node, their local ranks will be zero through six, inclusive.
-
-  This is equivalent to running `MPI_Comm_rank(...)` on a new communicator
-  which only includes processes on the same node.
-
-  Returns:
-    An integer scalar with the local MPI rank of the calling process.
-  """
-  return MPI_LIB.mpi_local_rank(name=name)
-
-
-ops.NotDifferentiable('MPILocalRank')
-
-
-def _allreduce(tensor, name=None):
-  """An op which sums an input tensor over all the MPI processes.
-
-  The reduction operation is keyed by the name of the op. The tensor type and
-  shape must be the same on all MPI processes for a given name. The reduction
-  will not start until all processes are ready to send and receive the tensor.
-
-  Returns:
-    A tensor of the same shape and type as `tensor`, summed across all
-    processes.
-  """
-  return MPI_LIB.mpi_allreduce(tensor, name=name)
-
-
-ops.NotDifferentiable('MPIAllreduce')
-
-
-def allgather(tensor, name=None):
-  """An op which concatenates the input tensor with the same input tensor on
-  all other MPI processes.
-
-  The concatenation is done on the first dimension, so the input tensors on the
-  different processes must have the same rank and shape, except for the first
-  dimension, which is allowed to be different.
-
-  Returns:
-    A tensor of the same type as `tensor`, concatenated on dimension zero
-    across all processes. The shape is identical to the input shape, except for
-    the first dimension, which may be greater and is the sum of all first
-    dimensions of the tensors in different MPI processes.
-  """
-  # Specify that first allgather is to collect the tensor gather sizes,
-  # indicated by passing in a scalar (0-D tensor) of value 0
-  sizes_flag = tf.constant(0, dtype=tf.int64, name='size_flag_const')
-  my_size = tf.slice(
-      tf.shape(tensor, out_type=tf.int64), [0], [1], name='size_slice')
-  if name is None:
-    name = 'allgather'
-  sizing_name = '{}_sizing'.format(name)
-  sizes = MPI_LIB.mpi_allgather(my_size, sizes_flag, name=sizing_name)
-  return MPI_LIB.mpi_allgather(tensor, sizes, name=name)
-
-
-ops.NotDifferentiable('MPIAllgather')
diff --git a/tensorflow/contrib/mpi_collectives/ring.cc b/tensorflow/contrib/mpi_collectives/ring.cc
deleted file mode 100644
index d93233eb21..0000000000
--- a/tensorflow/contrib/mpi_collectives/ring.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifdef TENSORFLOW_USE_MPI
-
-#define EIGEN_USE_THREADS
-
-#include "tensorflow/contrib/mpi_collectives/ring.h"
-
-namespace tensorflow {
-namespace contrib {
-namespace mpi {
-
-using CPUDevice = Eigen::ThreadPoolDevice;
-
-extern template MPI_Datatype MPIType<float>();
-extern template MPI_Datatype MPIType<int>();
-extern template MPI_Datatype MPIType<long long>();
-extern template DataType TensorFlowDataType<float>();
-extern template DataType TensorFlowDataType<int>();
-extern template DataType TensorFlowDataType<long long>();
-
-// Generate all necessary specializations for RingAllreduce.
-template Status RingAllreduce<CPUDevice, int>(OpKernelContext*, const Tensor*,
-                                              Tensor*, Tensor*);
-template Status RingAllreduce<CPUDevice, long long>(OpKernelContext*,
-                                                    const Tensor*, Tensor*,
-                                                    Tensor*);
-template Status RingAllreduce<CPUDevice, float>(OpKernelContext*, const Tensor*,
-                                                Tensor*, Tensor*);
-
-// Generate all necessary specializations for RingAllgather.
-template Status RingAllgather<CPUDevice, int>(OpKernelContext*, const Tensor*,
-                                              const std::vector<size_t>&,
-                                              Tensor*);
-template Status RingAllgather<CPUDevice, long long>(OpKernelContext*,
-                                                    const Tensor*,
-                                                    const std::vector<size_t>&,
-                                                    Tensor*);
-template Status RingAllgather<CPUDevice, float>(OpKernelContext*, const Tensor*,
-                                                const std::vector<size_t>&,
-                                                Tensor*);
-
-// Copy data on a CPU using a straight-forward memcpy.
-template <>
-void CopyTensorData<CPUDevice>(void* dst, void* src, size_t size) {
-  std::memcpy(dst, src, size);
-};
-
-// Accumulate values on a CPU.
-#define GENERATE_ACCUMULATE(type)                                    \
-  template <>                                                        \
-  void AccumulateTensorData<CPUDevice, type>(type * dst, type * src, \
-                                             size_t size) {          \
-    for (unsigned int i = 0; i < size; i++) {                        \
-      dst[i] += src[i];                                              \
-    }                                                                \
-  };
-GENERATE_ACCUMULATE(int);
-GENERATE_ACCUMULATE(long long);
-GENERATE_ACCUMULATE(float);
-#undef GENERATE_ACCUMULATE
-
-}  // namespace mpi
-}  // namespace contrib
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_USE_MPI
diff --git a/tensorflow/contrib/mpi_collectives/ring.cu.cc b/tensorflow/contrib/mpi_collectives/ring.cu.cc
deleted file mode 100644
index 2f3eef366a..0000000000
--- a/tensorflow/contrib/mpi_collectives/ring.cu.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifdef TENSORFLOW_USE_MPI
-
-#if GOOGLE_CUDA
-
-#define EIGEN_USE_GPU
-
-#include "tensorflow/contrib/mpi_collectives/ring.h"
-
-namespace tensorflow {
-namespace contrib {
-namespace mpi {
-
-using CPUDevice = Eigen::ThreadPoolDevice;
-
-template <>
-MPI_Datatype MPIType<float>() {
-  return MPI_FLOAT;
-};
-template <>
-MPI_Datatype MPIType<int>() {
-  return MPI_INT;
-};
-template <>
-MPI_Datatype MPIType<long long>() {
-  return MPI_LONG_LONG;
-};
-
-template <>
-DataType TensorFlowDataType<float>() {
-  return DT_FLOAT;
-};
-template <>
-DataType TensorFlowDataType<int>() {
-  return DT_INT32;
-};
-template <>
-DataType TensorFlowDataType<long long>() {
-  return DT_INT64;
-};
-
-// Generate all necessary specializations for RingAllreduce.
-template Status RingAllreduce<GPUDevice, int>(OpKernelContext*, const Tensor*,
-                                              Tensor*, Tensor*);
-template Status RingAllreduce<GPUDevice, long long>(OpKernelContext*,
-                                                    const Tensor*, Tensor*,
-                                                    Tensor*);
-template Status RingAllreduce<GPUDevice, float>(OpKernelContext*, const Tensor*,
-                                                Tensor*, Tensor*);
-
-// Generate all necessary specializations for RingAllgather.
-template Status RingAllgather<GPUDevice, int>(OpKernelContext*, const Tensor*,
-                                              const std::vector<size_t>&,
-                                              Tensor*);
-template Status RingAllgather<GPUDevice, long long>(OpKernelContext*,
-                                                    const Tensor*,
-                                                    const std::vector<size_t>&,
-                                                    Tensor*);
-template Status RingAllgather<GPUDevice, float>(OpKernelContext*, const Tensor*,
-                                                const std::vector<size_t>&,
-                                                Tensor*);
-
-// Synchronously copy data on the GPU, using a different stream than the default
-// and than TensorFlow to avoid synchronizing on operations unrelated to the
-// allreduce.
-template <>
-void CopyTensorData<GPUDevice>(void* dst, void* src, size_t size) {
-  auto stream = CudaStreamForMPI();
-  cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToDevice, stream);
-  cudaStreamSynchronize(stream);
-};
-
-// Elementwise accumulation kernel for GPU.
-template <typename T>
-__global__ void elemwise_accum(T* out, const T* in, const size_t N) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
-       i += blockDim.x * gridDim.x) {
-    out[i] += in[i];
-  }
-}
-
-// Synchronously accumulate tensors on the GPU, using a different stream than
-// the default and than TensorFlow to avoid synchronizing on operations
-// unrelated to the allreduce.
-#define GENERATE_ACCUMULATE(type)                                    \
-  template <>                                                        \
-  void AccumulateTensorData<GPUDevice, type>(type * dst, type * src, \
-                                             size_t size) {          \
-    auto stream = CudaStreamForMPI();                                \
-    elemwise_accum<type><<<32, 256, 0, stream>>>(dst, src, size);    \
-    cudaStreamSynchronize(stream);                                   \
-  };
-GENERATE_ACCUMULATE(int);
-GENERATE_ACCUMULATE(long long);
-GENERATE_ACCUMULATE(float);
-#undef GENERATE_ACCUMULATE
-
-}  // namespace mpi
-}  // namespace contrib
-}  // namespace tensorflow
-#endif  // GOOGLE_CUDA
-
-#endif  // TENSORFLOW_USE_MPI
diff --git a/tensorflow/contrib/mpi_collectives/ring.h b/tensorflow/contrib/mpi_collectives/ring.h
deleted file mode 100644
index cae57ce60e..0000000000
--- a/tensorflow/contrib/mpi_collectives/ring.h
+++ /dev/null
@@ -1,327 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CONTRIB_MPI_H_
-#define TENSORFLOW_CONTRIB_MPI_H_
-
-#ifdef TENSORFLOW_USE_MPI
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/tensor_types.h"
-
-#if GOOGLE_CUDA
-#include "cuda_runtime.h"
-#endif
-
-// Needed to avoid header issues with C++-supporting MPI implementations
-#define OMPI_SKIP_MPICXX
-#include "third_party/mpi/mpi.h"
-
-#define TAG_TENSOR 12
-
-namespace tensorflow {
-namespace contrib {
-namespace mpi {
-
-using CPUDevice = Eigen::ThreadPoolDevice;
-using GPUDevice = Eigen::GpuDevice;
-
-// Convert from templated types to values we can pass to MPI.
-template <typename T>
-MPI_Datatype MPIType();
-
-// Convert from templated types to TensorFlow data types.
-template <typename T>
-DataType TensorFlowDataType();
-
-#define MPI_REQUIRES_OK(MPI_STATUS)                               \
-  if ((MPI_STATUS) != MPI_SUCCESS) {                              \
-    return errors::Unknown("MPI operation failed unexpectedly."); \
-  }
-
-// Copy data from one tensor to another tensor.
-// This uses a custom CUDA stream on GPU, which is necessary to overlay the
-// backpropagation computations with the allreduce.
-template <typename Device>
-void CopyTensorData(void* destination, void* source, size_t size);
-
-// Add a tensor into another tensor, accumulating in place.
-// This uses a custom CUDA stream on GPU, which is necessary to overlay the
-// backpropagation computations with the allreduce.
-template <typename Device, typename T>
-void AccumulateTensorData(T* destination, T* source, size_t size);
-
-// We need to get the right stream for doing CUDA memory transfers and
-// operations, which is possibly different from the standard TensorFlow stream.
-#if GOOGLE_CUDA
-cudaStream_t CudaStreamForMPI();
-#endif
-
-/* Perform a ring allreduce on the data. Allocate the necessary output tensor
- * and store it in the output parameter.
- *
- * Assumes that all MPI processes are doing an allreduce of the same tensor,
- * with the same dimensions.
- *
- * A ring allreduce is a bandwidth-optimal way to do an allreduce. To do the
- * allreduce, the nodes involved are arranged in a ring:
- *
- *                   .--0--.
- *                  /       \
- *                 3         1
- *                  \       /
- *                   *--2--*
- *
- *  Each node always sends to the next clockwise node in the ring, and receives
- *  from the previous one.
- *
- *  The allreduce is done in two parts: a scatter-reduce and an allgather. In
- *  the scatter reduce, a reduction is done, so that each node ends up with a
- *  chunk of the final output tensor which has contributions from all other
- *  nodes.  In the allgather, those chunks are distributed among all the nodes,
- *  so that all nodes have the entire output tensor.
- *
- *  Both of these operations are done by dividing the input tensor into N
- *  evenly sized chunks (where N is the number of nodes in the ring).
- *
- *  The scatter-reduce is done in N-1 steps. In the ith step, node j will send
- *  the (j - i)th chunk and receive the (j - i - 1)th chunk, adding it in to
- *  its existing data for that chunk. For example, in the first iteration with
- *  the ring depicted above, you will have the following transfers:
- *
- *      Segment 0:  Node 0 --> Node 1
- *      Segment 1:  Node 1 --> Node 2
- *      Segment 2:  Node 2 --> Node 3
- *      Segment 3:  Node 3 --> Node 0
- *
- *  In the second iteration, you'll have the following transfers:
- *
- *      Segment 0:  Node 1 --> Node 2
- *      Segment 1:  Node 2 --> Node 3
- *      Segment 2:  Node 3 --> Node 0
- *      Segment 3:  Node 0 --> Node 1
- *
- *  After this iteration, Node 2 has 3 of the four contributions to Segment 0.
- *  The last iteration has the following transfers:
- *
- *      Segment 0:  Node 2 --> Node 3
- *      Segment 1:  Node 3 --> Node 0
- *      Segment 2:  Node 0 --> Node 1
- *      Segment 3:  Node 1 --> Node 2
- *
- *  After this iteration, Node 3 has the fully accumulated Segment 0; Node 0
- *  has the fully accumulated Segment 1; and so on. The scatter-reduce is
- * complete.
- *
- *  Next, the allgather distributes these fully accumululated chunks across all
- * nodes. Communication proceeds in the same ring, once again in N-1 steps. At
- * the ith step, node j will send chunk (j - i + 1) and receive chunk (j - i).
- * For example, at the first iteration, the following transfers will occur:
- *
- *      Segment 0:  Node 3 --> Node 0
- *      Segment 1:  Node 0 --> Node 1
- *      Segment 2:  Node 1 --> Node 2
- *      Segment 3:  Node 2 --> Node 3
- *
- * After the first iteration, Node 0 will have a fully accumulated Segment 0
- * (from Node 3) and Segment 1. In the next iteration, Node 0 will send its
- * just-received Segment 0 onward to Node 1, and receive Segment 3 from Node 3.
- * After this has continued for N - 1 iterations, all nodes will have a the
- * fully accumulated tensor.
- *
- * Each node will do (N-1) sends for the scatter-reduce and (N-1) sends for the
- * allgather. Each send will contain K / N bytes, if there are K bytes in the
- * original tensor on every node. Thus, each node sends and receives 2K(N - 1)/N
- * bytes of data, and the performance of the allreduce (assuming no latency in
- * connections) is constrained by the slowest interconnect between the nodes.
- *
- */
-template <typename Device, typename T>
-Status RingAllreduce(OpKernelContext* context, const Tensor* input,
-                     Tensor* temp, Tensor* output) {
-  // Acquire MPI size and rank
-  int n, r;
-  MPI_REQUIRES_OK(MPI_Comm_size(MPI_COMM_WORLD, &n));
-  MPI_REQUIRES_OK(MPI_Comm_rank(MPI_COMM_WORLD, &r));
-
-  T* buffer = (T*)output->tensor_data().data();
-
-  CopyTensorData<Device>((void*)buffer, (void*)input->tensor_data().data(),
-                         output->tensor_data().size());
-
-  // Calculate segment sizes and segment ends
-  const size_t elements_to_reduce = input->NumElements();
-  const size_t segment_size = elements_to_reduce / n;
-  std::vector<size_t> segment_sizes(n, segment_size);
-
-  const size_t residual = elements_to_reduce % n;
-  for (size_t i = 0; i < residual; ++i) {
-    segment_sizes[i]++;
-  }
-
-  std::vector<size_t> segment_starts(n);
-  segment_starts[0] = 0;
-  for (size_t i = 1; i < segment_starts.size(); ++i) {
-    segment_starts[i] = segment_starts[i - 1] + segment_sizes[i - 1];
-  }
-
-  assert(segment_starts[n - 1] + segment_sizes[n - 1] == elements_to_reduce);
-
-  T* segment_recv = (T*)temp->tensor_data().data();
-
-  // Receive from your left neighbor with wrap-around
-  const size_t recv_from = ((r - 1) + n) % n;
-
-  // Send to your right neighbor with wrap-around
-  const size_t send_to = (r + 1) % n;
-
-  MPI_Status recv_status;
-  MPI_Request recv_req;
-
-  // Now start ring. At every step, for every rank, we iterate through
-  // segments with wraparound and send and recv from our neighbors and reduce
-  // locally. At the i'th iteration, rank r, sends segment (r-i) and receives
-  // segment (r-i-1).
-  for (int i = 0; i < n - 1; i++) {
-    const size_t send_seg_id = ((r - i) + n) % n;
-    const size_t recv_seg_id = ((r - i - 1) + n) % n;
-
-    T* segment_send = &(buffer[segment_starts[send_seg_id]]);
-
-    MPI_REQUIRES_OK(MPI_Irecv(segment_recv, segment_sizes[recv_seg_id],
-                              MPIType<T>(), recv_from, TAG_TENSOR,
-                              MPI_COMM_WORLD, &recv_req));
-
-    MPI_REQUIRES_OK(MPI_Send(segment_send, segment_sizes[send_seg_id],
-                             MPIType<T>(), send_to, TAG_TENSOR,
-                             MPI_COMM_WORLD));
-
-    T* segment_update = &(buffer[segment_starts[recv_seg_id]]);
-
-    // Wait for recv to complete before reduction
-    MPI_REQUIRES_OK(MPI_Wait(&recv_req, &recv_status));
-
-    const size_t recv_seg_size = segment_sizes[recv_seg_id];
-    AccumulateTensorData<Device, T>(segment_update, segment_recv,
-                                    recv_seg_size);
-  }
-
-  // Now start pipelined ring allgather. At every step, for every rank, we
-  // iterate through segments with wraparound and send and recv from our
-  // neighbors. At the i'th iteration, rank r, sends segment (r-i+1) and
-  // receives segment (r-i).
-  for (size_t i = 0; i < n - 1; ++i) {
-    const size_t send_seg_id = ((r - i + 1) + n) % n;
-    const size_t recv_seg_id = ((r - i) + n) % n;
-
-    // Segment to send - at every iteration we send segment (r-i+1)
-    T* segment_send = &(buffer[segment_starts[send_seg_id]]);
-
-    // Segment to recv - at every iteration we receive segment (r-i)
-    T* segment_recv = &(buffer[segment_starts[recv_seg_id]]);
-
-    MPI_REQUIRES_OK(MPI_Sendrecv(
-        segment_send, segment_sizes[send_seg_id], MPIType<T>(), send_to,
-        TAG_TENSOR, segment_recv, segment_sizes[recv_seg_id], MPIType<T>(),
-        recv_from, TAG_TENSOR, MPI_COMM_WORLD, &recv_status));
-  }
-
-  return Status::OK();
-}
-
-// Perform a ring allgather on a Tensor. Other ranks may allgather with a
-// tensor which differs in the first dimension only; all other dimensions must
-// be the same.
-//
-// For more information on the ring allgather, read the documentation for the
-// ring allreduce, which includes a ring allgather.
-template <typename Device, typename T>
-Status RingAllgather(OpKernelContext* context, const Tensor* input,
-                     const std::vector<size_t>& sizes, Tensor* output) {
-  // Acquire MPI size and rank
-  int n, r;
-  MPI_REQUIRES_OK(MPI_Comm_size(MPI_COMM_WORLD, &n));
-  MPI_REQUIRES_OK(MPI_Comm_rank(MPI_COMM_WORLD, &r));
-
-  assert(sizes.size() == n);
-  assert(input->dim_size(0) == sizes[r]);
-
-  // Compute number of elements in every "row". We can't compute number of
-  // elements in every chunks, because those chunks are variable length.
-  size_t elements_per_row = 1;
-  for (int i = 1; i < input->shape().dims(); i++) {
-    elements_per_row *= input->dim_size(i);
-  }
-
-  // Copy data from input tensor to correct place in output tensor.
-  std::vector<size_t> segment_starts(n);
-  segment_starts[0] = 0;
-  for (int i = 1; i < n; i++) {
-    segment_starts[i] = segment_starts[i - 1] + elements_per_row * sizes[i - 1];
-  }
-  size_t offset = segment_starts[r];
-
-  // Copy data to the right offset for this rank.
-  T* buffer = (T*)output->tensor_data().data();
-  CopyTensorData<Device>((void*)(buffer + offset),
-                         (void*)input->tensor_data().data(),
-                         elements_per_row * sizes[r] * sizeof(T));
-
-  // Receive from your left neighbor with wrap-around
-  const size_t recv_from = ((r - 1) + n) % n;
-
-  // Send to your right neighbor with wrap-around
-  const size_t send_to = (r + 1) % n;
-
-  // Perform a ring allgather. At every step, for every rank, we iterate
-  // through segments with wraparound and send and recv from our neighbors.
-  // At the i'th iteration, rank r, sends segment (r-i) and receives segment
-  // (r-1-i).
-  MPI_Status recv_status;
-  for (size_t i = 0; i < n - 1; ++i) {
-    const size_t send_seg_id = ((r - i) + n) % n;
-    const size_t recv_seg_id = ((r - i - 1) + n) % n;
-
-    // Segment to send - at every iteration we send segment (r-i)
-    size_t offset_send = segment_starts[send_seg_id];
-    size_t rows_send = sizes[send_seg_id];
-    T* segment_send = &(buffer[offset_send]);
-
-    // Segment to recv - at every iteration we receive segment (r-1-i)
-    size_t offset_recv = segment_starts[recv_seg_id];
-    size_t rows_recv = sizes[recv_seg_id];
-    T* segment_recv = &(buffer[offset_recv]);
-
-    MPI_REQUIRES_OK(MPI_Sendrecv(
-        segment_send, elements_per_row * rows_send, MPIType<T>(), send_to,
-        TAG_TENSOR, segment_recv, elements_per_row * rows_recv, MPIType<T>(),
-        recv_from, TAG_TENSOR, MPI_COMM_WORLD, &recv_status));
-  }
-
-  return Status::OK();
-}
-
-}  // namespace mpi
-}  // namespace contrib
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_USE_MPI
-
-#undef TENSORFLOW_CONTRIB_MPI_H_
-#endif  // TENSORFLOW_CONTRIB_MPI_H_
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
index eef1ae25e9..7b883ebc5d 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
@@ -1553,7 +1553,8 @@ class WeightNormLSTMCellTest(test.TestCase):
 
     with self.test_session() as sess:
       init = init_ops.constant_initializer(0.5)
-      with variable_scope.variable_scope("root", initializer=init):
+      with variable_scope.variable_scope("root",
+                                         initializer=init):
         x = array_ops.zeros([1, 2])
         c0 = array_ops.zeros([1, 2])
         h0 = array_ops.zeros([1, 2])
@@ -1563,12 +1564,11 @@ class WeightNormLSTMCellTest(test.TestCase):
         xout, sout = cell()(x, state0)
 
       sess.run([variables.global_variables_initializer()])
-      res = sess.run(
-          [xout, sout], {
-              x.name: np.array([[1., 1.]]),
-              c0.name: 0.1 * np.asarray([[0, 1]]),
-              h0.name: 0.1 * np.asarray([[2, 3]]),
-          })
+      res = sess.run([xout, sout], {
+          x.name: np.array([[1., 1.]]),
+          c0.name: 0.1 * np.asarray([[0, 1]]),
+          h0.name: 0.1 * np.asarray([[2, 3]]),
+      })
 
     actual_state_c = res[1].c
     actual_state_h = res[1].h
@@ -1579,8 +1579,9 @@ class WeightNormLSTMCellTest(test.TestCase):
     """Tests cell w/o peepholes and w/o normalisation"""
 
     def cell():
-      return contrib_rnn_cell.WeightNormLSTMCell(
-          2, norm=False, use_peepholes=False)
+      return contrib_rnn_cell.WeightNormLSTMCell(2,
+                                                 norm=False,
+                                                 use_peepholes=False)
 
     actual_c, actual_h = self._cell_output(cell)
 
@@ -1594,8 +1595,9 @@ class WeightNormLSTMCellTest(test.TestCase):
     """Tests cell with peepholes and w/o normalisation"""
 
     def cell():
-      return contrib_rnn_cell.WeightNormLSTMCell(
-          2, norm=False, use_peepholes=True)
+      return contrib_rnn_cell.WeightNormLSTMCell(2,
+                                                 norm=False,
+                                                 use_peepholes=True)
 
     actual_c, actual_h = self._cell_output(cell)
 
@@ -1610,8 +1612,9 @@ class WeightNormLSTMCellTest(test.TestCase):
     """Tests cell w/o peepholes and with normalisation"""
 
     def cell():
-      return contrib_rnn_cell.WeightNormLSTMCell(
-          2, norm=True, use_peepholes=False)
+      return contrib_rnn_cell.WeightNormLSTMCell(2,
+                                                 norm=True,
+                                                 use_peepholes=False)
 
     actual_c, actual_h = self._cell_output(cell)
 
@@ -1625,8 +1628,9 @@ class WeightNormLSTMCellTest(test.TestCase):
     """Tests cell with peepholes and with normalisation"""
 
     def cell():
-      return contrib_rnn_cell.WeightNormLSTMCell(
-          2, norm=True, use_peepholes=True)
+      return contrib_rnn_cell.WeightNormLSTMCell(2,
+                                                 norm=True,
+                                                 use_peepholes=True)
 
     actual_c, actual_h = self._cell_output(cell)
 
diff --git a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
index d6184d6109..554eb24e52 100644
--- a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
@@ -724,7 +724,7 @@ def _mask_probs(probs, eos_token, finished):
       eos_token,
       vocab_size,
       dtype=probs.dtype,
-      on_value=0.,
+      on_value=ops.convert_to_tensor(0., dtype=probs.dtype),
       off_value=probs.dtype.min)
   finished_probs = array_ops.tile(
       array_ops.reshape(finished_row, [1, 1, -1]),
diff --git a/tensorflow/contrib/slim/python/slim/data/parallel_reader.py b/tensorflow/contrib/slim/python/slim/data/parallel_reader.py
index ad5e985487..b3343aef47 100644
--- a/tensorflow/contrib/slim/python/slim/data/parallel_reader.py
+++ b/tensorflow/contrib/slim/python/slim/data/parallel_reader.py
@@ -221,7 +221,7 @@ def parallel_read(data_sources,
         the data will be cycled through indefinitely.
     num_readers: a integer, number of Readers to create.
     reader_kwargs: an optional dict, of kwargs for the reader.
-    shuffle: boolean, wether should shuffle the files and the records by using
+    shuffle: boolean, whether should shuffle the files and the records by using
       RandomShuffleQueue as common_queue.
     dtypes:  A list of types.  The length of dtypes must equal the number
         of elements in each record. If it is None it will default to
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h
index 04e6b0a735..dc3e9fe79d 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h
@@ -468,7 +468,7 @@ class FixedSizeSparseClassificationGrowStats : public ClassificationStats {
   void PackToProto(FertileSlot* slot) const override;
 
   void InitLeafClassStats(int best_split_index, LeafStat* left_stats,
-                          LeafStat* right_stats) const;
+                          LeafStat* right_stats) const override;
 
  protected:
   void ClassificationAddSplitStats() override {
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 4003ba056d..9ee717dd7f 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -809,9 +809,9 @@ tensorflow::Status BinaryTensorOpTensor(
   CHECK_EQ_TYPE(tensor_r->getType(), dtype);
   auto op_pair = ops.find(node_def.op());
   if (op_pair == ops.end())
-    return tensorflow::errors::Unimplemented(
-        "binary op: " + node_def.op() +
-        " not supported at: " + node_def.name());
+    return tensorflow::errors::Unimplemented("binary op: " + node_def.op() +
+                                             " not supported at: " +
+                                             node_def.name());
 
   nvinfer1::IElementWiseLayer* layer = ctx.network()->addElementWise(
       *const_cast<nvinfer1::ITensor*>(tensor_l),
@@ -1471,13 +1471,13 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
             << std::to_string(op_info_vec.size());
 
     // TODO(ben,jie): update TRT input format/dimension
-    nvinfer1::DimsCHW input_dim_pseudo_chw;
-    for (int i = 0; i < 3; i++) input_dim_pseudo_chw.d[i] = 1;
+    nvinfer1::DimsCHW input_dim_psuedo_chw;
+    for (int i = 0; i < 3; i++) input_dim_psuedo_chw.d[i] = 1;
 
     for (int i = 1; i < op_info.shape().dim_size(); i++) {
       VLOG(2) << "dimension: " << i
               << " , size: " << op_info.shape().dim(i).size();
-      input_dim_pseudo_chw.d[i - 1] = op_info.shape().dim(i).size();
+      input_dim_psuedo_chw.d[i - 1] = op_info.shape().dim(i).size();
     }
 
     // TODO(ben,jie): proper way to restore input tensor name?
@@ -1486,7 +1486,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
       input_tensor_name = node_name + ":" + std::to_string(output_idx);
 
     nvinfer1::ITensor* input_tensor = converter.network()->addInput(
-        input_tensor_name.c_str(), dtype, input_dim_pseudo_chw);
+        input_tensor_name.c_str(), dtype, input_dim_psuedo_chw);
 
     if (!input_tensor)
       return tensorflow::errors::InvalidArgument(
diff --git a/tensorflow/contrib/timeseries/python/timeseries/BUILD b/tensorflow/contrib/timeseries/python/timeseries/BUILD
index fff972c1f3..ed3ed4c0e1 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/BUILD
@@ -140,11 +140,13 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/estimator:export",
         "//tensorflow/python/estimator:head",
+        "//tensorflow/python/estimator:metric_keys",
     ],
 )
 
diff --git a/tensorflow/contrib/timeseries/python/timeseries/head.py b/tensorflow/contrib/timeseries/python/timeseries/head.py
index f0330bfbbd..5c49e903ab 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/head.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/head.py
@@ -26,6 +26,7 @@ from tensorflow.contrib.timeseries.python.timeseries import feature_keys
 
 from tensorflow.python.estimator import estimator_lib
 from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.estimator.canned import metric_keys
 from tensorflow.python.estimator.export import export_lib
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -35,6 +36,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.util import nest
+from tensorflow.python.summary import summary
 
 
 def time_series_regression_head(model,
@@ -71,12 +73,31 @@ class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-acc
     self.input_statistics_generator = input_statistics_generator
     self._name = name
 
+  @property
+  def name(self):
+    return self._name
+
+  # TODO(terrytangyuan): consolidate `model_outputs` and `_Head.LossSpec`
+  # once `_Head.create_loss` becomes extendable
+  def create_loss(self, features, mode, logits=None, labels=None):
+    """See `_Head`."""
+    model_outputs = self.state_manager.define_loss(
+        self.model, features, mode)
+    summary.scalar(
+        head_lib._summary_key(self._name, metric_keys.MetricKeys.LOSS),
+        model_outputs.loss)
+    return model_outputs
+
+  @property
+  def logits_dimension(self):
+    """See `_Head`."""
+    return 1
+
   def _train_ops(self, features):
     """Add training ops to the graph."""
+    mode = estimator_lib.ModeKeys.TRAIN
     with variable_scope.variable_scope("model"):
-      model_outputs = self.state_manager.define_loss(
-          self.model, features, estimator_lib.ModeKeys.TRAIN)
-
+      model_outputs = self.create_loss(features, mode)
     train_op = optimizers.optimize_loss(
         model_outputs.loss,
         global_step=training_util.get_global_step(),
@@ -85,31 +106,14 @@ class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-acc
         learning_rate=None)
     return estimator_lib.EstimatorSpec(
         loss=model_outputs.loss,
-        mode=estimator_lib.ModeKeys.TRAIN,
+        mode=mode,
         train_op=train_op)
 
-  # TODO(terrytangyuan): suffix summary and metrics keys by `"/" + name`
-  @property
-  def name(self):
-    return self._name
-
-  # TODO(terrytangyuan): unused for now. Need to decouple
-  # `state_manager.define_loss` to satisfy the extendable return signature of
-  # `_Head.create_loss`.
-  def create_loss(self, features, mode, logits, labels):
-    """See `_Head`."""
-    return None
-
-  # TODO(terrytangyuan): check label dimension
-  @property
-  def logits_dimension(self):
-    return None
-
   def _evaluate_ops(self, features):
     """Add ops for evaluation (aka filtering) to the graph."""
+    mode = estimator_lib.ModeKeys.EVAL
     with variable_scope.variable_scope("model"):
-      model_outputs = self.state_manager.define_loss(
-          self.model, features, estimator_lib.ModeKeys.EVAL)
+      model_outputs = self.create_loss(features, mode)
     metrics = {}
     # Just output in-sample predictions for the last chunk seen
     for prediction_key, prediction_value in model_outputs.predictions.items():
@@ -122,7 +126,7 @@ class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-acc
                                 model_outputs.end_state))
     return estimator_lib.EstimatorSpec(
         loss=model_outputs.loss,
-        mode=estimator_lib.ModeKeys.EVAL,
+        mode=mode,
         eval_metric_ops=metrics,
         predictions={})
 
@@ -140,9 +144,8 @@ class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-acc
     with variable_scope.variable_scope("model"):
       prediction_outputs = self.model.predict(features=features)
     with variable_scope.variable_scope("model", reuse=True):
-      filtering_outputs = self.state_manager.define_loss(
-          self.model, features, estimator_lib.ModeKeys.EVAL)
-
+      filtering_outputs = self.create_loss(
+          features, estimator_lib.ModeKeys.EVAL)
     return estimator_lib.EstimatorSpec(
         mode=estimator_lib.ModeKeys.PREDICT,
         export_outputs={
@@ -191,7 +194,7 @@ class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-acc
 
   def create_estimator_spec(self, features, mode, labels=None):
     """Performs basic error checking and returns an EstimatorSpec."""
-    with ops.name_scope("head"):
+    with ops.name_scope(self._name, "head"):
       if labels:
         raise ValueError(
             "The model received a `labels` dictionary, which is "
diff --git a/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay.py b/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay.py
deleted file mode 100644
index ed0f398e30..0000000000
--- a/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay.py
+++ /dev/null
@@ -1,187 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""SGDR learning rate decay function."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import math_ops, control_flow_ops
-
-
-def sgdr_decay(learning_rate, global_step, initial_period_steps,
-               t_mul=2.0, m_mul=1.0, name=None):
-  """Implements Stochastic Gradient Descent with Warm Restarts (SGDR).
-
-  As described in "SGDR: Stochastic Gradient Descent
-  with Warm Restarts" by Ilya Loshchilov & Frank Hutter, Proceedings of
-  ICLR'2017, available at https://arxiv.org/pdf/1608.03983.pdf
-
-  The learning rate decreases according to cosine annealing:
-
-  ```python
-  learning_rate * 0.5 * (1 + cos(x_val * pi)) # for x_val defined in [0, 1]
-  ```
-
-  Thus, at the beginning (when the restart index i = 0),
-  the learning rate decreases for `initial_period_steps` steps from the initial
-  learning rate `learning_rate` (when `x_val=0`, we get `cos(0)=1`) to
-  0 (when `x_val=1`, we get `cos(pi)=-1`).
-
-  The decrease within the i-th period takes `t_i` steps,
-  where `t_0` = `initial_period_steps` is the user-defined number of batch
-  iterations (not epochs as in the paper) to be performed before the first
-  restart is launched.
-
-  Then, we perform the first restart (i=1) by setting the learning rate to
-  `learning_rate*(m_mul^i)`, where `m_mul in [0,1]` (set to 1 by default).
-  The i-th restart runs for `t_i=t_0*(t_mul^i)` steps, i.e., every new
-  restart runs `t_mul` times longer than the previous one.
-
-  Importantly, when one has no access to a validation set, SGDR suggests
-  to report the best expected / recommended solution in the following way:
-  When we are within our initial run (i=0), every new solution represents
-  SGDR's recommended solution. Instead, when i>0, the recommended solution is
-  the one obtained at the end of each restart.
-
-  Note that the minimum learning rate is set to 0 for simplicity,
-  you can adjust the code to deal with any positive minimum learning rate
-  as defined in the paper.
-
-  `initial_period_steps` is the duration of the first period measured in terms
-  of number of minibatch updates. If one wants to use epochs, one should compute
-  the number of updates required for an epoch.
-
-  For example, assume the following parameters and intention:
-      Minibatch size: 100
-      Training dataset size: 10000
-      If the user wants the first decay period to span across 5 epochs, then
-      `initial_period_steps` = 5 * 10000/100 = 500
-
-      Train for 10000 batch iterations with the initial learning rate set to
-      0.1, then restart to run 2 times longer, i.e, for 20000 batch iterations
-      and with the initial learning rate 0.05, then restart again and again,
-      doubling the runtime of each new period and with two times smaller
-      initial learning rate.
-
-  To accomplish the above, one would write:
-
-  ```python
-  ...
-  global_step = tf.Variable(0, trainable=False)
-  starter_learning_rate = 0.1
-  learning_rate = sgdr_decay(starter_learning_rate, global_step,
-                             initial_period_steps=10000, t_mul=2, m_mul=0.5)
-  # Passing global_step to minimize() will increment it at each step.
-  learning_step = (
-      tf.train.GradientDescentOptimizer(learning_rate)
-      .minimize(...my loss..., global_step=global_step)
-  )
-
-  # Step  | 0   | 1000  | 5000 | 9000  | 9999 | 10000 | 11000  |
-  # LR    | 0.1 | 0.097 | 0.05 | 0.002 | 0.00 | 0.05  | 0.0496 |
-
-  # Step  | 20000 | 29000  | 29999 | 30000 |
-  # LR    | 0.025 | 0.0003 | 0.00  | 0.025 |
-  ```
-
-  Args:
-    learning_rate: A scalar `float32` or `float64` `Tensor` or a
-      Python number.  The initial learning rate.
-    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Global step to use for the decay computation.  Must not be negative.
-    initial_period_steps: Duration of the first period measured as the number
-      of minibatch updates, if one wants to use epochs, one should compute
-      the number of updates required for an epoch.
-    t_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
-      Must be positive.
-      Used to derive the number of iterations in the i-th period:
-      `initial_period_steps * (t_mul^i)`. Defaults to 2.0.
-    m_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
-      Must be positive.
-      Used to derive the initial learning rate of the i-th period:
-      `learning_rate * (m_mul^i)`. Defaults to 1.0
-
-  Returns:
-    A scalar `Tensor` of the same type as `learning_rate`.
-    The learning rate for a provided global_step.
-  Raises:
-    ValueError: if `global_step` is not supplied.
-  """
-
-  if global_step is None:
-    raise ValueError("global_step is required for sgdr_decay.")
-  with ops.name_scope(name, "SGDRDecay",
-                      [learning_rate, global_step,
-                       initial_period_steps, t_mul, m_mul]) as name:
-    learning_rate = ops.convert_to_tensor(learning_rate,
-                                          name="initial_learning_rate")
-    dtype = learning_rate.dtype
-    global_step = math_ops.cast(global_step, dtype)
-    t_0 = math_ops.cast(initial_period_steps, dtype)
-    t_mul = math_ops.cast(t_mul, dtype)
-    m_mul = math_ops.cast(m_mul, dtype)
-
-    c_one = math_ops.cast(constant_op.constant(1.0), dtype)
-    c_half = math_ops.cast(constant_op.constant(0.5), dtype)
-    c_pi = math_ops.cast(constant_op.constant(math.pi), dtype)
-
-    # Find normalized value of the current step
-    x_val = math_ops.div(global_step, t_0)
-
-    def compute_step(x_val, geometric=False):
-      if geometric:
-        # Consider geometric series where t_mul != 1
-        # 1 + t_mul + t_mul^2 ... = (1 - t_mul^i_restart) / (1 - t_mul)
-
-        # First find how many restarts were performed for a given x_val
-        # Find maximal integer i_restart value for which this equation holds
-        # x_val >= (1 - t_mul^i_restart) / (1 - t_mul)
-        # x_val * (1 - t_mul) <= (1 - t_mul^i_restart)
-        # t_mul^i_restart <= (1 - x_val * (1 - t_mul))
-
-        # tensorflow allows only log with base e
-        # i_restart <= log(1 - x_val * (1 - t_mul) / log(t_mul)
-        # Find how many restarts were performed
-
-        i_restart = math_ops.floor(
-            math_ops.log(c_one - x_val * (c_one - t_mul)) / math_ops.log(t_mul))
-        # Compute the sum of all restarts before the current one
-        sum_r = (c_one - t_mul ** i_restart) / (c_one - t_mul)
-        # Compute our position within the current restart
-        x_val = (x_val - sum_r) / t_mul ** i_restart
-
-      else:
-        # Find how many restarts were performed
-        i_restart = math_ops.floor(x_val)
-        # Compute our position within the current restart
-        x_val = x_val - i_restart
-      return i_restart, x_val
-
-    i_restart, x_val = control_flow_ops.cond(
-        math_ops.equal(t_mul, c_one),
-        lambda: compute_step(x_val, geometric=False),
-        lambda: compute_step(x_val, geometric=True))
-
-    # If m_mul < 1, then the initial learning rate of every new restart will be
-    # smaller, i.e., by a factor of m_mul ** i_restart at i_restart-th restart
-    m_fac = learning_rate * (m_mul ** i_restart)
-
-  return math_ops.multiply(c_half * m_fac,
-                           (math_ops.cos(x_val * c_pi) + c_one), name=name)
diff --git a/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay_test.py b/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay_test.py
deleted file mode 100644
index 4a46e9a49e..0000000000
--- a/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay_test.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Functional test for sgdr learning rate decay."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-
-from sgdr_learning_rate_decay import sgdr_decay
-from tensorflow.python.platform import googletest
-from tensorflow.python.framework import test_util
-from tensorflow.python.framework import dtypes
-from tensorflow import placeholder
-
-
-class SGDRDecayTest(test_util.TensorFlowTestCase):
-  """Unit tests for SGDR learning rate decay."""
-
-  def get_original_values(self, lr, t_e, mult_factor, iter_per_epoch, epochs):
-    """Get an array with learning rate values from the consecutive steps using
-    the original implementation
-    (https://github.com/loshchil/SGDR/blob/master/SGDR_WRNs.py)."""
-    t0 = math.pi / 2.0
-    tt = 0
-    te_next = t_e
-
-    lr_values = []
-    sh_lr = lr
-    for epoch in range(epochs):
-      for _ in range(iter_per_epoch):
-        # In the original approach training function is executed here
-        lr_values.append(sh_lr)
-        dt = 2.0 * math.pi / float(2.0 * t_e)
-        tt = tt + float(dt) / iter_per_epoch
-        if tt >= math.pi:
-          tt = tt - math.pi
-        cur_t = t0 + tt
-        new_lr = lr * (1.0 + math.sin(cur_t)) / 2.0  # lr_min = 0, lr_max = lr
-        sh_lr = new_lr
-      if (epoch + 1) == te_next:  # time to restart
-        sh_lr = lr
-        tt = 0                # by setting to 0 we set lr to lr_max, see above
-        t_e = t_e * mult_factor  # change the period of restarts
-        te_next = te_next + t_e  # note the next restart's epoch
-
-    return lr_values
-
-  def get_sgdr_values(self, lr, initial_period_steps, t_mul, iters):
-    """Get an array with learning rate values from the consecutive steps
-    using current tensorflow implementation."""
-    with self.test_session():
-      step = placeholder(dtypes.int32)
-
-      decay = sgdr_decay(lr, step, initial_period_steps, t_mul)
-      lr_values = []
-      for i in range(iters):
-        lr_values.append(decay.eval(feed_dict={step: i}))
-
-      return lr_values
-
-  def testCompareToOriginal(self):
-    """Compare values generated by tensorflow implementation to the values
-    generated by the original implementation
-    (https://github.com/loshchil/SGDR/blob/master/SGDR_WRNs.py)."""
-    with self.test_session():
-      lr = 10.0
-      init_steps = 2
-      t_mul = 3
-      iters = 10
-      epochs = 50
-
-      org_lr = self.get_original_values(lr, init_steps, t_mul, iters, epochs)
-      sgdr_lr = self.get_sgdr_values(lr, init_steps*iters, t_mul, iters*epochs)
-
-      for org, sgdr in zip(org_lr, sgdr_lr):
-        self.assertAllClose(org, sgdr)
-
-  def testMDecay(self):
-    """Test m_mul argument. Check values for learning rate at the beginning
-    of the first, second, third and fourth period. """
-    with self.test_session():
-      step = placeholder(dtypes.int32)
-
-      lr = 0.1
-      t_e = 10
-      t_mul = 3
-      m_mul = 0.9
-
-      decay = sgdr_decay(lr, step, t_e, t_mul, m_mul)
-
-      test_step = 0
-      self.assertAllClose(decay.eval(feed_dict={step: test_step}),
-                          lr)
-
-      test_step = t_e
-      self.assertAllClose(decay.eval(feed_dict={step: test_step}),
-                          lr * m_mul)
-
-      test_step = t_e + t_e*t_mul
-      self.assertAllClose(decay.eval(feed_dict={step: test_step}),
-                          lr * m_mul**2)
-
-      test_step = t_e + t_e*t_mul + t_e * (t_mul**2)
-      self.assertAllClose(decay.eval(feed_dict={step: test_step}),
-                          lr * (m_mul**3))
-
-  def testCos(self):
-    """Check learning rate values at the beginning, in the middle
-    and at the end of the period."""
-    with self.test_session():
-      step = placeholder(dtypes.int32)
-      lr = 0.2
-      t_e = 1000
-      t_mul = 1
-
-      decay = sgdr_decay(lr, step, t_e, t_mul)
-
-      test_step = 0
-      self.assertAllClose(decay.eval(feed_dict={step: test_step}), lr)
-
-      test_step = t_e//2
-      self.assertAllClose(decay.eval(feed_dict={step: test_step}), lr/2)
-
-      test_step = t_e
-      self.assertAllClose(decay.eval(feed_dict={step: test_step}), lr)
-
-      test_step = t_e*3//2
-      self.assertAllClose(decay.eval(feed_dict={step: test_step}), lr/2)
-
-if __name__ == "__main__":
-  googletest.main()
diff --git a/tensorflow/contrib/verbs/rdma.cc b/tensorflow/contrib/verbs/rdma.cc
index 7d95b6522c..86350a08e5 100644
--- a/tensorflow/contrib/verbs/rdma.cc
+++ b/tensorflow/contrib/verbs/rdma.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/core/api_def/base_api/api_def_UniqueWithCountsV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_UniqueWithCountsV2.pbtxt
new file mode 100644
index 0000000000..e21f56ba5b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UniqueWithCountsV2.pbtxt
@@ -0,0 +1,85 @@
+op {
+  graph_op_name: "UniqueWithCountsV2"
+  in_arg {
+    name: "x"
+    description: <<END
+A `Tensor`.
+END
+  }
+  in_arg {
+    name: "axis"
+    description: <<END
+A `Tensor` of type `int32` (default: None). The axis of the Tensor to
+find the unique elements.
+END
+  }
+  out_arg {
+    name: "y"
+    description: <<END
+A `Tensor`. Unique elements along the `axis` of `Tensor` x.
+END
+  }
+  out_arg {
+    name: "idx"
+    description: <<END
+A 1-D Tensor. Has the same type as x that contains the index of each
+value of x in the output y.
+END
+  }
+  out_arg {
+    name: "count"
+    description: <<END
+A 1-D Tensor. The count of each value of x in the output y.
+END
+  }
+  summary: "Finds unique elements along an axis of a tensor."
+  description: <<END
+This operation either returns a tensor `y` containing unique elements
+along the `axis` of a tensor. The returned unique elements is sorted
+in the same order as they occur along `axis` in `x`.
+This operation also returns a tensor `idx` and a tensor `count`
+that are the same size as the number of the elements in `x` along the
+`axis` dimension. The `idx` contains the index in the unique output `y`
+and the `count` contains the count in the unique output `y`.
+In other words, for an `1-D` tensor `x` with `axis = None:
+
+`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+
+For example:
+
+```
+# tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+y, idx, count = unique_with_counts(x)
+y ==> [1, 2, 4, 7, 8]
+idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+count ==> [2, 1, 3, 1, 2]
+```
+
+For an `2-D` tensor `x` with `axis = 0`:
+
+```
+# tensor 'x' is [[1, 0, 0],
+#                [1, 0, 0],
+#                [2, 0, 0]]
+y, idx, count = unique_with_counts(x, axis=0)
+y ==> [[1, 0, 0],
+       [2, 0, 0]]
+idx ==> [0, 0, 1]
+count ==> [2, 1]
+```
+
+For an `2-D` tensor `x` with `axis = 1`:
+
+```
+# tensor 'x' is [[1, 0, 0],
+#                [1, 0, 0],
+#                [2, 0, 0]]
+y, idx, count = unique_with_counts(x, axis=1)
+y ==> [[1, 0],
+       [1, 0],
+       [2, 0]]
+idx ==> [0, 1, 1]
+count ==> [1, 2]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
index 4e69e0bc63..4ca6780c95 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
@@ -14,20 +14,21 @@ Has same shape as data, except for dimension 0 which
 has size `num_segments`.
 END
   }
-  summary: "Computes the Max along segments of a tensor."
+  summary: "Computes the maximum along segments of a tensor."
   description: <<END
 Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 segments.
 
-This operator is similar to the [unsorted segment sum operator](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-Instead of computing the sum over segments, it computes the maximum
-such that:
+This operator is similar to the unsorted segment sum operator found
+[(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+Instead of computing the sum over segments, it computes the maximum such that:
 
 \\(output_i = \max_j data_j\\) where max is over `j` such
 that `segment_ids[j] == i`.
 
-If the maximum is empty for a given segment ID `i`, it outputs the smallest possible value for specific numeric type,
- `output[i] = numeric_limits<T>::min()`.
+If the maximum is empty for a given segment ID `i`, it outputs the smallest
+possible value for the specific numeric type,
+`output[i] = numeric_limits<T>::lowest()`.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMin.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMin.pbtxt
new file mode 100644
index 0000000000..55ea69b5dd
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMin.pbtxt
@@ -0,0 +1,33 @@
+op {
+  graph_op_name: "UnsortedSegmentMin"
+  in_arg {
+    name: "segment_ids"
+    description: <<END
+A 1-D tensor whose rank is equal to the rank of `data`'s
+first dimension.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has same shape as data, except for dimension 0 which
+has size `num_segments`.
+END
+  }
+  summary: "Computes the minimum along segments of a tensor."
+  description: <<END
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
+
+This operator is similar to the unsorted segment sum operator found
+[(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+Instead of computing the sum over segments, it computes the minimum such that:
+
+\\(output_i = \min_j data_j\\) where min is over `j` such
+that `segment_ids[j] == i`.
+
+If the minimum is empty for a given segment ID `i`, it outputs the largest
+possible value for the specific numeric type,
+`output[i] = numeric_limits<T>::max()`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentProd.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentProd.pbtxt
new file mode 100644
index 0000000000..577ff53d60
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentProd.pbtxt
@@ -0,0 +1,32 @@
+op {
+  graph_op_name: "UnsortedSegmentProd"
+  in_arg {
+    name: "segment_ids"
+    description: <<END
+A 1-D tensor whose rank is equal to the rank of `data`'s
+first dimension.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has same shape as data, except for dimension 0 which
+has size `num_segments`.
+END
+  }
+  summary: "Computes the product along segments of a tensor."
+  description: <<END
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
+
+This operator is similar to the unsorted segment sum operator found
+[(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+Instead of computing the sum over segments, it computes the product of all
+entries belonging to a segment such that:
+
+\\(output_i = \prod_j data_j\\) where the product is over `j` such
+that `segment_ids[j] == i`.
+
+If there is no entry for a given segment ID `i`, it outputs 1.
+END
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_UniqueWithCounts.pbtxt b/tensorflow/core/api_def/python_api/api_def_UniqueWithCounts.pbtxt
new file mode 100644
index 0000000000..71b35eaab5
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_UniqueWithCounts.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "UniqueWithCounts"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_UniqueWithCountsV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_UniqueWithCountsV2.pbtxt
new file mode 100644
index 0000000000..7876e55cf3
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_UniqueWithCountsV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "UniqueWithCountsV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index 90664c3612..51b9547f53 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -43,8 +43,8 @@ SessionMgr::SessionMgr(
       worker_cache_factory_(std::move(worker_cache_factory)) {}
 
 string SessionMgr::WorkerNameFromServerDef(const ServerDef& server_def) {
-  return strings::StrCat("/job:", server_def.job_name(),
-                         "/replica:0/task:", server_def.task_index());
+  return strings::StrCat("/job:", server_def.job_name(), "/replica:0/task:",
+                         server_def.task_index());
 }
 
 Status SessionMgr::CreateSession(const string& session,
diff --git a/tensorflow/core/framework/numeric_types.h b/tensorflow/core/framework/numeric_types.h
index 99a5d0a054..4c38fbbe59 100644
--- a/tensorflow/core/framework/numeric_types.h
+++ b/tensorflow/core/framework/numeric_types.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define TENSORFLOW_FRAMEWORK_NUMERIC_TYPES_H_
 
 #include <complex>
-
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 // Disable clang-format to prevent 'FixedPoint' header from being included
 // before 'Tensor' header on which it depends.
@@ -43,12 +42,47 @@ typedef Eigen::QUInt16 quint16;
 
 }  // namespace tensorflow
 
+
+
+
+static inline tensorflow::bfloat16 FloatToBFloat16(float float_val) {
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    return *reinterpret_cast<tensorflow::bfloat16*>(
+        reinterpret_cast<uint16_t*>(&float_val));
+#else
+    return *reinterpret_cast<tensorflow::bfloat16*>(
+        &(reinterpret_cast<uint16_t*>(&float_val)[1]));
+#endif
+}
+    
 namespace Eigen {
-// TOOD(xpan): We probably need to overwrite more methods to have correct eigen
-// behavior. E.g. loest(), is_integer, etc. See NumTraits.h in eigen.
+// TODO(xpan): We probably need to overwrite more methods to have correct eigen
+// behavior. E.g. epsilon(), dummy_precision, etc. See NumTraits.h in eigen.
 template <>
 struct NumTraits<tensorflow::bfloat16>
-    : GenericNumTraits<tensorflow::bfloat16> {};
+    : GenericNumTraits<tensorflow::bfloat16> {
+  enum {
+    IsInteger = 0,
+    IsSigned = 1,
+    RequireInitialization = 0
+  };
+  static EIGEN_STRONG_INLINE tensorflow::bfloat16 highest() {
+    return FloatToBFloat16(NumTraits<float>::highest());
+  }
+
+  static EIGEN_STRONG_INLINE tensorflow::bfloat16 lowest() {
+    return FloatToBFloat16(NumTraits<float>::lowest());
+  }
+
+  static EIGEN_STRONG_INLINE tensorflow::bfloat16 infinity() {
+    return FloatToBFloat16(NumTraits<float>::infinity());
+  }
+
+  static EIGEN_STRONG_INLINE tensorflow::bfloat16 quiet_NaN() {
+    return FloatToBFloat16(NumTraits<float>::quiet_NaN());
+  }
+};
+
 
 using ::tensorflow::operator==;
 using ::tensorflow::operator!=;
diff --git a/tensorflow/core/framework/variant_op_registry.h b/tensorflow/core/framework/variant_op_registry.h
index e94100e994..c9e8dd2217 100644
--- a/tensorflow/core/framework/variant_op_registry.h
+++ b/tensorflow/core/framework/variant_op_registry.h
@@ -310,8 +310,8 @@ Status BinaryOpVariants(OpKernelContext* ctx, VariantBinaryOp op,
     return errors::Internal(
         "No unary variant binary_op function found for binary variant op "
         "enum: ",
-        op, " Variant type_name: '", a.TypeName(),
-        "' for device type: ", device);
+        op, " Variant type_name: '", a.TypeName(), "' for device type: ",
+        device);
   }
   return (*binary_op_fn)(ctx, a, b, out);
 }
diff --git a/tensorflow/core/kernels/cwise_op_maximum.cc b/tensorflow/core/kernels/cwise_op_maximum.cc
index 8c54f22f10..e8a58eea80 100644
--- a/tensorflow/core/kernels/cwise_op_maximum.cc
+++ b/tensorflow/core/kernels/cwise_op_maximum.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(BinaryOp, CPU, "Maximum", functor::maximum, float, Eigen::half,
-          double, int32, int64);
+REGISTER6(BinaryOp, CPU, "Maximum", functor::maximum, float, Eigen::half,
+          bfloat16, double, int32, int64);
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "Maximum", functor::maximum, float, Eigen::half,
           double, int64);
diff --git a/tensorflow/core/kernels/reshape_op.cc b/tensorflow/core/kernels/reshape_op.cc
index 8b86596721..33c63e7050 100644
--- a/tensorflow/core/kernels/reshape_op.cc
+++ b/tensorflow/core/kernels/reshape_op.cc
@@ -43,7 +43,6 @@ REGISTER_KERNEL_BUILDER(Name("Reshape")
                               .TypeConstraint<int64>("Tshape"), \
                           ReshapeOp);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
-TF_CALL_bfloat16(REGISTER_GPU_KERNEL);
 TF_CALL_bool(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
diff --git a/tensorflow/core/kernels/segment_reduction_ops.cc b/tensorflow/core/kernels/segment_reduction_ops.cc
index 27b8081eb8..6c4685a50a 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops.cc
@@ -20,10 +20,10 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #endif  // GOOGLE_CUDA
 
-#include "tensorflow/core/kernels/segment_reduction_ops.h"
-#include <vector>
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/kernels/segment_reduction_ops.h"
+#include <vector>
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -356,158 +356,180 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_SORTED_KERNELS_ALL);
 #undef REGISTER_GPU_SORTED_KERNELS_ALL
 #endif  // GOOGLE_CUDA
 
+// ____________________________________________________________________________
+// Unsorted segment reduction ops.
+
 namespace functor {
 
-// UnsortedSegmentSumFunctor implementation for CPUDevice.
-// todo: Remove duplicate code in UnsortedSegmentSumFunctor and
-// UnsortedSegmentMaxFunctor.
-template <typename T, typename Index>
-struct UnsortedSegmentSumFunctor<CPUDevice, T, Index>
-    : UnsortedSegmentBaseFunctor<CPUDevice, T, Index> {
-  void operator()(OpKernelContext* ctx, const CPUDevice& d,
-                  const Index output_rows, const TensorShape& segment_ids_shape,
+// The ReductionFunctor implementation for CPU.
+template <typename T, typename Index, typename InitialValueF,
+          typename ReductionF>
+struct UnsortedSegmentFunctor<CPUDevice, T, Index, InitialValueF, ReductionF> {
+  void operator()(OpKernelContext* ctx, const Index num_segments,
+                  const TensorShape& segment_ids_shape,
                   typename TTypes<Index>::ConstFlat segment_ids,
                   const Index data_size, const T* data,
-                  typename TTypes<T, 2>::Tensor output) override {
-    output.setZero();
+                  typename TTypes<T, 2>::Tensor output) {
+    output.setConstant(InitialValueF()());
     if (data_size == 0) {
       return;
     }
     const int64 N = segment_ids.dimension(0);
+    ReductionF reduction;
     auto data_flat = typename TTypes<T, 2>::ConstTensor(data, N, data_size / N);
     for (int64 i = 0; i < N; ++i) {
       Index j = internal::SubtleMustCopy(segment_ids(i));
       if (j < 0) {
         continue;
       }
-      OP_REQUIRES(ctx, FastBoundsCheck(j, output_rows),
+      OP_REQUIRES(ctx, FastBoundsCheck(j, num_segments),
                   errors::InvalidArgument(
                       "segment_ids", SliceDebugString(segment_ids_shape, i),
-                      " = ", j, " is out of range [0, ", output_rows, ")"));
-      output.template chip<0>(j) += data_flat.template chip<0>(i);
+                      " = ", j, " is out of range [0, ", num_segments, ")"));
+      reduction(data_flat.template chip<0>(i), output.template chip<0>(j));
     }
   }
 };
-// UnsortedSegmentMaxFunctor implementation for CPUDevice.
-template <typename T, typename Index>
-struct UnsortedSegmentMaxFunctor<CPUDevice, T, Index>
-    : UnsortedSegmentBaseFunctor<CPUDevice, T, Index> {
-  void operator()(OpKernelContext* ctx, const CPUDevice& d,
-                  const Index output_rows, const TensorShape& segment_ids_shape,
-                  typename TTypes<Index>::ConstFlat segment_ids,
-                  const Index data_size, const T* data,
-                  typename TTypes<T, 2>::Tensor output) override {
-    output.setConstant(std::numeric_limits<T>::lowest());
-    if (data_size == 0) {
-      return;
-    }
-    const int64 N = segment_ids.dimension(0);
-    auto data_flat = typename TTypes<T, 2>::ConstTensor(data, N, data_size / N);
-    for (int64 i = 0; i < N; ++i) {
-      Index j = internal::SubtleMustCopy(segment_ids(i));
-      OP_REQUIRES(ctx, FastBoundsCheck(j, output_rows),
-                  errors::InvalidArgument(
-                      "segment_ids", SliceDebugString(segment_ids_shape, i),
-                      " = ", j, " is out of range [0, ", output_rows, ")"));
-      output.template chip<0>(j) =
-          data_flat.template chip<0>(i).cwiseMax(output.template chip<0>(j));
-    }
+
+template <typename T>
+using MatrixChip = Eigen::TensorChippingOp<0l, typename TTypes<T, 2>::Matrix>;
+
+template <typename T>
+using constMatrixChip =
+    Eigen::TensorChippingOp<0l, const typename TTypes<T, 2>::ConstMatrix>;
+
+// reduction functors
+template <typename T>
+struct SumOp {
+  void operator()(const constMatrixChip<T> data, MatrixChip<T> output) {
+    output += data;
+  }
+};
+
+template <typename T>
+struct MaxOp {
+  void operator()(const constMatrixChip<T> data, MatrixChip<T> output) {
+    output = data.cwiseMax(output);
+  }
+};
+
+template <typename T>
+struct MinOp {
+  void operator()(const constMatrixChip<T> data, MatrixChip<T> output) {
+    output = data.cwiseMin(output);
+  }
+};
+
+template <typename T>
+struct ProdOp {
+  void operator()(const constMatrixChip<T> data, MatrixChip<T> output) {
+    output *= data;
   }
 };
 }  // namespace functor
 
-// Base class for SegmentReductionOps that can handle unsorted segment
-// definitions
-// and specifying the size of the output in addition to a reduction function
-template <typename Device, class T, class Index>
-class UnsortedSegmentBaseOp : public OpKernel {
+// Static check routines not in the templated class to reduce code size
+static void UnsortedSegmentReductionValidation(OpKernel* op_kernel,
+                                               OpKernelContext* context,
+                                               const Tensor& data,
+                                               const Tensor& segment_ids,
+                                               const Tensor& num_segments) {
+  OP_REQUIRES(
+      context, op_kernel->IsLegacyScalar(num_segments.shape()),
+      errors::InvalidArgument("num_segments should be a scalar, not shape ",
+                              num_segments.shape().DebugString()));
+  OP_REQUIRES(
+      context, TensorShapeUtils::StartsWith(data.shape(), segment_ids.shape()),
+      errors::InvalidArgument("data.shape = ", data.shape().DebugString(),
+                              " does not start with segment_ids.shape = ",
+                              segment_ids.shape().DebugString()));
+}
+
+static bool UnsortedSegmentReductionDoValidation(OpKernel* op_kernel,
+                                                 OpKernelContext* context,
+                                                 const Tensor& data,
+                                                 const Tensor& segment_ids,
+                                                 const Tensor& num_segments) {
+  UnsortedSegmentReductionValidation(op_kernel, context, data, segment_ids,
+                                     num_segments);
+  return context->status().ok();
+}
+
+// The UnsortedSegmentReduction OpKernel. The DeviceReductionFunctor
+// is the device specific implementation of the reduction. These device
+// specific implementations are templated themselves with the corresponding
+// initial value functors and reduction functors.
+template <typename T, typename Index, typename DeviceReductionFunctor>
+class UnsortedSegmentReductionOp : public OpKernel {
  public:
-  explicit UnsortedSegmentBaseOp(
-      OpKernelConstruction* context,
-      functor::UnsortedSegmentBaseFunctor<Device, T, Index>& functor)
-      : OpKernel(context), reduction_functor_(functor) {}
+  explicit UnsortedSegmentReductionOp(OpKernelConstruction* context)
+      : OpKernel(context), reduction_functor_(DeviceReductionFunctor()) {}
 
   void Compute(OpKernelContext* context) override {
     const Tensor& data = context->input(0);
     const Tensor& segment_ids = context->input(1);
     const Tensor& num_segments = context->input(2);
-
-    OP_REQUIRES(
-        context, IsLegacyScalar(num_segments.shape()),
-        errors::InvalidArgument("num_segments should be a scalar, not shape ",
-                                num_segments.shape().DebugString()));
-    OP_REQUIRES(
-        context,
-        TensorShapeUtils::StartsWith(data.shape(), segment_ids.shape()),
-        errors::InvalidArgument("data.shape = ", data.shape().DebugString(),
-                                " does not start with segment_ids.shape = ",
-                                segment_ids.shape().DebugString()));
-
+    if (!UnsortedSegmentReductionDoValidation(this, context, data, segment_ids,
+                                              num_segments)) {
+      return;
+    }
     const auto segment_flat = segment_ids.flat<Index>();
     const Index output_rows =
         internal::SubtleMustCopy(num_segments.scalar<int32>()());
     OP_REQUIRES(context, output_rows >= 0,
                 errors::InvalidArgument("Input num_segments == ", output_rows,
                                         " must not be negative."));
-
     TensorShape output_shape;
     output_shape.AddDim(output_rows);
     for (int i = segment_ids.dims(); i < data.dims(); i++) {
       output_shape.AddDim(data.dim_size(i));
     }
-
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
     auto output_flat = output->flat_outer_dims<T>();
-
     auto data_ptr = data.template flat<T>().data();
-    reduction_functor_(context, context->template eigen_device<Device>(),
-                       output_rows, segment_ids.shape(), segment_flat,
+    reduction_functor_(context, output_rows, segment_ids.shape(), segment_flat,
                        data.NumElements(), data_ptr, output_flat);
   }
 
- private:
-  functor::UnsortedSegmentBaseFunctor<Device, T, Index>& reduction_functor_;
-};
-
-template <typename Device, class T, class Index>
-class UnsortedSegmentSumOp : public UnsortedSegmentBaseOp<Device, T, Index> {
- public:
-  explicit UnsortedSegmentSumOp(OpKernelConstruction* context)
-      : UnsortedSegmentBaseOp<Device, T, Index>(context, sum_functor_) {}
-
- private:
-  functor::UnsortedSegmentSumFunctor<Device, T, Index> sum_functor_;
+ protected:
+  DeviceReductionFunctor reduction_functor_;
 };
 
-template <typename Device, class T, class Index>
-class UnsortedSegmentMaxOp : public UnsortedSegmentBaseOp<Device, T, Index> {
- public:
-  explicit UnsortedSegmentMaxOp(OpKernelConstruction* context)
-      : UnsortedSegmentBaseOp<Device, T, Index>(context, max_functor_) {}
-
- private:
-  functor::UnsortedSegmentMaxFunctor<Device, T, Index> max_functor_;
-};
-
-#define REGISTER_REAL_CPU_UNSORTED_KERNELS(type, index_type)                  \
-  REGISTER_KERNEL_BUILDER(Name("UnsortedSegmentSum")                          \
-                              .Device(DEVICE_CPU)                             \
-                              .TypeConstraint<type>("T")                      \
-                              .TypeConstraint<index_type>("Tindices"),        \
-                          UnsortedSegmentSumOp<CPUDevice, type, index_type>); \
-  REGISTER_KERNEL_BUILDER(Name("UnsortedSegmentMax")                          \
-                              .Device(DEVICE_CPU)                             \
-                              .TypeConstraint<type>("T")                      \
-                              .TypeConstraint<index_type>("Tindices"),        \
-                          UnsortedSegmentMaxOp<CPUDevice, type, index_type>);
-
-#define REGISTER_COMPLEX_CPU_UNSORTED_KERNELS(type, index_type)        \
-  REGISTER_KERNEL_BUILDER(Name("UnsortedSegmentSum")                   \
-                              .Device(DEVICE_CPU)                      \
-                              .TypeConstraint<type>("T")               \
-                              .TypeConstraint<index_type>("Tindices"), \
-                          UnsortedSegmentSumOp<CPUDevice, type, index_type>);
+#define REGISTER_CPU_KERNEL_UNSORTEDSEGMENT(                           \
+    name, type, index_type, initial_value_functor, reduction_functor)  \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name(name)                                                       \
+          .Device(DEVICE_CPU)                                          \
+          .TypeConstraint<type>("T")                                   \
+          .TypeConstraint<index_type>("Tindices"),                     \
+      UnsortedSegmentReductionOp<                                      \
+          type, index_type,                                            \
+          functor::UnsortedSegmentFunctor<CPUDevice, type, index_type, \
+                                          initial_value_functor,       \
+                                          reduction_functor> >)
+
+#define REGISTER_REAL_CPU_UNSORTED_KERNELS(type, index_type)                   \
+  REGISTER_CPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentSum", type, index_type,  \
+                                      functor::Zero<type>,                     \
+                                      functor::SumOp<type>);                   \
+  REGISTER_CPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentMax", type, index_type,  \
+                                      functor::Lowest<type>,                   \
+                                      functor::MaxOp<type>);                   \
+  REGISTER_CPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentMin", type, index_type,  \
+                                      functor::Highest<type>,                  \
+                                      functor::MinOp<type>);                   \
+  REGISTER_CPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentProd", type, index_type, \
+                                      functor::One<type>,                      \
+                                      functor::ProdOp<type>);
+
+#define REGISTER_COMPLEX_CPU_UNSORTED_KERNELS(type, index_type)                \
+  REGISTER_CPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentSum", type, index_type,  \
+                                      functor::Zero<type>,                     \
+                                      functor::SumOp<type>);                   \
+  REGISTER_CPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentProd", type, index_type, \
+                                      functor::One<type>,                      \
+                                      functor::ProdOp<type>)
 
 #define REGISTER_REAL_CPU_UNSORTED_KERNELS_ALL(type) \
   REGISTER_REAL_CPU_UNSORTED_KERNELS(type, int32);   \
@@ -520,31 +542,72 @@ class UnsortedSegmentMaxOp : public UnsortedSegmentBaseOp<Device, T, Index> {
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_REAL_CPU_UNSORTED_KERNELS_ALL);
 REGISTER_COMPLEX_CPU_UNSORTED_KERNELS_ALL(complex64);
 REGISTER_COMPLEX_CPU_UNSORTED_KERNELS_ALL(complex128);
+
 #undef REGISTER_REAL_CPU_UNSORTED_KERNELS
+#undef REGISTER_CPU_KERNEL_UNSORTEDSEGMENT
 #undef REGISTER_COMPLEX_CPU_UNSORTED_KERNELS
 #undef REGISTER_COMPLEX_CPU_UNSORTED_KERNELS_ALL
 #undef REGISTER_REAL_CPU_UNSORTED_KERNELS_ALL
 
 #if GOOGLE_CUDA
-#define REGISTER_GPU_UNSORTED_KERNELS(type, index_type)                \
-  REGISTER_KERNEL_BUILDER(Name("UnsortedSegmentSum")                   \
-                              .Device(DEVICE_GPU)                      \
-                              .HostMemory("num_segments")              \
-                              .TypeConstraint<type>("T")               \
-                              .TypeConstraint<index_type>("Tindices"), \
-                          UnsortedSegmentSumOp<GPUDevice, type, index_type>);
-
-#define REGISTER_GPU_UNSORTED_KERNELS_ALL(type) \
-  REGISTER_GPU_UNSORTED_KERNELS(type, int32);   \
-  REGISTER_GPU_UNSORTED_KERNELS(type, int64);
+#define REGISTER_GPU_KERNEL_UNSORTEDSEGMENT(                                 \
+    name, type, index_type, initial_value_functor, reduction_kernel_functor) \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name(name)                                                             \
+          .Device(DEVICE_GPU)                                                \
+          .HostMemory("num_segments")                                        \
+          .TypeConstraint<type>("T")                                         \
+          .TypeConstraint<index_type>("Tindices"),                           \
+      UnsortedSegmentReductionOp<                                            \
+          type, index_type,                                                  \
+          functor::UnsortedSegmentFunctor<GPUDevice, type, index_type,       \
+                                          initial_value_functor,             \
+                                          reduction_kernel_functor> >)
+
+// sum is the only op that supports all input types currently
+#define REGISTER_REAL_GPU_UNSORTED_KERNELS(type, index_type)                   \
+  REGISTER_GPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentMax", type, index_type,  \
+                                      functor::Lowest<type>,                   \
+                                      functor::MaxOpGpu<type>);                \
+  REGISTER_GPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentMin", type, index_type,  \
+                                      functor::Highest<type>,                  \
+                                      functor::MinOpGpu<type>);                \
+  REGISTER_GPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentProd", type, index_type, \
+                                      functor::One<type>,                      \
+                                      functor::ProdOpGpu<type>);
+
+#define REGISTER_SUM_GPU_UNSORTED_KERNELS(type, index_type)                   \
+  REGISTER_GPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentSum", type, index_type, \
+                                      functor::Zero<type>,                    \
+                                      functor::SumOpGpu<type>);
+
+#define REGISTER_REAL_GPU_UNSORTED_KERNELS_ALL(type) \
+  REGISTER_REAL_GPU_UNSORTED_KERNELS(type, int32);   \
+  REGISTER_REAL_GPU_UNSORTED_KERNELS(type, int64);
+
+#define REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL(type) \
+  REGISTER_SUM_GPU_UNSORTED_KERNELS(type, int32);   \
+  REGISTER_SUM_GPU_UNSORTED_KERNELS(type, int64);
+
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_REAL_GPU_UNSORTED_KERNELS_ALL);
+TF_CALL_int32(REGISTER_REAL_GPU_UNSORTED_KERNELS_ALL);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL);
+TF_CALL_int32(REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL);
+TF_CALL_complex64(REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL);
+TF_CALL_complex128(REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL);
+
+#undef REGISTER_GPU_KERNEL_UNSORTEDSEGMENT
+#undef REGISTER_REAL_GPU_UNSORTED_KERNELS
+#undef REGISTER_SUM_GPU_UNSORTED_KERNELS
+#undef REGISTER_REAL_GPU_UNSORTED_KERNELS_ALL
+#undef REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_UNSORTED_KERNELS_ALL);
-TF_CALL_complex64(REGISTER_GPU_UNSORTED_KERNELS_ALL);
-TF_CALL_complex128(REGISTER_GPU_UNSORTED_KERNELS_ALL);
-#undef REGISTER_GPU_UNSORTED_KERNELS
-#undef REGISTER_GPU_UNSORTED_KERNELS_ALL
 #endif  // GOOGLE_CUDA
 
+// ____________________________________________________________________________
+// Sparse segment reduction ops.
+
 // Same as SegmentReductionOp but takes as input a "sparse" tensor, represented
 // by two dense tensors, one containing the data, and the other containing
 // indices into the data.
diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h
index 5c9cfe0906..fe0a2782f9 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.h
+++ b/tensorflow/core/kernels/segment_reduction_ops.h
@@ -13,8 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
-#define TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
+
+
+// This file requires the following include because it uses CudaAtomicMax:
+// #include "tensorflow/core/util/cuda_kernel_helper.h"
+
+// Unfortunately we can't add the #include, since it breaks compilation for
+// non-GPU targets. This only breaks in clang, because it's more strict for
+// template code and CudaAtomicMax is used in template context.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor.h"
@@ -46,59 +54,81 @@ struct SegmentSumFunctor {
                   const Index data_size, const T* data,
                   typename TTypes<T, 2>::Tensor output);
 };
-#endif
 
-// BaseFunctor for definition of UnsorteSegmentReductionOp
-// for usage without templates.
-template <typename Device, typename T, typename Index>
-struct UnsortedSegmentBaseFunctor {
-  virtual ~UnsortedSegmentBaseFunctor() {}
-  virtual void operator()(OpKernelContext* ctx, const Device& d,
-                          const Index output_rows,
-                          const TensorShape& segment_ids_shape,
-                          typename TTypes<Index>::ConstFlat segment_ids,
-                          const Index data_size, const T* data,
-                          typename TTypes<T, 2>::Tensor output){};
-};
+#endif
 
-// Functor for UnsortedSegmentSumOp.
-// output_rows: the number of output segments (unique segment ids in
-//                'segment_ids').
-// segment_ids_shape: shape of 'segment_ids' tensor.
-// segment_ids: unsorted map from input to output segment ids at which to
-//                perform segment sum operation.
-// data_size: size of input data tensor.
-// data: input data tensor.
-// output: output reshaped to {output_rows, output.size/output_rows}
-template <typename Device, typename T, typename Index>
-struct UnsortedSegmentSumFunctor
-    : public UnsortedSegmentBaseFunctor<Device, T, Index> {
-  void operator()(OpKernelContext* ctx, const Device& d,
-                  const Index output_rows, const TensorShape& segment_ids_shape,
+template <typename Device, typename T, typename Index, typename InitialValueF,
+          typename ReductionF>
+struct UnsortedSegmentFunctor {
+  void operator()(OpKernelContext* ctx, const Index num_segments,
+                  const TensorShape& segment_ids_shape,
                   typename TTypes<Index>::ConstFlat segment_ids,
                   const Index data_size, const T* data,
                   typename TTypes<T, 2>::Tensor output);
 };
 
-// Functor for UnsortedSegmentMaxOp.
-// output_rows: the number of output segments (unique segment ids in
-//                'segment_ids').
-// segment_ids_shape: shape of 'segment_ids' tensor.
-// segment_ids: unsorted map from input to output segment ids at which to
-//                perform segment sum operation.
-// data_size: size of input data tensor.
-// data: input data tensor.
-// output: output reshaped to {output_rows, output.size/output_rows}
-template <typename Device, typename T, typename Index>
-struct UnsortedSegmentMaxFunctor
-    : public UnsortedSegmentBaseFunctor<Device, T, Index> {
-  void operator()(OpKernelContext* ctx, const Device& d,
-                  const Index output_rows, const TensorShape& segment_ids_shape,
-                  typename TTypes<Index>::ConstFlat segment_ids,
-                  const Index data_size, const T* data,
-                  typename TTypes<T, 2>::Tensor output);
+#ifdef GOOGLE_CUDA
+// reduction functors for the gpu
+template <typename T>
+struct SumOpGpu {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(T* dest,
+                                                        const T& value) {
+    CudaAtomicAdd(dest, value);
+  }
+};
+
+template <typename T>
+struct ProdOpGpu {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(T* dest,
+                                                        const T& value) {
+    CudaAtomicMul(dest, value);
+  }
+};
+
+template <typename T>
+struct MaxOpGpu {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(T* dest,
+                                                        const T& value) {
+    CudaAtomicMax(dest, value);
+  }
+};
+
+template <typename T>
+struct MinOpGpu {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(T* dest,
+                                                        const T& value) {
+    CudaAtomicMin(dest, value);
+  }
+};
+
+#endif  // GOOGLE_CUDA
+
+// initial value functors
+template <typename T>
+struct Zero {
+  EIGEN_STRONG_INLINE T operator()() const { return T(0); }
+};
+
+template <typename T>
+struct One {
+  EIGEN_STRONG_INLINE T operator()() const { return T(1); }
 };
+
+template <typename T>
+struct Lowest {
+  EIGEN_STRONG_INLINE T operator()() const {
+    return Eigen::NumTraits<T>::lowest();
+  }
+};
+
+template <typename T>
+struct Highest {
+  EIGEN_STRONG_INLINE T operator()() const {
+    return Eigen::NumTraits<T>::highest();
+  }
+};
+
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
diff --git a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
index 39d520698e..3511c85f71 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
@@ -17,43 +17,19 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/segment_reduction_ops.h"
+// We need to include cuda_kernel_helper.h before segment_reduction_ops.h
+// See comment in segment_reduction_ops.h for more details.
+#include "tensorflow/core/util/cuda_kernel_helper.h"
 
+#include "tensorflow/core/kernels/segment_reduction_ops.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/util/cuda_kernel_helper.h"
+#include "tensorflow/core/util/cuda_device_functions.h"
+
 
 namespace tensorflow {
 
 using GPUDevice = Eigen::GpuDevice;
 
-// Helper for UnusortedSegmentSumCustomKernel that adds value into dest
-// atomically.
-template <typename T>
-static __device__ __forceinline__ void AccumulateInto(T* dest, const T& value) {
-  CudaAtomicAdd(dest, value);
-}
-
-// Specializations of AccumulateInto for complex types, which CudaAtomicAdd does
-// not support. We treat a std::complex<T>* as a T* (the C++ standard section
-// 26.4.4 allows this explicitly) and atomic add the real and imaginary
-// components individually. The operation as a whole is not atomic, but we can
-// safely treat the components independently for the purpose of accumulating.
-template <>
-__device__ __forceinline__ void AccumulateInto(
-    std::complex<float>* dest, const std::complex<float>& value) {
-  auto dest_scalar = reinterpret_cast<float*>(dest);
-  CudaAtomicAdd(dest_scalar, value.real());
-  CudaAtomicAdd(dest_scalar + 1, value.imag());
-}
-
-template <>
-__device__ __forceinline__ void AccumulateInto(
-    std::complex<double>* dest, const std::complex<double>& value) {
-  auto dest_scalar = reinterpret_cast<double*>(dest);
-  CudaAtomicAdd(dest_scalar, value.real());
-  CudaAtomicAdd(dest_scalar + 1, value.imag());
-}
-
 // SortedSegmentSumFunctor kernel reduces input data just as
 // UnsortedSegmentSumCustomKernel does except that input data
 // is partitioned along the outer reduction dimension. This is
@@ -81,7 +57,7 @@ __global__ void SortedSegmentSumCustomKernel(const Index input_outer_dim_size,
                                              const Index* segment_ids,
                                              const T* input, T* output,
                                              const Index total_stripe_count) {
-  CUDA_1D_KERNEL_LOOP(stripe_index, total_stripe_count) {
+  for (int stripe_index : CudaGridRangeX(total_stripe_count)) {
     const Index segment_offset = stripe_index % inner_dim_size;
     const Index input_outer_dim_index_base =
         stripe_index / inner_dim_size * Index(OuterDimTileSize);
@@ -106,7 +82,7 @@ __global__ void SortedSegmentSumCustomKernel(const Index input_outer_dim_size,
         // decide whether to write result to global memory using atomic
         // operations
         if (last_output_segment_id == first_segment_id) {
-          AccumulateInto<T>(output + output_index, sum);
+          CudaAtomicAdd(output + output_index, sum);
         } else {
           *(output + output_index) = sum;
         }
@@ -121,31 +97,31 @@ __global__ void SortedSegmentSumCustomKernel(const Index input_outer_dim_size,
     // the following strip.
     const Index output_index =
         last_output_segment_id * inner_dim_size + segment_offset;
-    AccumulateInto<T>(output + output_index, sum);
+    CudaAtomicAdd(output + output_index, sum);
   }
 }
 
-// UnsortedSegmentSumFunctor kernel processes 'input_total_size' elements.
+// UnsortedSegmentSumKernel processes 'input_total_size' elements.
 // Each element is mapped from input to output by a combination of its
 // 'segment_ids' mapping and 'inner_dim_size'.
-template <typename T, typename Index>
-__global__ void UnsortedSegmentSumCustomKernel(
-    const Index input_outer_dim_size, const Index inner_dim_size,
-    const Index output_outer_dim_size, const Index* segment_ids, const T* input,
-    T* output) {
+template <typename T, typename Index, typename KernelReductionFunctor>
+__global__ void UnsortedSegmentCustomKernel(const Index input_outer_dim_size,
+                                            const Index inner_dim_size,
+                                            const Index output_outer_dim_size,
+                                            const Index* segment_ids,
+                                            const T* input, T* output) {
   const Index input_total_size = input_outer_dim_size * inner_dim_size;
   const Index output_total_size = output_outer_dim_size * inner_dim_size;
-  CUDA_1D_KERNEL_LOOP(input_index, input_total_size) {
+  for (int input_index : CudaGridRangeX(input_total_size)) {
     const Index input_segment_index = input_index / inner_dim_size;
     const Index segment_offset = input_index % inner_dim_size;
     const Index output_segment_index = segment_ids[input_segment_index];
-
     if (output_segment_index < 0 || output_segment_index >= output_total_size) {
       continue;
     }
     const Index output_index =
         output_segment_index * inner_dim_size + segment_offset;
-    AccumulateInto<T>(output + output_index, ldg(input + input_index));
+    KernelReductionFunctor()(output + output_index, ldg(input + input_index));
   }
 }
 
@@ -190,41 +166,39 @@ void SegmentSumFunctor<T, Index>::operator()(
       <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
           input_outer_dim_size, input_inner_dim_size, output_rows,
           segment_ids.data(), data, output.data(), total_stripe_count);
-};
+}
 
-// UnsortedSegmentSumFunctor implementation for GPUDevice.
-template <typename T, typename Index>
-struct UnsortedSegmentSumFunctor<GPUDevice, T, Index>
-    : UnsortedSegmentBaseFunctor<GPUDevice, T, Index> {
-  void operator()(OpKernelContext* ctx, const GPUDevice& d,
-                  const Index output_rows, const TensorShape& segment_ids_shape,
+template <typename T, typename Index, typename InitialValueF,
+          typename ReductionF>
+struct UnsortedSegmentFunctor<GPUDevice, T, Index, InitialValueF, ReductionF> {
+  void operator()(OpKernelContext* ctx, const Index num_segments,
+                  const TensorShape& segment_ids_shape,
                   typename TTypes<Index>::ConstFlat segment_ids,
                   const Index data_size, const T* data,
-                  typename TTypes<T, 2>::Tensor output) override {
+                  typename TTypes<T, 2>::Tensor output) {
     if (output.size() == 0) {
       return;
     }
-    // Set 'output' to zeros.
+    // Set 'output' to initial value.
+    GPUDevice d = ctx->template eigen_device<GPUDevice>();
     CudaLaunchConfig config = GetCudaLaunchConfig(output.size(), d);
-    SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        output.size(), output.data());
+    SetToValue<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+        output.size(), output.data(), InitialValueF()());
     if (data_size == 0 || segment_ids_shape.num_elements() == 0) {
       return;
     }
-
-    // Launch kernel to compute unsorted segment sum.
+    // Launch kernel to compute unsorted segment reduction.
     // Notes:
-    // *) 'input_total_size' is the total number of elements to process.
+    // *) 'data_size' is the total number of elements to process.
     // *) 'segment_ids.shape' is a prefix of data's shape.
     // *) 'input_outer_dim_size' is the total number of segments to process.
-    const Index input_total_size = data_size;
     const Index input_outer_dim_size = segment_ids.dimension(0);
-    const Index input_inner_dim_size = input_total_size / input_outer_dim_size;
+    const Index input_inner_dim_size = data_size / input_outer_dim_size;
+    config = GetCudaLaunchConfig(data_size, d);
 
-    config = GetCudaLaunchConfig(input_total_size, d);
-    UnsortedSegmentSumCustomKernel<T, Index>
+    UnsortedSegmentCustomKernel<T, Index, ReductionF>
         <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            input_outer_dim_size, input_inner_dim_size, output_rows,
+            input_outer_dim_size, input_inner_dim_size, num_segments,
             segment_ids.data(), data, output.data());
   }
 };
@@ -238,19 +212,40 @@ struct UnsortedSegmentSumFunctor<GPUDevice, T, Index>
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_SORTED_GPU_SPECS);
 
-#define DEFINE_GPU_SPECS_INDEX(T, Index) \
-  template struct UnsortedSegmentSumFunctor<GPUDevice, T, Index>
-
-#define DEFINE_GPU_SPECS(T)         \
-  DEFINE_GPU_SPECS_INDEX(T, int32); \
-  DEFINE_GPU_SPECS_INDEX(T, int64);
-
-TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
-TF_CALL_complex64(DEFINE_GPU_SPECS);
-TF_CALL_complex128(DEFINE_GPU_SPECS);
-
-#undef DEFINE_GPU_SPECS
-#undef DEFINE_GPU_SPECS_INDEX
+#define DEFINE_REAL_UNSORTED_GPU_SPECS_INDEX(T, Index)                         \
+  template struct UnsortedSegmentFunctor<                                      \
+      GPUDevice, T, Index, functor::Lowest<T>, functor::MaxOpGpu<T>>;          \
+  template struct UnsortedSegmentFunctor<                                      \
+      GPUDevice, T, Index, functor::Highest<T>, functor::MinOpGpu<T>>;         \
+  template struct UnsortedSegmentFunctor<GPUDevice, T, Index, functor::One<T>, \
+                                         functor::ProdOpGpu<T>>;
+
+// sum is the only op that supports all input types currently
+#define DEFINE_SUM_UNSORTED_GPU_SPECS_INDEX(T, Index) \
+  template struct UnsortedSegmentFunctor<             \
+      GPUDevice, T, Index, functor::Zero<T>, functor::SumOpGpu<T>>;
+
+#define DEFINE_REAL_GPU_SPECS(T)                  \
+  DEFINE_REAL_UNSORTED_GPU_SPECS_INDEX(T, int32); \
+  DEFINE_REAL_UNSORTED_GPU_SPECS_INDEX(T, int64);
+
+#define DEFINE_SUM_GPU_SPECS(T)                  \
+  DEFINE_SUM_UNSORTED_GPU_SPECS_INDEX(T, int32); \
+  DEFINE_SUM_UNSORTED_GPU_SPECS_INDEX(T, int64);
+
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_REAL_GPU_SPECS);
+TF_CALL_int32(DEFINE_REAL_GPU_SPECS);
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_SUM_GPU_SPECS);
+TF_CALL_int32(DEFINE_SUM_GPU_SPECS);
+TF_CALL_complex64(DEFINE_SUM_GPU_SPECS);
+TF_CALL_complex128(DEFINE_SUM_GPU_SPECS);
+
+#undef DEFINE_SORTED_GPU_SPECS_INDEX
+#undef DEFINE_SORTED_GPU_SPECS
+#undef DEFINE_REAL_UNSORTED_GPU_SPECS_INDEX
+#undef DEFINE_SUM_UNSORTED_GPU_SPECS_INDEX
+#undef DEFINE_REAL_GPU_SPECS
+#undef DEFINE_SUM_GPU_SPECS
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/unique_op.cc b/tensorflow/core/kernels/unique_op.cc
index 0ef8724b10..31388e4290 100644
--- a/tensorflow/core/kernels/unique_op.cc
+++ b/tensorflow/core/kernels/unique_op.cc
@@ -223,6 +223,16 @@ class UniqueOp : public OpKernel {
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int64>("out_idx"), \
+                          UniqueOp<type, int64>);                \
+  REGISTER_KERNEL_BUILDER(Name("UniqueWithCountsV2")             \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int32>("out_idx"), \
+                          UniqueOp<type, int32>)                 \
+  REGISTER_KERNEL_BUILDER(Name("UniqueWithCountsV2")             \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int64>("out_idx"), \
                           UniqueOp<type, int64>)
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_UNIQUE);
 REGISTER_UNIQUE(string)
diff --git a/tensorflow/core/kernels/unravel_index_op.cc b/tensorflow/core/kernels/unravel_index_op.cc
index a61272675b..62e814ff77 100644
--- a/tensorflow/core/kernels/unravel_index_op.cc
+++ b/tensorflow/core/kernels/unravel_index_op.cc
@@ -15,11 +15,11 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 267ce88440..2fab62ea5c 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -1201,6 +1201,23 @@ REGISTER_OP("UniqueWithCounts")
       return Status::OK();
     });
 
+REGISTER_OP("UniqueWithCountsV2")
+    .Input("x: T")
+    .Input("axis: Taxis")
+    .Output("y: T")
+    .Output("idx: out_idx")
+    .Output("count: out_idx")
+    .Attr("T: type")
+    .Attr("Taxis: {int32,int64} = DT_INT64")
+    .Attr("out_idx: {int32, int64} = DT_INT32")
+    .SetShapeFn([](InferenceContext* c) {
+      auto uniq = c->Vector(InferenceContext::kUnknownDim);
+      c->set_output(0, uniq);
+      c->set_output(1, c->input(0));
+      c->set_output(2, uniq);
+      return Status::OK();
+    });
+
 namespace {
 
 Status ShapeShapeFn(InferenceContext* c) {
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 872ebe98c1..8f33d51d5a 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1065,6 +1065,26 @@ REGISTER_OP("UnsortedSegmentMax")
     .Attr("Tnumsegments: {int32,int64} = DT_INT32")
     .SetShapeFn(UnsortedSegmentReductionShapeFn);
 
+REGISTER_OP("UnsortedSegmentMin")
+    .Input("data: T")
+    .Input("segment_ids: Tindices")
+    .Input("num_segments: Tnumsegments")
+    .Output("output: T")
+    .Attr("T: realnumbertype")
+    .Attr("Tindices: {int32,int64}")
+    .Attr("Tnumsegments: {int32,int64} = DT_INT32")
+    .SetShapeFn(UnsortedSegmentReductionShapeFn);
+
+REGISTER_OP("UnsortedSegmentProd")
+    .Input("data: T")
+    .Input("segment_ids: Tindices")
+    .Input("num_segments: Tnumsegments")
+    .Output("output: T")
+    .Attr("T: realnumbertype")
+    .Attr("Tindices: {int32,int64}")
+    .Attr("Tnumsegments: {int32,int64} = DT_INT32")
+    .SetShapeFn(UnsortedSegmentReductionShapeFn);
+
 REGISTER_OP("SparseSegmentSum")
     .Input("data: T")
     .Input("indices: Tidx")
diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index 52bf0d4694..301fcb9dbf 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include <aws/core/utils/StringUtils.h>
 #include <aws/core/utils/logging/AWSLogging.h>
 #include <aws/core/utils/logging/LogSystemInterface.h>
+#include <aws/core/utils/StringUtils.h>
 #include <aws/s3/S3Client.h>
 #include <aws/s3/S3Errors.h>
 #include <aws/s3/model/CopyObjectRequest.h>
diff --git a/tensorflow/core/platform/vmodule_benchmark_test.cc b/tensorflow/core/platform/vmodule_benchmark_test.cc
deleted file mode 100644
index 0f9e75bf9c..0000000000
--- a/tensorflow/core/platform/vmodule_benchmark_test.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/test_benchmark.h"
-
-namespace tensorflow {
-
-static void BM_DisabledVlog(int iters) {
-  for (int i = 0; i < iters; ++i) {
-    VLOG(1) << "Testing VLOG(1)!";
-  }
-}
-BENCHMARK(BM_DisabledVlog);
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/platform/vmodule_test.cc b/tensorflow/core/platform/vmodule_test.cc
deleted file mode 100644
index 47b4b2e0e7..0000000000
--- a/tensorflow/core/platform/vmodule_test.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Test that popens a child process with the VLOG-ing environment variable set
-// for the logging framework, and observes VLOG_IS_ON and VLOG macro output.
-
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/platform.h"
-#include "tensorflow/core/platform/test.h"
-
-#include <string.h>
-
-namespace tensorflow {
-namespace {
-
-int RealMain(const char* argv0, bool do_vlog) {
-  if (do_vlog) {
-#if !defined(PLATFORM_GOOGLE)
-    // Note, we only test this when !defined(PLATFORM_GOOGLE) because
-    // VmoduleActivated doesn't exist in that implementation.
-    //
-    // Also, we call this internal API to simulate what would happen if
-    // differently-named translation units attempted to VLOG, so we don't need
-    // to create dummy translation unit files.
-    bool ok = internal::LogMessage::VmoduleActivated("vmodule_test.cc", 7) &&
-              internal::LogMessage::VmoduleActivated("shoobadooba.h", 3);
-    if (!ok) {
-      fprintf(stderr, "vmodule activated levels not as expected.\n");
-      return EXIT_FAILURE;
-    }
-#endif
-
-    // Print info on which VLOG levels are activated.
-    fprintf(stderr, "VLOG_IS_ON(8)? %d\n", VLOG_IS_ON(8));
-    fprintf(stderr, "VLOG_IS_ON(7)? %d\n", VLOG_IS_ON(7));
-    fprintf(stderr, "VLOG_IS_ON(6)? %d\n", VLOG_IS_ON(6));
-    // Do some VLOG-ing.
-    VLOG(8) << "VLOG(8)";
-    VLOG(7) << "VLOG(7)";
-    VLOG(6) << "VLOG(6)";
-    LOG(INFO) << "INFO";
-    return EXIT_SUCCESS;
-  }
-
-  // Popen the child process.
-  std::string command = std::string(argv0);
-#if defined(PLATFORM_GOOGLE)
-  command = command + " do_vlog --vmodule=vmodule_test=7 --alsologtostderr";
-#else
-  command =
-      "TF_CPP_VMODULE=vmodule_test=7,shoobadooba=3 " + command + " do_vlog";
-#endif
-  command += " 2>&1";
-  fprintf(stderr, "Running: \"%s\"\n", command.c_str());
-  FILE* f = popen(command.c_str(), "r");
-  if (f == nullptr) {
-    fprintf(stderr, "Failed to popen child: %s\n", strerror(errno));
-    return EXIT_FAILURE;
-  }
-
-  // Read data from the child's stdout.
-  constexpr int kBufferSizeBytes = 4096;
-  char buffer[kBufferSizeBytes];
-  size_t result = fread(buffer, sizeof(buffer[0]), kBufferSizeBytes - 1, f);
-  if (result == 0) {
-    fprintf(stderr, "Failed to read from child stdout: %zu %s\n", result,
-            strerror(errno));
-    return EXIT_FAILURE;
-  }
-  buffer[result] = '\0';
-  int status = pclose(f);
-  if (status == -1) {
-    fprintf(stderr, "Failed to close popen child: %s\n", strerror(errno));
-    return EXIT_FAILURE;
-  }
-
-  // Check output is as expected.
-  const char kExpected[] =
-      "VLOG_IS_ON(8)? 0\nVLOG_IS_ON(7)? 1\nVLOG_IS_ON(6)? 1\n";
-  if (strstr(buffer, kExpected) == nullptr) {
-    fprintf(stderr, "error: unexpected output from child: \"%.*s\"\n",
-            kBufferSizeBytes, buffer);
-    return EXIT_FAILURE;
-  }
-  bool ok = strstr(buffer, "VLOG(7)\n") != nullptr &&
-            strstr(buffer, "VLOG(6)\n") != nullptr &&
-            strstr(buffer, "VLOG(8)\n") == nullptr;
-  if (!ok) {
-    fprintf(stderr, "error: VLOG output not as expected: \"%.*s\"\n",
-            kBufferSizeBytes, buffer);
-    return EXIT_FAILURE;
-  }
-
-  // Success!
-  return EXIT_SUCCESS;
-}
-
-}  // namespace
-}  // namespace tensorflow
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  bool do_vlog = argc >= 2 && strcmp(argv[1], "do_vlog") == 0;
-  return tensorflow::RealMain(argv[0], do_vlog);
-}
diff --git a/tensorflow/core/util/cuda_device_functions.h b/tensorflow/core/util/cuda_device_functions.h
index f787687f66..f2d4e470c8 100644
--- a/tensorflow/core/util/cuda_device_functions.h
+++ b/tensorflow/core/util/cuda_device_functions.h
@@ -28,14 +28,10 @@ limitations under the License.
 
 #include <algorithm>
 #include <complex>
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "cuda/include/cuda.h"
-#include "cuda/include/device_functions.h"
 #include "tensorflow/core/platform/types.h"
 
-#if CUDA_VERSION >= 7050
-#include "cuda/include/cuda_fp16.h"
-#endif  // CUDA_VERSION >= 7050
-
 namespace tensorflow {
 
 namespace detail {
@@ -394,6 +390,17 @@ __global__ void SetZero(const int count, T* ptr) {
   }
 }
 
+// Helper to set all tensor entries to a specific value.
+template <typename T>
+__global__ void SetToValue(const int count, T* ptr, T value) {
+  // Check that the grid is one dimensional and index doesn't overflow.
+  assert(blockDim.y == 1 && blockDim.z == 1);
+  assert(blockDim.x * gridDim.x / blockDim.x == gridDim.x);
+  for (int i : CudaGridRangeX(count)) {
+    ptr[i] = value;
+  }
+}
+
 namespace detail {
 // Helper function for atomic accumulation implemented as CAS.
 template <typename T, typename F>
@@ -425,6 +432,47 @@ __device__ double CudaAtomicCasHelper(double* ptr, F accumulate) {
       }));
 }
 
+// Overload of above function for half. Note that we don't have
+// atomicCAS() for anything less than 32 bits, so we need to include the
+// other 16 bits in the operation.
+//
+// This version is going to be very slow
+// under high concurrency, since most threads will be spinning on failing
+// their compare-and-swap tests. (The fact that we get false sharing on the
+// neighboring fp16 makes this even worse.) If you are doing a large reduction,
+// you are much better off with doing the intermediate steps in fp32 and then
+// switching to fp16 as late as you can in the calculations.
+//
+// Note: Assumes little endian.
+template <typename F>
+__device__ Eigen::half CudaAtomicCasHelper(Eigen::half* ptr, F accumulate) {
+#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__)
+  static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Not little endian");
+#endif
+  namespace half_impl = Eigen::half_impl;
+  intptr_t intptr = reinterpret_cast<intptr_t>(ptr);
+  assert(!(intptr & 0x1));  // should be 2-aligned.
+  if (intptr & 0x2) {
+    // The half is in the second part of the uint32 (upper 16 bits).
+    uint32* address = reinterpret_cast<uint32*>(intptr - 2);
+    uint32 result = CudaAtomicCasHelper(address, [accumulate](uint32 arg) {
+      unsigned short high = static_cast<unsigned short>(arg >> 16);
+      Eigen::half acc = accumulate(half_impl::raw_uint16_to_half(high));
+      return (static_cast<uint32>(acc.x) << 16) | (arg & 0xffff);
+    });
+    return half_impl::raw_uint16_to_half(static_cast<uint16>(result >> 16));
+  } else {
+    // The half is in the first part of the uint32 (lower 16 bits).
+    uint32* address = reinterpret_cast<uint32*>(intptr);
+    uint32 result = CudaAtomicCasHelper(address, [accumulate](uint32 arg) {
+      unsigned short low = static_cast<unsigned short>(arg & 0xffff);
+      Eigen::half acc = accumulate(half_impl::raw_uint16_to_half(low));
+      return (arg & 0xffff0000) | static_cast<uint32>(acc.x);
+    });
+    return half_impl::raw_uint16_to_half(static_cast<uint16>(result & 0xffff));
+  }
+}
+
 template <typename From, typename To>
 using ToTypeIfConvertible =
     typename std::enable_if<std::is_convertible<From, To>::value, To>::type;
@@ -438,6 +486,14 @@ template <typename T, typename U>
 __device__ detail::ToTypeIfConvertible<U, T> CudaAtomicAdd(T* ptr, U value) {
   return atomicAdd(ptr, value);
 }
+
+__device__ inline Eigen::half CudaAtomicAdd(Eigen::half* ptr,
+                                            Eigen::half value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](Eigen::half a) { return a + value; });
+}
+
+
 #if __CUDA_ARCH__ < 600
 __device__ inline double CudaAtomicAdd(double* ptr, double value) {
   return detail::CudaAtomicCasHelper(ptr,
@@ -455,27 +511,74 @@ __device__ inline double CudaAtomicAdd(double* ptr, double value) {
   return result;
 }
 #endif
-
+// CudaAtomicAdd
+// Specializations of CudaAtomicAdd for complex types, which CudaAtomicAdd does
+// not support. We treat a std::complex<T>* as a T* (the C++ standard section
+// 26.4.4 allows this explicitly) and atomic add the real and imaginary
+// components individually. The operation as a whole is not atomic, but we can
+// safely treat the components independently for the purpose of accumulating.
+__device__ inline std::complex<float> CudaAtomicAdd(std::complex<float>* ptr,
+                                                    std::complex<float> value) {
+  auto ptr_scalar = reinterpret_cast<float*>(ptr);
+  return std::complex<float>(CudaAtomicAdd(ptr_scalar, value.real()),
+                             CudaAtomicAdd(ptr_scalar + 1, value.imag()));
+}
+
+__device__ inline std::complex<double> CudaAtomicAdd(
+    std::complex<double>* ptr, std::complex<double> value) {
+  auto ptr_scalar = reinterpret_cast<double*>(ptr);
+  return std::complex<double>(CudaAtomicAdd(ptr_scalar, value.real()),
+                              CudaAtomicAdd(ptr_scalar + 1, value.imag()));
+}
+
+// CudaAtomicSub
 template <typename T, typename U>
 __device__ detail::ToTypeIfConvertible<U, T> CudaAtomicSub(T* ptr, U value) {
   return atomicSub(ptr, value);
 }
+
 // Specializations of substraction which add the negative value.
 __device__ inline float CudaAtomicSub(float* ptr, float value) {
   return CudaAtomicAdd(ptr, -value);
 }
+
 __device__ inline double CudaAtomicSub(double* ptr, double value) {
   return CudaAtomicAdd(ptr, -value);
 }
+
 __device__ inline tensorflow::uint64 CudaAtomicSub(tensorflow::uint64* ptr,
                                                    tensorflow::uint64 value) {
   return CudaAtomicAdd(ptr, -value);
 }
 
+__device__ inline Eigen::half CudaAtomicSub(Eigen::half* ptr,
+                                            Eigen::half value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](Eigen::half a) { return a - value; });
+}
+
+// CudaAtomicMax
 template <typename T, typename U>
 __device__ detail::ToTypeIfConvertible<U, T> CudaAtomicMax(T* ptr, U value) {
   return atomicMax(ptr, value);
 }
+
+__device__ inline float CudaAtomicMax(float* ptr, float value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](float a) { return max(a, value); });
+}
+
+__device__ inline double CudaAtomicMax(double* ptr, double value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](double a) { return max(a, value); });
+}
+
+__device__ inline Eigen::half CudaAtomicMax(Eigen::half* ptr,
+                                            Eigen::half value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](Eigen::half a) { return max(a, value); });
+}
+
 #if __CUDA_ARCH__ < 320
 __device__ inline tensorflow::uint64 CudaAtomicMax(tensorflow::uint64* ptr,
                                                    tensorflow::uint64 value) {
@@ -484,10 +587,43 @@ __device__ inline tensorflow::uint64 CudaAtomicMax(tensorflow::uint64* ptr,
 }
 #endif
 
+// CudaAtomicMin
+template <typename T, typename U>
+__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicMin(T* ptr, U value) {
+  return atomicMin(ptr, value);
+}
+
+__device__ inline float CudaAtomicMin(float* ptr, float value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](float a) { return min(a, value); });
+}
+
+__device__ inline double CudaAtomicMin(double* ptr, double value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](double a) { return min(a, value); });
+}
+
+__device__ inline Eigen::half CudaAtomicMin(Eigen::half* ptr,
+                                            Eigen::half value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](Eigen::half a) { return min(a, value); });
+}
+
+#if __CUDA_ARCH__ < 320
+__device__ inline tensorflow::uint64 CudaAtomicMin(tensorflow::uint64* ptr,
+                                                   tensorflow::uint64 value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](tensorflow::uint64 a) { return min(a, value); });
+}
+#endif
+
+// CudaAtomicMul
 template <typename T, typename U>
 __device__ detail::ToTypeIfConvertible<U, T> CudaAtomicMul(T* ptr, U value) {
   return detail::CudaAtomicCasHelper(ptr, [value](T a) { return a * value; });
 }
+
+// CudaAtomicDiv
 template <typename T, typename U>
 __device__ detail::ToTypeIfConvertible<U, T> CudaAtomicDiv(T* ptr, U value) {
   return detail::CudaAtomicCasHelper(ptr, [value](T a) { return a / value; });
diff --git a/tensorflow/core/util/cuda_kernel_helper.h b/tensorflow/core/util/cuda_kernel_helper.h
index 18a4c008f1..3c59524cb6 100644
--- a/tensorflow/core/util/cuda_kernel_helper.h
+++ b/tensorflow/core/util/cuda_kernel_helper.h
@@ -90,60 +90,6 @@ __device__ EIGEN_ALWAYS_INLINE Eigen::half CudaShuffleXorSync(
       CudaShuffleXorSync(mask, static_cast<uint16>(value), lane_mask, width));
 }
 
-namespace detail {
-// Overload of above function for half. Note that we don't have
-// atomicCAS() for anything less than 32 bits, so we need to include the
-// other 16 bits in the operation.
-//
-// This version is going to be very slow
-// under high concurrency, since most threads will be spinning on failing
-// their compare-and-swap tests. (The fact that we get false sharing on the
-// neighboring fp16 makes this even worse.) If you are doing a large reduction,
-// you are much better off with doing the intermediate steps in fp32 and then
-// switching to fp16 as late as you can in the calculations.
-//
-// Note: Assumes little endian.
-template <typename F>
-__device__ Eigen::half CudaAtomicCasHelper(Eigen::half* ptr, F accumulate) {
-#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__)
-  static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Not little endian");
-#endif
-  namespace half_impl = Eigen::half_impl;
-  intptr_t intptr = reinterpret_cast<intptr_t>(ptr);
-  assert(!(intptr & 0x1));  // should be 2-aligned.
-  if (intptr & 0x2) {
-    // The half is in the second part of the uint32 (upper 16 bits).
-    uint32* address = reinterpret_cast<uint32*>(intptr - 2);
-    uint32 result = CudaAtomicCasHelper(address, [accumulate](uint32 arg) {
-      unsigned short high = static_cast<unsigned short>(arg >> 16);
-      Eigen::half acc = accumulate(half_impl::raw_uint16_to_half(high));
-      return (static_cast<uint32>(acc.x) << 16) | (arg & 0xffff);
-    });
-    return half_impl::raw_uint16_to_half(static_cast<uint16>(result >> 16));
-  } else {
-    // The half is in the first part of the uint32 (lower 16 bits).
-    uint32* address = reinterpret_cast<uint32*>(intptr);
-    uint32 result = CudaAtomicCasHelper(address, [accumulate](uint32 arg) {
-      unsigned short low = static_cast<unsigned short>(arg & 0xffff);
-      Eigen::half acc = accumulate(half_impl::raw_uint16_to_half(low));
-      return (arg & 0xffff0000) | static_cast<uint32>(acc.x);
-    });
-    return half_impl::raw_uint16_to_half(static_cast<uint16>(result & 0xffff));
-  }
-}
-}  // namespace detail
-
-__device__ inline Eigen::half CudaAtomicAdd(Eigen::half* ptr,
-                                            Eigen::half value) {
-  return detail::CudaAtomicCasHelper(
-      ptr, [value](Eigen::half a) { return a + value; });
-}
-__device__ inline Eigen::half CudaAtomicSub(Eigen::half* ptr,
-                                            Eigen::half value) {
-  return detail::CudaAtomicCasHelper(
-      ptr, [value](Eigen::half a) { return a - value; });
-}
-
 namespace cuda_helper {
 template <typename IntType>
 __device__ IntType upper_bound(IntType* first, IntType count, IntType val) {
diff --git a/tensorflow/docs_src/performance/xla/operation_semantics.md b/tensorflow/docs_src/performance/xla/operation_semantics.md
index 1f7a3a1e2c..b0abf5fdd2 100644
--- a/tensorflow/docs_src/performance/xla/operation_semantics.md
+++ b/tensorflow/docs_src/performance/xla/operation_semantics.md
@@ -123,7 +123,7 @@ Normalizes an array across batch and spatial dimensions.
 | `scale`         | `ComputationDataHandle` | 1 dimensional array              |
 :                 :                         : (\\(\gamma\\))                   :
 | `offset`        | `ComputationDataHandle` | 1 dimensional array              |
-:                 :                         : (\\(\beta\\ )                    :
+:                 :                         : (\\(\beta\\))                    :
 | `epsilon`       | `float`                 | Epsilon value (\\(\epsilon\\))   |
 | `feature_index` | `int64`                 | Index to feature dimension       |
 :                 :                         : in `operand`                     :
@@ -135,8 +135,8 @@ element in `operand`. The `feature_index` must be a valid index for the feature
 dimension in `operand`.
 
 The algorithm goes as follows for each batch in `operand` \\(x\\) that
-contains `m` elements with `w` and `h` as the size of spatial dimensions (
-assuming `operand` is an 4 dimensional array):
+contains `m` elements with `w` and `h` as the size of spatial dimensions
+(assuming `operand` is an 4 dimensional array):
 
 - Calculates batch mean \\(\mu_l\\) for each feature `l` in feature dimension:
 \\(\mu_l=\frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h x_{ijkl}\\)
@@ -170,7 +170,7 @@ Similar to a `tf.bitcast` in TensorFlow, performs an element-wise bitcast
 operation from a data shape to a target shape. The dimensions must match, and
 the conversion is an element-wise one; e.g. `s32` elements become `f32` elements
 via bitcast routine. Bitcast is implemented as a low-level cast, so machines
-with different floating point representations will give different results.
+with different floating-point representations will give different results.
 
 <b> `BitcastConvertType(operand, new_element_type)` </b>
 
@@ -351,7 +351,7 @@ each other) and contains the arguments in the order that they were specified.
 :             :                         : concatenated between the `operands`. :
 
 With the exception of `dimension` all dimensions must be the same. This is
-because XLA does not support "ragged" arrays Also note that rank-0 values
+because XLA does not support "ragged" arrays. Also note that rank-0 values
 cannot be concatenated (as it's impossible to name the dimension along which the
 concatenation occurs).
 
@@ -470,7 +470,7 @@ filter/kernel/window. The dimensions are, in this order:
     window that moves across the base area.
 
 The `window_strides` argument specifies the stride of the convolutional window
-in the spatial dimensions. For example, if the stride in a the first spatial
+in the spatial dimensions. For example, if the stride in the first spatial
 dimension is 3, then the window can only be placed at coordinates where the
 first spatial index is divisible by 3.
 
@@ -944,7 +944,7 @@ expand the rank of the lower-rank operand up to the rank of the higher-rank
 operand. `broadcast_dimensions` maps the dimensions of the lower-rank shape to
 the dimensions of the higher-rank shape. The unmapped dimensions of the expanded
 shape are filled with dimensions of size one. Degenerate-dimension broadcasting
-then broadcasts the shapes along these degenerate dimension to equalize the
+then broadcasts the shapes along these degenerate dimensions to equalize the
 shapes of both operands. The semantics are described in detail on the
 @{$broadcasting$broadcasting page}.
 
@@ -1271,7 +1271,7 @@ result2 = while (condition, init = result1) {
 ```
 
 Nested tuple shapes are not supported. For an empty tuple shape, the Infeed
-operation is effectively a nop and proceeds without reading any data from the
+operation is effectively a no-op and proceeds without reading any data from the
 Infeed of the device.
 
 > Note: We plan to allow multiple Infeed operations without a total order, in
@@ -1334,7 +1334,7 @@ dimension.
 
 `PaddingConfig` is a repeated field of `PaddingConfigDimension`, which contains
 three fields for each dimension: `edge_padding_low`, `edge_padding_high`, and
-`interior_padding`. `edge_padding_low` and `edge_padding_high` specifies the
+`interior_padding`. `edge_padding_low` and `edge_padding_high` specify the
 amount of padding added at the low-end (next to index 0) and the high-end (next
 to the highest index) of each dimension respectively. The amount of edge padding
 can be negative -- the absolute value of negative padding indicates the number
@@ -1343,8 +1343,8 @@ the amount of padding added between any two elements in each dimension. Interior
 padding occurs logically before edge padding, so in the case of negative edge
 padding elements are removed from the interior-padded operand. This operation is
 a no-op if the edge padding pairs are all (0, 0) and the interior padding values
-are all 0. Figure below shows examples of different `edge_padding` and
-`interior_padding` values for a two dimensional array.
+are all 0. The figure below shows examples of different `edge_padding` and
+`interior_padding` values for a two-dimensional array.
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
   <img style="width:100%" src="https://www.tensorflow.org/images/ops_pad.png">
diff --git a/tensorflow/docs_src/programmers_guide/variables.md b/tensorflow/docs_src/programmers_guide/variables.md
index 6425073805..e8cf771155 100644
--- a/tensorflow/docs_src/programmers_guide/variables.md
+++ b/tensorflow/docs_src/programmers_guide/variables.md
@@ -62,9 +62,10 @@ them. For this reason TensorFlow provides **collections**, which are named lists
 of tensors or other objects, such as `tf.Variable` instances.
 
 By default every `tf.Variable` gets placed in the following two collections:
+
  * `tf.GraphKeys.GLOBAL_VARIABLES` --- variables that can be shared across
-multiple devices,
- * `tf.GraphKeys.TRAINABLE_VARIABLES`--- variables for which TensorFlow will
+   multiple devices,
+ * `tf.GraphKeys.TRAINABLE_VARIABLES` --- variables for which TensorFlow will
    calculate gradients.
 
 If you don't want a variable to be trainable, add it to the
diff --git a/tensorflow/docs_src/tutorials/deep_cnn.md b/tensorflow/docs_src/tutorials/deep_cnn.md
index 3692a02f2e..6797540204 100644
--- a/tensorflow/docs_src/tutorials/deep_cnn.md
+++ b/tensorflow/docs_src/tutorials/deep_cnn.md
@@ -83,21 +83,21 @@ for details.  It consists of 1,068,298 learnable parameters and requires about
 ## Code Organization
 
 The code for this tutorial resides in
-[`models/tutorials/image/cifar10/`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/).
+[`models/tutorials/image/cifar10/`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/).
 
 File | Purpose
 --- | ---
-[`cifar10_input.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_input.py) | Reads the native CIFAR-10 binary file format.
-[`cifar10.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10.py) | Builds the CIFAR-10 model.
-[`cifar10_train.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_train.py) | Trains a CIFAR-10 model on a CPU or GPU.
-[`cifar10_multi_gpu_train.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_multi_gpu_train.py) | Trains a CIFAR-10 model on multiple GPUs.
-[`cifar10_eval.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_eval.py) | Evaluates the predictive performance of a CIFAR-10 model.
+[`cifar10_input.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_input.py) | Reads the native CIFAR-10 binary file format.
+[`cifar10.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10.py) | Builds the CIFAR-10 model.
+[`cifar10_train.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_train.py) | Trains a CIFAR-10 model on a CPU or GPU.
+[`cifar10_multi_gpu_train.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_multi_gpu_train.py) | Trains a CIFAR-10 model on multiple GPUs.
+[`cifar10_eval.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_eval.py) | Evaluates the predictive performance of a CIFAR-10 model.
 
 
 ## CIFAR-10 Model
 
 The CIFAR-10 network is largely contained in
-[`cifar10.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10.py).
+[`cifar10.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10.py).
 The complete training
 graph contains roughly 765 operations. We find that we can make the code most
 reusable by constructing the graph with the following modules:
diff --git a/tensorflow/docs_src/tutorials/word2vec.md b/tensorflow/docs_src/tutorials/word2vec.md
index 0a1c41c84a..3fe7352bd2 100644
--- a/tensorflow/docs_src/tutorials/word2vec.md
+++ b/tensorflow/docs_src/tutorials/word2vec.md
@@ -23,7 +23,7 @@ straight in, feel free to look at the minimalistic implementation in
 This basic example contains the code needed to download some data, train on it a
 bit and visualize the result. Once you get comfortable with reading and running
 the basic version, you can graduate to
-[models/tutorials/embedding/word2vec.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec.py)
+[models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py)
 which is a more serious implementation that showcases some more advanced
 TensorFlow principles about how to efficiently use threads to move data into a
 text model, how to checkpoint during training, etc.
@@ -341,7 +341,7 @@ t-SNE.
 Et voila! As expected, words that are similar end up clustering nearby each
 other. For a more heavyweight implementation of word2vec that showcases more of
 the advanced features of TensorFlow, see the implementation in
-[models/tutorials/embedding/word2vec.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec.py).
+[models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py).
 
 ## Evaluating Embeddings: Analogical Reasoning
 
@@ -357,7 +357,7 @@ Download the dataset for this task from
 
 To see how we do this evaluation, have a look at the `build_eval_graph()` and
 `eval()` functions in
-[models/tutorials/embedding/word2vec.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec.py).
+[models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py).
 
 The choice of hyperparameters can strongly influence the accuracy on this task.
 To achieve state-of-the-art performance on this task requires training over a
@@ -385,13 +385,13 @@ your model is seriously bottlenecked on input data, you may want to implement a
 custom data reader for your problem, as described in
 @{$new_data_formats$New Data Formats}.  For the case of Skip-Gram
 modeling, we've actually already done this for you as an example in
-[models/tutorials/embedding/word2vec.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec.py).
+[models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py).
 
 If your model is no longer I/O bound but you want still more performance, you
 can take things further by writing your own TensorFlow Ops, as described in
 @{$adding_an_op$Adding a New Op}.  Again we've provided an
 example of this for the Skip-Gram case
-[models/tutorials/embedding/word2vec_optimized.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec_optimized.py).
+[models/tutorials/embedding/word2vec_optimized.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec_optimized.py).
 Feel free to benchmark these against each other to measure performance
 improvements at each stage.
 
diff --git a/tensorflow/examples/speech_commands/train.py b/tensorflow/examples/speech_commands/train.py
index 07c1919347..f084931215 100644
--- a/tensorflow/examples/speech_commands/train.py
+++ b/tensorflow/examples/speech_commands/train.py
@@ -357,14 +357,12 @@ if __name__ == '__main__':
       '--window_size_ms',
       type=float,
       default=30.0,
-      help='How long each spectrogram timeslice is.',
-  )
+      help='How long each spectrogram timeslice is.',)
   parser.add_argument(
       '--window_stride_ms',
       type=float,
       default=10.0,
-      help='How far to move in time between spectogram timeslices.',
-  )
+      help='How far to move in time between spectogram timeslices.',)
   parser.add_argument(
       '--dct_coefficient_count',
       type=int,
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index 5a533e3b60..9dee1aa72b 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -104,9 +104,6 @@ cc_library(
     copts = tf_copts(),
     deps = [
         ":java_op_gen_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
     ],
 )
 
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Input.java b/tensorflow/java/src/main/java/org/tensorflow/Input.java
deleted file mode 100644
index 13bc463e7d..0000000000
--- a/tensorflow/java/src/main/java/org/tensorflow/Input.java
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow;
-
-/**
- * Interface implemented by operands of a TensorFlow operation.
- *
- * <p>Example usage:
- *
- * <pre>{@code
- * // The "decodeJpeg" operation can be used as input to the "cast" operation
- * Input decodeJpeg = ops.image().decodeJpeg(...);
- * ops.math().cast(decodeJpeg, DataType.FLOAT);
- *
- * // The output "y" of the "unique" operation can be used as input to the "cast" operation
- * Output y = ops.array().unique(...).y();
- * ops.math().cast(y, DataType.FLOAT);
- *
- * // The "split" operation can be used as input list to the "concat" operation
- * Iterable<? extends Input> split = ops.array().split(...);
- * ops.array().concat(0, split);
- * }</pre>
- */
-public interface Input<T> {
-
-  /**
-   * Returns the symbolic handle of a tensor.
-   *
-   * <p>Inputs to TensorFlow operations are outputs of another TensorFlow operation. This method is
-   * used to obtain a symbolic handle that represents the computation of the input.
-   *
-   * @see OperationBuilder#addInput(Output)
-   */
-  Output<T> asOutput();
-}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/TFBool.java b/tensorflow/java/src/main/java/org/tensorflow/types/TFBool.java
deleted file mode 100644
index ab34f6aa12..0000000000
--- a/tensorflow/java/src/main/java/org/tensorflow/types/TFBool.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// GENERATED FILE. To update, edit tftypes.pl instead.
-
-package org.tensorflow.types;
-
-import org.tensorflow.DataType;
-
-/** Represents a boolean. */
-public class TFBool implements TFType {
-  private TFBool() {}
-  static {
-    Types.typeCodes.put(TFBool.class, DataType.BOOL);
-  }
-  static {
-    Types.scalars.put(TFBool.class, false);
-  }
-}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/TFDouble.java b/tensorflow/java/src/main/java/org/tensorflow/types/TFDouble.java
deleted file mode 100644
index 49e5d9f2f3..0000000000
--- a/tensorflow/java/src/main/java/org/tensorflow/types/TFDouble.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// GENERATED FILE. To update, edit tftypes.pl instead.
-
-package org.tensorflow.types;
-
-import org.tensorflow.DataType;
-
-/** Represents a 64-bit double precision floating point number. */
-public class TFDouble implements TFType {
-  private TFDouble() {}
-  static {
-    Types.typeCodes.put(TFDouble.class, DataType.DOUBLE);
-  }
-  static {
-    Types.scalars.put(TFDouble.class, 0.0);
-  }
-}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/TFFloat.java b/tensorflow/java/src/main/java/org/tensorflow/types/TFFloat.java
deleted file mode 100644
index 8426ee41f0..0000000000
--- a/tensorflow/java/src/main/java/org/tensorflow/types/TFFloat.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// GENERATED FILE. To update, edit tftypes.pl instead.
-
-package org.tensorflow.types;
-
-import org.tensorflow.DataType;
-
-/** Represents a 32-bit single precision floating point number. */
-public class TFFloat implements TFType {
-  private TFFloat() {}
-  static {
-    Types.typeCodes.put(TFFloat.class, DataType.FLOAT);
-  }
-  static {
-    Types.scalars.put(TFFloat.class, 0f);
-  }
-}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/TFInt32.java b/tensorflow/java/src/main/java/org/tensorflow/types/TFInt32.java
deleted file mode 100644
index 3947b6ad09..0000000000
--- a/tensorflow/java/src/main/java/org/tensorflow/types/TFInt32.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// GENERATED FILE. To update, edit tftypes.pl instead.
-
-package org.tensorflow.types;
-
-import org.tensorflow.DataType;
-
-/** Represents a 32-bit signed integer. */
-public class TFInt32 implements TFType {
-  private TFInt32() {}
-  static {
-    Types.typeCodes.put(TFInt32.class, DataType.INT32);
-  }
-  static {
-    Types.scalars.put(TFInt32.class, 0);
-  }
-}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/TFInt64.java b/tensorflow/java/src/main/java/org/tensorflow/types/TFInt64.java
deleted file mode 100644
index ccdded8693..0000000000
--- a/tensorflow/java/src/main/java/org/tensorflow/types/TFInt64.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// GENERATED FILE. To update, edit tftypes.pl instead.
-
-package org.tensorflow.types;
-
-import org.tensorflow.DataType;
-
-/** Represents a 64-bit signed integer. */
-public class TFInt64 implements TFType {
-  private TFInt64() {}
-  static {
-    Types.typeCodes.put(TFInt64.class, DataType.INT64);
-  }
-  static {
-    Types.scalars.put(TFInt64.class, 0L);
-  }
-}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/TFString.java b/tensorflow/java/src/main/java/org/tensorflow/types/TFString.java
deleted file mode 100644
index e7327e8c57..0000000000
--- a/tensorflow/java/src/main/java/org/tensorflow/types/TFString.java
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// GENERATED FILE. To update, edit tftypes.pl instead.
-
-package org.tensorflow.types;
-
-import org.tensorflow.DataType;
-
-/** Represents an arbitrary sequence of bytes. */
-public class TFString implements TFType {
-  private TFString() {}
-  static {
-    Types.typeCodes.put(TFString.class, DataType.STRING);
-  }
-}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/TFType.java b/tensorflow/java/src/main/java/org/tensorflow/types/TFType.java
deleted file mode 100644
index 562953ac9d..0000000000
--- a/tensorflow/java/src/main/java/org/tensorflow/types/TFType.java
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-package org.tensorflow.types;
-
-/**
- * A marker interface for classes representing TensorFlow types.
- */
-public interface TFType {}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/TFUInt8.java b/tensorflow/java/src/main/java/org/tensorflow/types/TFUInt8.java
deleted file mode 100644
index d7305ca5a8..0000000000
--- a/tensorflow/java/src/main/java/org/tensorflow/types/TFUInt8.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// GENERATED FILE. To update, edit tftypes.pl instead.
-
-package org.tensorflow.types;
-
-import org.tensorflow.DataType;
-
-/** Represents an 8-bit unsigned integer. */
-public class TFUInt8 implements TFType {
-  private TFUInt8() {}
-  static {
-    Types.typeCodes.put(TFUInt8.class, DataType.UINT8);
-  }
-  static {
-    Types.scalars.put(TFUInt8.class, (byte)0);
-  }
-}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/Types.java b/tensorflow/java/src/main/java/org/tensorflow/types/Types.java
deleted file mode 100644
index 976cd9fd34..0000000000
--- a/tensorflow/java/src/main/java/org/tensorflow/types/Types.java
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-package org.tensorflow.types;
-
-import java.util.HashMap;
-import java.util.Map;
-import org.tensorflow.DataType;
-
-/**
- * Utility class for managing the representation of TensorFlow types as Java
- * types. For each TensorFlow type (e.g., int32), there is a corresponding Java
- * type (e.g., TFInt32) that represents it at compile time and a corresponding
- * class object (e.g., TFInt32.class) that represents it at run time. There is
- * also an enumeration value in DataType that can be used to represent the
- * type, though that should rarely be required.
- */
-public class Types {
-
-  private Types() {} // not instantiable
-
-  static final Map<Class<?>, DataType> typeCodes = new HashMap<>();
-
-  /** Returns the DataType value corresponding to a TensorFlow type class. */
-  public static DataType dataType(Class<? extends TFType> c) {
-    DataType dtype = typeCodes.get(c);
-    if (dtype == null) {
-      throw new IllegalArgumentException("" + c + " is not a TensorFlow type.");
-    }
-    return dtype;
-  }
-
-  static final Map<Class<?>, Object> scalars = new HashMap<>();
-
-  /** Returns the zero value of type described by {@code c}, or null if
-   *  the type (e.g., string) is not numeric and therefore has no zero value.
-   */
-  public static Object zeroValue(Class<? extends TFType> c) {
-    return scalars.get(c);
-  }
-}
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index aabf89a234..7389730d91 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -1378,7 +1378,8 @@ class TensorFlowTestCase(googletest.TestCase):
     """
     device1 = pydev.canonical_name(device1)
     device2 = pydev.canonical_name(device2)
-    self.assertEqual(device1, device2, "Devices %s and %s are not equal. %s" %
+    self.assertEqual(device1, device2,
+                     "Devices %s and %s are not equal. %s" % 
                      (device1, device2, msg))
 
   # Fix Python 3 compatibility issues
diff --git a/tensorflow/python/keras/_impl/keras/layers/lstm_test.py b/tensorflow/python/keras/_impl/keras/layers/lstm_test.py
index 74548d05c8..11a5e0aeaa 100644
--- a/tensorflow/python/keras/_impl/keras/layers/lstm_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/lstm_test.py
@@ -48,8 +48,8 @@ class LSTMLayerTest(test.TestCase):
     units = 2
 
     model = keras.models.Sequential()
-    inputs = keras.layers.Dense(
-        embedding_dim, input_shape=(timesteps, embedding_dim))
+    inputs = keras.layers.Dense(embedding_dim,
+                                input_shape=(timesteps, embedding_dim))
     model.add(inputs)
     layer = keras.layers.LSTM(units, return_sequences=True)
     model.add(layer)
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
index 343d158498..8cb9f9e621 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
@@ -129,7 +129,7 @@ class LinearOperatorDiagTest(
     with self.test_session() as sess:
       x = random_ops.random_normal(shape=(2, 2, 3, 4))
 
-      # This LinearOperatorDiag will be brodacast to (2, 2, 3, 3) during solve
+      # This LinearOperatorDiag will be broadcast to (2, 2, 3, 3) during solve
       # and matmul with 'x' as the argument.
       diag = random_ops.random_uniform(shape=(2, 1, 3))
       operator = linalg.LinearOperatorDiag(diag, is_self_adjoint=True)
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index 5a54f448d0..bbce6b7d47 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -46,7 +46,8 @@ class SegmentReductionHelper(test.TestCase):
     return constant_op.constant(
         np_values, shape=input_shape, dtype=dtype), np_values
 
-  def _segmentReduce(self, indices, x, op1, op2=None, num_segments=None):
+  def _segmentReduce(self, indices, x, op1, op2=None, num_segments=None,
+                     initial_value=0):
     if not x.size:
       return np.array([])
     indices = np.asarray(indices)
@@ -64,13 +65,8 @@ class SegmentReductionHelper(test.TestCase):
       else:
         output[index] = x_flat[i]
     # zero initialize values that are still uncalcuated.
-    # output = [o if o is not None else np.zeros(slice_shape) for o in output]
-    if not op1 == np.max:
-      output = [o if o is not None else np.zeros(slice_shape) for o in output]
-    else:
-      zeroslice = np.zeros(slice_shape)
-      zeroslice.fill(dtype.min)
-      output = [o if o is not None else zeroslice for o in output]
+    initial_value_slice = np.ones(slice_shape) * initial_value
+    output = [o if o is not None else initial_value_slice for o in output]
     if op2 is not None:
       output = [op2(o) for o in output]
     output = [o.reshape(slice_shape) for o in output]
@@ -82,6 +78,9 @@ class SegmentReductionHelper(test.TestCase):
   def _mean_reduce_op(self, x):
     return x[0] / x[1] if isinstance(x, tuple) else x
 
+  def _sqrt_n_reduce_op(self, x):
+    return x[0] / np.sqrt(x[1]) if isinstance(x, tuple) else x
+
 
 class SegmentReductionOpTest(SegmentReductionHelper):
 
@@ -244,27 +243,61 @@ class SegmentReductionOpTest(SegmentReductionHelper):
       self.assertAllClose(jacob_t, jacob_n)
 
 
-class UnsortedSegmentSumTest(SegmentReductionHelper):
+class UnsortedSegmentTest(SegmentReductionHelper):
+
+  def __init__(self, methodName='runTest'):
+    # Each item is np_op1, np_op2, tf_op, initial_value functor
+    self.ops_list = [(np.add, None,
+                      math_ops.unsorted_segment_sum, lambda t: 0),
+                     (self._mean_cum_op, self._mean_reduce_op,
+                      math_ops.unsorted_segment_mean, lambda t: 0),
+                     (self._mean_cum_op, self._sqrt_n_reduce_op,
+                      math_ops.unsorted_segment_sqrt_n, lambda t: 0),
+                     (np.ndarray.__mul__, None,
+                      math_ops.unsorted_segment_prod, lambda t: 1),
+                     (np.minimum, None,
+                      math_ops.unsorted_segment_min, lambda t: t.max),
+                     (np.maximum, None,
+                      math_ops.unsorted_segment_max, lambda t: t.min)]
+
+    # A subset of ops has been enabled for complex numbers
+    self.complex_ops_list = [(np.add, None,
+                              math_ops.unsorted_segment_sum, lambda t: 0)]
+    self.differentiable_dtypes = [dtypes_lib.float16, dtypes_lib.float32,
+                                  dtypes_lib.float64]
+    self.all_dtypes = (self.differentiable_dtypes +
+                       [dtypes_lib.bfloat16,
+                        dtypes_lib.int64, dtypes_lib.int32,
+                        dtypes_lib.complex64, dtypes_lib.complex128])
+    super(UnsortedSegmentTest, self).__init__(methodName=methodName)
 
   def testValues(self):
-    dtypes = [
-        dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int64,
-        dtypes_lib.int32, dtypes_lib.complex64, dtypes_lib.complex128
-    ]
     indices_flat = np.array([0, 4, 0, 8, 3, 8, 4, 7, 7, 3])
     num_segments = 12
     for indices in indices_flat, indices_flat.reshape(5, 2):
       shape = indices.shape + (2,)
-      for dtype in dtypes:
-        with self.test_session(use_gpu=True):
-          tf_x, np_x = self._input(shape, dtype=dtype)
-          np_ans = self._segmentReduce(
-              indices, np_x, np.add, op2=None, num_segments=num_segments)
-          s = math_ops.unsorted_segment_sum(
-              data=tf_x, segment_ids=indices, num_segments=num_segments)
-          tf_ans = s.eval()
-        self.assertAllClose(np_ans, tf_ans)
-        self.assertShapeEqual(np_ans, s)
+      for dtype in self.all_dtypes:
+        ops_list = self.complex_ops_list if dtype.is_complex else self.ops_list
+        tf_x, np_x = self._input(shape, dtype=dtype)
+        for use_gpu in [True, False]:
+          with self.test_session(use_gpu=True):
+            for np_op1, np_op2, tf_op, init_op in ops_list:
+              # sqrt_n doesn't support integers
+              if (np_op2 == self._sqrt_n_reduce_op and dtype.is_integer):
+                continue
+              # todo(philjd): enable this test once real_div supports bfloat16
+              if (np_op2 in [self._sqrt_n_reduce_op, self._mean_reduce_op] and
+                  dtype == dtypes_lib.bfloat16):
+                continue
+              np_ans = self._segmentReduce(
+                  indices, np_x, np_op1, np_op2, num_segments=num_segments,
+                  initial_value=init_op(dtype))
+              s = tf_op(tf_x, segment_ids=indices, num_segments=num_segments)
+              tf_ans = s.eval()
+              if dtype is dtypes_lib.bfloat16:
+                tf_ans = tf_ans.astype(np.float32)
+              self.assertAllClose(np_ans, tf_ans)
+              self.assertShapeEqual(np_ans, s)
 
   def testNumSegmentsTypes(self):
     dtypes = [dtypes_lib.int32, dtypes_lib.int64]
@@ -287,25 +320,51 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
         self.assertAllClose(np_ans, tf_ans)
         self.assertShapeEqual(np_ans, s)
 
-  def testGradientSegmentSum(self):
+  def testGradients(self):
     num_cols = 2
-    indices_flat = np.array([0, 4, 0, 8, 3, 8, 4, 7, 7, 3])
+    indices_flat = np.array([0, 4, 0, -1, 3, -1, 4, 7, 7, 3])
     num_segments = max(indices_flat) + 3
-    for dtype in [dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.complex64,
-                  dtypes_lib.complex128]:
+    for dtype in self.differentiable_dtypes:
+      ops_list = self.complex_ops_list if dtype.is_complex else self.ops_list
       for indices in indices_flat, indices_flat.reshape(5, 2):
         shape = indices.shape + (num_cols,)
-        with self.test_session(use_gpu=True):
-          tf_x, np_x = self._input(shape, dtype=dtype)
-          s = math_ops.unsorted_segment_sum(
-              data=tf_x, segment_ids=indices, num_segments=num_segments)
+        # test CPU and GPU as tf.gather behaves differently on each device
+        for use_gpu in [False, True]:
+          with self.test_session(use_gpu=use_gpu):
+            for _, _, tf_op, _ in ops_list:
+              tf_x, np_x = self._input(shape, dtype=dtype)
+              s = tf_op(tf_x, indices, num_segments)
+              jacob_t, jacob_n = gradient_checker.compute_gradient(
+                  tf_x,
+                  shape,
+                  s, [num_segments, num_cols],
+                  x_init_value=np_x,
+                  delta=1)
+            self.assertAllClose(jacob_t, jacob_n)
+
+  def testProdGrad(self):
+    # additional test for the prod gradient to ensure correct handling of zeros
+    values = np.array([0, 0, 1, 0, 2, 2, 3, 3, 3], dtype=np.float32)
+    indices = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2], dtype=np.int32)
+    indices_neg = np.array([-1, 0, 0, -1, 1, 1, -1, 2, 2], dtype=np.int32)
+    values_tf = constant_op.constant(values)
+    # ground truth partial derivatives
+    gradients_indices = np.zeros((9, 3), dtype=np.float32)
+    gradients_indices_neg = np.zeros((9, 3), dtype=np.float32)
+    # the derivative w.r.t. to the other segments is zero, so here we only
+    # explicitly set the grad values for the corresponding segment
+    gradients_indices[range(9), indices] = [0, 0, 0, 4, 0, 0, 9, 9, 9]
+    gradients_indices_neg[range(9), indices_neg] = [0, 1, 0, 0, 2, 2, 0, 3, 3]
+    for use_gpu in [False, True]:
+      with self.test_session(use_gpu=use_gpu):
+        for ind, grad_gt in [(indices, gradients_indices),
+                             (indices_neg, gradients_indices_neg)]:
+          s = math_ops.unsorted_segment_prod(values_tf,
+                                             constant_op.constant(ind), 3)
           jacob_t, jacob_n = gradient_checker.compute_gradient(
-              tf_x,
-              shape,
-              s, [num_segments, num_cols],
-              x_init_value=np_x,
-              delta=1)
-        self.assertAllClose(jacob_t, jacob_n)
+              values_tf, (9,), s, (3,), x_init_value=values, delta=1)
+          self.assertAllClose(jacob_t, jacob_n)
+          self.assertAllClose(jacob_t, grad_gt)
 
   def testGradientMatchesSegmentSum(self):
     # Strategy: compute the gradient for UnsortedSegmentSum and SegmentSum
@@ -318,8 +377,7 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
     num_cols = 2
     shape = [n, num_cols]
     num_segments = max(indices) + 1
-    for dtype in [dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.complex64,
-                  dtypes_lib.complex128]:
+    for dtype in self.differentiable_dtypes:
       with self.test_session(use_gpu=True):
         tf_x, np_x = self._input(shape, dtype=dtype)
         # Results from UnsortedSegmentSum
@@ -353,9 +411,8 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
           unsorted.eval()
 
   def testEmptySecondDimension(self):
-    dtypes = [
-        np.float32, np.float64, np.int64, np.int32, np.complex64, np.complex128
-    ]
+    dtypes = [np.float16, np.float32, np.float64, np.int64, np.int32,
+              np.complex64, np.complex128]
     with self.test_session(use_gpu=True):
       for dtype in dtypes:
         for itype in (np.int32, np.int64):
@@ -364,36 +421,14 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
           unsorted = math_ops.unsorted_segment_sum(data, segment_ids, 2)
           self.assertAllEqual(unsorted.eval(), np.zeros((2, 0), dtype=dtype))
 
-  def testGradientSegmentMax(self):
-    num_cols = 2
-    indices_flat = np.array([0, 4, 0, 8, 3, 8, 4, 7, 7, 3])
-    num_segments = max(indices_flat) + 3
-    for indices in indices_flat, indices_flat.reshape(5, 2):
-      shape = indices.shape + (num_cols,)
-      with self.test_session(use_gpu=True):
-        tf_x, np_x = self._input(shape, dtype=dtypes_lib.float64)
-        s = math_ops.unsorted_segment_max(
-            data=tf_x, segment_ids=indices, num_segments=num_segments)
-        jacob_t, jacob_n = gradient_checker.compute_gradient(
-            tf_x,
-            shape,
-            s,
-            [num_segments, num_cols],
-            x_init_value=np_x.astype(np.double), delta=1)
-      self.assertAllClose(jacob_t, jacob_n)
-
   def testDropNegatives(self):
     # Note: the test is done by replacing segment_ids with 8 to -1
     # for index  and replace values generated by numpy with 0.
-    dtypes = [
-        dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int64,
-        dtypes_lib.int32, dtypes_lib.complex64, dtypes_lib.complex128
-    ]
     indices_flat = np.array([0, 4, 0, 8, 3, 8, 4, 7, 7, 3])
     num_segments = 12
     for indices in indices_flat, indices_flat.reshape(5, 2):
       shape = indices.shape + (2,)
-      for dtype in dtypes:
+      for dtype in self.all_dtypes:
         with self.test_session(use_gpu=True):
           tf_x, np_x = self._input(shape, dtype=dtype)
           np_ans = self._segmentReduce(
diff --git a/tensorflow/python/kernel_tests/unique_op_test.py b/tensorflow/python/kernel_tests/unique_op_test.py
index 6366d2e181..4498fd9fe9 100644
--- a/tensorflow/python/kernel_tests/unique_op_test.py
+++ b/tensorflow/python/kernel_tests/unique_op_test.py
@@ -133,6 +133,39 @@ class UniqueWithCountsTest(test.TestCase):
       v = [1 if x[i] == value.decode('ascii') else 0 for i in range(7000)]
       self.assertEqual(count, sum(v))
 
+  def testInt32Axis(self):
+    for dtype in [np.int32, np.int64]:
+      x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]])
+      with self.test_session() as sess:
+        y0, idx0, count0 = gen_array_ops._unique_with_counts_v2(
+            x, axis=np.array([0], dtype))
+        tf_y0, tf_idx0, tf_count0 = sess.run([y0, idx0, count0])
+        y1, idx1, count1 = gen_array_ops._unique_with_counts_v2(
+            x, axis=np.array([1], dtype))
+        tf_y1, tf_idx1, tf_count1 = sess.run([y1, idx1, count1])
+      self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
+      self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
+      self.assertAllEqual(tf_count0, np.array([2, 1]))
+      self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]]))
+      self.assertAllEqual(tf_idx1, np.array([0, 1, 1]))
+      self.assertAllEqual(tf_count1, np.array([1, 2]))
+
+  def testInt32V2(self):
+    # This test is only temporary, once V2 is used
+    # by default, the axis will be wrapped to allow `axis=None`.
+    x = np.random.randint(2, high=10, size=7000)
+    with self.test_session() as sess:
+      y, idx, count = gen_array_ops._unique_with_counts_v2(
+          x, axis=np.array([], np.int32))
+      tf_y, tf_idx, tf_count = sess.run([y, idx, count])
+
+    self.assertEqual(len(x), len(tf_idx))
+    self.assertEqual(len(tf_y), len(np.unique(x)))
+    for i in range(len(x)):
+      self.assertEqual(x[i], tf_y[tf_idx[i]])
+    for value, count in zip(tf_y, tf_count):
+      self.assertEqual(count, np.sum(x == value))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index cdfb955f54..96f5f81c1f 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -1325,6 +1325,18 @@ def unique(x, out_idx=dtypes.int32, name=None):
 unique.__doc__ = gen_array_ops._unique.__doc__
 
 
+@tf_export("unique_with_counts")
+def unique_with_counts(x, out_idx=dtypes.int32, name=None):
+  # TODO(yongtang): switch to v2 once API deprecation
+  # period (3 weeks) pass.
+  # TODO(yongtang): The documentation should also
+  # be updated when switch  to v2.
+  return gen_array_ops._unique_with_counts(x, out_idx, name)
+
+
+unique_with_counts.__doc__ = gen_array_ops._unique_with_counts.__doc__
+
+
 @tf_export("split")
 def split(value, num_or_size_splits, axis=0, num=None, name="split"):
   """Splits a tensor into sub tensors.
diff --git a/tensorflow/python/ops/bitwise_ops_test.py b/tensorflow/python/ops/bitwise_ops_test.py
index 0a2af3716b..c4cfc0da19 100644
--- a/tensorflow/python/ops/bitwise_ops_test.py
+++ b/tensorflow/python/ops/bitwise_ops_test.py
@@ -70,10 +70,8 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(truth, popcnt_result)
 
   def testInvertOp(self):
-    dtype_list = [
-        dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64, dtypes.uint8,
-        dtypes.uint16, dtypes.uint32, dtypes.uint64
-    ]
+    dtype_list = [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
+                  dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64]
     inputs = [0, 5, 3, 14]
     with self.test_session(use_gpu=True) as sess:
       for dtype in dtype_list:
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index 0fd6e29a49..64567ac54a 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -334,9 +334,9 @@ def assert_equal(x, y, data=None, summarize=None, message=None, name=None):
     @compatibility{eager} returns None
 
   Raises:
-    InvalidArgumentError if the check can be performed immediately and
-    `x == y` is False. The check can be performed immediately during
-    eager execution or if `x` and `y` are statically known.
+    InvalidArgumentError: if the check can be performed immediately and
+      `x == y` is False. The check can be performed immediately during eager
+      execution or if `x` and `y` are statically known.
   """
   message = message or ''
   with ops.name_scope(name, 'assert_equal', [x, y, data]):
diff --git a/tensorflow/python/ops/confusion_matrix.py b/tensorflow/python/ops/confusion_matrix.py
index e4ce2ab28a..b9a93c3bed 100644
--- a/tensorflow/python/ops/confusion_matrix.py
+++ b/tensorflow/python/ops/confusion_matrix.py
@@ -99,19 +99,16 @@ def confusion_matrix(labels, predictions, num_classes=None, dtype=dtypes.int32,
                      name=None, weights=None):
   """Computes the confusion matrix from predictions and labels.
 
-  Calculate the Confusion Matrix for a pair of prediction and
-  label 1-D int arrays.
-
   The matrix columns represent the prediction labels and the rows represent the
   real labels. The confusion matrix is always a 2-D array of shape `[n, n]`,
   where `n` is the number of valid labels for a given classification task. Both
   prediction and labels must be 1-D arrays of the same shape in order for this
   function to work.
 
-  If `num_classes` is None, then `num_classes` will be set to the one plus
-  the maximum value in either predictions or labels.
-  Class labels are expected to start at 0. E.g., if `num_classes` was
-  three, then the possible labels would be `[0, 1, 2]`.
+  If `num_classes` is `None`, then `num_classes` will be set to one plus the
+  maximum value in either predictions or labels. Class labels are expected to
+  start at 0. For example, if `num_classes` is 3, then the possible labels
+  would be `[0, 1, 2]`.
 
   If `weights` is not `None`, then each prediction contributes its
   corresponding weight to the total value of the confusion matrix cell.
@@ -141,8 +138,9 @@ def confusion_matrix(labels, predictions, num_classes=None, dtype=dtypes.int32,
     weights: An optional `Tensor` whose shape matches `predictions`.
 
   Returns:
-    A k X k matrix representing the confusion matrix, where k is the number of
-    possible labels in the classification task.
+    A `Tensor` of type `dtype` with shape `[n, n]` representing the confusion
+    matrix, where `n` is the number of possible labels in the classification
+    task.
 
   Raises:
     ValueError: If both predictions and labels are not 1-D vectors and have
@@ -188,7 +186,7 @@ def confusion_matrix(labels, predictions, num_classes=None, dtype=dtypes.int32,
       weights = math_ops.cast(weights, dtype)
 
     shape = array_ops.stack([num_classes, num_classes])
-    indices = array_ops.transpose(array_ops.stack([labels, predictions]))
+    indices = array_ops.stack([labels, predictions], axis=1)
     values = (array_ops.ones_like(predictions, dtype)
               if weights is None else weights)
     cm_sparse = sparse_tensor.SparseTensor(
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index c78a5aa8c2..8218e60b53 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -179,8 +179,6 @@ def Assert(condition, data, summarize=None, name=None):
             condition, data, summarize, name="Assert")
 
       guarded_assert = cond(condition, no_op, true_assert, name="AssertGuard")
-      if context.in_eager_mode():
-        return
       return guarded_assert.op
 
 
diff --git a/tensorflow/python/ops/distributions/special_math.py b/tensorflow/python/ops/distributions/special_math.py
index bed4cbb2c1..1d605c5dfc 100644
--- a/tensorflow/python/ops/distributions/special_math.py
+++ b/tensorflow/python/ops/distributions/special_math.py
@@ -213,7 +213,7 @@ def _ndtri(p):
 
   # Compute x for p <= exp(-2): x = z - log(z)/z - (1/z) P(1/z) / Q(1/z),
   # where z = sqrt(-2. * log(p)), and P/Q are chosen between two different
-  # arrays based on wether p < exp(-32).
+  # arrays based on whether p < exp(-32).
   z = math_ops.sqrt(-2. * math_ops.log(sanitized_mcp))
   first_term = z - math_ops.log(z) / z
   second_term_small_p = (_create_polynomial(1. / z, p2)
diff --git a/tensorflow/python/ops/hidden_ops.txt b/tensorflow/python/ops/hidden_ops.txt
index f6ef6f3f3d..9b8172bf26 100644
--- a/tensorflow/python/ops/hidden_ops.txt
+++ b/tensorflow/python/ops/hidden_ops.txt
@@ -32,6 +32,8 @@ TileGrad  # Exported through array_grad instead of array_ops.
 ZerosLike  # TODO(josh11b): Use this instead of the Python version.
 Unique
 UniqueV2
+UniqueWithCounts
+UniqueWithCountsV2
 Unpack
 
 # candidate_sampling_ops
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 53bd108c44..58c18c6696 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -452,7 +452,6 @@ def _rot90_4D(images, k, name_scope):
 
   def _rot180():
     return array_ops.reverse_v2(images, [1, 2])
-
   def _rot270():
     return array_ops.reverse_v2(array_ops.transpose(images, [0, 2, 1, 3]), [2])
 
@@ -465,7 +464,6 @@ def _rot90_4D(images, k, name_scope):
   result.set_shape([shape[0], None, None, shape[3]])
   return result
 
-
 @tf_export('image.transpose_image')
 def transpose_image(image):
   """Transpose image(s) by swapping the height and width dimension.
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index b67e7cc558..b8c4b27c16 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -1170,6 +1170,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(ValueError, "must be three-dimensional"):
         op(p_wrong_rank)
 
+
   def testRot90GroupOrder(self):
     image = np.arange(24, dtype=np.uint8).reshape([2, 4, 3])
     with self.test_session(use_gpu=True):
@@ -1204,7 +1205,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         y_np = np.rot90(image, k=k, axes=(1, 2))
         self.assertAllEqual(y_np, y_tf.eval({k_placeholder: k}))
 
-
 class RandomFlipTest(test_util.TensorFlowTestCase):
 
   def testRandomLeftRight(self):
diff --git a/tensorflow/python/ops/linalg/linear_operator_diag.py b/tensorflow/python/ops/linalg/linear_operator_diag.py
index b3ec3d5b7c..e180e83026 100644
--- a/tensorflow/python/ops/linalg/linear_operator_diag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_diag.py
@@ -67,7 +67,7 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
   operator = LinearOperatorDiag(diag)
 
   # Create a shape [2, 1, 4, 2] vector.  Note that this shape is compatible
-  # since the batch dimensions, [2, 1], are brodcast to
+  # since the batch dimensions, [2, 1], are broadcast to
   # operator.batch_shape = [2, 3].
   y = tf.random_normal(shape=[2, 1, 4, 2])
   x = operator.solve(y)
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index a39417139e..7386976e93 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -654,7 +654,7 @@ def sigmoid_cross_entropy(
 
   Args:
     multi_class_labels: `[batch_size, num_classes]` target integer labels in
-      `(0, 1)`.
+      `{0, 1}`.
     logits: Float `[batch_size, num_classes]` logits outputs of the network.
     weights: Optional `Tensor` whose rank is either 0, or the same rank as
       `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index bf28f74153..69afa618e2 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -234,56 +234,142 @@ def _SparseSegmentSqrtNWithNumSegmentsGrad(op, grad):
                                               dim0), None, None, None)
 
 
-def _SegmentMinOrMaxGrad(op, grad, is_sorted):
-  """Gradient for SegmentMin and (unsorted) SegmentMax.
-
-  They share similar code.
-  """
-  zeros = array_ops.zeros(
-      array_ops.shape(op.inputs[0]), dtype=op.inputs[0].dtype)
-
+def _SegmentMinOrMaxGrad(op, grad):
+  """ Gradient for SegmentMin and SegmentMax. """
+  zeros = array_ops.zeros_like(op.inputs[0], dtype=op.inputs[0].dtype)
   # Get the number of selected (minimum or maximum) elements in each segment.
   gathered_outputs = array_ops.gather(op.outputs[0], op.inputs[1])
   is_selected = math_ops.equal(op.inputs[0], gathered_outputs)
-  if is_sorted:
-    num_selected = math_ops.segment_sum(
-        math_ops.cast(is_selected, grad.dtype), op.inputs[1])
-  else:
-    num_selected = math_ops.unsorted_segment_sum(
-        math_ops.cast(is_selected, grad.dtype), op.inputs[1], op.inputs[2])
-
+  num_selected = math_ops.segment_sum(math_ops.cast(is_selected, grad.dtype),
+                                      op.inputs[1])
   # Compute the gradient for each segment. The gradient for the ith segment is
   # divided evenly among the selected elements in that segment.
   weighted_grads = math_ops.div(grad, num_selected)
   gathered_grads = array_ops.gather(weighted_grads, op.inputs[1])
-
-  if is_sorted:
-    return array_ops.where(is_selected, gathered_grads, zeros), None
-  else:
-    return array_ops.where(is_selected, gathered_grads, zeros), None, None
+  return array_ops.where(is_selected, gathered_grads, zeros), None
 
 
 @ops.RegisterGradient("SegmentMin")
 def _SegmentMinGrad(op, grad):
   """Gradient for SegmentMin."""
-  return _SegmentMinOrMaxGrad(op, grad, True)
+  return _SegmentMinOrMaxGrad(op, grad)
 
 
 @ops.RegisterGradient("SegmentMax")
 def _SegmentMaxGrad(op, grad):
   """Gradient for SegmentMax."""
-  return _SegmentMinOrMaxGrad(op, grad, True)
+  return _SegmentMinOrMaxGrad(op, grad)
+
+
+def _GatherDropNegatives(params, ids, zero_clipped_indices=None,
+                         is_positive=None):
+  """ Helper function for unsorted segment ops. Gathers params for
+      positive segment ids and gathers 0 for inputs with negative segment id.
+      Also returns the clipped indices and a boolean mask with the same shape
+      as ids where a positive id is masked as true. With this, the latter two
+      can be passed as arguments to this function to reuse them.
+  """
+  if zero_clipped_indices is None:
+    zero_clipped_indices = math_ops.maximum(ids, array_ops.zeros_like(ids))
+  gathered = array_ops.gather(params, zero_clipped_indices)
+  if is_positive is None:
+    is_positive = math_ops.greater_equal(ids, 0)
+    # tf.where(condition, x, y) requires condition to have the same shape as x
+    # and y.
+    # todo(philjd): remove this if tf.where supports broadcasting (#9284)
+    for _ in range(gathered.shape.ndims - is_positive.shape.ndims):
+      is_positive = array_ops.expand_dims(is_positive, -1)
+    is_positive = (is_positive &
+                   array_ops.ones_like(gathered, dtype=dtypes.bool))
+  # replace gathered params of negative indices with 0
+  zero_slice = array_ops.zeros_like(gathered)
+  return (array_ops.where(is_positive, gathered, zero_slice),
+          zero_clipped_indices, is_positive)
+
+
+def _UnsortedSegmentMinOrMaxGrad(op, grad):
+  """ Gradient for UnsortedSegmentMin and UnsortedSegmentMax. """
+  # Get the number of selected (minimum or maximum) elements in each segment.
+  gathered_outputs, zero_clipped_indices, is_positive = \
+      _GatherDropNegatives(op.outputs[0], op.inputs[1])
+  is_selected = math_ops.equal(op.inputs[0], gathered_outputs)
+  is_selected = math_ops.logical_and(is_selected, is_positive)
+  num_selected = math_ops.unsorted_segment_sum(
+      math_ops.cast(is_selected, grad.dtype), op.inputs[1], op.inputs[2])
+  # Compute the gradient for each segment. The gradient for the ith segment is
+  # divided evenly among the selected elements in that segment.
+  weighted_grads = math_ops.div(grad, num_selected)
+  gathered_grads, _, _ = _GatherDropNegatives(weighted_grads, None,
+                                              zero_clipped_indices,
+                                              is_positive)
+  zeros = array_ops.zeros_like(gathered_grads)
+  return array_ops.where(is_selected, gathered_grads, zeros), None, None
 
 
 @ops.RegisterGradient("UnsortedSegmentSum")
 def _UnsortedSegmentSumGrad(op, grad):
-  """Gradient for SegmentSum."""
-  return array_ops.gather(grad, op.inputs[1]), None, None
+  """Gradient for UnsortedSegmentSum."""
+  return _GatherDropNegatives(grad, op.inputs[1])[0], None, None
 
 
 @ops.RegisterGradient("UnsortedSegmentMax")
 def _UnsortedSegmentMaxGrad(op, grad):
-  return _SegmentMinOrMaxGrad(op, grad, False)
+  """ Gradient for UnsortedSegmentMax. """
+  return _UnsortedSegmentMinOrMaxGrad(op, grad)
+
+
+@ops.RegisterGradient("UnsortedSegmentMin")
+def _UnsortedSegmentMinGrad(op, grad):
+  """ Gradient for UnsortedSegmentMin. """
+  return _UnsortedSegmentMinOrMaxGrad(op, grad)
+
+
+@ops.RegisterGradient("UnsortedSegmentProd")
+def _UnsortedSegmentProdGrad(op, grad):
+  """ Gradient for UnsortedSegmentProd.
+  The gradient can be expressed for each segment by dividing the segment's
+  product by each element of the segment input tensor, but this approach can't
+  deal with zeros in the input.
+  Unlike reduce_prod we can't use cumsum here as individual segments may have
+  a different number of elements. Therefore we consider three cases:
+  1) A segment input contains no zeros and we can safely divide by the input
+     tensor.
+  2) A segment contains exactly one zero. Then the gradient of each input of
+     the segment is zero except for the 0-input, there the gradient is
+     the product of the remaining segment entries.
+  3) A segment contains at least two zeros. The gradient is zero for all
+     segment inputs.
+  """
+  # Note that unsorted_segment_sum will filter out the negative indices,
+  # so we don't need to do a logical_and with is_positive here
+  is_zero = math_ops.equal(op.inputs[0], 0)
+  num_zeros = gen_math_ops.unsorted_segment_sum(
+      math_ops.cast(is_zero, dtype=dtypes.int32), op.inputs[1], op.inputs[2])
+  # handle case 3 and set the gradient to 0 for segments with more than one
+  # 0 as input
+  grad = array_ops.where(math_ops.greater(num_zeros, 1),
+                         array_ops.zeros_like(grad), grad)
+  # replace all zeros with ones and compute the unsorted_segment_prod
+  non_zero_data = array_ops.where(is_zero, array_ops.ones_like(op.inputs[0]),
+                                  op.inputs[0])
+  non_zero_prod = gen_math_ops.unsorted_segment_prod(
+      non_zero_data, op.inputs[1], op.inputs[2])
+  # clip the indices for gather to be positive
+  zero_clipped_indices = math_ops.maximum(op.inputs[1],
+                                          array_ops.zeros_like(op.inputs[1]))
+  gathered_prod = array_ops.gather(op.outputs[0], zero_clipped_indices)
+  gathered_non_zero_prod = array_ops.gather(non_zero_prod,
+                                            zero_clipped_indices)
+  prod_divided_by_el = gathered_prod / op.inputs[0]  # May contain nan/inf.
+  # Now fetch the individual results for segments containing 0 and those that
+  # don't. is_zero will also fetch results for entries with negative index
+  # but the following gather_drop_negatives sets the corresponding entry in
+  # grad to 0 for these
+  partial_derivative = array_ops.where(is_zero, gathered_non_zero_prod,
+                                       prod_divided_by_el)
+  gathered_grad = _GatherDropNegatives(grad, op.inputs[1],
+                                       zero_clipped_indices)[0]
+  return gathered_grad * partial_derivative, None, None
 
 
 @ops.RegisterGradient("Abs")
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index a09540028f..2ae8b610da 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -131,6 +131,9 @@ See the @{$python/math_ops} guide.
 @@segment_mean
 @@unsorted_segment_sum
 @@unsorted_segment_max
+@@unsorted_segment_min
+@@unsorted_segment_prod
+@@unsorted_segment_sqrt_n
 @@sparse_segment_sum
 @@sparse_segment_mean
 @@sparse_segment_sqrt_n
@@ -898,6 +901,40 @@ def to_bfloat16(x, name="ToBFloat16"):
   return cast(x, dtypes.bfloat16, name=name)
 
 
+@tf_export("to_complex64")
+def to_complex64(x, name="ToComplex64"):
+  """Casts a tensor to type `complex64`.
+
+  Args:
+    x: A `Tensor` or `SparseTensor`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor` with same shape as `x` with type `complex64`.
+
+  Raises:
+    TypeError: If `x` cannot be cast to the `complex64`.
+  """
+  return cast(x, dtypes.complex64, name=name)
+
+
+@tf_export("to_complex128")
+def to_complex128(x, name="ToComplex128"):
+  """Casts a tensor to type `complex128`.
+
+  Args:
+    x: A `Tensor` or `SparseTensor`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor` with same shape as `x` with type `complex128`.
+
+  Raises:
+    TypeError: If `x` cannot be cast to the `complex128`.
+  """
+  return cast(x, dtypes.complex128, name=name)
+
+
 ops.Tensor._override_operator("__neg__", gen_math_ops._neg)
 ops.Tensor._override_operator("__abs__", abs)
 # __invert__ corresponds to the ~ operator.  Here we follow the numpy convention
@@ -2552,6 +2589,87 @@ def reduced_shape(input_shape, axes):
       ])  # [1, 1]
 
 
+def _unsorted_segment_N(data, segment_ids, num_segments):
+  """ Helper function for unsorted_segment_mean/_sqrtN. Computes the number
+      of segment entries with 0-entries set to 1 to allow division by N.
+  """
+  # bincount doesn't support negative indices so we use unsorted_segment_sum
+  ones_tensor = array_ops.ones(segment_ids.shape, dtype=data.dtype)
+  N = gen_math_ops.unsorted_segment_sum(ones_tensor, segment_ids, num_segments)
+  # add dimensions for all non-reduced axes
+  ndims_output = data.shape.ndims - segment_ids.shape.ndims
+  broadcast_shape = [num_segments] + [1] * ndims_output
+  N = array_ops.reshape(N, broadcast_shape)
+  return gen_math_ops.maximum(N, 1)
+
+
+@tf_export("unsorted_segment_mean")
+def unsorted_segment_mean(data, segment_ids, num_segments, name=None):
+  r""" Computes the mean along segments of a tensor.
+
+  Read @{$math_ops#segmentation$the section on segmentation} for an explanation
+  of segments.
+
+  This operator is similar to the unsorted segment sum operator found
+  [here](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+  Instead of computing the sum over segments, it computes the mean of all
+  entries belonging to a segment such that:
+
+  \\(output_i = 1/N_i \sum data_j\\) where the sum is over `j` such
+  that `segment_ids[j] == i` with \\N_i\\ being the number of occurrences
+  of id \\i\\.
+
+  If there is no entry for a given segment ID `i`, it outputs 0.
+
+  segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+  first dimension.
+
+  output: Has same shape as data, except for dimension 0 which
+  has size `num_segments`.
+  """
+  with ops.name_scope(name, "UnsortedSegmentMean"):
+    data = ops.convert_to_tensor(data)
+    segment_ids = ops.convert_to_tensor(segment_ids)
+    N = _unsorted_segment_N(data, segment_ids, num_segments)
+    summed = gen_math_ops.unsorted_segment_sum(data, segment_ids, num_segments)
+    return summed / N
+
+
+@tf_export("unsorted_segment_sqrt_n")
+def unsorted_segment_sqrt_n(data, segment_ids, num_segments, name=None):
+  r"""Computes the sum along segments of a tensor divided by the sqrt(N).
+
+  Read @{$math_ops#segmentation$the section on segmentation} for an explanation
+  of segments.
+
+  This operator is similar to the unsorted segment sum operator found
+  [here](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+  Additionally to computing the sum over segments, it divides the results by
+  sqrt(N).
+
+  \\(output_i = 1/sqrt(N_i) \sum data_j\\) where the sum is over `j` such
+  that `segment_ids[j] == i` with \\N_i\\ being the number of occurrences
+  of id \\i\\.
+
+  If there is no entry for a given segment ID `i`, it outputs 0.
+
+  Note that this op only supports floating point and complex dtypes,
+  due to tf.sqrt only supporting these types.
+
+  segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+  first dimension.
+
+  output: Has same shape as data, except for dimension 0 which
+  has size `num_segments`.
+  """
+  with ops.name_scope(name, "UnsortedSegmentSqrtN"):
+    data = ops.convert_to_tensor(data)
+    segment_ids = ops.convert_to_tensor(segment_ids)
+    N = _unsorted_segment_N(data, segment_ids, num_segments)
+    summed = gen_math_ops.unsorted_segment_sum(data, segment_ids, num_segments)
+    return summed / gen_math_ops.sqrt(N)
+
+
 @tf_export("sparse_segment_sum")
 def sparse_segment_sum(data, indices, segment_ids, name=None,
                        num_segments=None):
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index 2333736583..f8d08f1d39 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -2061,6 +2061,18 @@ tf_module {
     argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
+    name: "unsorted_segment_min"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "unsorted_segment_prod"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "unsorted_segment_sqrt_n"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "unsorted_segment_sum"
     argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
diff --git a/tensorflow/tools/ci_build/builds/with_the_same_user b/tensorflow/tools/ci_build/builds/with_the_same_user
index 5817716c8d..d4bf546d40 100755
--- a/tensorflow/tools/ci_build/builds/with_the_same_user
+++ b/tensorflow/tools/ci_build/builds/with_the_same_user
@@ -36,8 +36,13 @@ else
   rm /this_is_writable_file_system
 fi
 
+if [ -n "${CI_BUILD_USER_FORCE_BADNAME}" ]; then
+  ADDUSER_OPTS="--force-badname"
+fi
+
 getent group "${CI_BUILD_GID}" || addgroup --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}"
-getent passwd "${CI_BUILD_UID}" || adduser --gid "${CI_BUILD_GID}" --uid "${CI_BUILD_UID}" \
+getent passwd "${CI_BUILD_UID}" || adduser ${ADDUSER_OPTS} \
+    --gid "${CI_BUILD_GID}" --uid "${CI_BUILD_UID}" \
     --gecos "${CI_BUILD_USER} (generated by with_the_same_user script)" \
     --disabled-password --home "${CI_BUILD_HOME}" --quiet "${CI_BUILD_USER}"
 usermod -a -G sudo "${CI_BUILD_USER}"
diff --git a/tensorflow/tools/compatibility/ast_edits.py b/tensorflow/tools/compatibility/ast_edits.py
deleted file mode 100644
index 23cc4a21a9..0000000000
--- a/tensorflow/tools/compatibility/ast_edits.py
+++ /dev/null
@@ -1,502 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Upgrader for Python scripts according to an API change specification."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import ast
-import collections
-import os
-import shutil
-import sys
-import tempfile
-import traceback
-
-
-class APIChangeSpec(object):
-  """This class defines the transformations that need to happen.
-
-  This class must provide the following fields:
-
-  * `function_keyword_renames`: maps function names to a map of old -> new
-    argument names
-  * `function_renames`: maps function names to new function names
-  * `change_to_function`: a set of function names that have changed (for
-    notifications)
-  * `function_reorders`: maps functions whose argument order has changed to the
-    list of arguments in the new order
-  * `function_handle`: maps function names to custom handlers for the function
-
-  For an example, see `TFAPIChangeSpec`.
-  """
-
-
-class _FileEditTuple(
-    collections.namedtuple("_FileEditTuple",
-                           ["comment", "line", "start", "old", "new"])):
-  """Each edit that is recorded by a _FileEditRecorder.
-
-  Fields:
-    comment: A description of the edit and why it was made.
-    line: The line number in the file where the edit occurs (1-indexed).
-    start: The line number in the file where the edit occurs (0-indexed).
-    old: text string to remove (this must match what was in file).
-    new: text string to add in place of `old`.
-  """
-
-  __slots__ = ()
-
-
-class _FileEditRecorder(object):
-  """Record changes that need to be done to the file."""
-
-  def __init__(self, filename):
-    # all edits are lists of chars
-    self._filename = filename
-
-    self._line_to_edit = collections.defaultdict(list)
-    self._errors = []
-
-  def process(self, text):
-    """Process a list of strings, each corresponding to the recorded changes.
-
-    Args:
-      text: A list of lines of text (assumed to contain newlines)
-    Returns:
-      A tuple of the modified text and a textual description of what is done.
-    Raises:
-      ValueError: if substitution source location does not have expected text.
-    """
-
-    change_report = ""
-
-    # Iterate of each line
-    for line, edits in self._line_to_edit.items():
-      offset = 0
-      # sort by column so that edits are processed in order in order to make
-      # indexing adjustments cumulative for changes that change the string
-      # length
-      edits.sort(key=lambda x: x.start)
-
-      # Extract each line to a list of characters, because mutable lists
-      # are editable, unlike immutable strings.
-      char_array = list(text[line - 1])
-
-      # Record a description of the change
-      change_report += "%r Line %d\n" % (self._filename, line)
-      change_report += "-" * 80 + "\n\n"
-      for e in edits:
-        change_report += "%s\n" % e.comment
-      change_report += "\n    Old: %s" % (text[line - 1])
-
-      # Make underscore buffers for underlining where in the line the edit was
-      change_list = [" "] * len(text[line - 1])
-      change_list_new = [" "] * len(text[line - 1])
-
-      # Iterate for each edit
-      for e in edits:
-        # Create effective start, end by accounting for change in length due
-        # to previous edits
-        start_eff = e.start + offset
-        end_eff = start_eff + len(e.old)
-
-        # Make sure the edit is changing what it should be changing
-        old_actual = "".join(char_array[start_eff:end_eff])
-        if old_actual != e.old:
-          raise ValueError("Expected text %r but got %r" %
-                           ("".join(e.old), "".join(old_actual)))
-        # Make the edit
-        char_array[start_eff:end_eff] = list(e.new)
-
-        # Create the underline highlighting of the before and after
-        change_list[e.start:e.start + len(e.old)] = "~" * len(e.old)
-        change_list_new[start_eff:end_eff] = "~" * len(e.new)
-
-        # Keep track of how to generate effective ranges
-        offset += len(e.new) - len(e.old)
-
-      # Finish the report comment
-      change_report += "         %s\n" % "".join(change_list)
-      text[line - 1] = "".join(char_array)
-      change_report += "    New: %s" % (text[line - 1])
-      change_report += "         %s\n\n" % "".join(change_list_new)
-    return "".join(text), change_report, self._errors
-
-  def add(self, comment, line, start, old, new, error=None):
-    """Add a new change that is needed.
-
-    Args:
-      comment: A description of what was changed
-      line: Line number (1 indexed)
-      start: Column offset (0 indexed)
-      old: old text
-      new: new text
-      error: this "edit" is something that cannot be fixed automatically
-    Returns:
-      None
-    """
-
-    self._line_to_edit[line].append(
-        _FileEditTuple(comment, line, start, old, new))
-    if error:
-      self._errors.append("%s:%d: %s" % (self._filename, line, error))
-
-
-class _ASTCallVisitor(ast.NodeVisitor):
-  """AST Visitor that processes function calls.
-
-  Updates function calls from old API version to new API version using a given
-  change spec.
-  """
-
-  def __init__(self, filename, lines, api_change_spec):
-    self._filename = filename
-    self._file_edit = _FileEditRecorder(filename)
-    self._lines = lines
-    self._api_change_spec = api_change_spec
-
-  def process(self, lines):
-    return self._file_edit.process(lines)
-
-  def generic_visit(self, node):
-    ast.NodeVisitor.generic_visit(self, node)
-
-  def _rename_functions(self, node, full_name):
-    function_renames = self._api_change_spec.function_renames
-    try:
-      new_name = function_renames[full_name]
-      self._file_edit.add("Renamed function %r to %r" % (full_name, new_name),
-                          node.lineno, node.col_offset, full_name, new_name)
-    except KeyError:
-      pass
-
-  def _get_attribute_full_path(self, node):
-    """Traverse an attribute to generate a full name e.g. tf.foo.bar.
-
-    Args:
-      node: A Node of type Attribute.
-
-    Returns:
-      a '.'-delimited full-name or None if the tree was not a simple form.
-      i.e. `foo()+b).bar` returns None, while `a.b.c` would return "a.b.c".
-    """
-    curr = node
-    items = []
-    while not isinstance(curr, ast.Name):
-      if not isinstance(curr, ast.Attribute):
-        return None
-      items.append(curr.attr)
-      curr = curr.value
-    items.append(curr.id)
-    return ".".join(reversed(items))
-
-  def _find_true_position(self, node):
-    """Return correct line number and column offset for a given node.
-
-    This is necessary mainly because ListComp's location reporting reports
-    the next token after the list comprehension list opening.
-
-    Args:
-      node: Node for which we wish to know the lineno and col_offset
-    """
-    import re
-    find_open = re.compile("^\s*(\\[).*$")
-    find_string_chars = re.compile("['\"]")
-
-    if isinstance(node, ast.ListComp):
-      # Strangely, ast.ListComp returns the col_offset of the first token
-      # after the '[' token which appears to be a bug. Workaround by
-      # explicitly finding the real start of the list comprehension.
-      line = node.lineno
-      col = node.col_offset
-      # loop over lines
-      while 1:
-        # Reverse the text to and regular expression search for whitespace
-        text = self._lines[line - 1]
-        reversed_preceding_text = text[:col][::-1]
-        # First find if a [ can be found with only whitespace between it and
-        # col.
-        m = find_open.match(reversed_preceding_text)
-        if m:
-          new_col_offset = col - m.start(1) - 1
-          return line, new_col_offset
-        else:
-          if (reversed_preceding_text == "" or
-              reversed_preceding_text.isspace()):
-            line = line - 1
-            prev_line = self._lines[line - 1]
-            # TODO(aselle):
-            # this is poor comment detection, but it is good enough for
-            # cases where the comment does not contain string literal starting/
-            # ending characters. If ast gave us start and end locations of the
-            # ast nodes rather than just start, we could use string literal
-            # node ranges to filter out spurious #'s that appear in string
-            # literals.
-            comment_start = prev_line.find("#")
-            if comment_start == -1:
-              col = len(prev_line) - 1
-            elif find_string_chars.search(prev_line[comment_start:]) is None:
-              col = comment_start
-            else:
-              return None, None
-          else:
-            return None, None
-    # Most other nodes return proper locations (with notably does not), but
-    # it is not possible to use that in an argument.
-    return node.lineno, node.col_offset
-
-  def visit_Call(self, node):  # pylint: disable=invalid-name
-    """Handle visiting a call node in the AST.
-
-    Args:
-      node: Current Node
-    """
-
-    # Find a simple attribute name path e.g. "tf.foo.bar"
-    full_name = self._get_attribute_full_path(node.func)
-
-    # Make sure the func is marked as being part of a call
-    node.func.is_function_for_call = True
-
-    if full_name:
-      # Call special handlers
-      function_handles = self._api_change_spec.function_handle
-      if full_name in function_handles:
-        function_handles[full_name](self._file_edit, node)
-
-      # Examine any non-keyword argument and make it into a keyword argument
-      # if reordering required.
-      function_reorders = self._api_change_spec.function_reorders
-      function_keyword_renames = (
-          self._api_change_spec.function_keyword_renames)
-
-      if full_name in function_reorders:
-        reordered = function_reorders[full_name]
-        for idx, arg in enumerate(node.args):
-          lineno, col_offset = self._find_true_position(arg)
-          if lineno is None or col_offset is None:
-            self._file_edit.add(
-                "Failed to add keyword %r to reordered function %r" %
-                (reordered[idx], full_name),
-                arg.lineno,
-                arg.col_offset,
-                "",
-                "",
-                error="A necessary keyword argument failed to be inserted.")
-          else:
-            keyword_arg = reordered[idx]
-            if (full_name in function_keyword_renames and
-                keyword_arg in function_keyword_renames[full_name]):
-              keyword_arg = function_keyword_renames[full_name][keyword_arg]
-            self._file_edit.add("Added keyword %r to reordered function %r" %
-                                (reordered[idx], full_name), lineno, col_offset,
-                                "", keyword_arg + "=")
-
-      # Examine each keyword argument and convert it to the final renamed form
-      renamed_keywords = ({} if full_name not in function_keyword_renames else
-                          function_keyword_renames[full_name])
-      for keyword in node.keywords:
-        argkey = keyword.arg
-        argval = keyword.value
-
-        if argkey in renamed_keywords:
-          argval_lineno, argval_col_offset = self._find_true_position(argval)
-          if argval_lineno is not None and argval_col_offset is not None:
-            # TODO(aselle): We should scan backward to find the start of the
-            # keyword key. Unfortunately ast does not give you the location of
-            # keyword keys, so we are forced to infer it from the keyword arg
-            # value.
-            key_start = argval_col_offset - len(argkey) - 1
-            key_end = key_start + len(argkey) + 1
-            if (self._lines[argval_lineno - 1][key_start:key_end] == argkey +
-                "="):
-              self._file_edit.add("Renamed keyword argument from %r to %r" %
-                                  (argkey,
-                                   renamed_keywords[argkey]), argval_lineno,
-                                  argval_col_offset - len(argkey) - 1,
-                                  argkey + "=", renamed_keywords[argkey] + "=")
-              continue
-          self._file_edit.add(
-              "Failed to rename keyword argument from %r to %r" %
-              (argkey, renamed_keywords[argkey]),
-              argval.lineno,
-              argval.col_offset - len(argkey) - 1,
-              "",
-              "",
-              error="Failed to find keyword lexographically. Fix manually.")
-
-    ast.NodeVisitor.generic_visit(self, node)
-
-  def visit_Attribute(self, node):  # pylint: disable=invalid-name
-    """Handle bare Attributes i.e. [tf.foo, tf.bar].
-
-    Args:
-      node: Node that is of type ast.Attribute
-    """
-    full_name = self._get_attribute_full_path(node)
-    if full_name:
-      self._rename_functions(node, full_name)
-    if full_name in self._api_change_spec.change_to_function:
-      if not hasattr(node, "is_function_for_call"):
-        new_text = full_name + "()"
-        self._file_edit.add("Changed %r to %r" % (full_name, new_text),
-                            node.lineno, node.col_offset, full_name, new_text)
-
-    ast.NodeVisitor.generic_visit(self, node)
-
-
-class ASTCodeUpgrader(object):
-  """Handles upgrading a set of Python files using a given API change spec."""
-
-  def __init__(self, api_change_spec):
-    if not isinstance(api_change_spec, APIChangeSpec):
-      raise TypeError("Must pass APIChangeSpec to ASTCodeUpgrader, got %s" %
-                      type(api_change_spec))
-    self._api_change_spec = api_change_spec
-
-  def process_file(self, in_filename, out_filename):
-    """Process the given python file for incompatible changes.
-
-    Args:
-      in_filename: filename to parse
-      out_filename: output file to write to
-    Returns:
-      A tuple representing number of files processed, log of actions, errors
-    """
-
-    # Write to a temporary file, just in case we are doing an implace modify.
-    with open(in_filename, "r") as in_file, \
-        tempfile.NamedTemporaryFile("w", delete=False) as temp_file:
-      ret = self.process_opened_file(in_filename, in_file, out_filename,
-                                     temp_file)
-
-    shutil.move(temp_file.name, out_filename)
-    return ret
-
-  # Broad exceptions are required here because ast throws whatever it wants.
-  # pylint: disable=broad-except
-  def process_opened_file(self, in_filename, in_file, out_filename, out_file):
-    """Process the given python file for incompatible changes.
-
-    This function is split out to facilitate StringIO testing from
-    tf_upgrade_test.py.
-
-    Args:
-      in_filename: filename to parse
-      in_file: opened file (or StringIO)
-      out_filename: output file to write to
-      out_file: opened file (or StringIO)
-    Returns:
-      A tuple representing number of files processed, log of actions, errors
-    """
-    process_errors = []
-    text = "-" * 80 + "\n"
-    text += "Processing file %r\n outputting to %r\n" % (in_filename,
-                                                         out_filename)
-    text += "-" * 80 + "\n\n"
-
-    parsed_ast = None
-    lines = in_file.readlines()
-    try:
-      parsed_ast = ast.parse("".join(lines))
-    except Exception:
-      text += "Failed to parse %r\n\n" % in_filename
-      text += traceback.format_exc()
-    if parsed_ast:
-      visitor = _ASTCallVisitor(in_filename, lines, self._api_change_spec)
-      visitor.visit(parsed_ast)
-      out_text, new_text, process_errors = visitor.process(lines)
-      text += new_text
-      if out_file:
-        out_file.write(out_text)
-    text += "\n"
-    return 1, text, process_errors
-
-  # pylint: enable=broad-except
-
-  def process_tree(self, root_directory, output_root_directory,
-                   copy_other_files):
-    """Processes upgrades on an entire tree of python files in place.
-
-    Note that only Python files. If you have custom code in other languages,
-    you will need to manually upgrade those.
-
-    Args:
-      root_directory: Directory to walk and process.
-      output_root_directory: Directory to use as base.
-      copy_other_files: Copy files that are not touched by this converter.
-
-    Returns:
-      A tuple of files processed, the report string ofr all files, and errors
-    """
-
-    # make sure output directory doesn't exist
-    if output_root_directory and os.path.exists(output_root_directory):
-      print("Output directory %r must not already exist." %
-            (output_root_directory))
-      sys.exit(1)
-
-    # make sure output directory does not overlap with root_directory
-    norm_root = os.path.split(os.path.normpath(root_directory))
-    norm_output = os.path.split(os.path.normpath(output_root_directory))
-    if norm_root == norm_output:
-      print("Output directory %r same as input directory %r" %
-            (root_directory, output_root_directory))
-      sys.exit(1)
-
-    # Collect list of files to process (we do this to correctly handle if the
-    # user puts the output directory in some sub directory of the input dir)
-    files_to_process = []
-    files_to_copy = []
-    for dir_name, _, file_list in os.walk(root_directory):
-      py_files = [f for f in file_list if f.endswith(".py")]
-      copy_files = [f for f in file_list if not f.endswith(".py")]
-      for filename in py_files:
-        fullpath = os.path.join(dir_name, filename)
-        fullpath_output = os.path.join(output_root_directory,
-                                       os.path.relpath(fullpath,
-                                                       root_directory))
-        files_to_process.append((fullpath, fullpath_output))
-      if copy_other_files:
-        for filename in copy_files:
-          fullpath = os.path.join(dir_name, filename)
-          fullpath_output = os.path.join(output_root_directory,
-                                         os.path.relpath(
-                                             fullpath, root_directory))
-          files_to_copy.append((fullpath, fullpath_output))
-
-    file_count = 0
-    tree_errors = []
-    report = ""
-    report += ("=" * 80) + "\n"
-    report += "Input tree: %r\n" % root_directory
-    report += ("=" * 80) + "\n"
-
-    for input_path, output_path in files_to_process:
-      output_directory = os.path.dirname(output_path)
-      if not os.path.isdir(output_directory):
-        os.makedirs(output_directory)
-      file_count += 1
-      _, l_report, l_errors = self.process_file(input_path, output_path)
-      tree_errors += l_errors
-      report += l_report
-    for input_path, output_path in files_to_copy:
-      output_directory = os.path.dirname(output_path)
-      if not os.path.isdir(output_directory):
-        os.makedirs(output_directory)
-      shutil.copy(input_path, output_path)
-    return file_count, report, tree_errors
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7 b/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
deleted file mode 100644
index 3bedc8cf34..0000000000
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
+++ /dev/null
@@ -1,115 +0,0 @@
-FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
-
-LABEL maintainer="Gunhan Gulsoy <gunan@google.com>"
-
-# It is possible to override these for releases.
-ARG TF_BRANCH=master
-ARG BAZEL_VERSION=0.5.4
-ARG TF_AVAILABLE_CPUS=32
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        curl \
-        git \
-        golang \
-        libcurl3-dev \
-        libfreetype6-dev \
-        libpng12-dev \
-        libzmq3-dev \
-        pkg-config \
-        python-dev \
-        python-pip \
-        rsync \
-        software-properties-common \
-        unzip \
-        zip \
-        zlib1g-dev \
-        openjdk-8-jdk \
-        openjdk-8-jre-headless \
-        wget \
-        && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN pip --no-cache-dir install --upgrade \
-        pip setuptools
-
-RUN pip --no-cache-dir install \
-        ipykernel \
-        jupyter \
-        matplotlib \
-        numpy \
-        scipy \
-        sklearn \
-        pandas \
-        wheel \
-        && \
-    python -m ipykernel.kernelspec
-
-# Set up our notebook config.
-COPY jupyter_notebook_config.py /root/.jupyter/
-
-# Jupyter has issues with being run directly:
-#   https://github.com/ipython/ipython/issues/7062
-# We just add a little wrapper script.
-COPY run_jupyter.sh /
-
-# Set up Bazel.
-
-# Running bazel inside a `docker build` command causes trouble, cf:
-#   https://github.com/bazelbuild/bazel/issues/134
-# The easiest solution is to set up a bazelrc file forcing --batch.
-RUN echo "startup --batch" >>/etc/bazel.bazelrc
-# Similarly, we need to workaround sandboxing issues:
-#   https://github.com/bazelbuild/bazel/issues/418
-RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
-    >>/etc/bazel.bazelrc
-WORKDIR /
-RUN mkdir /bazel && \
-    cd /bazel && \
-    wget --quiet https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
-    wget --quiet https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE && \
-    chmod +x bazel-*.sh && \
-    ./bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
-    rm -f /bazel/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh
-
-# Download and build TensorFlow.
-WORKDIR /
-RUN git clone https://github.com/tensorflow/tensorflow.git && \
-    cd tensorflow && \
-    git checkout ${TF_BRANCH}
-WORKDIR /tensorflow
-
-# Configure the build for our CUDA configuration.
-ENV CI_BUILD_PYTHON=python \
-    LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:${LD_LIBRARY_PATH} \
-    CUDNN_INSTALL_PATH=/usr/lib/x86_64-linux-gnu \
-    PYTHON_BIN_PATH=/usr/bin/python \
-    PYTHON_LIB_PATH=/usr/local/lib/python2.7/dist-packages \
-    TF_NEED_CUDA=1 \
-    TF_CUDA_VERSION=9.0 \
-    TF_CUDA_COMPUTE_CAPABILITIES=3.0,3.5,5.2,6.0,6.1,7.0 \
-    TF_CUDNN_VERSION=7
-RUN ./configure
-
-# Build and Install TensorFlow.
-RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
-    LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:${LD_LIBRARY_PATH} \
-    bazel build -c opt \
-                --config=cuda \
-                --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
-                --jobs=${TF_AVAILABLE_CPUS} \
-                tensorflow/tools/pip_package:build_pip_package && \
-    mkdir /pip_pkg && \
-    bazel-bin/tensorflow/tools/pip_package/build_pip_package /pip_pkg && \
-    pip --no-cache-dir install --upgrade /pip_pkg/tensorflow-*.whl && \
-    rm -rf /pip_pkg && \
-    rm -rf /root/.cache
-# Clean up pip wheel and Bazel cache when done.
-
-WORKDIR /root
-
-# TensorBoard
-EXPOSE 6006
-# IPython
-EXPOSE 8888
diff --git a/tensorflow/tools/graph_transforms/remove_control_dependencies.cc b/tensorflow/tools/graph_transforms/remove_control_dependencies.cc
index a900ee65b0..cba6b78fc5 100644
--- a/tensorflow/tools/graph_transforms/remove_control_dependencies.cc
+++ b/tensorflow/tools/graph_transforms/remove_control_dependencies.cc
@@ -24,24 +24,23 @@ namespace graph_transforms {
 // inputs which are referenced with "^tensor_name".
 // See node_def.proto for more details.
 Status RemoveControlDependencies(const GraphDef& input_graph_def,
-                                 const TransformFuncContext& context,
-                                 GraphDef* output_graph_def) {
-  output_graph_def->Clear();
-  for (const NodeDef& node : input_graph_def.node()) {
-    NodeDef* new_node = output_graph_def->mutable_node()->Add();
-    *new_node = node;
-    new_node->clear_input();
-    for (const auto& input : node.input()) {
-      if (input[0] != '^') {
-        new_node->add_input(input);
-      }
+                const TransformFuncContext& context,
+                GraphDef* output_graph_def) {
+    output_graph_def->Clear();
+    for (const NodeDef& node : input_graph_def.node()) {
+        NodeDef* new_node = output_graph_def->mutable_node()->Add();
+        *new_node = node;
+        new_node->clear_input();
+        for (const auto& input : node.input()) {
+            if (input[0] != '^') {
+                new_node->add_input(input);
+            }
+        }
     }
-  }
-  return Status::OK();
+    return Status::OK();
 }
 
-REGISTER_GRAPH_TRANSFORM("remove_control_dependencies",
-                         RemoveControlDependencies);
+REGISTER_GRAPH_TRANSFORM("remove_control_dependencies", RemoveControlDependencies);
 
 }  // namespace graph_transforms
 }  // namespace tensorflow
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 614457e899..3fbdb5cacd 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -27,6 +27,7 @@ pkg_tar(
         ":cheaders",
         ":clib",
         ":clicenses",
+        ":eager_cheaders",
     ],
 )
 
@@ -57,7 +58,6 @@ pkg_tar(
     name = "cheaders",
     files = [
         "//tensorflow/c:headers",
-        "//tensorflow/c/eager:headers",
     ],
     package_dir = "include/tensorflow/c",
     # Mark as "manual" till
@@ -69,6 +69,20 @@ pkg_tar(
 )
 
 pkg_tar(
+    name = "eager_cheaders",
+    files = [
+        "//tensorflow/c/eager:headers",
+    ],
+    package_dir = "include/tensorflow/c/eager",
+    # Mark as "manual" till
+    # https://github.com/bazelbuild/bazel/issues/2352
+    # and https://github.com/bazelbuild/bazel/issues/1580
+    # are resolved, otherwise these rules break when built
+    # with Python 3.
+    tags = ["manual"],
+)
+
+pkg_tar(
     name = "clib",
     files = ["//tensorflow:libtensorflow.so"],
     package_dir = "lib",
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index d6ac7be8b5..85f423f236 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -226,6 +226,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       urls = [
           "https://mirror.bazel.build/github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.1.tar.gz",
           "https://github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.1.tar.gz",
+          "http://www.nasm.us/pub/nasm/releasebuilds/2.12.02/nasm-2.12.02.tar.bz2",
       ],
       sha256 = "c15a9607892113946379ccea3ca8b85018301b200754f209453ab21674268e77",
       strip_prefix = "libjpeg-turbo-1.5.1",
diff --git a/third_party/boringssl/add_boringssl_s390x.patch b/third_party/boringssl/add_boringssl_s390x.patch
deleted file mode 100644
index 8b42d10e68..0000000000
--- a/third_party/boringssl/add_boringssl_s390x.patch
+++ /dev/null
@@ -1,133 +0,0 @@
-diff --git a/src/include/openssl/base.h b/src/include/openssl/base.h
-index 7a3adfb..88012ad 100644
---- a/src/include/openssl/base.h
-+++ b/src/include/openssl/base.h
-@@ -94,6 +94,8 @@ extern "C" {
- #define OPENSSL_PNACL
- #elif defined(__myriad2__)
- #define OPENSSL_32_BIT
-+#elif defined(__s390x__)
-+#define OPENSSL_64_BIT
- #else
- #error "Unknown target CPU"
- #endif
-diff --git a/BUILD b/BUILD
-index 6b645e61..c90b7beb 100644
---- a/BUILD
-+++ b/BUILD
-@@ -40,29 +40,46 @@ config_setting(
-     values = {"cpu": "darwin"},
- )
- 
--boringssl_copts = [
--    # Assembler option --noexecstack adds .note.GNU-stack to each object to
--    # ensure that binaries can be built with non-executable stack.
--    "-Wa,--noexecstack",
--
--    # This is needed on Linux systems (at least) to get rwlock in pthread.
--    "-D_XOPEN_SOURCE=700",
--
--    # This list of warnings should match those in the top-level CMakeLists.txt.
--    "-Wall",
--    "-Werror",
--    "-Wformat=2",
--    "-Wsign-compare",
--    "-Wmissing-field-initializers",
--    "-Wwrite-strings",
--    "-Wshadow",
--    "-fno-common",
--
--    # Modern build environments should be able to set this to use atomic
--    # operations for reference counting rather than locks. However, it's
--    # known not to work on some Android builds.
--    # "-DOPENSSL_C11_ATOMIC",
--] + select({
-+config_setting(
-+    name = "windows",
-+    values = {"cpu": "x64_windows"},
-+    visibility = ["//visibility:public"],
-+)
-+
-+config_setting(
-+    name = "windows_msvc",
-+    values = {"cpu": "x64_windows_msvc"},
-+    visibility = ["//visibility:public"],
-+)
-+
-+boringssl_copts = select({
-+    ":windows": [
-+        "-DWIN32_LEAN_AND_MEAN",
-+    ],
-+    "//conditions:default": [
-+        # Assembler option --noexecstack adds .note.GNU-stack to each object to
-+        # ensure that binaries can be built with non-executable stack.
-+        "-Wa,--noexecstack",
-+
-+        # This is needed on Linux systems (at least) to get rwlock in pthread.
-+        "-D_XOPEN_SOURCE=700",
-+
-+        # This list of warnings should match those in the top-level CMakeLists.txt.
-+        "-Wall",
-+        "-Werror",
-+        "-Wformat=2",
-+        "-Wsign-compare",
-+        "-Wmissing-field-initializers",
-+        "-Wwrite-strings",
-+        "-Wshadow",
-+        "-fno-common",
-+
-+        # Modern build environments should be able to set this to use atomic
-+        # operations for reference counting rather than locks. However, it's
-+        # known not to work on some Android builds.
-+        # "-DOPENSSL_C11_ATOMIC",
-+    ],
-+}) + select({
-     ":linux_x86_64": [],
-     ":mac_x86_64": [],
-     "//conditions:default": ["-DOPENSSL_NO_ASM"],
-@@ -75,18 +92,26 @@ crypto_sources_asm = select({
- })
- 
- # For C targets only (not C++), compile with C11 support.
--boringssl_copts_c11 = boringssl_copts + [
--    "-std=c11",
--    "-Wmissing-prototypes",
--    "-Wold-style-definition",
--    "-Wstrict-prototypes",
--]
-+boringssl_copts_c11 = boringssl_copts + select({
-+    ":windows": [],
-+    ":windows_msvc": [],
-+    "//conditions:default": [
-+        "-std=c11",
-+        "-Wmissing-prototypes",
-+        "-Wold-style-definition",
-+        "-Wstrict-prototypes",
-+    ],
-+})
- 
- # For C targets only (not C++), compile with C11 support.
--boringssl_copts_cxx = boringssl_copts + [
--    "-std=c++11",
--    "-Wmissing-declarations",
--]
-+boringssl_copts_cxx = boringssl_copts + select({
-+    ":windows": [],
-+    ":windows_msvc": [],
-+    "//conditions:default": [
-+        "-std=c++11",
-+        "-Wmissing-declarations",
-+    ],
-+})
- 
- cc_library(
-     name = "crypto",
-@@ -96,6 +121,8 @@ cc_library(
-     includes = ["src/include"],
-     linkopts = select({
-         ":mac_x86_64": [],
-+        ":windows": [],
-+        ":windows_msvc": [],
-         "//conditions:default": ["-lpthread"],
-     }),
-     visibility = ["//visibility:public"],
diff --git a/third_party/nanopb.BUILD b/third_party/nanopb.BUILD
deleted file mode 100644
index d21866911b..0000000000
--- a/third_party/nanopb.BUILD
+++ /dev/null
@@ -1,23 +0,0 @@
-# Description:
-#   Nanopb, a tiny ANSI C protobuf implementation for use on embedded devices.
-
-licenses(["notice"])  # zlib license
-
-exports_files(["LICENSE.txt"])
-
-cc_library(
-    name = "nanopb",
-    srcs = [
-        "pb_common.c",
-        "pb_decode.c",
-        "pb_encode.c",
-    ],
-    hdrs = [
-        "pb.h",
-        "pb_common.h",
-        "pb_decode.h",
-        "pb_encode.h",
-    ],
-    includes = ["."],
-    visibility = ["//visibility:public"],
-)