From 9e5529cd62446a883293e8c3f9484b95211add5b Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Mon, 14 May 2018 19:10:20 -0700
Subject: Added segment graphdef conversion functions

Functional Dyn Ops
---
 configure.py | 74 +++++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 43 insertions(+), 31 deletions(-)

(limited to 'configure.py')

diff --git a/configure.py b/configure.py
index 6d9aba61bb..69c9378a9c 100644
--- a/configure.py
+++ b/configure.py
@@ -977,6 +977,35 @@ def set_tf_cudnn_version(environ_cp):
   write_action_env_to_bazelrc('TF_CUDNN_VERSION', tf_cudnn_version)
 
 
+def is_cuda_compatible(lib, cuda_ver, cudnn_ver):
+  """Check the compatibility between given library and cudnn/cudart libraries."""
+  ldd_bin = which('ldd') or '/usr/bin/ldd'
+  ldd_out = run_shell([ldd_bin, lib], True)
+  ldd_out = ldd_out.split(os.linesep)
+  cudnn_pattern = re.compile('.*libcudnn.so\\.?(.*) =>.*$')
+  cuda_pattern = re.compile('.*libcudart.so\\.?(.*) =>.*$')
+  cudnn = None
+  cudart = None
+  cudnn_ok = True  # assume no cudnn dependency by default
+  cuda_ok = True  # assume no cuda dependency by default
+  for line in ldd_out:
+    if 'libcudnn.so' in line:
+      cudnn = cudnn_pattern.search(line)
+      cudnn_ok = False
+    elif 'libcudart.so' in line:
+      cudart = cuda_pattern.search(line)
+      cuda_ok = False
+  if cudnn and len(cudnn.group(1)):
+    cudnn = convert_version_to_int(cudnn.group(1))
+  if cudart and len(cudart.group(1)):
+    cudart = convert_version_to_int(cudart.group(1))
+  if cudnn is not None:
+    cudnn_ok = (cudnn == cudnn_ver)
+  if cudart is not None:
+    cuda_ok = (cudart == cuda_ver)
+  return cudnn_ok and cuda_ok
+
+
 def set_tf_tensorrt_install_path(environ_cp):
   """Set TENSORRT_INSTALL_PATH and TF_TENSORRT_VERSION.
 
@@ -993,8 +1022,8 @@ def set_tf_tensorrt_install_path(environ_cp):
     raise ValueError('Currently TensorRT is only supported on Linux platform.')
 
   # Ask user whether to add TensorRT support.
-  if str(int(get_var(
-      environ_cp, 'TF_NEED_TENSORRT', 'TensorRT', False))) != '1':
+  if str(int(get_var(environ_cp, 'TF_NEED_TENSORRT', 'TensorRT',
+                     False))) != '1':
     return
 
   for _ in range(_DEFAULT_PROMPT_ASK_ATTEMPTS):
@@ -1007,47 +1036,29 @@ def set_tf_tensorrt_install_path(environ_cp):
 
     # Result returned from "read" will be used unexpanded. That make "~"
     # unusable. Going through one more level of expansion to handle that.
-    trt_install_path = os.path.realpath(
-        os.path.expanduser(trt_install_path))
+    trt_install_path = os.path.realpath(os.path.expanduser(trt_install_path))
 
     def find_libs(search_path):
       """Search for libnvinfer.so in "search_path"."""
       fl = set()
       if os.path.exists(search_path) and os.path.isdir(search_path):
-        fl.update([os.path.realpath(os.path.join(search_path, x))
-                   for x in os.listdir(search_path) if 'libnvinfer.so' in x])
+        fl.update([
+            os.path.realpath(os.path.join(search_path, x))
+            for x in os.listdir(search_path)
+            if 'libnvinfer.so' in x
+        ])
       return fl
 
     possible_files = find_libs(trt_install_path)
     possible_files.update(find_libs(os.path.join(trt_install_path, 'lib')))
     possible_files.update(find_libs(os.path.join(trt_install_path, 'lib64')))
-
-    def is_compatible(tensorrt_lib, cuda_ver, cudnn_ver):
-      """Check the compatibility between tensorrt and cudnn/cudart libraries."""
-      ldd_bin = which('ldd') or '/usr/bin/ldd'
-      ldd_out = run_shell([ldd_bin, tensorrt_lib]).split(os.linesep)
-      cudnn_pattern = re.compile('.*libcudnn.so\\.?(.*) =>.*$')
-      cuda_pattern = re.compile('.*libcudart.so\\.?(.*) =>.*$')
-      cudnn = None
-      cudart = None
-      for line in ldd_out:
-        if 'libcudnn.so' in line:
-          cudnn = cudnn_pattern.search(line)
-        elif 'libcudart.so' in line:
-          cudart = cuda_pattern.search(line)
-      if cudnn and len(cudnn.group(1)):
-        cudnn = convert_version_to_int(cudnn.group(1))
-      if cudart and len(cudart.group(1)):
-        cudart = convert_version_to_int(cudart.group(1))
-      return (cudnn == cudnn_ver) and (cudart == cuda_ver)
-
     cuda_ver = convert_version_to_int(environ_cp['TF_CUDA_VERSION'])
     cudnn_ver = convert_version_to_int(environ_cp['TF_CUDNN_VERSION'])
     nvinfer_pattern = re.compile('.*libnvinfer.so.?(.*)$')
     highest_ver = [0, None, None]
 
     for lib_file in possible_files:
-      if is_compatible(lib_file, cuda_ver, cudnn_ver):
+      if is_cuda_compatible(lib_file, cuda_ver, cudnn_ver):
         matches = nvinfer_pattern.search(lib_file)
         if len(matches.groups()) == 0:
           continue
@@ -1063,12 +1074,13 @@ def set_tf_tensorrt_install_path(environ_cp):
     # Try another alternative from ldconfig.
     ldconfig_bin = which('ldconfig') or '/sbin/ldconfig'
     ldconfig_output = run_shell([ldconfig_bin, '-p'])
-    search_result = re.search(
-        '.*libnvinfer.so\\.?([0-9.]*).* => (.*)', ldconfig_output)
+    search_result = re.search('.*libnvinfer.so\\.?([0-9.]*).* => (.*)',
+                              ldconfig_output)
     if search_result:
       libnvinfer_path_from_ldconfig = search_result.group(2)
       if os.path.exists(libnvinfer_path_from_ldconfig):
-        if is_compatible(libnvinfer_path_from_ldconfig, cuda_ver, cudnn_ver):
+        if is_cuda_compatible(libnvinfer_path_from_ldconfig, cuda_ver,
+                              cudnn_ver):
           trt_install_path = os.path.dirname(libnvinfer_path_from_ldconfig)
           tf_tensorrt_version = search_result.group(1)
           break
@@ -1227,7 +1239,7 @@ def set_tf_cuda_compute_capabilities(environ_cp):
     # Check whether all capabilities from the input is valid
     all_valid = True
     # Remove all whitespace characters before splitting the string
-    # that users may insert by accident, as this will result in error 
+    # that users may insert by accident, as this will result in error
     tf_cuda_compute_capabilities = ''.join(tf_cuda_compute_capabilities.split())
     for compute_capability in tf_cuda_compute_capabilities.split(','):
       m = re.match('[0-9]+.[0-9]+', compute_capability)
-- 
cgit v1.2.3


From e80732c9895d1283af9b98d6277ad1a1015e2e9a Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Mon, 18 Jun 2018 09:57:19 -0700
Subject: Merge changes from github.

PiperOrigin-RevId: 201011811
---
 CONTRIBUTING.md                                    |   2 +-
 README.md                                          |   1 +
 RELEASE.md                                         |  67 +++-
 configure.py                                       |   5 +
 tensorflow/BUILD                                   |   4 +-
 tensorflow/c/generate-pc.sh                        |  11 +-
 tensorflow/cc/gradients/math_grad.cc               |   1 +
 tensorflow/cc/gradients/nn_grad.cc                 |  47 +++
 tensorflow/cc/gradients/nn_grad_test.cc            |  84 ++++-
 tensorflow/compiler/aot/codegen_test_h.golden      |   4 +-
 .../compiler/aot/embedded_protocol_buffers.h       |   2 +-
 tensorflow/compiler/aot/runtime.h                  |   4 +-
 tensorflow/compiler/aot/runtime_test.cc            |  16 +-
 tensorflow/compiler/xla/service/cpu/BUILD          |  18 +-
 tensorflow/compiler/xla/service/cpu/cpu_runtime.cc |   2 +
 tensorflow/compiler/xla/service/cpu/cpu_runtime.h  |   1 +
 tensorflow/compiler/xla/service/cpu/ir_emitter.cc  |   8 +-
 .../compiler/xla/service/cpu/runtime_fft_impl.h    |  20 +-
 .../xla/service/cpu/runtime_single_threaded_fft.cc |  32 ++
 .../xla/service/cpu/runtime_single_threaded_fft.h  |  31 ++
 .../compiler/xla/service/cpu/simple_orc_jit.cc     |   2 +
 tensorflow/compiler/xla/service/pattern_matcher.h  |   2 +-
 .../compiler/xla/service/tuple_simplifier.cc       |   7 +
 tensorflow/compiler/xla/service/tuple_simplifier.h |   9 +-
 .../compiler/xla/service/tuple_simplifier_test.cc  |  77 ++++
 tensorflow/contrib/autograph/__init__.py           |   3 +
 tensorflow/contrib/cmake/tf_c.cmake                |  22 +-
 tensorflow/contrib/cmake/tf_cc_ops.cmake           |   2 +-
 tensorflow/contrib/cmake/tf_python.cmake           |   3 +-
 tensorflow/contrib/cmake/tools/create_def_file.py  |   9 +-
 .../bijectors/sinh_arcsinh_bijector_test.py        |  28 +-
 tensorflow/contrib/eager/python/datasets.py        |   3 +-
 .../python/examples/notebooks/4_high_level.ipynb   |   4 +-
 .../feature_column/sequence_feature_column.py      |  22 +-
 .../feature_column/sequence_feature_column_test.py |  41 ++
 tensorflow/contrib/ffmpeg/__init__.py              |   1 -
 tensorflow/contrib/ffmpeg/ffmpeg_ops.py            |   1 -
 tensorflow/contrib/framework/__init__.py           |   3 +-
 .../ops/fused_conv2d_bias_activation_op_test.py    |  11 +-
 .../src_impl/hexagon_controller.c                  |   2 +-
 tensorflow/contrib/lite/download_dependencies.sh   |   4 +-
 .../contrib/lite/examples/minimal/minimal.cc       |   2 +-
 .../contrib/lite/g3doc/tf_ops_compatibility.md     |  14 +-
 tensorflow/contrib/lite/java/ovic/README.md        |   4 +-
 .../kernels/internal/reference/reference_ops.h     |   4 +-
 tensorflow/contrib/lite/python/interpreter.py      |   2 +-
 .../interpreter_wrapper/interpreter_wrapper.cc     |   9 +-
 .../interpreter_wrapper/interpreter_wrapper.h      |   3 +-
 tensorflow/contrib/lite/python/lite.py             |  11 +
 tensorflow/contrib/lite/toco/import_tensorflow.cc  |   2 +-
 tensorflow/contrib/lite/toco/toco_port.cc          |   6 +
 tensorflow/contrib/lite/toco/toco_port.h           |  18 +
 tensorflow/contrib/makefile/compile_nsync.sh       |   2 +-
 .../contrib/makefile/download_dependencies.sh      |   4 +-
 .../contrib/metrics/python/ops/metric_ops.py       |   2 +-
 tensorflow/contrib/mpi_collectives/kernels/ring.h  |   2 +-
 .../contrib/opt/python/training/adamax_test.py     |   6 +-
 .../opt/python/training/model_average_optimizer.py |   2 +-
 tensorflow/contrib/periodic_resample/BUILD         |  20 +-
 .../kernels/periodic_resample_op.cc                |   5 +
 .../kernels/periodic_resample_op.h                 | 415 +++++++++++++++------
 .../contrib/periodic_resample/ops/array_ops.cc     |  53 ++-
 .../periodic_resample/ops/array_ops_test.cc        |  41 ++
 .../kernel_tests/periodic_resample_op_test.py      |  27 +-
 .../python/ops/periodic_resample_op.py             |   8 +-
 .../predictor/contrib_estimator_predictor.py       |   5 +-
 .../contrib/predictor/core_estimator_predictor.py  |   5 +-
 .../contrib/predictor/predictor_factories.py       |  24 +-
 .../contrib/predictor/predictor_factories_test.py  |  19 +
 .../contrib/predictor/saved_model_predictor.py     |   6 +-
 tensorflow/contrib/quantize/README.md              |   2 +-
 .../contrib/slim/python/slim/evaluation_test.py    |  25 +-
 tensorflow/contrib/summary/summary.py              |   5 +-
 .../contrib/tensor_forest/client/eval_metrics.py   |  45 +--
 .../contrib/tensor_forest/python/tensor_forest.py  |  34 +-
 .../tensor_forest/python/tensor_forest_test.py     |  45 +++
 .../contrib/tensorrt/convert/convert_graph.cc      |  66 ++--
 .../contrib/tensorrt/convert/convert_nodes.cc      |  97 +++--
 tensorflow/contrib/tpu/python/tpu/datasets.py      |  16 +-
 tensorflow/contrib/tpu/python/tpu/datasets_test.py |  26 ++
 tensorflow/core/BUILD                              |   9 +-
 .../core/api_def/base_api/api_def_Selu.pbtxt       |   4 +
 .../api_def/base_api/api_def_StringSplitV2.pbtxt   |  48 +++
 .../api_def/python_api/api_def_StringSplitV2.pbtxt |   4 +
 tensorflow/core/common_runtime/bfc_allocator.cc    |   8 +-
 tensorflow/core/common_runtime/bfc_allocator.h     |   3 +-
 .../direct_session_with_tracking_alloc_test.cc     |  16 +
 .../common_runtime/mkl_threadpool_device_test.cc   |  53 +++
 tensorflow/core/common_runtime/process_util.cc     |  11 +-
 .../core/common_runtime/threadpool_device.cc       |  25 +-
 .../rpc/grpc_master_service_impl.cc                |   4 +-
 .../core/distributed_runtime/rpc/grpc_testlib.cc   |  10 +-
 tensorflow/core/framework/allocator.h              |   5 -
 tensorflow/core/framework/op_gen_lib.cc            |   1 +
 .../remote_fused_graph_execute_info.proto          |   2 +-
 tensorflow/core/framework/tensor_test.cc           |  24 +-
 tensorflow/core/graph/mkl_layout_pass.cc           | 148 +++++++-
 tensorflow/core/graph/mkl_layout_pass_test.cc      |  31 ++
 tensorflow/core/grappler/costs/graph_properties.cc |   1 -
 tensorflow/core/grappler/optimizers/BUILD          |   2 +-
 tensorflow/core/grappler/optimizers/remapper.cc    |   4 +-
 tensorflow/core/kernels/as_string_op.cc            |   2 +
 tensorflow/core/kernels/cwise_op_clip.cc           |  43 +--
 .../core/kernels/dense_update_functor_gpu.cu.cc    |   1 +
 tensorflow/core/kernels/gather_functor.cc          |   1 +
 tensorflow/core/kernels/gather_functor_gpu.cu.cc   |   1 +
 tensorflow/core/kernels/gather_nd_op.cc            |   4 +
 tensorflow/core/kernels/gather_nd_op_gpu.cu.cc     |   2 +
 tensorflow/core/kernels/gather_op.cc               |   1 +
 tensorflow/core/kernels/mkl_concat_op.cc           | 213 ++++++++---
 tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc  |   2 +
 tensorflow/core/kernels/mkl_pooling_ops_common.h   |   6 +-
 tensorflow/core/kernels/scatter_nd_op.cc           |   4 +
 tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc    |   1 +
 .../core/kernels/scoped_allocator_ops_test.cc      |   9 +-
 tensorflow/core/kernels/segment_reduction_ops.h    |  10 +-
 tensorflow/core/kernels/sparse_matmul_op.cc        |   2 +-
 tensorflow/core/kernels/string_split_op.cc         | 130 +++++++
 tensorflow/core/ops/candidate_sampling_ops.cc      |   5 +-
 tensorflow/core/ops/dataset_ops.cc                 |  24 +-
 tensorflow/core/ops/image_ops.cc                   |   4 +-
 tensorflow/core/ops/math_ops.cc                    |   2 +-
 tensorflow/core/ops/nn_ops.cc                      |   1 +
 tensorflow/core/ops/string_ops.cc                  |  20 +-
 tensorflow/core/platform/cpu_info.cc               |  23 ++
 tensorflow/core/platform/cpu_info.h                |   7 +
 tensorflow/core/platform/default/build_config.bzl  |   2 +
 .../core/platform/hadoop/hadoop_file_system.cc     |  21 +-
 tensorflow/core/platform/posix/port.cc             |   5 +
 tensorflow/core/public/version.h                   |   4 +-
 tensorflow/core/util/mkl_util.h                    |  50 ++-
 tensorflow/docs_src/community/groups.md            |  29 +-
 tensorflow/docs_src/get_started/eager.md           |   2 +-
 tensorflow/docs_src/get_started/index.md           |   4 +-
 tensorflow/docs_src/install/install_c.md           |   2 +-
 tensorflow/docs_src/install/install_go.md          |   2 +-
 tensorflow/docs_src/install/install_java.md        |  24 +-
 tensorflow/docs_src/install/install_linux.md       |  24 +-
 tensorflow/docs_src/install/install_mac.md         |  10 +-
 tensorflow/docs_src/install/install_sources.md     |  17 +-
 tensorflow/docs_src/mobile/linking_libs.md         |   2 +-
 tensorflow/docs_src/mobile/prepare_models.md       |   4 +-
 tensorflow/docs_src/performance/quantization.md    |   2 +-
 .../docs_src/programmers_guide/estimators.md       |  19 +-
 .../docs_src/programmers_guide/feature_columns.md  |   4 +-
 tensorflow/examples/learn/iris.py                  |   7 +-
 tensorflow/go/op/wrappers.go                       |  12 +-
 tensorflow/java/src/gen/cc/op_generator.cc         |  11 +-
 tensorflow/java/src/gen/cc/op_specs.cc             |   1 +
 tensorflow/python/eager/backprop.py                |   4 +-
 tensorflow/python/estimator/BUILD                  |   5 +-
 tensorflow/python/estimator/exporter.py            |   4 +-
 tensorflow/python/estimator/inputs/numpy_io.py     |   8 +-
 .../python/estimator/inputs/numpy_io_test.py       |   5 +-
 tensorflow/python/estimator/inputs/pandas_io.py    |   7 +-
 .../python/estimator/inputs/pandas_io_test.py      |   5 +-
 .../estimator/inputs/queues/feeding_functions.py   |   2 +-
 tensorflow/python/estimator/keras.py               |   4 +-
 tensorflow/python/estimator/keras_test.py          |  14 +-
 .../python/grappler/layout_optimizer_test.py       |   4 +-
 tensorflow/python/keras/activations.py             |   2 +
 tensorflow/python/keras/callbacks.py               |  21 +-
 tensorflow/python/keras/callbacks_test.py          |   2 +
 tensorflow/python/keras/engine/network.py          |   2 +-
 tensorflow/python/keras/engine/saving_test.py      |   4 +-
 tensorflow/python/keras/engine/training.py         |   7 +-
 tensorflow/python/keras/engine/training_eager.py   |   2 +-
 tensorflow/python/keras/initializers_test.py       |  26 +-
 tensorflow/python/keras/layers/core.py             |  26 +-
 tensorflow/python/keras/models_test.py             |  14 +
 .../python/kernel_tests/as_string_op_test.py       |  10 +
 tensorflow/python/kernel_tests/betainc_op_test.py  |   4 +-
 tensorflow/python/kernel_tests/clip_ops_test.py    |  13 +
 tensorflow/python/kernel_tests/conv_ops_test.py    |  32 +-
 .../python/kernel_tests/gather_nd_op_test.py       |  32 +-
 tensorflow/python/kernel_tests/gather_op_test.py   |  20 +-
 tensorflow/python/kernel_tests/init_ops_test.py    |  27 ++
 tensorflow/python/kernel_tests/pooling_ops_test.py |   4 +-
 tensorflow/python/kernel_tests/py_func_test.py     |  31 +-
 .../python/kernel_tests/scatter_nd_ops_test.py     |   6 +-
 tensorflow/python/kernel_tests/scatter_ops_test.py |  14 +-
 .../kernel_tests/segment_reduction_ops_test.py     |   4 +-
 .../python/kernel_tests/string_split_op_test.py    |  96 +++++
 tensorflow/python/ops/array_ops.py                 |   4 +
 tensorflow/python/ops/gradient_checker.py          |   8 +-
 tensorflow/python/ops/image_ops_impl.py            |  74 ++--
 tensorflow/python/ops/image_ops_test.py            | 261 ++++++++++---
 tensorflow/python/ops/init_ops.py                  |   3 +-
 tensorflow/python/ops/logging_ops.py               |   5 +-
 tensorflow/python/ops/math_ops.py                  |  28 +-
 tensorflow/python/ops/nn_impl.py                   |   5 +-
 tensorflow/python/ops/nn_ops.py                    |   4 +-
 tensorflow/python/ops/nn_test.py                   |  10 +
 tensorflow/python/ops/script_ops.py                |  35 +-
 tensorflow/python/ops/sparse_ops.py                |   4 +
 tensorflow/python/ops/string_ops.py                |  53 +++
 tensorflow/python/ops/variable_scope.py            |  21 +-
 .../python/tools/import_pb_to_tensorboard.py       |   0
 tensorflow/tensorflow.bzl                          |   2 +-
 .../tools/api/generator/create_python_api.py       |   8 +-
 tensorflow/tools/api/golden/tensorflow.image.pbtxt |   2 +-
 tensorflow/tools/api/golden/tensorflow.pbtxt       |   4 +
 .../tools/api/golden/tensorflow.strings.pbtxt      |   4 +
 tensorflow/tools/ci_build/builds/pip.sh            |   4 +
 .../tools/ci_build/builds/with_the_same_user       |   2 +-
 tensorflow/tools/ci_build/ci_build.sh              |   7 +
 tensorflow/tools/ci_build/copy_binary.py           |   3 +-
 .../tools/ci_build/install/install_pip_packages.sh |   4 +
 .../install/install_python3.5_pip_packages.sh      |   4 +-
 .../install/install_python3.6_pip_packages.sh      |   5 +-
 .../tools/ci_build/linux/mkl/basic-mkl-test.sh     |  29 ++
 tensorflow/tools/ci_build/pi/build_raspberry_pi.sh |   8 +-
 .../def_file_filter/def_file_filter_configure.bzl  |   6 +-
 tensorflow/tools/dist_test/local_test.sh           |  12 +-
 tensorflow/tools/dist_test/remote_test.sh          |  11 +-
 tensorflow/tools/docker/Dockerfile.devel           |   2 +-
 tensorflow/tools/docker/Dockerfile.devel-cpu-mkl   |   2 +-
 tensorflow/tools/docker/Dockerfile.devel-gpu       |   6 +-
 tensorflow/tools/docker/Dockerfile.gpu             |   2 +-
 tensorflow/tools/pip_package/BUILD                 |   1 +
 tensorflow/tools/pip_package/build_pip_package.sh  | 160 +++++---
 tensorflow/tools/pip_package/setup.py              |   3 +-
 .../proto_text/gen_proto_text_functions_lib.cc     |   3 +
 .../tools/quantization/quantize_graph_test.py      |  12 +-
 tensorflow/tools/test/upload_test_benchmarks.py    |   1 -
 tensorflow/workspace.bzl                           |  40 +-
 third_party/eigen.BUILD                            |   1 +
 third_party/highwayhash.BUILD                      |   1 +
 third_party/jpeg/jpeg.BUILD                        |   2 +
 third_party/png.BUILD                              |   9 +-
 third_party/py/python_configure.bzl                |  24 +-
 third_party/repo.bzl                               |   5 +-
 232 files changed, 3343 insertions(+), 909 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc
 create mode 100644 tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h
 create mode 100644 tensorflow/contrib/periodic_resample/ops/array_ops_test.cc
 create mode 100644 tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt
 create mode 100644 tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
 mode change 100755 => 100644 tensorflow/python/tools/import_pb_to_tensorboard.py
 create mode 100755 tensorflow/tools/ci_build/linux/mkl/basic-mkl-test.sh

(limited to 'configure.py')

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 8669c25c45..db4b1581ae 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -90,7 +90,7 @@ Bazel BUILD files also need to include a license section, e.g.,
 Changes to TensorFlow C++ code should conform to
 [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
 
-Use `clang-tidy` to check your C/C++ changes. To install clang-tidy on ubuntu:16.04, do:
+Use `clang-tidy` to check your C/C++ changes. To install `clang-tidy` on ubuntu:16.04, do:
 
 ```bash
 apt-get install -y clang-tidy
diff --git a/README.md b/README.md
index 6fb4486d0d..63853137cf 100644
--- a/README.md
+++ b/README.md
@@ -56,6 +56,7 @@ $ python
 42
 >>> sess.close()
 ```
+Learn more examples about how to do specific tasks in TensorFlow at the [tutorials page of tensorflow.org](https://www.tensorflow.org/tutorials/).
 
 ## Contribution guidelines
 
diff --git a/RELEASE.md b/RELEASE.md
index 84d9d52868..e09e9c6190 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,62 @@
+# Release 1.9.0
+
+## Major Features And Improvements
+* Update tf.keras to the Keras 2.1.6 API.
+* `tfe.Network` is deprecated. Please inherit from `tf.keras.Model`.
+* Adding support of core feature columns and losses to gradient boosted trees estimators.
+* The distributions.Bijector API supports broadcasting for Bijectors with new API changes. See [here](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/distributions/bijectors/Bijector) for more details.
+* Layered variable names have changed in the following conditions:
+  * Using `tf.keras.layers` with custom variable scopes.
+  * Using `tf.layers` in  a subclassed `tf.keras.Model` class. See [here](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/layers) for more details
+
+## Breaking Chances
+  * If you're opening empty variable scopes; replace `variable_scope`('', ...) by `variable_scope`(`tf.get_variable_scope()`, ...).
+
+## Bug Fixes and Other Changes
+* `tf.data`:
+  * The `DatasetBase::DebugString()` method is now `const`.
+  * Added the `tf.contrib.data.sample_from_datasets()` API for randomly sampling from multiple datasets.
+* Eager Execution:
+* `tf.keras`:
+  * Move Keras code out of _impl folder and remove API files.
+  * `tf.keras.Model.save_weights` now saves in TensorFlow format by default.
+  * Enable dataset iterators to be passed to `tf.keras.Model` training/eval methods.
+* Accelerated Linear Algebra (XLA):
+* TensorFlow Debugger (tfdbg): fix an issue in which the TensorBoard Debugger Plugin could not handle total source file size exceeding gRPC message size limit (4 MB).
+* `tf.contrib`:
+  * Add `tf.contrib.data.choose_from_datasets()`.
+  * `tf.contrib.data.make_csv_dataset()` now supports line breaks in quoted strings. Two arguments were removed from `make_csv_dataset`.
+  * `tf.contrib.framework.zero_initializer` supports ResourceVariable.
+  * Adding "constrained_optimization" to tensorflow/contrib.
+* Other:
+  * Add GCS Configuration Ops.
+  * Changing signature of `MakeIterator` to enable propagating error status.
+  * KL divergence for two Dirichlet distributions.
+  * More consistent GcsFileSystem behavior for certain reads past EOF.
+  * Update benchmark for tf.scan to match ranges across eager and graph modes.
+  * Fixed bug in `tf.reduce_prod gradient` for complex dtypes.
+  * Add optional `args` argument to `Dataset.from_generator()`.
+  * Allow the use of '.' in variables (e.g. "hparams.parse('a.b=1.0')"), which would previously raise an error. This will correspond to an attribute name with an embedded '.' symbol (e.g. 'a.b'), which can only be accessed indirectly (e.g. through getattr and setattr).  To set this up the user will first need to explicitly add the variable to the hparam object (e.g. "hparams.add_hparam(name='a.b', value=0.0)").
+  * Benchmark for tf.scan in graph and eager modes.
+  * Added complex128 support to FFT, FFT2D, FFT3D, IFFT, IFFT2D, and IFFT3D.
+  * Making ids unique in `nn.embedding_lookup_sparse`. This helps to reduce RPC calls for looking up the embeddings when there are repeated ids in the batch.
+  * Support indicator column in boosted trees.
+  * Prevent `tf.gradients()` from backpropagating through integer tensors.
+  * LinearOperator[1D,2D,3D]Circulant added to `tensorflow.linalg`.
+  * Conv3D, Conv3DBackpropInput, Conv3DBackpropFilter now supports arbitrary.
+  * Added `tf.train.Checkpoint` for reading/writing object-based checkpoints.
+  * `Dataset.list_files()` now produces determinstic results when `shuffle=False` or a `seed` is passed.
+  * Added LinearOperatorKronecker, a dense-free implementation of the Kronecker Product.
+  * Allow LinearOperator to broadcast.
+  * SavedModelBuilder will now deduplicate asset names that point to files with the same basename and the same contents. Note that this may result in new asset files included in SavedModels in cases where assets with the same name but different contents were previously overwriting each other.
+
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+Abdullah Alrasheed, Achal Shah, Ad-530, ADiegoCAlonso, Aditya Yogi, Ag Ramesh, akindyakov, Andy Kernahan, Anya Petrova, Aurelien Geron, Ben, Ben Barsdell, Bhavani-Subramanian, braincodercn, Brett Koonce, Brian Nemsick, Brian Zier, Bryan Heden, candy.dc, cclauss, Clayne Robison, ctiijima, Dalmo Cirne, David Norman, David T.H. Kao, DosLin, ekelsen, Elson Rodriguez, Erik Smistad, Felix Abecassis, Fergal Cotter, fo40225, foo0x29a, Freedom" Koan-Sin Tan, FréDéRic Branchaud-Charron, gdh1995, Geoffrey Irving, Giuseppe, gracehoney, Guido Zuidhof, Guillaume Klein, Guozhong Zhuang, Haggai, Harald Husum, imsheridan, Ivan Zhang, Jan Zikes, Jayaram Bobba, Jesse Benson, Jesse Gumz, Jiajia Li, Jie, jinghuangintel, Jingwen, jjsjann123, Joe Yearsley, Joel Hestness, Joel Shor, josephyearsley, Junpeng Lao, Karol M. Langner, Kb Sriram, krantideep95, Krish Ravindranath, Letian Feng, Loo Rong Jie, Lukas Geiger, Maciej, Mahmoud Abuzaina, ManHyuk, Mark Ryan, mbhuiyan, Michal Turek, Mostafa Alaa, Myungsung Kwak, Nand Dalal, Nehal J Wani, Neil Tenenholtz, ngc92, Nicholas Nadeau, P.Eng., Avs, Niranjan Hasabnis, P-Hidringer, Paul Van Eck, Peng Yu, Qing Zhao, Qingying Chen, Quanlong, Rajendra Arora, Rholais Lii, rmanyari, Robin Richtsfeld, Russell Klopfer, Sagi, Sam Sendelbach, Sandeep N Gupta, Sandip Giri, Sarah Edkins, Scott Tseng, Sdalbsoo, Sergii Khomenko, Seungwoo Choi (Biggie), Seyed Majid Azimi, Shaoning Zeng, shengfuintel, Siu Kei, Muk, Smit Shilu, soonson, Stefan Schweter, Sukhwan Kim, Sunitha Kambhampati, Taehoon Lee, tamimaddari82, Tang, Wenyi, Ted Chang, u2takey, Utkarsh Upadhyay, Vadim Markovtsev, voegtlel, Wai Hon Law, wangsiyu, Wenhao Hu, wenhao.hu, William D. Irons, Yan Facai (颜发才), Yanbo Liang, Yihong Wang, Yilei (Dolee) Yang, Yong Tang, Yuan (Terry) Tang
+
 # Release 1.8.0
 
 ## Major Features And Improvements
@@ -404,14 +463,6 @@ answered questions, and were part of inspiring discussions.
 
 # Release 1.4.0
 
-## Major Features And Improvements
-* `tf.keras` is now part of the core TensorFlow API.
-* [`tf.data`](http://tensorflow.org/programmers_guide/datasets) is now part of
-  the core TensorFlow API.
-  * The API is now subject to backwards compatibility guarantees.
-
-# Release 1.4.0
-
 ## Major Features And Improvements
 * `tf.keras` is now part of the core TensorFlow API.
 * [`tf.data`](http://tensorflow.org/programmers_guide/datasets) is now part of
diff --git a/configure.py b/configure.py
index bde7af8c0e..ada342a50a 100644
--- a/configure.py
+++ b/configure.py
@@ -1397,6 +1397,10 @@ def set_grpc_build_flags():
   write_to_bazelrc('build --define grpc_no_ares=true')
 
 
+def set_build_strip_flag():
+  write_to_bazelrc('build --strip=always')
+
+
 def set_windows_build_flags():
   if is_windows():
     # The non-monolithic build is not supported yet
@@ -1519,6 +1523,7 @@ def main():
 
   set_grpc_build_flags()
   set_cc_opt_flags(environ_cp)
+  set_build_strip_flag()
   set_windows_build_flags()
 
   if get_var(
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index a73c4ca3aa..6d134dbb80 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -475,7 +475,7 @@ tf_cc_shared_object(
 # excludes all but a subset of function names.
 # On MacOS, the linker does not support version_script, but has an
 # an "-exported_symbols_list" command.  -z defs disallows undefined
-# symbols in object files and -s strips the output.
+# symbols in object files.
 
 tf_cc_shared_object(
     name = "libtensorflow.so",
@@ -489,7 +489,6 @@ tf_cc_shared_object(
         "//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "-z defs",
-            "-s",
             "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
             "$(location //tensorflow/c:version_script.lds)",
         ],
@@ -515,7 +514,6 @@ tf_cc_shared_object(
         "//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "-z defs",
-            "-s",
             "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
             "$(location //tensorflow:tf_version_script.lds)",
         ],
diff --git a/tensorflow/c/generate-pc.sh b/tensorflow/c/generate-pc.sh
index 02a6a58b61..7184ad68fb 100755
--- a/tensorflow/c/generate-pc.sh
+++ b/tensorflow/c/generate-pc.sh
@@ -15,10 +15,12 @@
 # ==============================================================================
 
 TF_PREFIX='/usr/local'
+LIBDIR='lib'
 
 usage() {
     echo "Usage: $0 OPTIONS"
     echo -e "-p, --prefix\tset installation prefix (default: /usr/local)"
+    echo -e "-l, --libdir\tset lib directory (default: lib)"
     echo -e "-v, --version\tset TensorFlow version"
     echo -e "-h, --help\tdisplay this message"
 }
@@ -26,7 +28,7 @@ usage() {
 [ $# == 0 ] && usage && exit 0
 
 # read the options
-ARGS=$(getopt -o p:v:h --long prefix:,version:,help -n $0 -- "$@")
+ARGS=$(getopt -o p:l:v:h --long prefix:,libdir:,version:,help -n $0 -- "$@")
 eval set -- "$ARGS"
 
 # extract options and their arguments into variables.
@@ -38,6 +40,11 @@ while true ; do
                 "") shift 2 ;;
                 *) TF_PREFIX=$2 ; shift 2 ;;
             esac ;;
+        -l|--libdir)
+            case "$2" in
+                "") shift 2 ;;
+                *) LIBDIR=$2 ; shift 2 ;;
+            esac ;;
         -v|--version)
             case "$2" in
                 "") shift 2 ;;
@@ -55,7 +62,7 @@ echo "Generating pkgconfig file for TensorFlow $TF_VERSION in $TF_PREFIX"
 cat << EOF > tensorflow.pc
 prefix=${TF_PREFIX}
 exec_prefix=\${prefix}
-libdir=\${exec_prefix}/lib
+libdir=\${exec_prefix}/${LIBDIR}
 includedir=\${prefix}/include
 
 Name: TensorFlow
diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc
index 52c177212a..35a01e0341 100644
--- a/tensorflow/cc/gradients/math_grad.cc
+++ b/tensorflow/cc/gradients/math_grad.cc
@@ -38,6 +38,7 @@ REGISTER_NO_GRADIENT_OP("NotEqual");
 REGISTER_NO_GRADIENT_OP("LogicalAnd");
 REGISTER_NO_GRADIENT_OP("LogicalOr");
 REGISTER_NO_GRADIENT_OP("LogicalNot");
+REGISTER_NO_GRADIENT_OP("Floor");
 
 // Conjugate helper function returns the conjugate of an Output if it
 // is complex valued.
diff --git a/tensorflow/cc/gradients/nn_grad.cc b/tensorflow/cc/gradients/nn_grad.cc
index 0cb3132e94..c73482d5f4 100644
--- a/tensorflow/cc/gradients/nn_grad.cc
+++ b/tensorflow/cc/gradients/nn_grad.cc
@@ -255,6 +255,53 @@ Status LRNGradHelper(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("LRN", LRNGradHelper);
 
+Status SoftplusGradHelper(const Scope& scope, const Operation& op,
+                          const std::vector<Output>& grad_inputs,
+                          std::vector<Output>* grad_outputs) {
+  auto dx = internal::SoftplusGrad(scope, grad_inputs[0], op.input(0));
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("Softplus", SoftplusGradHelper);
+
+Status SoftsignGradHelper(const Scope& scope, const Operation& op,
+                          const std::vector<Output>& grad_inputs,
+                          std::vector<Output>* grad_outputs) {
+  auto dx = internal::SoftsignGrad(scope, grad_inputs[0], op.input(0));
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("Softsign", SoftsignGradHelper);
+
+Status FractionalAvgPoolGradHelper(const Scope& scope, const Operation& op,
+                                   const std::vector<Output>& grad_inputs,
+                                   std::vector<Output>* grad_outputs) {
+  bool overlapping;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.output(0).node()->attrs(), "overlapping", &overlapping));
+  auto dx = internal::FractionalAvgPoolGrad(
+      scope, Shape(scope, op.input(0), Shape::OutType(DT_INT64)),
+      grad_inputs[0], op.output(1), op.output(2),
+      internal::FractionalAvgPoolGrad::Overlapping(overlapping));
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("FractionalAvgPool", FractionalAvgPoolGradHelper);
+
+Status FractionalMaxPoolGradHelper(const Scope& scope, const Operation& op,
+                                   const std::vector<Output>& grad_inputs,
+                                   std::vector<Output>* grad_outputs) {
+  bool overlapping;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.output(0).node()->attrs(), "overlapping", &overlapping));
+  auto dx = internal::FractionalMaxPoolGrad(
+      scope, op.input(0), op.output(0), grad_inputs[0], op.output(1),
+      op.output(2), internal::FractionalMaxPoolGrad::Overlapping(overlapping));
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("FractionalMaxPool", FractionalMaxPoolGradHelper);
+
 }  // anonymous namespace
 }  // namespace ops
 }  // namespace tensorflow
diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc
index c4eba7ecb0..b4d457a9d1 100644
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@@ -28,6 +28,8 @@ namespace {
 using ops::BiasAdd;
 using ops::Conv2D;
 using ops::Elu;
+using ops::FractionalAvgPool;
+using ops::FractionalMaxPool;
 using ops::L2Loss;
 using ops::LogSoftmax;
 using ops::LRN;
@@ -41,6 +43,8 @@ using ops::Relu;
 using ops::Relu6;
 using ops::Selu;
 using ops::Softmax;
+using ops::Softplus;
+using ops::Softsign;
 
 class NNGradTest : public ::testing::Test {
  protected:
@@ -71,22 +75,30 @@ class NNGradTest : public ::testing::Test {
     EXPECT_LT(max_error, 1e-3);
   }
 
-  // Sets tensor with random values, ensuring that the max value is largest by
-  // a reasonable amount.
-  // This is an issue for MaxPool, MaxPoolV2 and MaxPool3D, in which
-  // perturbations by the numeric gradient computation in the gradient checker
-  // can change the max value if values are too close together.
+  // Sets tensor with random values, ensuring that every pair of elements are at
+  // least a reasonable amount apart.
+  // This is an issue for max pooling operations, in which perturbations by the
+  // numeric gradient computation in the gradient checker can change the max
+  // value if a pool has values that are too close together.
   template <typename T>
-  void SetRandomValuesWithBumpedMax(Tensor* tensor) {
+  void SetRandomValuesForMaxPooling(Tensor* tensor) {
     auto tensor_flat = tensor->flat<T>();
-    tensor_flat.setRandom();
-    int32 max_index = 0;
-    for (size_t i = 1; i < tensor->NumElements(); i++) {
-      if (tensor_flat(i) > tensor_flat(max_index)) {
-        max_index = i;
-      }
+    // First set the array to an increasing sequence of values spaced
+    // a reasonable amount apart
+    T cur = 0;
+    for (size_t i = 0; i < tensor->NumElements(); i++) {
+      tensor_flat(i) = cur;
+      cur += 5e-2;
+    }
+    // Fischer-Yates shuffle the array
+    for (size_t i = tensor->NumElements() - 1; i >= 1; i--) {
+      // j <- random integer 0 <= j <= i
+      size_t j = random::New64() % (i + 1);
+      // swap values at i, j
+      T tmp = tensor_flat(i);
+      tensor_flat(i) = tensor_flat(j);
+      tensor_flat(j) = tmp;
     }
-    tensor_flat(max_index) += 1e-2;
   }
 
   Scope scope_;
@@ -189,7 +201,7 @@ TEST_F(NNGradTest, MaxPoolGradHelper) {
   const std::vector<int> strides{1, 2, 2, 1};
   auto y = MaxPool(scope_, x, ksize, strides, "VALID");
   Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
-  SetRandomValuesWithBumpedMax<float>(&x_init_value);
+  SetRandomValuesForMaxPooling<float>(&x_init_value);
   RunTest(x, x_init_value, y, y_shape);
 }
 
@@ -202,7 +214,7 @@ TEST_F(NNGradTest, MaxPoolGradV2Helper) {
   Tensor strides = test::AsTensor<int>({1, 2, 2, 1}, {4});
   auto y = MaxPoolV2(scope_, x, ksize, strides, "VALID");
   Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
-  SetRandomValuesWithBumpedMax<float>(&x_init_value);
+  SetRandomValuesForMaxPooling<float>(&x_init_value);
   RunTest(x, x_init_value, y, y_shape);
 }
 
@@ -215,7 +227,7 @@ TEST_F(NNGradTest, MaxPool3DGradHelper) {
   const std::vector<int> strides{1, 3, 3, 3, 1};
   auto y = MaxPool3D(scope_, x, ksize, strides, "VALID");
   Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
-  SetRandomValuesWithBumpedMax<float>(&x_init_value);
+  SetRandomValuesForMaxPooling<float>(&x_init_value);
   RunTest(x, x_init_value, y, y_shape);
 }
 
@@ -248,5 +260,45 @@ TEST_F(NNGradTest, LRN){
   RunTest(x, x_shape, y, x_shape);
 }
 
+TEST_F(NNGradTest, SoftplusGrad) {
+  TensorShape shape({3, 7});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+  auto y = Softplus(scope_, x);
+  RunTest(x, shape, y, shape);
+}
+
+TEST_F(NNGradTest, SoftsignGrad) {
+  TensorShape shape({3, 7});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+  auto y = Softsign(scope_, x);
+  RunTest(x, shape, y, shape);
+}
+
+TEST_F(NNGradTest, FractionalAvgPoolGradHelper) {
+  TensorShape x_shape({1, 3, 7, 1});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+  // Force consistent pooling regions for unit testing.
+  auto y = FractionalAvgPool(
+      scope_, x, {1, 1.2, 1.9, 1},
+      FractionalAvgPool::Deterministic(true).Overlapping(true).Seed(1).Seed2(
+          2));
+  TensorShape y_shape({1, 2, 3, 1});
+  RunTest(x, x_shape, y.output, y_shape);
+}
+
+TEST_F(NNGradTest, FractionalMaxPoolGradHelper) {
+  TensorShape x_shape({1, 3, 7, 1});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+  // Force consistent pooling regions for unit testing.
+  auto y = FractionalMaxPool(
+      scope_, x, {1, 1.2, 1.9, 1},
+      FractionalMaxPool::Deterministic(true).Overlapping(true).Seed(1).Seed2(
+          2));
+  Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
+  SetRandomValuesForMaxPooling<float>(&x_init_value);
+  TensorShape y_shape({1, 2, 3, 1});
+  RunTest(x, x_init_value, y.output, y_shape);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index 6e050cf564..6641d45e83 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -56,9 +56,9 @@ namespace bar {
 //
 // Memory stats:
 //   arg bytes total:    104
-//   arg bytes aligned:  128
+//   arg bytes aligned:  192
 //   temp bytes total:   126
-//   temp bytes aligned: 224
+//   temp bytes aligned: 320
 class MyClass : public tensorflow::XlaCompiledCpuFunction {
  public:
   // Number of input arguments for the compiled computation.
diff --git a/tensorflow/compiler/aot/embedded_protocol_buffers.h b/tensorflow/compiler/aot/embedded_protocol_buffers.h
index ebfe4806c2..4e194a6aba 100644
--- a/tensorflow/compiler/aot/embedded_protocol_buffers.h
+++ b/tensorflow/compiler/aot/embedded_protocol_buffers.h
@@ -71,7 +71,7 @@ struct ProtobufToEmbed {
   const ::tensorflow::protobuf::MessageLite* message;
 };
 
-// Embeds a a sequence of protocol buffers into an object file.
+// Embeds a sequence of protocol buffers into an object file.
 //
 // `target_triple` is the target triple for the target architecture for the
 // generated object file.
diff --git a/tensorflow/compiler/aot/runtime.h b/tensorflow/compiler/aot/runtime.h
index d085864f00..d1a669ceb1 100644
--- a/tensorflow/compiler/aot/runtime.h
+++ b/tensorflow/compiler/aot/runtime.h
@@ -25,8 +25,8 @@ namespace tensorflow {
 namespace tfcompile {
 namespace runtime {
 
-// Align to 32-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment.
-static constexpr size_t kAlign = 32;
+// Align to 64-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment.
+static constexpr size_t kAlign = 64;
 
 // aligned_buffer_bytes returns the sum of each size in `sizes`, skipping -1
 // values.  There are `n` entries in `sizes`.  Each buffer is aligned to kAlign
diff --git a/tensorflow/compiler/aot/runtime_test.cc b/tensorflow/compiler/aot/runtime_test.cc
index 6d603a02eb..06ec623eb2 100644
--- a/tensorflow/compiler/aot/runtime_test.cc
+++ b/tensorflow/compiler/aot/runtime_test.cc
@@ -24,7 +24,7 @@ namespace runtime {
 namespace {
 
 TEST(Runtime, AlignmentValue) {
-  // We've chosen 32 byte alignment for the tfcompile runtime to mimic the
+  // We've chosen 64 byte alignment for the tfcompile runtime to mimic the
   // regular tensorflow allocator, which was chosen to play nicely with Eigen.
   // The tfcompile runtime also has a requirement that comes from the xla
   // generated code, on the relation: buffer_size >= 16 ? 2 * sizeof(void*) : 8
@@ -39,13 +39,13 @@ TEST(Runtime, AlignedBufferBytes) {
   EXPECT_EQ(aligned_buffer_bytes(sizesA, 1), 0);
 
   static constexpr intptr_t sizesB[1] = {3};
-  EXPECT_EQ(aligned_buffer_bytes(sizesB, 1), 32);
+  EXPECT_EQ(aligned_buffer_bytes(sizesB, 1), 64);
 
   static constexpr intptr_t sizesC[1] = {32};
-  EXPECT_EQ(aligned_buffer_bytes(sizesC, 1), 32);
+  EXPECT_EQ(aligned_buffer_bytes(sizesC, 1), 64);
 
   static constexpr intptr_t sizesD[7] = {1, -1, 32, -1, 64, 2, 3};
-  EXPECT_EQ(aligned_buffer_bytes(sizesD, 7), 192);
+  EXPECT_EQ(aligned_buffer_bytes(sizesD, 7), 320);
 }
 
 void* add_ptr(void* base, uintptr_t delta) {
@@ -101,11 +101,11 @@ TEST(Runtime, MallocFreeContiguousBuffers) {
   EXPECT_NE(base, nullptr);
   EXPECT_EQ(bufD[0], add_ptr(base, 0));
   EXPECT_EQ(bufD[1], nullptr);
-  EXPECT_EQ(bufD[2], add_ptr(base, 32));
+  EXPECT_EQ(bufD[2], add_ptr(base, 64));
   EXPECT_EQ(bufD[3], nullptr);
-  EXPECT_EQ(bufD[4], add_ptr(base, 64));
-  EXPECT_EQ(bufD[5], add_ptr(base, 128));
-  EXPECT_EQ(bufD[6], add_ptr(base, 160));
+  EXPECT_EQ(bufD[4], add_ptr(base, 128));
+  EXPECT_EQ(bufD[5], add_ptr(base, 192));
+  EXPECT_EQ(bufD[6], add_ptr(base, 256));
   for (int i = 0; i < 7; ++i) {
     const intptr_t size = sizesD[i];
     if (size != -1) {
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index d82922a359..1067b38f93 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -178,6 +178,7 @@ cc_library(
         ":runtime_matmul",
         ":runtime_matmul_mkl",
         ":runtime_single_threaded_conv2d",
+        ":runtime_single_threaded_fft",
         ":runtime_single_threaded_matmul",
         "@llvm//:execution_engine",
         "@llvm//:core",
@@ -516,7 +517,6 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/core:framework",
         "//tensorflow/core:framework_lite",
         "//third_party/eigen3",
     ],
@@ -578,6 +578,22 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "runtime_single_threaded_fft",
+    srcs = [
+        "runtime_fft_impl.h",
+        "runtime_single_threaded_fft.cc",
+    ],
+    hdrs = ["runtime_single_threaded_fft.h"],
+    copts = runtime_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:framework_lite",
+        "//third_party/eigen3",
+    ],
+)
+
 cc_library(
     name = "runtime_single_threaded_matmul",
     srcs = ["runtime_single_threaded_matmul.cc"],
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
index 215405f680..54c52bc08f 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
@@ -51,6 +51,8 @@ extern const char* const kEigenConvF16SymbolName =
 extern const char* const kEigenConvF32SymbolName =
     "__xla_cpu_runtime_EigenConvF32";
 extern const char* const kEigenFftSymbolName = "__xla_cpu_runtime_EigenFft";
+extern const char* const kEigenSingleThreadedFftSymbolName =
+    "__xla_cpu_runtime_EigenSingleThreadedFft";
 extern const char* const kEigenSingleThreadedMatMulF16SymbolName =
     "__xla_cpu_runtime_EigenSingleThreadedMatMulF16";
 extern const char* const kEigenSingleThreadedMatMulF32SymbolName =
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
index 1dce6efa5c..aa0e967123 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
@@ -52,6 +52,7 @@ extern const char* const kMKLSingleThreadedMatMulF64SymbolName;
 extern const char* const kEigenConvF16SymbolName;
 extern const char* const kEigenConvF32SymbolName;
 extern const char* const kEigenFftSymbolName;
+extern const char* const kEigenSingleThreadedFftSymbolName;
 extern const char* const kEigenSingleThreadedMatMulF16SymbolName;
 extern const char* const kEigenSingleThreadedMatMulF32SymbolName;
 extern const char* const kEigenSingleThreadedMatMulF64SymbolName;
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 2c20be155f..758b8c62b4 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -1172,7 +1172,13 @@ Status IrEmitter::HandleFft(HloInstruction* fft) {
       {int8_ptr_type, int8_ptr_type, int8_ptr_type, int32_type, int32_type,
        int64_type, int64_type, int64_type, int64_type},
       /*isVarArg=*/false);
-  const char* fn_name = runtime::kEigenFftSymbolName;
+
+  bool multi_threaded_eigen =
+      hlo_module_config_.debug_options().xla_cpu_multi_thread_eigen();
+  const char* fn_name = multi_threaded_eigen
+                            ? runtime::kEigenFftSymbolName
+                            : runtime::kEigenSingleThreadedFftSymbolName;
+
   llvm::Function* fft_func = llvm::cast<llvm::Function>(
       module_->getOrInsertFunction(fn_name, fft_type));
   fft_func->setCallingConv(llvm::CallingConv::C);
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h b/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
index 984cb0616e..0bf693edd0 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
@@ -21,8 +21,6 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/numeric_types.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/platform/types.h"
 
 // 'tensorflow' namespace is used so that int64 and other types don't require
@@ -71,11 +69,9 @@ void EigenFftR2C(const EigenDevice& device, complex64* out, float* operand,
   in_dims[0] = input_batch;
   Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> out_dims;
   out_dims[0] = input_batch;
-  TensorShape temp_shape{input_batch};
   for (int i = 0; i < FFTRank; i++) {
     in_dims[i + 1] = fft_shape[i];
     out_dims[i + 1] = i == FFTRank - 1 ? fft_shape[i] / 2 + 1 : fft_shape[i];
-    temp_shape.AddDim(fft_shape[i]);
   }
   const Eigen::TensorMap<Eigen::Tensor<float, FFTRank + 1, Eigen::RowMajor>,
                          Eigen::Aligned>
@@ -88,8 +84,8 @@ void EigenFftR2C(const EigenDevice& device, complex64* out, float* operand,
   const auto axes = Eigen::ArrayXi::LinSpaced(FFTRank, 1, FFTRank);
 
   // Compute the full FFT using a temporary tensor.
-  Tensor temp(DataTypeToEnum<complex64>::v(), temp_shape);
-  auto full_fft = temp.flat_inner_dims<complex64, FFTRank + 1>();
+  Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor> full_fft(in_dims);
+
   const Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> zero_start_indices;
   full_fft.device(device) =
       input.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(axes);
@@ -112,11 +108,9 @@ void EigenFftC2R(const EigenDevice& device, float* out, complex64* operand,
   in_dims[0] = input_batch;
   Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> out_dims;
   out_dims[0] = input_batch;
-  TensorShape temp_shape{input_batch};
   for (int i = 0; i < FFTRank; i++) {
     in_dims[i + 1] = i == FFTRank - 1 ? fft_shape[i] / 2 + 1 : fft_shape[i];
     out_dims[i + 1] = fft_shape[i];
-    temp_shape.AddDim(fft_shape[i]);
   }
   const Eigen::TensorMap<Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor>,
                          Eigen::Aligned>
@@ -129,8 +123,7 @@ void EigenFftC2R(const EigenDevice& device, float* out, complex64* operand,
   // region we will slice from input given fft_shape. We slice input to
   // fft_shape on its inner-most dimensions, except the last (which we
   // slice to fft_shape[-1] / 2 + 1).
-  Tensor temp(DataTypeToEnum<complex64>::v(), temp_shape);
-  auto full_fft = temp.flat_inner_dims<complex64, FFTRank + 1>();
+  Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor> full_fft(out_dims);
 
   // Calculate the starting point and range of the source of
   // negative frequency part.
@@ -179,7 +172,6 @@ template <int FFTRank, typename EigenDevice>
 void EigenFftWithRank(const EigenDevice& device, void* out, void* operand,
                       int32 fft_type, int64 input_batch, int64 fft_length0,
                       int64 fft_length1, int64 fft_length2) {
-  CHECK(::xla::FftType_IsValid(fft_type)) << fft_type;
   switch (fft_type) {
     case ::xla::FftType::FFT:
       EigenFftC2C<true, FFTRank, EigenDevice>(
@@ -204,7 +196,8 @@ void EigenFftWithRank(const EigenDevice& device, void* out, void* operand,
           input_batch, fft_length0, fft_length1, fft_length2);
       break;
     default:
-      LOG(FATAL) << "Unsupported FFT type: " << fft_type;
+      // Unsupported FFT type
+      abort();
   }
 }
 
@@ -230,7 +223,8 @@ void EigenFftImpl(const EigenDevice& device, void* out, void* operand,
                                                  fft_length1, fft_length2);
       break;
     default:
-      LOG(FATAL) << "Unsupported FFT rank " << fft_rank;
+      // Unsupported FFT rank
+      abort();
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc
new file mode 100644
index 0000000000..2613ddb127
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc
@@ -0,0 +1,32 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h"
+
+#include "tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h"
+#include "tensorflow/core/platform/dynamic_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+using tensorflow::int32;
+using tensorflow::int64;
+
+TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenSingleThreadedFft(
+    const void* run_options_ptr, void* out, void* operand, int32 fft_type,
+    int32 fft_rank, int64 input_batch, int64 fft_length0, int64 fft_length1,
+    int64 fft_length2) {
+  tensorflow::xla::EigenFftImpl(Eigen::DefaultDevice(), out, operand, fft_type,
+                                fft_rank, input_batch, fft_length0, fft_length1,
+                                fft_length2);
+}
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h
new file mode 100644
index 0000000000..dcd133d012
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_FFT_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_FFT_H_
+
+#include "tensorflow/core/platform/types.h"
+
+extern "C" {
+
+extern void __xla_cpu_runtime_EigenSingleThreadedFft(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, void* out,
+    void* operand, tensorflow::int32 fft_type, tensorflow::int32 fft_rank,
+    tensorflow::int64 input_batch, tensorflow::int64 fft_length0,
+    tensorflow::int64 fft_length1, tensorflow::int64 fft_length2);
+
+}  // extern "C"
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_FFT_H_
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index 8d8c5e4c44..c4c90515ac 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "tensorflow/compiler/xla/service/cpu/windows_compatibility.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -202,6 +203,7 @@ bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(MKLSingleThreadedMatMulF64);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF32);
+  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedFft);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF64);
diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
index d3bc47e61e..2515222cf2 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher.h
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -204,7 +204,7 @@ class LayoutPattern {
   // Modifies the pattern to match only if the layout equals the given proto.
   // The layout must outlive the returned pattern.
   constexpr LayoutPattern<LayoutType, LayoutPatternEqualImpl<Impl>> EqualTo(
-      const Layout* layout) const {
+      const ::xla::Layout* layout) const {
     return LayoutPattern<LayoutType, LayoutPatternEqualImpl<Impl>>(
         LayoutPatternEqualImpl<Impl>(impl_, layout), matched_layout_);
   }
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier.cc b/tensorflow/compiler/xla/service/tuple_simplifier.cc
index e536c8afbf..77bdcc9de0 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier.cc
+++ b/tensorflow/compiler/xla/service/tuple_simplifier.cc
@@ -30,10 +30,17 @@ limitations under the License.
 
 namespace xla {
 
+TupleSimplifier::TupleSimplifier(bool exclude_entry_computation) :
+    exclude_entry_computation_(exclude_entry_computation) {}
+
 StatusOr<bool> TupleSimplifier::Run(HloModule* module) {
   // Initially add all GTE and Tuple instructions to the worklist.
   std::queue<HloInstruction*> worklist;
   for (auto* computation : module->computations()) {
+    if (exclude_entry_computation_ &&
+        computation == module->entry_computation()) {
+      continue;
+    }
     for (auto* instruction : computation->instructions()) {
       if (instruction->opcode() == HloOpcode::kTuple ||
           instruction->opcode() == HloOpcode::kGetTupleElement) {
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier.h b/tensorflow/compiler/xla/service/tuple_simplifier.h
index e5e9b10b5b..7509501883 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier.h
+++ b/tensorflow/compiler/xla/service/tuple_simplifier.h
@@ -27,13 +27,20 @@ namespace xla {
 // the module.
 class TupleSimplifier : public HloPassInterface {
  public:
-  TupleSimplifier() {}
+  TupleSimplifier() : TupleSimplifier(/*exclude_entry_computation=*/false) {}
+  explicit TupleSimplifier(bool exclude_entry_computation);
   ~TupleSimplifier() override {}
   tensorflow::StringPiece name() const override { return "tuple-simplifier"; }
 
   // Run tuple simplification on the given computation. Returns whether the
   // computation was changed.
   StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  // When set, this pipeline stage will perform optimization of all computations
+  // apart from the module's entry computation. This is used by Graphcore's
+  // backend.
+  bool exclude_entry_computation_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier_test.cc b/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
index ca9ae91281..d3635eae81 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
@@ -42,6 +42,12 @@ class TupleSimplifierTest : public HloTestBase {
     TF_ASSERT_OK(changed_status.status());
     EXPECT_EQ(change_expected, changed_status.ValueOrDie());
   }
+  void Run(HloModule* module, bool change_expected, bool exclude_entry) {
+    TupleSimplifier simplifier(exclude_entry);
+    auto changed_status = simplifier.Run(module);
+    TF_ASSERT_OK(changed_status.status());
+    EXPECT_EQ(change_expected, changed_status.ValueOrDie());
+  }
 
   const Shape scalar_shape_ = ShapeUtil::MakeShape(F32, {});
   const Shape tuple_shape_ = ShapeUtil::MakeTupleShape(
@@ -211,5 +217,76 @@ TEST_F(TupleSimplifierTest, IncompatibleTuples) {
   EXPECT_THAT(computation->root_instruction(), tuple);
 }
 
+TEST_F(TupleSimplifierTest, CanExcludeEntryComputation) {
+  //  Verify that the root computation can be excluded
+  auto module = CreateNewModule();
+
+  HloInstruction* p0;
+  HloInstruction* p1;
+  HloComputation* c0;
+  HloComputation* c1;
+  HloComputation* entry;
+
+  {
+    HloComputation::Builder builder(TestName() + "_1");
+    p0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, tuple_shape_, "param"));
+    HloInstruction* gte0 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, p0, 0));
+    HloInstruction* gte1 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, p0, 1));
+    HloInstruction* gte2 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, p0, 2));
+
+    builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1, gte2}));
+
+    c0 = module->AddEmbeddedComputation(builder.Build());
+  }
+  {
+    HloComputation::Builder builder(TestName() + "_2");
+    p1 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, tuple_shape_, "param"));
+    HloInstruction* gte0 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, p1, 0));
+    HloInstruction* gte1 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, p1, 1));
+    HloInstruction* gte2 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, p1, 2));
+
+    builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1, gte2}));
+
+    c1 = module->AddEmbeddedComputation(builder.Build());
+  }
+  {
+    HloComputation::Builder builder(TestName() + "_Entry");
+    HloInstruction* tuple_param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, tuple_shape_, "param"));
+    HloInstruction* call0 = builder.AddInstruction(
+        HloInstruction::CreateCall(tuple_shape_, {tuple_param}, c0));
+    HloInstruction* call1 = builder.AddInstruction(
+        HloInstruction::CreateCall(tuple_shape_, {tuple_param}, c1));
+    HloInstruction* gte0 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, call0, 0));
+    HloInstruction* gte1 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, call1, 1));
+    HloInstruction* tuple0 =
+        builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
+    HloInstruction* gte2 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, tuple0, 0));
+    HloInstruction* gte3 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, tuple0, 1));
+
+    builder.AddInstruction(HloInstruction::CreateTuple({gte2, gte3}));
+
+    entry = module->AddEntryComputation(builder.Build());
+  }
+
+  Run(module.get(), /*change_expected=*/true, /*exclude_entry=*/ true);
+
+  EXPECT_THAT(c0->root_instruction(), p0);
+  EXPECT_THAT(c1->root_instruction(), p1);
+  EXPECT_THAT(entry->instruction_count(), 9);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/contrib/autograph/__init__.py b/tensorflow/contrib/autograph/__init__.py
index 637e49c082..dbdbad8f4c 100644
--- a/tensorflow/contrib/autograph/__init__.py
+++ b/tensorflow/contrib/autograph/__init__.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 
 # TODO(mdan): Bring only the relevant symbols to the top level.
 from tensorflow.contrib.autograph import utils
+from tensorflow.contrib.autograph import operators
 from tensorflow.contrib.autograph.impl.api import convert
 from tensorflow.contrib.autograph.impl.api import converted_call
 from tensorflow.contrib.autograph.impl.api import do_not_convert
@@ -43,6 +44,8 @@ _allowed_symbols = [
     'do_not_convert',
     'to_code',
     'to_graph',
+    # Overloaded operators
+    'operators',
     # Special functions and directives
     'set_element_type',
     'set_loop_options',
diff --git a/tensorflow/contrib/cmake/tf_c.cmake b/tensorflow/contrib/cmake/tf_c.cmake
index bda5e26f43..2e0a2fcef4 100644
--- a/tensorflow/contrib/cmake/tf_c.cmake
+++ b/tensorflow/contrib/cmake/tf_c.cmake
@@ -37,13 +37,15 @@ add_dependencies(
   tf_core_lib
   tf_protos_cc)
 
-add_library(tf_c_python_api OBJECT
-  "${tensorflow_source_dir}/tensorflow/c/python_api.cc"
-  "${tensorflow_source_dir}/tensorflow/c/python_api.h"
-)
-add_dependencies(
-  tf_c_python_api
-  tf_c
-  tf_core_lib
-  tf_core_framework
-  tf_protos_cc)
+if(tensorflow_BUILD_PYTHON_BINDINGS)
+  add_library(tf_c_python_api OBJECT
+    "${tensorflow_source_dir}/tensorflow/c/python_api.cc"
+    "${tensorflow_source_dir}/tensorflow/c/python_api.h"
+  )
+  add_dependencies(
+    tf_c_python_api
+    tf_c
+    tf_core_lib
+    tf_core_framework
+    tf_protos_cc)
+endif()
diff --git a/tensorflow/contrib/cmake/tf_cc_ops.cmake b/tensorflow/contrib/cmake/tf_cc_ops.cmake
index f73da0b8ab..6c90cf398c 100644
--- a/tensorflow/contrib/cmake/tf_cc_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_cc_ops.cmake
@@ -155,7 +155,7 @@ if (WIN32)
     set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.lib")
   endif()
 else (WIN32)
-  set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so")
+  set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal${CMAKE_SHARED_LIBRARY_SUFFIX}")
 endif (WIN32)
 add_custom_target(tf_extension_ops)
 
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index a0c3ddd28b..9244604489 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -715,7 +715,7 @@ if(WIN32)
   endif()
 else()
   add_custom_command(TARGET pywrap_tensorflow_internal POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal${CMAKE_SHARED_LIBRARY_SUFFIX}
                                      ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.so)
 endif()
 
@@ -832,7 +832,6 @@ add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
 add_custom_command(TARGET tf_python_copy_scripts_to_destination PRE_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/contrib/testing/python/framework/util_test.py
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/testing/python/framework/)
-
 add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tools/pip_package/README
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/)
diff --git a/tensorflow/contrib/cmake/tools/create_def_file.py b/tensorflow/contrib/cmake/tools/create_def_file.py
index cffe069aa3..4f957f1e0b 100644
--- a/tensorflow/contrib/cmake/tools/create_def_file.py
+++ b/tensorflow/contrib/cmake/tools/create_def_file.py
@@ -44,7 +44,8 @@ UNDNAME = "undname.exe"
 DUMPBIN = "dumpbin.exe"
 
 # Exclude if matched
-EXCLUDE_RE = re.compile(r"RTTI|deleting destructor|::internal::")
+EXCLUDE_RE = re.compile(r"RTTI|deleting destructor|::internal::|Internal|"
+                        r"python_op_gen_internal|grappler")
 
 # Include if matched before exclude
 INCLUDEPRE_RE = re.compile(r"google::protobuf::internal::ExplicitlyConstructed|"
@@ -56,6 +57,10 @@ INCLUDEPRE_RE = re.compile(r"google::protobuf::internal::ExplicitlyConstructed|"
                            r"tensorflow::ops::internal::Enter|"
                            r"tensorflow::strings::internal::AppendPieces|"
                            r"tensorflow::strings::internal::CatPieces|"
+                           r"tensorflow::errors::Internal|"
+                           r"tensorflow::Tensor::CopyFromInternal|"
+                           r"tensorflow::kernel_factory::"
+                           r"OpKernelRegistrar::InitInternal|"
                            r"tensorflow::io::internal::JoinPathImpl")
 
 # Include if matched after exclude
@@ -64,7 +69,7 @@ INCLUDE_RE = re.compile(r"^(TF_\w*)$|"
                         r"tensorflow::|"
                         r"functor::|"
                         r"\?nsync_|"
-                        r"perftools::gputools")
+                        r"stream_executor::")
 
 # We want to identify data members explicitly in the DEF file, so that no one
 # can implicitly link against the DLL if they use one of the variables exported
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
index 45760a29ee..795f1993ba 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
@@ -151,16 +151,24 @@ class SinhArcsinhBijectorTest(test.TestCase):
         self.assertAllClose(y, bijector.forward(x).eval(), rtol=1e-4, atol=0.)
         self.assertAllClose(x, bijector.inverse(y).eval(), rtol=1e-4, atol=0.)
 
-        # Do the numpy calculation in float128 to avoid inf/nan.
-        y_float128 = np.float128(y)
-        self.assertAllClose(
-            np.log(np.cosh(
-                np.arcsinh(y_float128) / tailweight - skewness) / np.sqrt(
-                    y_float128**2 + 1)) -
-            np.log(tailweight),
-            bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
-            rtol=1e-4,
-            atol=0.)
+        # On IBM PPC systems, longdouble (np.float128) is same as double except that it can have more precision.
+        # Type double being of 8 bytes, can't hold square of max of float64 (which is also 8 bytes) and
+        # below test fails due to overflow error giving inf. So this check avoids that error by skipping square
+        # calculation and corresponding assert.
+
+        if np.amax(y) <= np.sqrt(np.finfo(np.float128).max) and \
+           np.fabs(np.amin(y)) <= np.sqrt(np.fabs(np.finfo(np.float128).min)):
+
+          # Do the numpy calculation in float128 to avoid inf/nan.
+          y_float128 = np.float128(y)
+          self.assertAllClose(
+              np.log(np.cosh(
+                  np.arcsinh(y_float128) / tailweight - skewness) / np.sqrt(
+                      y_float128**2 + 1)) -
+              np.log(tailweight),
+              bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
+              rtol=1e-4,
+              atol=0.)
         self.assertAllClose(
             -bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
             bijector.forward_log_det_jacobian(x, event_ndims=0).eval(),
diff --git a/tensorflow/contrib/eager/python/datasets.py b/tensorflow/contrib/eager/python/datasets.py
index d7909dd5a2..adf92c27ea 100644
--- a/tensorflow/contrib/eager/python/datasets.py
+++ b/tensorflow/contrib/eager/python/datasets.py
@@ -106,7 +106,8 @@ class Iterator(iterator_ops.EagerIterator, checkpointable.CheckpointableBase):
             target_device=target,
             buffer_size=10,
             container="",
-            shared_name=_generate_shared_name("function_buffer_resource"))
+            shared_name=_generate_shared_name(
+                "contrib_eager_iterator_function_buffer_resource"))
         self._buffer_resource_deleter = resource_variable_ops.EagerResourceDeleter(  # pylint: disable=line-too-long
             handle=self._buffer_resource_handle,
             handle_device=self._device)
diff --git a/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb b/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb
index 4fe3a0e3f3..5749f22ac5 100644
--- a/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb
+++ b/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb
@@ -68,7 +68,7 @@
         "# simply construct the object. Most layers take as a first argument the number\n",
         "# of output dimensions / channels.\n",
         "layer = tf.keras.layers.Dense(100)\n",
-        "# The number of input dimensionss is often unnecessary, as it can be inferred\n",
+        "# The number of input dimensions is often unnecessary, as it can be inferred\n",
         "# the first time the layer is used, but it can be provided if you want to \n",
         "# specify it manually, which is useful in some complex models.\n",
         "layer = tf.keras.layers.Dense(10, input_shape=(None, 5))"
@@ -267,7 +267,7 @@
         "  * `build`, where you know the shapes of the input tensors and can do the rest of the initialization\n",
         "  * `call`, where you do the forward computation\n",
         "\n",
-        "Note that you don't have to wait until `build` is called to create your variables, you can also create them in `__init__`. However, the advantage of creating them in `build` is that it enables late variable creation based on the shape of the inputs the layer will operate on. On the other hand, creating variables in `__init__` would mean that shapes requires to create the variables will need to be explicitly specified."
+        "Note that you don't have to wait until `build` is called to create your variables, you can also create them in `__init__`. However, the advantage of creating them in `build` is that it enables late variable creation based on the shape of the inputs the layer will operate on. On the other hand, creating variables in `__init__` would mean that shapes required to create the variables will need to be explicitly specified."
       ]
     },
     {
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
index 84a413c791..05bcdac2ca 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
@@ -346,7 +346,8 @@ def sequence_numeric_column(
     key,
     shape=(1,),
     default_value=0.,
-    dtype=dtypes.float32):
+    dtype=dtypes.float32,
+    normalizer_fn=None):
   """Returns a feature column that represents sequences of numeric data.
 
   Example:
@@ -370,6 +371,12 @@ def sequence_numeric_column(
     default_value: A single value compatible with `dtype` that is used for
       padding the sparse data into a dense `Tensor`.
     dtype: The type of values.
+    normalizer_fn: If not `None`, a function that can be used to normalize the
+      value of the tensor after `default_value` is applied for parsing.
+      Normalizer function takes the input `Tensor` as its argument, and returns
+      the output `Tensor`. (e.g. lambda x: (x - 3.0) / 4.2). Please note that
+      even though the most common use case of this function is normalization, it
+      can be used for any kind of Tensorflow transformations.
 
   Returns:
     A `_SequenceNumericColumn`.
@@ -383,12 +390,16 @@ def sequence_numeric_column(
   if not (dtype.is_integer or dtype.is_floating):
     raise ValueError('dtype must be convertible to float. '
                      'dtype: {}, key: {}'.format(dtype, key))
+  if normalizer_fn is not None and not callable(normalizer_fn):
+    raise TypeError(
+        'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
 
   return _SequenceNumericColumn(
       key,
       shape=shape,
       default_value=default_value,
-      dtype=dtype)
+      dtype=dtype,
+      normalizer_fn=normalizer_fn)
 
 
 def _assert_all_equal_and_return(tensors, name=None):
@@ -407,7 +418,7 @@ class _SequenceNumericColumn(
     fc._SequenceDenseColumn,
     collections.namedtuple(
         '_SequenceNumericColumn',
-        ['key', 'shape', 'default_value', 'dtype'])):
+        ['key', 'shape', 'default_value', 'dtype', 'normalizer_fn'])):
   """Represents sequences of numeric data."""
 
   @property
@@ -419,7 +430,10 @@ class _SequenceNumericColumn(
     return {self.key: parsing_ops.VarLenFeature(self.dtype)}
 
   def _transform_feature(self, inputs):
-    return inputs.get(self.key)
+    input_tensor = inputs.get(self.key)
+    if self.normalizer_fn is not None:
+      input_tensor = self.normalizer_fn(input_tensor)
+    return input_tensor
 
   @property
   def _variable_shape(self):
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
index ee74cf56dc..45d7b74046 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import monitored_session
 
@@ -947,6 +948,7 @@ class SequenceNumericColumnTest(test.TestCase):
     self.assertEqual((1,), a.shape)
     self.assertEqual(0., a.default_value)
     self.assertEqual(dtypes.float32, a.dtype)
+    self.assertIsNone(a.normalizer_fn)
 
   def test_shape_saved_as_tuple(self):
     a = sfc.sequence_numeric_column('aaa', shape=[1, 2])
@@ -965,6 +967,10 @@ class SequenceNumericColumnTest(test.TestCase):
         ValueError, 'dtype must be convertible to float'):
       sfc.sequence_numeric_column('aaa', dtype=dtypes.string)
 
+  def test_normalizer_fn_must_be_callable(self):
+    with self.assertRaisesRegexp(TypeError, 'must be a callable'):
+      sfc.sequence_numeric_column('aaa', normalizer_fn='NotACallable')
+
   def test_get_sequence_dense_tensor(self):
     sparse_input = sparse_tensor.SparseTensorValue(
         # example 0, values [[0.], [1]]
@@ -985,6 +991,41 @@ class SequenceNumericColumnTest(test.TestCase):
       self.assertAllEqual(
           expected_dense_tensor, dense_tensor.eval(session=sess))
 
+  def test_get_sequence_dense_tensor_with_normalizer_fn(self):
+
+    def _increment_two(input_sparse_tensor):
+      return sparse_ops.sparse_add(
+          input_sparse_tensor,
+          sparse_tensor.SparseTensor(((0, 0), (1, 1)), (2.0, 2.0), (2, 2))
+      )
+
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, values [[0.], [1]]
+        # example 1, [[10.]]
+        indices=((0, 0), (0, 1), (1, 0)),
+        values=(0., 1., 10.),
+        dense_shape=(2, 2))
+
+    # Before _increment_two:
+    #   [[0.], [1.]],
+    #   [[10.], [0.]],
+    # After _increment_two:
+    #   [[2.], [1.]],
+    #   [[10.], [2.]],
+    expected_dense_tensor = [
+        [[2.], [1.]],
+        [[10.], [2.]],
+    ]
+    numeric_column = sfc.sequence_numeric_column(
+        'aaa', normalizer_fn=_increment_two)
+
+    dense_tensor, _ = numeric_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': sparse_input}))
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_dense_tensor, dense_tensor.eval(session=sess))
+
   def test_get_sequence_dense_tensor_with_shape(self):
     """Tests get_sequence_dense_tensor with shape !=(1,)."""
     sparse_input = sparse_tensor.SparseTensorValue(
diff --git a/tensorflow/contrib/ffmpeg/__init__.py b/tensorflow/contrib/ffmpeg/__init__.py
index daba965a98..484ffee3e7 100644
--- a/tensorflow/contrib/ffmpeg/__init__.py
+++ b/tensorflow/contrib/ffmpeg/__init__.py
@@ -28,7 +28,6 @@ from __future__ import print_function
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_audio
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_video
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import encode_audio
-from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_video
 
 from tensorflow.python.util.all_util import remove_undocumented
 
diff --git a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
index 020b5c99c6..b1b5126d9e 100644
--- a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
+++ b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 from tensorflow.contrib.ffmpeg.ops import gen_decode_audio_op_py
 from tensorflow.contrib.ffmpeg.ops import gen_decode_video_op_py
 from tensorflow.contrib.ffmpeg.ops import gen_encode_audio_op_py
-from tensorflow.contrib.ffmpeg.ops import gen_decode_video_op_py
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import resource_loader
diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index 10d1ecc738..dc49383c5c 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -119,14 +119,13 @@ from tensorflow.python.framework.smart_cond import smart_cond
 from tensorflow.python.framework.smart_cond import smart_constant_value
 from tensorflow.python.framework.tensor_spec import BoundedTensorSpec
 from tensorflow.python.framework.tensor_spec import TensorSpec
-from tensorflow.python.ops.array_ops import broadcast_to
 from tensorflow.python.ops.init_ops import convolutional_delta_orthogonal
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_1d
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_2d
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_3d
 from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = ['nest', 'broadcast_to']
+_allowed_symbols = ['nest']
 _nest_allowed_symbols = [
     'assert_same_structure',
     'is_sequence',
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
index 65cb94b5a4..a955e21b72 100644
--- a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
@@ -301,8 +301,8 @@ class FusedConv2DBiasActivationTest(test.TestCase):
           conv = tensors[i]
           value = values[i]
           ref_value = ref_values[i]
-          print("expected = ", ref_value)
-          print("actual = ", value)
+          tf_logging.info("expected = ", ref_value)
+          tf_logging.info("actual = ", value)
           tol = 1e-5
           if value.dtype == np.float16:
             tol = 1e-3
@@ -843,7 +843,8 @@ class FusedConvInt8Tests(test.TestCase):
                                                 vertical_stride, padding_type)
     output_width = CalculateConvolvedOutputDim(input_width, filter_width,
                                                horizontal_stride, padding_type)
-    print("output_height=", output_height, ", output_width=", output_width)
+    tf_logging.info("output_height=", output_height, ", output_width=", 
+			                 output_width)
 
     side_input, _, _ = gen_array_ops.quantize_v2(
         random_ops.random_uniform(
@@ -880,8 +881,8 @@ class FusedConvInt8Tests(test.TestCase):
     with self.test_session(
         use_gpu=True, config=NoMemoryOptimizationConfig()) as sess:
       actual_y, expected_y = sess.run([actual, expected])
-      print("actual_y = ", actual_y)
-      print("expected_y = ", expected_y)
+      tf_logging.info("actual_y = ", actual_y)
+      tf_logging.info("expected_y = ", expected_y)
       self.assertTrue(np.array_equal(actual_y, expected_y))
 
   def testFusedConvInt8(self):
diff --git a/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c b/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
index 6a5d982dc8..2e5c84704f 100644
--- a/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
+++ b/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "hexagon_controller.h"
 
-#include <malloc.h>
+#include <stdlib.h>
 #include <stdio.h>
 
 #include "adspmsgd.h"
diff --git a/tensorflow/contrib/lite/download_dependencies.sh b/tensorflow/contrib/lite/download_dependencies.sh
index 436c3e1d4c..840015a7fa 100755
--- a/tensorflow/contrib/lite/download_dependencies.sh
+++ b/tensorflow/contrib/lite/download_dependencies.sh
@@ -30,9 +30,7 @@ if [ ! -f $BZL_FILE_PATH ]; then
 fi
 
 EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
-# TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' once
-# the archive has been propagated in mirror.bazel.build.
-GEMMLOWP_URL="$(grep -o 'https://github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
+GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)"
 NEON_2_SSE_URL="https://github.com/intel/ARM_NEON_2_x86_SSE/archive/master.zip"
diff --git a/tensorflow/contrib/lite/examples/minimal/minimal.cc b/tensorflow/contrib/lite/examples/minimal/minimal.cc
index 106e3b0270..8b0ace96cc 100644
--- a/tensorflow/contrib/lite/examples/minimal/minimal.cc
+++ b/tensorflow/contrib/lite/examples/minimal/minimal.cc
@@ -38,7 +38,7 @@ using namespace tflite;
 
 int main(int argc, char *argv[]) {
   if(argc != 2) {
-    fprintf(stderr, "Usage: %s <model>\n");
+    fprintf(stderr, "minimal <tflite model>\n");
     return 1;
   }
   const char* filename = argv[1];
diff --git a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
index bb2e615eac..965273f0f0 100644
--- a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
+++ b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
@@ -128,7 +128,6 @@ TensorFlow operation not listed above are likely unsupported. Notably, the
 following common ops are not supported at the moment:
 
 *   [tf.depth_to_space](https://www.tensorflow.org/api_docs/python/tf/depth_to_space)
-*   [tf.gather](https://www.tensorflow.org/api_docs/python/tf/gather)
 *   [tf.image.resize_bilinear](https://www.tensorflow.org/api_docs/python/tf/image/resize_bilinear)
 *   [tf.tanh](https://www.tensorflow.org/api_docs/python/tf/tanh)
 
@@ -306,6 +305,19 @@ Options {
 }
 ```
 
+**GATHER**
+
+```
+Inputs {
+  0: params tensor
+  1: indices tensor
+  2: axis tensor (optional)
+}
+Outputs {
+  0: a tensor with same type as the params tensor.
+}
+```
+
 **GREATER**
 
 ```
diff --git a/tensorflow/contrib/lite/java/ovic/README.md b/tensorflow/contrib/lite/java/ovic/README.md
index 5efa70987e..26349347fa 100644
--- a/tensorflow/contrib/lite/java/ovic/README.md
+++ b/tensorflow/contrib/lite/java/ovic/README.md
@@ -2,7 +2,7 @@
 
 This folder contains building code for track one of the [Low Power ImageNet Recognition Challenge workshop at CVPR 2018.](https://rebootingcomputing.ieee.org/home/sitemap/14-lpirc/80-low-power-image-recognition-challenge-lpirc-2018)
 
-## Pre-requesits
+## Pre-requisite
 
 Follow the steps [here](https://www.tensorflow.org/mobile/tflite/demo_android) to install Tensorflow, Bazel, and the Android NDK and SDK.
 
@@ -49,7 +49,7 @@ Once you have a submission that follows the instructions from the [competition s
 You can call the validator binary below to verify that your model fits the format requirements. This often helps you to catch size mismatches (e.g. output should be [1, 1001] instead of [1,1,1,1001]). Let say the submission file is located at `/path/to/my_model.lite`, then call:
 
 ```sh
-bazel build --cxxopt--std=c++11 //tensorflow/contrib/lite/java/ovic:ovic_validator --cxxopt=-Wno-all
+bazel build --cxxopt=--std=c++11 //tensorflow/contrib/lite/java/ovic:ovic_validator --cxxopt=-Wno-all
 bazel-bin/tensorflow/contrib/lite/java/ovic/ovic_validator /path/to/my_model.lite
 ```
 
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index a2f192bbc2..1908f7fa6c 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -1934,7 +1934,7 @@ inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
 // The quantization of the input, output arrays is as follows:
 //  - The input activations are quantized as uint8 on the interval
 //    [-1, 127/128].
-//    The rationale for that is that that is the natural interval for output
+//    The rationale for that is that is the natural interval for output
 //    activations (see next point) and these need to be concatenated together.
 //    We could accommodate different ranges by re-scaling, but we empirically
 //    found that setting the input activations range to be [-1, 127/128] in the
@@ -1999,7 +1999,7 @@ inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
 // However, for a fixed-point implementation in 16-bit integers, using 5
 // integer bits to represent the [-16, 16] range would leave only 11
 // fractional bits, giving an increment of 2^-11 = 4.9e-4 between consecutive
-// representable values. Notice that that is higher than the
+// representable values. Notice that is higher than the
 // worst-case clamping error with clamping to [-8, 8]: 3.4e-4 for Logistic.
 // Using [-8, 8] thus seems like the better compromise overall, enjoying
 // an increment of 2.4e-4 between representable values and a worst-case
diff --git a/tensorflow/contrib/lite/python/interpreter.py b/tensorflow/contrib/lite/python/interpreter.py
index 9400e757b9..fd90823425 100644
--- a/tensorflow/contrib/lite/python/interpreter.py
+++ b/tensorflow/contrib/lite/python/interpreter.py
@@ -55,7 +55,7 @@ class Interpreter(object):
     elif model_content and not model_path:
       self._interpreter = (
           _interpreter_wrapper.InterpreterWrapper_CreateWrapperCPPFromBuffer(
-              model_content, len(model_content)))
+              model_content))
       if not self._interpreter:
         raise ValueError(
             'Failed to create model from {} bytes'.format(len(model_content)))
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index f705551fcb..b283551c45 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -397,9 +397,14 @@ InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromFile(
 }
 
 InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
-    const char* data, size_t len) {
+    PyObject* data) {
+  char * buf = nullptr;
+  Py_ssize_t length;
+  if (PY_TO_CPPSTRING(data, &buf, &length) == -1) {
+    return nullptr;
+  }
   std::unique_ptr<tflite::FlatBufferModel> model =
-      tflite::FlatBufferModel::BuildFromBuffer(data, len);
+      tflite::FlatBufferModel::BuildFromBuffer(buf, length);
   return model ? new InterpreterWrapper(std::move(model)) : nullptr;
 }
 
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
index b0ed7c4559..cbeb53bee7 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
@@ -40,8 +40,7 @@ class InterpreterWrapper {
   static InterpreterWrapper* CreateWrapperCPPFromFile(const char* model_path);
 
   // SWIG caller takes ownership of pointer.
-  static InterpreterWrapper* CreateWrapperCPPFromBuffer(const char* data,
-                                                        size_t len);
+  static InterpreterWrapper* CreateWrapperCPPFromBuffer(PyObject* data);
 
   ~InterpreterWrapper();
   bool AllocateTensors();
diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index 0913cd2c5c..88dda7290b 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -34,6 +34,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from six import PY3
+
 from google.protobuf import text_format as _text_format
 from google.protobuf.message import DecodeError
 from tensorflow.contrib.lite.python import lite_constants as constants
@@ -54,6 +56,7 @@ from tensorflow.python.framework.importer import import_graph_def
 from tensorflow.python.ops.variables import global_variables_initializer
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import tag_constants
+# from tensorflow.python.util.all_util import remove_undocumented
 
 
 class TocoConverter(object):
@@ -203,6 +206,12 @@ class TocoConverter(object):
       except (_text_format.ParseError, DecodeError):
         try:
           print("Ignore 'tcmalloc: large alloc' warnings.")
+
+          if not isinstance(file_content, str):
+            if PY3:
+              file_content = file_content.decode('utf-8')
+            else:
+              file_content = file_content.encode('utf-8')
           _text_format.Merge(file_content, graph_def)
         except (_text_format.ParseError, DecodeError):
           raise ValueError(
@@ -382,3 +391,5 @@ def _freeze_graph(sess, output_tensors):
                                                         output_arrays)
   else:
     return sess.graph_def
+
+# remove_undocumented(__name__)
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index e33b430937..5c7fa09891 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -178,7 +178,7 @@ ArrayDataType ConvertDataType(tensorflow::DataType dtype) {
   else if (dtype == DT_STRING)
     return ArrayDataType::kString;
   else
-    LOG(INFO) << "Unsupported data type in placehoder op: " << dtype;
+    LOG(INFO) << "Unsupported data type in placeholder op: " << dtype;
   return ArrayDataType::kNone;
 }
 
diff --git a/tensorflow/contrib/lite/toco/toco_port.cc b/tensorflow/contrib/lite/toco/toco_port.cc
index 1b21c8bc60..de76fd4032 100644
--- a/tensorflow/contrib/lite/toco/toco_port.cc
+++ b/tensorflow/contrib/lite/toco/toco_port.cc
@@ -20,6 +20,12 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 
+#if defined(__ANDROID__) && defined(__ARM_ARCH_7A__)
+namespace std {
+double round(double x) { return ::round(x); }
+}  // namespace std
+#endif
+
 namespace toco {
 namespace port {
 void CopyToBuffer(const string& src, char* dest) {
diff --git a/tensorflow/contrib/lite/toco/toco_port.h b/tensorflow/contrib/lite/toco/toco_port.h
index 5c019cb2bf..17f82b9dd7 100644
--- a/tensorflow/contrib/lite/toco/toco_port.h
+++ b/tensorflow/contrib/lite/toco/toco_port.h
@@ -34,6 +34,24 @@ limitations under the License.
 #define TFLITE_PROTO_NS google::protobuf
 #endif
 
+#ifdef __ANDROID__
+#include <sstream>
+namespace std {
+
+template <typename T>
+std::string to_string(T value)
+{
+    std::ostringstream os ;
+    os << value ;
+    return os.str() ;
+}
+
+#ifdef __ARM_ARCH_7A__
+double round(double x);
+#endif
+}
+#endif
+
 namespace toco {
 namespace port {
 
diff --git a/tensorflow/contrib/makefile/compile_nsync.sh b/tensorflow/contrib/makefile/compile_nsync.sh
index e8c6edd7ba..a28fc3a87f 100755
--- a/tensorflow/contrib/makefile/compile_nsync.sh
+++ b/tensorflow/contrib/makefile/compile_nsync.sh
@@ -270,7 +270,7 @@ for arch in $archs; do
                         PLATFORM_LDFLAGS=-pthread
                         MKDEP=${CC} -M -std=c++11
                         PLATFORM_C=../../platform/c++11/src/nsync_semaphore_mutex.cc \
-                                   ../../platform/c++11/src/per_thread_waiter.cc \
+                                   ../../platform/posix/src/per_thread_waiter.c \
                                    ../../platform/c++11/src/yield.cc \
                                    ../../platform/c++11/src/time_rep_timespec.cc \
                                    ../../platform/c++11/src/nsync_panic.cc
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index eff9081e35..48953e2e38 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -27,9 +27,7 @@ if [ ! -f $BZL_FILE_PATH ]; then
 fi
 
 EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
-# TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' once
-# the archive has been propagated in mirror.bazel.build.
-GEMMLOWP_URL="$(grep -o 'https://github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
+GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 NSYNC_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
 PROTOBUF_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/protobuf/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 2ed99d50a4..a6be2084aa 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -2503,7 +2503,7 @@ def _compute_recall_at_precision(tp, fp, fn, precision, name):
     name: An optional variable_scope name.
 
   Returns:
-    The recall at a the given `precision`.
+    The recall at a given `precision`.
   """
   precisions = math_ops.div(tp, tp + fp + _EPSILON)
   tf_index = math_ops.argmin(
diff --git a/tensorflow/contrib/mpi_collectives/kernels/ring.h b/tensorflow/contrib/mpi_collectives/kernels/ring.h
index 1d56d588bc..c001615d3f 100644
--- a/tensorflow/contrib/mpi_collectives/kernels/ring.h
+++ b/tensorflow/contrib/mpi_collectives/kernels/ring.h
@@ -129,7 +129,7 @@ cudaStream_t CudaStreamForMPI();
  *  has the fully accumulated Segment 1; and so on. The scatter-reduce is
  * complete.
  *
- *  Next, the allgather distributes these fully accumululated chunks across all
+ *  Next, the allgather distributes these fully accumulated chunks across all
  * nodes. Communication proceeds in the same ring, once again in N-1 steps. At
  * the ith step, node j will send chunk (j - i + 1) and receive chunk (j - i).
  * For example, at the first iteration, the following transfers will occur:
diff --git a/tensorflow/contrib/opt/python/training/adamax_test.py b/tensorflow/contrib/opt/python/training/adamax_test.py
index 21bf3f5313..915e6504e1 100644
--- a/tensorflow/contrib/opt/python/training/adamax_test.py
+++ b/tensorflow/contrib/opt/python/training/adamax_test.py
@@ -224,8 +224,10 @@ class AdaMaxOptimizerTest(test.TestCase):
           var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
-          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0),
+                                             rtol=1e-2)
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1),
+                                             rtol=1e-2)
           if use_resource:
             self.assertEqual("var0_%d/AdaMax:0" % (i,),
                              opt.get_slot(var=var0, name="m").name)
diff --git a/tensorflow/contrib/opt/python/training/model_average_optimizer.py b/tensorflow/contrib/opt/python/training/model_average_optimizer.py
index a7c97a1da2..b6b10e500b 100644
--- a/tensorflow/contrib/opt/python/training/model_average_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/model_average_optimizer.py
@@ -62,7 +62,7 @@ class ModelAverageCustomGetter(object):
   """
 
   def __init__(self, worker_device):
-    """Create a new `ElasticAverageCustomGetter`.
+    """Create a new `ModelAverageCustomGetter`.
 
     Args:
       worker_device: String.  Name of the `worker` job.
diff --git a/tensorflow/contrib/periodic_resample/BUILD b/tensorflow/contrib/periodic_resample/BUILD
index 6ca7fe8b6e..aad1ca04c5 100644
--- a/tensorflow/contrib/periodic_resample/BUILD
+++ b/tensorflow/contrib/periodic_resample/BUILD
@@ -6,12 +6,13 @@ exports_files(["LICENSE"])
 
 load(
     "//tensorflow:tensorflow.bzl",
-    "py_test",
+    "tf_cc_test",
     "tf_gen_op_libs",
     "tf_custom_op_library",
     "tf_custom_op_py_library",
     "tf_gen_op_wrapper_py",
 )
+load("//tensorflow:tensorflow.bzl", "py_test")
 
 cc_library(
     name = "all_ops",
@@ -84,6 +85,23 @@ py_test(
         ":init_py",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradient_checker",
+    ],
+)
+
+tf_cc_test(
+    name = "periodic_resample_op_cc_test",
+    size = "small",
+    srcs = [
+        "ops/array_ops_test.cc",
+    ],
+    deps = [
+        ":all_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_proto",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
     ],
 )
 
diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
index e18923c8aa..514689cf45 100644
--- a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
+++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
@@ -22,4 +22,9 @@ namespace tensorflow {
 REGISTER_KERNEL_BUILDER(Name("PeriodicResample").Device(DEVICE_CPU),
                         PeriodicResampleOp);
 
+
+REGISTER_KERNEL_BUILDER(Name("PeriodicResampleOpGrad")
+                            .Device(DEVICE_CPU),
+                        PeriodicResampleOpGrad);
+
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
index 3ab588c458..42fba81a5c 100644
--- a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
+++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
@@ -25,92 +25,202 @@
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/work_sharder.h"
 
 namespace {
 
-template <class IndexVecT, class IndexT>
-IndexT compute_input_index(
-    IndexVecT* target_dimensions, const IndexT& output_index,
-    const IndexVecT& original_dimensions, const int& adjustable_dimension,
-    const std::vector<tensorflow::int64>& dimension_ceiling,
-    const std::vector<tensorflow::int64>& cumulative_dimensions, IndexT* result,
-    std::vector<IndexT>* output_indices, const int& rank) {
-  *result = 0;
-  output_indices->clear();
+// Computes input tensor index for given output index during forward
+// propagation through periodic_resample operation.
+class InputIndexer {
+ public:
+  InputIndexer(const std::vector<tensorflow::int64>& output_dimensions,
+               const tensorflow::TensorShape& input_shape,
+               int adjustable_dimension)
+      : output_dimensions_(output_dimensions),
+        adjustable_dimension_(adjustable_dimension),
+        rank_(input_shape.dims()),
+        linear_output_index_(0),
+        linear_input_index_(0),
+        adjustable_dimension_carriage_sum_(0) {
+    auto input_dimensions = TensorShapeToVector(input_shape);
+    // factors by which input_dimensions increases/decreases w.r.t.
+    // output_dimensions
+    dimension_ceiling_ =
+        ComputeDimensionCeiling(output_dimensions, input_dimensions);
+    cumulative_dimensions_ = ComputeCumulativeDimensions();
+
+    output_indices_.resize(output_dimensions_.size());
+    input_indices_.resize(output_dimensions_.size());
+
+    // Compute index_factors
+    index_factors_.resize(rank_);
+    tensorflow::int64 last_index_factor = 1;
+    for (auto r = rank_ - 1; r >= 0; --r) {
+      index_factors_[r] = last_index_factor;
+      last_index_factor *= input_dimensions[r];
+    }
+  }
+
+  tensorflow::int64 linear_input_index() const { return linear_input_index_; }
+
+  void MoveToOutputIndex(tensorflow::int64 output_index);
+  void IncrementOutputIndex();
+
+ private:
+  void RecomputeInputAdjustableDimensionIndex() {
+    tensorflow::int64 index = adjustable_dimension_carriage_sum_;
+    index *= output_dimensions_[adjustable_dimension_];
+    index += output_indices_[adjustable_dimension_];
+    input_indices_[adjustable_dimension_] = index;
+  }
+
+  std::vector<tensorflow::int64> TensorShapeToVector(
+      const tensorflow::TensorShape& tensor_shape);
+
+  std::vector<tensorflow::int64> ComputeDimensionCeiling(
+      const std::vector<tensorflow::int64>& output_dimensions,
+      const std::vector<tensorflow::int64>& input_dimensions);
+
+  std::vector<tensorflow::int64> ComputeCumulativeDimensions();
+
+  const std::vector<tensorflow::int64> output_dimensions_;
+  std::vector<tensorflow::int64> dimension_ceiling_;
+  std::vector<tensorflow::int64> index_factors_;
+  std::vector<tensorflow::int64> cumulative_dimensions_;
+  std::vector<tensorflow::int64> output_indices_;
+  std::vector<tensorflow::int64> input_indices_;
+
+  const int adjustable_dimension_;
+  const int rank_;
+  tensorflow::int64 linear_output_index_;
+  tensorflow::int64 linear_input_index_;
+  tensorflow::int64 adjustable_dimension_carriage_sum_;
+};
+
+void InputIndexer::MoveToOutputIndex(tensorflow::int64 output_index) {
+  linear_output_index_ = output_index;
+  linear_input_index_ = 0;
 
   // un-rasterize the output index
   auto last_reduced_i = output_index;
-  for (auto r = rank - 1; r >= 0; --r) {
-    (*output_indices)[r] = last_reduced_i % (*target_dimensions)[r];
+  for (auto r = rank_ - 1; r >= 0; --r) {
+    output_indices_[r] = last_reduced_i % output_dimensions_[r];
     last_reduced_i =
-        (last_reduced_i - (*output_indices)[r]) / (*target_dimensions)[r];
+        (last_reduced_i - output_indices_[r]) / output_dimensions_[r];
   }
 
+  tensorflow::int64 carriage_sum = 0;
+  for (int qi = 0; qi < rank_; ++qi) {
+    if (qi == adjustable_dimension_) continue;
+    carriage_sum += cumulative_dimensions_[qi] *
+                    (output_indices_[qi] % dimension_ceiling_[qi]);
+  }
+  adjustable_dimension_carriage_sum_ = carriage_sum;
+
   // rasterize the input index
-  IndexT last_index_factor = 1;
-  for (auto r = rank - 1; r >= 0; --r) {
-    IndexT index = 0;
-    if (r != adjustable_dimension)
-      index = (*output_indices)[r] / dimension_ceiling[r];
-    else {
-      for (int qi = 0; qi < rank; ++qi) {
-        if (qi == adjustable_dimension) continue;
-        index += cumulative_dimensions[qi] *
-                 ((*output_indices)[qi] % dimension_ceiling[qi]);
-      }
-      index *= (*target_dimensions)[adjustable_dimension];
-      index += (*output_indices)[r];
+  for (auto r = rank_ - 1; r >= 0; --r) {
+    if (r != adjustable_dimension_) {
+      input_indices_[r] = output_indices_[r] / dimension_ceiling_[r];
+    } else {
+      RecomputeInputAdjustableDimensionIndex();
     }
-    *result += last_index_factor * index;
-    last_index_factor *= original_dimensions[r];
   }
+  for (auto r = rank_ - 1; r >= 0; --r) {
+    linear_input_index_ += index_factors_[r] * input_indices_[r];
+  }
+}
+
+void InputIndexer::IncrementOutputIndex() {
+  linear_output_index_++;
+  for (auto r = rank_ - 1; r >= 0; --r) {
+    auto old_carriage_sum_increment =
+        cumulative_dimensions_[r] *
+        (output_indices_[r] % dimension_ceiling_[r]);
+    output_indices_[r] = (output_indices_[r] + 1) % output_dimensions_[r];
+    if (r != adjustable_dimension_) {
+      auto new_input_index = output_indices_[r] / dimension_ceiling_[r];
+      linear_input_index_ +=
+          (new_input_index - input_indices_[r]) * index_factors_[r];
+
+      input_indices_[r] = new_input_index;
+
+      auto new_carriage_sum_increment =
+          cumulative_dimensions_[r] *
+          (output_indices_[r] % dimension_ceiling_[r]);
 
-  return *result;
+      adjustable_dimension_carriage_sum_ = adjustable_dimension_carriage_sum_ -
+                                           old_carriage_sum_increment +
+                                           new_carriage_sum_increment;
+    }
+
+    if (output_indices_[r] != 0) {
+      // No more carries to higher indices.
+      break;
+    }
+  }
+  auto old_adjustable_dimension_input_index =
+      input_indices_[adjustable_dimension_];
+  RecomputeInputAdjustableDimensionIndex();
+  linear_input_index_ += (input_indices_[adjustable_dimension_] -
+                           old_adjustable_dimension_input_index) *
+                          index_factors_[adjustable_dimension_];
 }
 
-template <class InputDataT,
-          class IndexVecT>  // both types are needed here b/c IndexVecT and
-                            // InputDataT are not related
-                            void
-                            fill_periodic_tensor(
-                                tensorflow::OpKernelContext* context,
-                                const IndexVecT& desired_shape,
-                                const tensorflow::Tensor& input_tensor) {
-  // input is a strided array (last index is fastest, C-ordered)
-  auto input = input_tensor.flat<InputDataT>();
-  const int rank = input_tensor.dims();
-  // original and target dimensions
-  std::vector<tensorflow::int64> original_dimensions(rank),
-      target_dimensions(rank);
-  tensorflow::int64 total_size(input_tensor.NumElements()), new_sliced_size(1);
-  // factors by which original_dimensions increases/decreases w.r.t.
-  // target_dimensions
-  std::vector<tensorflow::int64> dimension_ceiling(rank),
-      cumulative_dimensions(rank);
-  // index of adjustable dimension
-  int adjustable_dimension;
-  tensorflow::TensorShape output_shape;
+std::vector<tensorflow::int64> InputIndexer::TensorShapeToVector(
+    const tensorflow::TensorShape& tensor_shape) {
+  std::vector<tensorflow::int64> result(tensor_shape.dims());
+  int count = 0;
+  for (const auto dim_info : tensor_shape) {
+    result[count] = dim_info.size;
+    ++count;
+  }
+  return result;
+}
 
-  // requires that the rank of the input tensor and length of the desired shape
-  // are equal
-  OP_REQUIRES(context, rank == desired_shape.size(),
-              tensorflow::errors::InvalidArgument(
-                  "periodic_resample expects the rank of the input tensor, ",
-                  rank, ", to be the same as the length of the desired shape, ",
-                  desired_shape.size(), "."));
+std::vector<tensorflow::int64> InputIndexer::ComputeDimensionCeiling(
+    const std::vector<tensorflow::int64>& output_dimensions,
+    const std::vector<tensorflow::int64>& input_dimensions) {
+  std::vector<tensorflow::int64> dimension_ceiling(input_dimensions.size());
+  for (size_t i = 0; i < input_dimensions.size(); ++i) {
+    dimension_ceiling[i] = (output_dimensions[i] + input_dimensions[i] - 1) /
+        input_dimensions[i];
+  }
+  return dimension_ceiling;
+}
 
-  bool found = false;
-  const auto& input_tensor_shape = input_tensor.shape();
+std::vector<tensorflow::int64> InputIndexer::ComputeCumulativeDimensions() {
+  std::vector<tensorflow::int64> cumulative_dimensions(rank_);
+  int count = 0;
+  for (int i = 0; i < rank_; ++i) {
+    if (count == 0) {
+      cumulative_dimensions[count] = 1;
+    } else {
+      cumulative_dimensions[count] =
+          cumulative_dimensions[count - 1] * dimension_ceiling_[count - 1];
+    }
+    ++count;
+  }
+  return cumulative_dimensions;
+}
 
+template <typename IndexVecT>
+void process_desired_shape(tensorflow::OpKernelContext* context,
+                           const tensorflow::TensorShape& input_tensor_shape,
+                           const IndexVecT& desired_shape,
+                           int* adjustable_dimension,
+                           std::vector<tensorflow::int64>* target_dimensions,
+                           tensorflow::int64* output_size) {
+  tensorflow::int64 new_sliced_size = 1;
+  bool found = false;
+  const int rank = input_tensor_shape.dims();
   for (int i = 0; i < rank; ++i) {
-    // if (desired_shape(i) < 1) {
     if (desired_shape[i] < 1) {
       // only one index can be adjustable
       OP_REQUIRES(context, !found,
                   tensorflow::errors::InvalidArgument(
                       "periodic_resample expects only "
                       "one index to be marked as adjustable."));
-      adjustable_dimension = i;
+      *adjustable_dimension = i;
       found = true;
     } else {
       OP_REQUIRES(
@@ -122,9 +232,8 @@ template <class InputDataT,
               i, " input tensor has size ", input_tensor_shape.dim_size(i),
               ", desired shape has size ", desired_shape[i], "."));
 
-      // target_dimensions[i] = desired_shape(i);
-      target_dimensions[i] = desired_shape[i];
-      new_sliced_size *= target_dimensions[i];
+      (*target_dimensions)[i] = desired_shape[i];
+      new_sliced_size *= (*target_dimensions)[i];
     }
   }
   // at least one index needs to be adjustable
@@ -132,26 +241,50 @@ template <class InputDataT,
               tensorflow::errors::InvalidArgument(
                   "periodic_resample expects at least "
                   "one index to be marked as adjustable."));
+  (*target_dimensions)[*adjustable_dimension] =
+      input_tensor_shape.num_elements() / new_sliced_size;
 
-  int count = 0;
-  for (const auto dim_info : input_tensor.shape()) {
-    original_dimensions[count] = dim_info.size;
-    ++count;
-  }
+  *output_size = new_sliced_size * (*target_dimensions)[*adjustable_dimension];
+}
 
-  target_dimensions[adjustable_dimension] = total_size / new_sliced_size;
+// Heuristic number based on measurements on
+// Intel(R) Core(TM) i7-4930K CPU @ 3.40GHz
+const tensorflow::int64 costPerFillIndex = 35;
 
-  count = 0;
-  for (int i = 0; i < input_tensor.shape().dims(); ++i) {
-    dimension_ceiling[count] = tensorflow::int64(std::ceil(
-        float(target_dimensions[count]) / float(original_dimensions[count])));
-    if (count == 0)
-      cumulative_dimensions[count] = 1;
-    else
-      cumulative_dimensions[count] =
-          cumulative_dimensions[count - 1] * dimension_ceiling[count - 1];
-    ++count;
-  }
+enum class Mode {
+  kForward,
+  kGradient
+};
+
+// Computes either periodic_resample operation output or gradients for it,
+// depending on |mode|.
+// |original_shape| is always shape of input to periodic_resample operation.
+// |source_tensor| is either source for periodic_resample (for forward mode)
+//     or gradients tensor.
+// |desired_shape| is always shape, provided by user, to which forward
+//     propagation attempts resample input tensor.
+template <class InputDataT, Mode mode>
+void
+do_periodic_resample_op(tensorflow::OpKernelContext* context,
+                        const tensorflow::TensorShape& original_shape,
+                        const tensorflow::PartialTensorShape& desired_shape,
+                        const tensorflow::Tensor& source_tensor) {
+  const int rank = source_tensor.dims();
+
+  // requires that the rank of the input tensor and length of the desired shape
+  // are equal
+  OP_REQUIRES(context, rank == desired_shape.dims(),
+              tensorflow::errors::InvalidArgument(
+                  "periodic_resample expects the rank of the input tensor, ",
+                  rank, ", to be the same as the length of the desired shape, ",
+                  desired_shape.dims(), "."));
+
+  std::vector<tensorflow::int64> target_dimensions(rank);
+  tensorflow::int64 new_size = 0;
+  // index of adjustable dimension
+  int adjustable_dimension = 0;
+  process_desired_shape(context, original_shape, desired_shape.dim_sizes(),
+                        &adjustable_dimension, &target_dimensions, &new_size);
 
   // ensure that the new dimension is greater than zero
   OP_REQUIRES(context, target_dimensions[adjustable_dimension] > 0,
@@ -160,11 +293,14 @@ template <class InputDataT,
                   "adjustable dimension, ",
                   adjustable_dimension, ", isn't greater than zero, ",
                   target_dimensions[adjustable_dimension], "."));
-  for (int i = 0; i < rank; ++i) {
-    output_shape.AddDim(target_dimensions[i]);
+  tensorflow::TensorShape output_shape;
+  if (mode == Mode::kForward) {
+    for (int i = 0; i < rank; ++i) {
+      output_shape.AddDim(target_dimensions[i]);
+    }
+  } else {
+    output_shape = original_shape;
   }
-  const auto new_size =
-      new_sliced_size * target_dimensions[adjustable_dimension];
 
   // Create an output tensor and attach it to the current context
   tensorflow::Tensor* output_tensor = nullptr;
@@ -172,47 +308,73 @@ template <class InputDataT,
                  context->allocate_output(0, output_shape, &output_tensor));
   auto output = output_tensor->flat<InputDataT>();
 
-  // memory is allocated for these variables outside the inner loop for
-  // efficiency (although, I could create a separate class scope for
-  // this purpose instead)
-  tensorflow::int64 result = 0;
-  std::vector<tensorflow::int64> output_indices(target_dimensions.size());
+  // input is a strided array (last index is fastest, C-ordered)
+  auto input = source_tensor.flat<InputDataT>();
 
   // Fill output tensor with periodically resampled input tensor values
-  for (tensorflow::int64 output_index = 0; output_index < new_size;
-       ++output_index) {
-    output(output_index) = input(compute_input_index(
-        &target_dimensions, output_index, original_dimensions,
-        adjustable_dimension, dimension_ceiling, cumulative_dimensions, &result,
-        &output_indices, rank));
-  }
+  InputIndexer input_indexer(target_dimensions, original_shape,
+                             adjustable_dimension);
+
+  auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+  auto fill_output_tensor = [&input_indexer, &output, &input](
+      tensorflow::int64 start, tensorflow::int64 limit) {
+    InputIndexer local_indexer(input_indexer);
+    local_indexer.MoveToOutputIndex(start);
+    for (tensorflow::int64 output_index = start; output_index < limit;
+         ++output_index) {
+      if (mode == Mode::kForward) {
+        output(output_index) = input(local_indexer.linear_input_index());
+      } else {
+        output(local_indexer.linear_input_index()) = input(output_index);
+      }
+      local_indexer.IncrementOutputIndex();
+    }
+  };
+  ::tensorflow::Shard(worker_threads.num_threads, worker_threads.workers,
+                      new_size, costPerFillIndex, fill_output_tensor);
 }
 
+#define DATA_TYPE_SWITCH(data_type, context, CASE)                            \
+  switch (data_type) {                                                        \
+    CASE(float)                                                               \
+    CASE(double)                                                              \
+    CASE(tensorflow::int32)                                                   \
+    CASE(tensorflow::int64)                                                   \
+    default:                                                                  \
+      context->CtxFailure(__FILE__, __LINE__,                                 \
+          tensorflow::errors::InvalidArgument(                                \
+              "Unsuppored tensor elements type"));                            \
+      break;                                                                  \
+  }
+
 void create_output_tensor(
     tensorflow::OpKernelContext* context,
     const tensorflow::Tensor& input_tensor,
     const tensorflow::DataType& input_tensor_type,
-    const tensorflow::PartialTensorShape& desired_shape_tensor) {
-  auto desired_shape = desired_shape_tensor.dim_sizes();
-
-  // obligatory type switch
-  switch (input_tensor_type) {
-    case tensorflow::DataTypeToEnum<float>::value:
-      fill_periodic_tensor<float>(context, desired_shape, input_tensor);
+    const tensorflow::PartialTensorShape& desired_shape) {
+#define CASE(type)                                                            \
+    case tensorflow::DataTypeToEnum<type>::value:                             \
+      do_periodic_resample_op<type, Mode::kForward>(                          \
+          context, input_tensor.shape(), desired_shape, input_tensor);        \
       break;
-    case tensorflow::DataTypeToEnum<double>::value:
-      fill_periodic_tensor<double>(context, desired_shape, input_tensor);
-      break;
-    case tensorflow::DataTypeToEnum<tensorflow::int32>::value:
-      fill_periodic_tensor<tensorflow::int32>(context, desired_shape,
-                                              input_tensor);
-      break;
-    case tensorflow::DataTypeToEnum<tensorflow::int64>::value:
-      fill_periodic_tensor<tensorflow::int64>(context, desired_shape,
-                                              input_tensor);
+
+  DATA_TYPE_SWITCH(input_tensor_type, context, CASE);
+#undef CASE
+}
+
+void create_grad_tensor(tensorflow::OpKernelContext* context,
+                        const tensorflow::Tensor& grad_tensor,
+                        const tensorflow::DataType& grad_tensor_type,
+                        const tensorflow::TensorShape& original_shape,
+                        const tensorflow::PartialTensorShape& desired_shape) {
+#define CASE(type)                                                            \
+    case tensorflow::DataTypeToEnum<type>::value:                             \
+      do_periodic_resample_op<type, Mode::kGradient>(                         \
+          context, original_shape, desired_shape, grad_tensor);               \
       break;
-    default:;
-  }
+
+  DATA_TYPE_SWITCH(grad_tensor_type, context, CASE);
+#undef CASE
 }
 
 }  // namespace
@@ -238,4 +400,25 @@ class PeriodicResampleOp : public tensorflow::OpKernel {
   tensorflow::PartialTensorShape desired_shape;
 };
 
+class PeriodicResampleOpGrad : public tensorflow::OpKernel {
+ public:
+  explicit PeriodicResampleOpGrad(tensorflow::OpKernelConstruction* context)
+      : tensorflow::OpKernel(context) {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("original_shape", &original_shape));
+    OP_REQUIRES_OK(context, context->GetAttr("desired_shape", &desired_shape));
+  }
+
+  void Compute(tensorflow::OpKernelContext* context) override {
+    const tensorflow::Tensor& grad_tensor = context->input(0);
+    const tensorflow::DataType grad_tensor_type = context->input_dtype(0);
+    create_grad_tensor(context, grad_tensor, grad_tensor_type, original_shape,
+                       desired_shape);
+  }
+
+ private:
+  tensorflow::TensorShape original_shape;
+  tensorflow::PartialTensorShape desired_shape;
+};
+
 #endif  // TENSORFLOW_KERNELS_PERIODICRESAMPLE_OP_H_
diff --git a/tensorflow/contrib/periodic_resample/ops/array_ops.cc b/tensorflow/contrib/periodic_resample/ops/array_ops.cc
index 82bd796956..fd38cd09b4 100644
--- a/tensorflow/contrib/periodic_resample/ops/array_ops.cc
+++ b/tensorflow/contrib/periodic_resample/ops/array_ops.cc
@@ -26,7 +26,42 @@ REGISTER_OP("PeriodicResample")
     .Input("values: T")
     .Attr("shape: shape")
     .Output("output: T")
-    .SetShapeFn(shape_inference::ExplicitShape)
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      tensorflow::PartialTensorShape desired_shape;
+      TF_RETURN_IF_ERROR(c->GetAttr("shape", &desired_shape));
+      shape_inference::ShapeHandle input_tensor_shape = c->input(0);
+      shape_inference::DimensionHandle num_input_elements =
+          c->NumElements(input_tensor_shape);
+      shape_inference::ShapeHandle result_shape_handle;
+      if (!shape_inference::InferenceContext::ValueKnown(num_input_elements)) {
+        TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
+            desired_shape, &result_shape_handle));
+      } else {
+        const int rank = c->Rank(input_tensor_shape);
+        std::vector<tensorflow::int64> target_dimensions(rank);
+        tensorflow::int64 new_sliced_size = 1;
+        int adjustable_dimension = 0;
+        for (int i = 0; i < rank; ++i) {
+          if (desired_shape.dim_size(i) < 1) {
+            adjustable_dimension = i;
+          } else {
+            target_dimensions[i] = desired_shape.dim_size(i);
+            new_sliced_size *= target_dimensions[i];
+          }
+        }
+        target_dimensions[adjustable_dimension] =
+            shape_inference::InferenceContext::Value(
+                num_input_elements) / new_sliced_size;
+        tensorflow::TensorShape result_shape;
+        for (int i = 0; i < rank; ++i) {
+          result_shape.AddDim(target_dimensions[i]);
+        }
+        TF_RETURN_IF_ERROR(c->MakeShapeFromTensorShape(
+            result_shape, &result_shape_handle));
+      }
+      c->set_output(0, result_shape_handle);
+      return Status::OK();
+    })
     .Doc(R"doc(
 Periodically resample elements of a tensor to conform to `shape`.
 
@@ -101,4 +136,20 @@ output: Periodically resampled tensor that has dimensions specified as in
 
 )doc");
 
+
+REGISTER_OP("PeriodicResampleOpGrad")
+    .Attr("T: numbertype")
+    .Input("grad: T")
+    .Attr("original_shape: shape")
+    .Attr("desired_shape: shape")
+    .Output("grad_values: T")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      tensorflow::TensorShape original_shape;
+      TF_RETURN_IF_ERROR(c->GetAttr("original_shape", &original_shape));
+      shape_inference::ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromTensorShape(original_shape, &s));
+      c->set_output(0, s);
+      return Status::OK();
+});
+
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/ops/array_ops_test.cc b/tensorflow/contrib/periodic_resample/ops/array_ops_test.cc
new file mode 100644
index 0000000000..43b7c1799f
--- /dev/null
+++ b/tensorflow/contrib/periodic_resample/ops/array_ops_test.cc
@@ -0,0 +1,41 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+TEST(ArrayOpsTest, PeriodicResample_ShapeFn) {
+  ShapeInferenceTestOp op("PeriodicResample");
+  // Case 1: output shape can be fully inferreed.
+  PartialTensorShape shape({4, 4, -1});
+  TensorShapeProto shape_proto;
+  shape.AsProto(&shape_proto);
+
+  TF_ASSERT_OK(NodeDefBuilder("test", "PeriodicResample")
+                   .Input({"values", 0, DT_INT32})
+                   .Attr("shape", shape_proto)
+                   .Finalize(&op.node_def));
+  INFER_OK(op, "[2,2,4]", "[4,4,1]");
+  // Case 2: output shape can not be inferred - report desired shape.
+  INFER_OK(op, "[2,2,?]", "[4,4,?]");
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py b/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
index a25de55e18..31a6fe1d94 100644
--- a/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
+++ b/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
@@ -21,8 +21,11 @@ from __future__ import print_function
 import numpy
 
 from tensorflow.contrib.periodic_resample import periodic_resample
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
@@ -93,7 +96,6 @@ class PeriodicResampleTest(test_util.TensorFlowTestCase):
   def testPeriodicResampleErrors(self):
     input_tensor = numpy.zeros(shape=[1, 2, 2, 4])
     with self.test_session():
-      variables.global_variables_initializer().run()
       with self.assertRaisesWithPredicateMatch(
           errors_impl.InvalidArgumentError,
           'Dimension 3 input tensor has size 4, desired shape has size 1'):
@@ -103,6 +105,29 @@ class PeriodicResampleTest(test_util.TensorFlowTestCase):
           '4, to be the same as the length of the desired shape, 3'):
         periodic_resample(input_tensor, [None, 4, 4]).eval()
 
+  def testPeriodicResampleGradient(self):
+    desired_shape = numpy.array([4, 4, None])
+    result_shape = (4, 4, 1)
+    input_shape = (2, 2, 4)
+    with self.test_session() as sess:
+      x = array_ops.placeholder(dtypes.float32, shape=input_shape)
+      output = periodic_resample(x, desired_shape)
+      error = gradient_checker.compute_gradient_error(
+          x, input_shape, output, result_shape)
+      self.assertLess(error, 1e-4)
+
+  def testPeriodicResampleShapeInference(self):
+    with self.test_session() as sess:
+      # Case 1: output shape can be fully inferreed.
+      x = array_ops.placeholder(dtypes.float32, shape=(2, 2, 4))
+      output = periodic_resample(x, [4, 4, None])
+      self.assertEqual(output.shape, [4, 4, 1])
+      # Case 2: output shape can not be inferred - report desired shape.
+      x = array_ops.placeholder(dtypes.float32, shape=(2, 2, None))
+      output = periodic_resample(x, [4, 4, None])
+      self.assertTrue(output.shape.is_compatible_with([4, 4, None]))
+      self.assertEqual(output.shape[2].value, None)
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py b/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
index 348623d8f8..470e300ccb 100644
--- a/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
+++ b/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
@@ -21,11 +21,17 @@ from __future__ import print_function
 # pylint: disable=unused-import
 from tensorflow.contrib.periodic_resample.python.ops import gen_periodic_resample_op
 
-from tensorflow.contrib.periodic_resample.python.ops.gen_periodic_resample_op import periodic_resample
+from tensorflow.contrib.periodic_resample.python.ops.gen_periodic_resample_op import periodic_resample, periodic_resample_op_grad
 
 from tensorflow.contrib.util import loader
+from tensorflow.python.framework import ops
 from tensorflow.python.platform import resource_loader
 # pylint: enable=unused-import
 
 _periodic_resample_op = loader.load_op_library(
     resource_loader.get_path_to_datafile('_periodic_resample_op.so'))
+
+@ops.RegisterGradient("PeriodicResample")
+def _periodic_resample_grad_cc(op, grad):
+  return periodic_resample_op_grad(
+      grad, op.inputs[0].shape, op.get_attr('shape'))
diff --git a/tensorflow/contrib/predictor/contrib_estimator_predictor.py b/tensorflow/contrib/predictor/contrib_estimator_predictor.py
index b7a98c68e2..af3b2ad1b5 100644
--- a/tensorflow/contrib/predictor/contrib_estimator_predictor.py
+++ b/tensorflow/contrib/predictor/contrib_estimator_predictor.py
@@ -34,7 +34,8 @@ class ContribEstimatorPredictor(predictor.Predictor):
                prediction_input_fn,
                input_alternative_key=None,
                output_alternative_key=None,
-               graph=None):
+               graph=None,
+               config=None):
     """Initialize a `ContribEstimatorPredictor`.
 
     Args:
@@ -48,6 +49,7 @@ class ContribEstimatorPredictor(predictor.Predictor):
         multi-headed models.
       graph: Optional. The Tensorflow `graph` in which prediction should be
         done.
+      config: `ConfigProto` proto used to configure the session.
     """
     self._graph = graph or ops.Graph()
     with self._graph.as_default():
@@ -58,6 +60,7 @@ class ContribEstimatorPredictor(predictor.Predictor):
       checkpoint_path = saver.latest_checkpoint(estimator.model_dir)
       self._session = monitored_session.MonitoredSession(
           session_creator=monitored_session.ChiefSessionCreator(
+              config=config,
               checkpoint_filename_with_path=checkpoint_path))
 
     input_alternative_key = (
diff --git a/tensorflow/contrib/predictor/core_estimator_predictor.py b/tensorflow/contrib/predictor/core_estimator_predictor.py
index d78d94c269..a725072e72 100644
--- a/tensorflow/contrib/predictor/core_estimator_predictor.py
+++ b/tensorflow/contrib/predictor/core_estimator_predictor.py
@@ -51,7 +51,8 @@ class CoreEstimatorPredictor(predictor.Predictor):
                estimator,
                serving_input_receiver_fn,
                output_key=None,
-               graph=None):
+               graph=None,
+               config=None):
     """Initialize a `CoreEstimatorPredictor`.
 
     Args:
@@ -62,6 +63,7 @@ class CoreEstimatorPredictor(predictor.Predictor):
         `None`, then `DEFAULT_SERVING_SIGNATURE_DEF_KEY` is used.
       graph: Optional. The Tensorflow `graph` in which prediction should be
         done.
+      config: `ConfigProto` proto used to configure the session.
     """
     self._graph = graph or ops.Graph()
     with self._graph.as_default():
@@ -71,6 +73,7 @@ class CoreEstimatorPredictor(predictor.Predictor):
       checkpoint_dir = estimator.model_dir
       self._session = monitored_session.MonitoredSession(
           session_creator=monitored_session.ChiefSessionCreator(
+              config=config,
               checkpoint_dir=checkpoint_dir))
 
     feed_tensor_info = signature_def.inputs
diff --git a/tensorflow/contrib/predictor/predictor_factories.py b/tensorflow/contrib/predictor/predictor_factories.py
index 6e77e934fe..f275bc15ad 100644
--- a/tensorflow/contrib/predictor/predictor_factories.py
+++ b/tensorflow/contrib/predictor/predictor_factories.py
@@ -30,7 +30,8 @@ def from_contrib_estimator(estimator,
                            prediction_input_fn,
                            input_alternative_key=None,
                            output_alternative_key=None,
-                           graph=None):
+                           graph=None,
+                           config=None):
   """Constructs a `Predictor` from a `tf.contrib.learn.Estimator`.
 
   Args:
@@ -44,6 +45,7 @@ def from_contrib_estimator(estimator,
       multi-headed models.
     graph: Optional. The Tensorflow `graph` in which prediction should be
       done.
+    config: `ConfigProto` proto used to configure the session.
 
   Returns:
     An initialized `Predictor`.
@@ -62,13 +64,15 @@ def from_contrib_estimator(estimator,
       prediction_input_fn,
       input_alternative_key=input_alternative_key,
       output_alternative_key=output_alternative_key,
-      graph=graph)
+      graph=graph,
+      config=config)
 
 
 def from_estimator(estimator,
                    serving_input_receiver_fn,
                    output_key=None,
-                   graph=None):
+                   graph=None,
+                   config=None):
   """Constructs a `Predictor` from a `tf.python.estimator.Estimator`.
 
   Args:
@@ -79,6 +83,7 @@ def from_estimator(estimator,
       `None`, then `DEFAULT_SERVING_SIGNATURE_DEF_KEY` is used.
     graph: Optional. The Tensorflow `graph` in which prediction should be
       done.
+    config: `ConfigProto` proto used to configure the session.
 
   Returns:
     An initialized `Predictor`.
@@ -93,14 +98,19 @@ def from_estimator(estimator,
                     'tf.contrib.learn.Estimator. You likely want to call '
                     'from_contrib_estimator.')
   return core_estimator_predictor.CoreEstimatorPredictor(
-      estimator, serving_input_receiver_fn, output_key=output_key, graph=graph)
+      estimator,
+      serving_input_receiver_fn,
+      output_key=output_key,
+      graph=graph,
+      config=config)
 
 
 def from_saved_model(export_dir,
                      signature_def_key=None,
                      signature_def=None,
                      tags=None,
-                     graph=None):
+                     graph=None,
+                     config=None):
   """Constructs a `Predictor` from a `SavedModel` on disk.
 
   Args:
@@ -115,6 +125,7 @@ def from_saved_model(export_dir,
       `SignatureDef`. Defaults to `DEFAULT_TAGS`.
     graph: Optional. The Tensorflow `graph` in which prediction should be
       done.
+    config: `ConfigProto` proto used to configure the session.
 
   Returns:
     An initialized `Predictor`.
@@ -128,4 +139,5 @@ def from_saved_model(export_dir,
       signature_def_key=signature_def_key,
       signature_def=signature_def,
       tags=tags,
-      graph=graph)
+      graph=graph,
+      config=config)
diff --git a/tensorflow/contrib/predictor/predictor_factories_test.py b/tensorflow/contrib/predictor/predictor_factories_test.py
index 578d9424b2..a2ef1dc3af 100644
--- a/tensorflow/contrib/predictor/predictor_factories_test.py
+++ b/tensorflow/contrib/predictor/predictor_factories_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.contrib.predictor import predictor_factories
 from tensorflow.contrib.predictor import testing_common
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.platform import test
 
 MODEL_DIR_NAME = 'contrib/predictor/test_export_dir'
@@ -41,6 +42,11 @@ class PredictorFactoriesTest(test.TestCase):
     """Test loading from_saved_model with tags."""
     predictor_factories.from_saved_model(self._export_dir, tags='serve')
 
+  def testFromSavedModelWithSessionConfig(self):
+    """Test loading from_saved_model with session config."""
+    predictor_factories.from_saved_model(
+        self._export_dir, config=config_pb2.ConfigProto())
+
   def testFromSavedModelWithBadTags(self):
     """Test that loading fails for bad tags."""
     bad_tags_regex = ('.*? could not be found in SavedModel')
@@ -53,6 +59,13 @@ class PredictorFactoriesTest(test.TestCase):
     predictor_factories.from_contrib_estimator(
         estimator, input_fn, output_alternative_key='sum')
 
+  def testFromContribEstimatorWithSessionConfig(self):
+    estimator = testing_common.get_arithmetic_estimator(core=False)
+    input_fn = testing_common.get_arithmetic_input_fn(core=False)
+    predictor_factories.from_contrib_estimator(
+        estimator, input_fn, output_alternative_key='sum',
+        config=config_pb2.ConfigProto())
+
   def testFromContribEstimatorWithCoreEstimatorRaises(self):
     estimator = testing_common.get_arithmetic_estimator(core=True)
     input_fn = testing_common.get_arithmetic_input_fn(core=True)
@@ -64,6 +77,12 @@ class PredictorFactoriesTest(test.TestCase):
     input_fn = testing_common.get_arithmetic_input_fn(core=True)
     predictor_factories.from_estimator(estimator, input_fn)
 
+  def testFromCoreEstimatorWithSessionConfig(self):
+    estimator = testing_common.get_arithmetic_estimator(core=True)
+    input_fn = testing_common.get_arithmetic_input_fn(core=True)
+    predictor_factories.from_estimator(
+        estimator, input_fn, config=config_pb2.ConfigProto())
+
   def testFromCoreEstimatorWithContribEstimatorRaises(self):
     estimator = testing_common.get_arithmetic_estimator(core=False)
     input_fn = testing_common.get_arithmetic_input_fn(core=False)
diff --git a/tensorflow/contrib/predictor/saved_model_predictor.py b/tensorflow/contrib/predictor/saved_model_predictor.py
index 0dbca0f813..95da6d04ed 100644
--- a/tensorflow/contrib/predictor/saved_model_predictor.py
+++ b/tensorflow/contrib/predictor/saved_model_predictor.py
@@ -121,7 +121,8 @@ class SavedModelPredictor(predictor.Predictor):
                input_names=None,
                output_names=None,
                tags=None,
-               graph=None):
+               graph=None,
+               config=None):
     """Initialize a `CoreEstimatorPredictor`.
 
     Args:
@@ -142,6 +143,7 @@ class SavedModelPredictor(predictor.Predictor):
         the correct `SignatureDef`. Defaults to `DEFAULT_TAGS`.
       graph: Optional. The Tensorflow `graph` in which prediction should be
         done.
+      config: `ConfigProto` proto used to configure the session.
     Raises:
       ValueError: If more than one of signature_def_key OR signature_def OR
         (input_names AND output_names) is specified.
@@ -152,7 +154,7 @@ class SavedModelPredictor(predictor.Predictor):
     self._graph = graph or ops.Graph()
 
     with self._graph.as_default():
-      self._session = session.Session()
+      self._session = session.Session(config=config)
       loader.load(self._session, tags.split(','), export_dir)
 
     if input_names is None:
diff --git a/tensorflow/contrib/quantize/README.md b/tensorflow/contrib/quantize/README.md
index c83623ec94..27a933c0f9 100644
--- a/tensorflow/contrib/quantize/README.md
+++ b/tensorflow/contrib/quantize/README.md
@@ -6,7 +6,7 @@ inference. The details of the transformation implemented in this package is
 described here [1].
 
 This is done using the
-[fake quantization op](https://www.tensorflow.org/versions/r0.12/api_docs/python/array_ops/fake_quantization).
+[fake quantization op](https://www.tensorflow.org/api_guides/python/array_ops#Fake_quantization).
 
 Literature has shown that fixed point networks provide comparable performance to
 floating point networks [2]. This is achieved by modeling the quantization
diff --git a/tensorflow/contrib/slim/python/slim/evaluation_test.py b/tensorflow/contrib/slim/python/slim/evaluation_test.py
index 94fc12ca81..3d0308aaf3 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation_test.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation_test.py
@@ -26,7 +26,6 @@ import time
 import numpy as np
 
 from tensorflow.contrib.framework.python.ops import variables as variables_lib
-from tensorflow.contrib.metrics.python.ops import metric_ops
 from tensorflow.contrib.slim.python.slim import evaluation
 from tensorflow.contrib.training.python.training import evaluation as evaluation_lib
 from tensorflow.core.protobuf import saver_pb2
@@ -37,6 +36,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import flags
 from tensorflow.python.platform import gfile
@@ -89,8 +89,8 @@ class EvaluationTest(test.TestCase):
     self._predictions, self._scale = TestModel(self._inputs)
 
   def testFinalOpsOnEvaluationLoop(self):
-    value_op, update_op = metric_ops.streaming_accuracy(self._predictions,
-                                                        self._labels)
+    value_op, update_op = metrics.accuracy(
+        labels=self._labels, predictions=self._predictions)
     init_op = control_flow_ops.group(variables.global_variables_initializer(),
                                      variables.local_variables_initializer())
     # Create checkpoint and log directories:
@@ -136,9 +136,10 @@ class EvaluationTest(test.TestCase):
     self.assertTrue(obj.hook_was_run)
 
   def _create_names_to_metrics(self, predictions, labels):
-    accuracy0, update_op0 = metric_ops.streaming_accuracy(predictions, labels)
-    accuracy1, update_op1 = metric_ops.streaming_accuracy(predictions + 1,
-                                                          labels)
+    accuracy0, update_op0 = metrics.accuracy(
+        labels=labels, predictions=predictions)
+    accuracy1, update_op1 = metrics.accuracy(
+        labels=labels, predictions=predictions + 1)
 
     names_to_values = {'Accuracy': accuracy0, 'Another_accuracy': accuracy1}
     names_to_updates = {'Accuracy': update_op0, 'Another_accuracy': update_op1}
@@ -198,8 +199,8 @@ class EvaluationTest(test.TestCase):
     predictions_limited = input.limit_epochs(self._predictions, num_epochs=1)
     labels_limited = input.limit_epochs(self._labels, num_epochs=1)
 
-    value_op, update_op = metric_ops.streaming_accuracy(
-        predictions_limited, labels_limited)
+    value_op, update_op = metrics.accuracy(
+        labels=labels_limited, predictions=predictions_limited)
 
     init_op = control_flow_ops.group(variables.global_variables_initializer(),
                                      variables.local_variables_initializer())
@@ -260,8 +261,8 @@ class SingleEvaluationTest(test.TestCase):
     self._prepareCheckpoint(checkpoint_path)
 
     # Next, determine the metric to evaluate:
-    value_op, update_op = metric_ops.streaming_accuracy(self._predictions,
-                                                        self._labels)
+    value_op, update_op = metrics.accuracy(
+        labels=self._labels, predictions=self._predictions)
 
     # Run the evaluation and verify the results:
     accuracy_value = evaluation.evaluate_once(
@@ -276,8 +277,8 @@ class SingleEvaluationTest(test.TestCase):
     self._prepareCheckpoint(checkpoint_path)
 
     # Next, determine the metric to evaluate:
-    value_op, update_op = metric_ops.streaming_accuracy(self._predictions,
-                                                        self._labels)
+    value_op, update_op = metrics.accuracy(
+        labels=self._labels, predictions=self._predictions)
 
     dumping_root = os.path.join(self.get_temp_dir(), 'tfdbg_dump_dir')
     dumping_hook = hooks.DumpingDebugHook(dumping_root, log_usage=False)
diff --git a/tensorflow/contrib/summary/summary.py b/tensorflow/contrib/summary/summary.py
index 99ced53e11..d22b80ac88 100644
--- a/tensorflow/contrib/summary/summary.py
+++ b/tensorflow/contrib/summary/summary.py
@@ -21,6 +21,7 @@ from @{tf.summary.merge_all} to @{tf.summary.FileWriter}.
 
 To use with eager execution enabled, write your code as follows:
 
+```python
 global_step = tf.train.get_or_create_global_step()
 summary_writer = tf.contrib.summary.create_file_writer(
     train_dir, flush_millis=10000)
@@ -30,9 +31,11 @@ with summary_writer.as_default(), tf.contrib.summary.always_record_summaries():
   tf.contrib.summary.scalar("loss", my_loss)
   # In this case every call to tf.contrib.summary.scalar will generate a record
   # ...
+```
 
 To use it with graph execution, write your code as follows:
 
+```python
 global_step = tf.train.get_or_create_global_step()
 summary_writer = tf.contrib.summary.create_file_writer(
     train_dir, flush_millis=10000)
@@ -53,7 +56,7 @@ with tf.Session(...) as sess:
   while not_done_training:
     sess.run([train_op, tf.contrib.summary.all_summary_ops()])
     # ...
-
+```
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/contrib/tensor_forest/client/eval_metrics.py b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
index e893e1d1c8..d8236a0a6f 100644
--- a/tensorflow/contrib/tensor_forest/client/eval_metrics.py
+++ b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
@@ -21,10 +21,10 @@ import numpy as np
 
 from tensorflow.contrib import losses
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
-from tensorflow.contrib.metrics.python.ops import metric_ops
 
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics
 from tensorflow.python.ops import nn
 
 INFERENCE_PROB_NAME = prediction_key.PredictionKey.PROBABILITIES
@@ -38,12 +38,13 @@ def _top_k_generator(k):
     targets = math_ops.to_int32(targets)
     if targets.get_shape().ndims > 1:
       targets = array_ops.squeeze(targets, axis=[1])
-    return metric_ops.streaming_mean(nn.in_top_k(probabilities, targets, k))
+    return metrics.mean(nn.in_top_k(probabilities, targets, k))
   return _top_k
 
 
 def _accuracy(predictions, targets, weights=None):
-  return metric_ops.streaming_accuracy(predictions, targets, weights=weights)
+  return metrics.accuracy(
+      labels=targets, predictions=predictions, weights=weights)
 
 
 def _r2(probabilities, targets, weights=None):
@@ -53,7 +54,7 @@ def _r2(probabilities, targets, weights=None):
   squares_residuals = math_ops.reduce_sum(
       math_ops.square(targets - probabilities), 0)
   score = 1 - math_ops.reduce_sum(squares_residuals / squares_total)
-  return metric_ops.streaming_mean(score, weights=weights)
+  return metrics.mean(score, weights=weights)
 
 
 def _squeeze_and_onehot(targets, depth):
@@ -62,7 +63,7 @@ def _squeeze_and_onehot(targets, depth):
 
 
 def _sigmoid_entropy(probabilities, targets, weights=None):
-  return metric_ops.streaming_mean(
+  return metrics.mean(
       losses.sigmoid_cross_entropy(probabilities,
                                    _squeeze_and_onehot(
                                        targets,
@@ -71,7 +72,7 @@ def _sigmoid_entropy(probabilities, targets, weights=None):
 
 
 def _softmax_entropy(probabilities, targets, weights=None):
-  return metric_ops.streaming_mean(
+  return metrics.mean(
       losses.sparse_softmax_cross_entropy(probabilities,
                                           math_ops.to_int32(targets)),
       weights=weights)
@@ -82,7 +83,7 @@ def _predictions(predictions, unused_targets, **unused_kwargs):
 
 
 def _class_log_loss(probabilities, targets, weights=None):
-  return metric_ops.streaming_mean(
+  return metrics.mean(
       losses.log_loss(probabilities,
                       _squeeze_and_onehot(targets,
                                           array_ops.shape(probabilities)[1])),
@@ -90,34 +91,36 @@ def _class_log_loss(probabilities, targets, weights=None):
 
 
 def _precision(predictions, targets, weights=None):
-  return metric_ops.streaming_precision(predictions, targets, weights=weights)
+  return metrics.precision(
+      labels=targets, predictions=predictions, weights=weights)
 
 
 def _precision_at_thresholds(predictions, targets, weights=None):
-  return metric_ops.streaming_precision_at_thresholds(
-      array_ops.slice(predictions, [0, 1], [-1, 1]),
-      targets,
-      np.arange(
-          0, 1, 0.01, dtype=np.float32),
+  return metrics.precision_at_thresholds(
+      labels=targets,
+      predictions=array_ops.slice(predictions, [0, 1], [-1, 1]),
+      thresholds=np.arange(0, 1, 0.01, dtype=np.float32),
       weights=weights)
 
 
 def _recall(predictions, targets, weights=None):
-  return metric_ops.streaming_recall(predictions, targets, weights=weights)
+  return metrics.recall(
+      labels=targets, predictions=predictions, weights=weights)
 
 
 def _recall_at_thresholds(predictions, targets, weights=None):
-  return metric_ops.streaming_recall_at_thresholds(
-      array_ops.slice(predictions, [0, 1], [-1, 1]),
-      targets,
-      np.arange(
-          0, 1, 0.01, dtype=np.float32),
+  return metrics.recall_at_thresholds(
+      labels=targets,
+      predictions=array_ops.slice(predictions, [0, 1], [-1, 1]),
+      thresholds=np.arange(0, 1, 0.01, dtype=np.float32),
       weights=weights)
 
 
 def _auc(probs, targets, weights=None):
-  return metric_ops.streaming_auc(array_ops.slice(probs, [0, 1], [-1, 1]),
-                                  targets, weights=weights)
+  return metrics.auc(
+      labels=targets,
+      predictions=array_ops.slice(probs, [0, 1], [-1, 1]),
+      weights=weights)
 
 
 _EVAL_METRICS = {
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest.py b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
index 7a35a70bbe..6f62cd11a9 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
@@ -295,7 +295,7 @@ def get_epoch_variable():
 
 
 # A simple container to hold the training variables for a single tree.
-class TreeTrainingVariables(object):
+class TreeVariables(object):
   """Stores tf.Variables for training a single random tree.
 
   Uses tf.get_variable to get tree-specific names so that this can be used
@@ -303,7 +303,7 @@ class TreeTrainingVariables(object):
   then relies on restoring that model to evaluate).
   """
 
-  def __init__(self, params, tree_num, training):
+  def __init__(self, params, tree_num, training, tree_config='', tree_stat=''):
     if (not hasattr(params, 'params_proto') or
         not isinstance(params.params_proto,
                        _params_proto.TensorForestParams)):
@@ -315,27 +315,28 @@ class TreeTrainingVariables(object):
       # TODO(gilberth): Manually shard this to be able to fit it on
       # multiple machines.
       self.stats = stats_ops.fertile_stats_variable(
-          params, '', self.get_tree_name('stats', tree_num))
+          params, tree_stat, self.get_tree_name('stats', tree_num))
     self.tree = model_ops.tree_variable(
-        params, '', self.stats, self.get_tree_name('tree', tree_num))
+        params, tree_config, self.stats, self.get_tree_name('tree', tree_num))
 
   def get_tree_name(self, name, num):
     return '{0}-{1}'.format(name, num)
 
 
-class ForestTrainingVariables(object):
+class ForestVariables(object):
   """A container for a forests training data, consisting of multiple trees.
 
-  Instantiates a TreeTrainingVariables object for each tree. We override the
+  Instantiates a TreeVariables object for each tree. We override the
   __getitem__ and __setitem__ function so that usage looks like this:
 
-    forest_variables = ForestTrainingVariables(params)
+    forest_variables = ForestVariables(params)
 
     ... forest_variables.tree ...
   """
 
   def __init__(self, params, device_assigner, training=True,
-               tree_variables_class=TreeTrainingVariables):
+               tree_variables_class=TreeVariables,
+               tree_configs=None, tree_stats=None):
     self.variables = []
     # Set up some scalar variables to run through the device assigner, then
     # we can use those to colocate everything related to a tree.
@@ -347,7 +348,13 @@ class ForestTrainingVariables(object):
 
     for i in range(params.num_trees):
       with ops.device(self.device_dummies[i].device):
-        self.variables.append(tree_variables_class(params, i, training))
+        kwargs = {}
+        if tree_configs is not None:
+          kwargs.update(dict(tree_config=tree_configs[i]))
+        if tree_stats is not None:
+          kwargs.update(dict(tree_stat=tree_stats[i]))
+        self.variables.append(tree_variables_class(
+            params, i, training, **kwargs))
 
   def __setitem__(self, t, val):
     self.variables[t] = val
@@ -361,9 +368,11 @@ class RandomForestGraphs(object):
 
   def __init__(self,
                params,
+               tree_configs=None,
+               tree_stats=None,
                device_assigner=None,
                variables=None,
-               tree_variables_class=TreeTrainingVariables,
+               tree_variables_class=TreeVariables,
                tree_graphs=None,
                training=True):
     self.params = params
@@ -371,9 +380,10 @@ class RandomForestGraphs(object):
         device_assigner or framework_variables.VariableDeviceChooser())
     logging.info('Constructing forest with params = ')
     logging.info(self.params.__dict__)
-    self.variables = variables or ForestTrainingVariables(
+    self.variables = variables or ForestVariables(
         self.params, device_assigner=self.device_assigner, training=training,
-        tree_variables_class=tree_variables_class)
+        tree_variables_class=tree_variables_class,
+        tree_configs=tree_configs, tree_stats=tree_stats)
     tree_graph_class = tree_graphs or RandomTreeGraphs
     self.trees = [
         tree_graph_class(self.variables[i], self.params, i)
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py b/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
index bbe627b157..1c9c81827e 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
@@ -18,10 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from google.protobuf.json_format import ParseDict
+from tensorflow.contrib.decision_trees.proto import generic_tree_model_pb2 as _tree_proto
 from tensorflow.contrib.tensor_forest.python import tensor_forest
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import resources
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
@@ -110,6 +114,47 @@ class TensorForestTest(test_util.TensorFlowTestCase):
     self.assertTrue(isinstance(paths, ops.Tensor))
     self.assertTrue(isinstance(var, ops.Tensor))
 
+  def testInfrenceFromRestoredModel(self):
+    input_data = [[-1., 0.], [-1., 2.],  # node 1
+                  [1., 0.], [1., -2.]]  # node 2
+    expected_prediction = [[0.0, 1.0], [0.0, 1.0],
+                           [0.0, 1.0], [0.0, 1.0]]
+    hparams = tensor_forest.ForestHParams(
+        num_classes=2,
+        num_features=2,
+        num_trees=1,
+        max_nodes=1000,
+        split_after_samples=25).fill()
+    tree_weight = {'decisionTree':
+                       {'nodes':
+                        [{'binaryNode':
+                          {'rightChildId': 2,
+                           'leftChildId': 1,
+                           'inequalityLeftChildTest':
+                           {'featureId': {'id': '0'},
+                            'threshold': {'floatValue': 0}}}},
+                         {'leaf': {'vector':
+                                   {'value': [{'floatValue': 0.0},
+                                              {'floatValue': 1.0}]}},
+                          'nodeId': 1},
+                         {'leaf': {'vector':
+                                   {'value': [{'floatValue': 0.0},
+                                              {'floatValue': 1.0}]}},
+                          'nodeId': 2}]}}
+    restored_tree_param = ParseDict(tree_weight,
+                                    _tree_proto.Model()).SerializeToString()
+    graph_builder = tensor_forest.RandomForestGraphs(hparams,
+                                                     [restored_tree_param])
+    probs, paths, var = graph_builder.inference_graph(input_data)
+    self.assertTrue(isinstance(probs, ops.Tensor))
+    self.assertTrue(isinstance(paths, ops.Tensor))
+    self.assertTrue(isinstance(var, ops.Tensor))
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      resources.initialize_resources(resources.shared_resources()).run()
+      self.assertEquals(probs.eval().shape, (4, 2))
+      self.assertEquals(probs.eval().tolist(), expected_prediction)
+
   def testTrainingConstructionClassificationSparse(self):
     input_data = sparse_tensor.SparseTensor(
         indices=[[0, 0], [0, 3], [1, 0], [1, 7], [2, 1], [3, 9]],
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index b7b26cfb1c..da4dd5a14c 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -91,8 +91,11 @@ void GetSubGraphIncomingEdges(const tensorflow::Graph& graph,
       if (!subgraph_node_ids.count(edge->src()->id()) &&
           !edge->src()->IsSource() && !edge->IsControlEdge()) {
         incoming_edges->insert(edge);
+        VLOG(2) << "INCOMING " << edge->src()->name() << " -> " << node->name()
+                << " Y, ";
       } else {
-        VLOG(2) << node->name() << " -> " << edge->src()->name() << " N, ";
+        VLOG(2) << "INCOMING " << edge->src()->name() << " -> " << node->name()
+                << " N, ";
       }
     }
   }
@@ -106,10 +109,12 @@ void GetSubGraphOutgoingEdges(const tensorflow::Graph& graph,
     for (const tensorflow::Edge* edge : node->out_edges()) {
       if (!subgraph_node_ids.count(edge->dst()->id()) &&
           !edge->dst()->IsSink() && !edge->IsControlEdge()) {
-        VLOG(2) << node->name() << " -> " << edge->dst()->name() << " Y, ";
+        VLOG(2) << "OUTGOING " << node->name() << " -> " << edge->dst()->name()
+                << " Y, ";
         outgoing_edges->insert(edge);
       } else {
-        VLOG(2) << node->name() << " -> " << edge->dst()->name() << " N, ";
+        VLOG(2) << "OUTGOING " << node->name() << " -> " << edge->dst()->name()
+                << " N, ";
       }
     }
   }
@@ -181,29 +186,27 @@ struct ConvertGraphParams {
 static tensorflow::Status FillSubGraphEdgeSets(ConvertGraphParams* p) {
   GetSubGraphIncomingEdges(p->graph, p->subgraph_node_ids,
                            &p->subgraph_incoming_edges);
+
+  std::set<std::pair<int, int>> unique_tensors;
+  // Add only unique input source nodes. If output of an outside node is shared
+  // between multiple nodes inside the engine, only one edge should be created
   for (const tensorflow::Edge* edge : p->subgraph_incoming_edges) {
-    p->subgraph_inputs.push_back({edge->src()->id(), edge->src_output()});
-  }
-  auto output_name_to_index_map = BuildTensorNameMap(p->output_names);
-  std::set<std::pair<int, int>> subgraph_outputs_set;
-  // Collect outputs referenced from output_names
-  for (int node_id : p->subgraph_node_ids) {
-    tensorflow::Node* node = p->graph.FindNodeId(node_id);
-    if (output_name_to_index_map.count(node->name())) {
-      for (int index : output_name_to_index_map.at(node->name())) {
-        subgraph_outputs_set.insert({node_id, index});
-      }
-    }
+    unique_tensors.insert({edge->src()->id(), edge->src_output()});
   }
+  p->subgraph_inputs.insert(p->subgraph_inputs.begin(), unique_tensors.begin(),
+                            unique_tensors.end());
   GetSubGraphOutgoingEdges(p->graph, p->subgraph_node_ids,
                            &p->subgraph_outgoing_edges);
+  unique_tensors.clear();
+  // Similar to above, if multiple ouside nodes are sharing the output of an
+  // internal node only one output port should be created and shared between
+  // outputs
   for (const tensorflow::Edge* edge : p->subgraph_outgoing_edges) {
-    subgraph_outputs_set.insert({edge->src()->id(), edge->src_output()});
+    unique_tensors.insert({edge->src()->id(), edge->src_output()});
   }
-  p->subgraph_outputs.reserve(subgraph_outputs_set.size());
+  p->subgraph_outputs.reserve(unique_tensors.size());
   p->subgraph_outputs.insert(p->subgraph_outputs.begin(),
-                             subgraph_outputs_set.begin(),
-                             subgraph_outputs_set.end());
+                             unique_tensors.begin(), unique_tensors.end());
   return tensorflow::Status::OK();
 }
 
@@ -225,7 +228,6 @@ tensorflow::Status GetCalibNode(ConvertGraphParams* params) {
   for (auto in_edge :
        params->subgraph_incoming_edges) {  // loop over incoming edges and
                                            // attach them to calib node
-    // tensorflow::Node* src_node = in_edge->src();
     auto src_output = in_edge->src_output();
     auto dst_node = in_edge->dst();
     auto dst_input = in_edge->dst_input();
@@ -257,19 +259,24 @@ tensorflow::Status ConvertSubGraphToTensorRT(ConvertGraphParams* params) {
   for (size_t i = 0; i < params->subgraph_inputs.size(); ++i) {
     subgraph_edge_to_input_map.insert({params->subgraph_inputs.at(i), i});
   }
+  std::set<std::pair<int, int>> unique_tensors;
   for (const tensorflow::Edge* edge : params->subgraph_incoming_edges) {
     std::pair<int, int> old_src = {edge->src()->id(), edge->src_output()};
+    if (unique_tensors.count(old_src)) continue;
+    unique_tensors.insert(old_src);
     int new_src_output = subgraph_edge_to_input_map.at(old_src);
     params->graph.AddEdge(edge->src(), edge->src_output(), trt_node,
                           new_src_output);
+    VLOG(1) << "Wire " << edge->src()->name() << ":" << edge->src_output()
+            << " -> " << trt_node->name() << ":" << new_src_output;
     params->graph.RemoveEdge(edge);
   }
-
-  VLOG(2) << "new wiring edges: " << trt_node->in_edges().size();
-  for (const tensorflow::Edge* edge : trt_node->in_edges()) {
-    VLOG(2) << edge->src()->name() << " port: " << edge->src_output();
+  if (VLOG_IS_ON(2)) {
+    VLOG(2) << "new edge count: " << trt_node->in_edges().size();
+    for (const tensorflow::Edge* edge : trt_node->in_edges()) {
+      VLOG(2) << edge->src()->name() << " port: " << edge->src_output();
+    }
   }
-
   TF_RETURN_IF_ERROR(status);
 
   // Re-map outgoing edges to use the new TRT node instead of the orig subgraph
@@ -283,6 +290,8 @@ tensorflow::Status ConvertSubGraphToTensorRT(ConvertGraphParams* params) {
     int new_src_output = subgraph_edge_to_output_map.at(old_src);
     TF_RETURN_IF_ERROR(params->graph.UpdateEdge(
         trt_node, new_src_output, edge->dst(), edge->dst_input()));
+    VLOG(1) << "Wire " << trt_node->name() << ":" << new_src_output << " -> "
+            << edge->dst()->name() << ":" << edge->dst_input();
   }
   // Remove the original subgraph
   for (int node_id : params->subgraph_node_ids) {
@@ -317,9 +326,12 @@ tensorflow::Status ConvertCalibGraphToInferGraph(
       tensorflow::GraphConstructorOptions(), graph_def, &graph));
   //  get calib nodes
   std::vector<tensorflow::Node*> calib_nodes;
-  for (auto node : graph.op_nodes()) {
+  std::vector<tensorflow::Node*> topo_order;
+  tensorflow::GetPostOrder(graph, &topo_order);
+  for (auto rit = topo_order.rbegin(); rit != topo_order.rend(); ++rit) {
+    auto node = *rit;
     if (node->type_string() == "TRTCalibOp") {
-      VLOG(1) << "Found Calib Node";
+      VLOG(1) << "Found Calib Node " << node->name();
       calib_nodes.push_back(node);
     }
   }
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 96e0700862..4e4d295538 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -362,10 +362,11 @@ void ReorderCKtoKC(const TRT_ShapedWeights& iweights,
       break;
     }
     case tensorflow::DataType::DT_HALF: {
-      Reorder2({k, c}, static_cast<Eigen::half const*>(iweights.GetValues()),
-               istrides, static_cast<Eigen::half*>(
-                             const_cast<void*>(oweights->GetValues())),
-               ostrides);
+      Reorder2(
+          {k, c}, static_cast<Eigen::half const*>(iweights.GetValues()),
+          istrides,
+          static_cast<Eigen::half*>(const_cast<void*>(oweights->GetValues())),
+          ostrides);
       break;
     }
     default:
@@ -1179,9 +1180,9 @@ tensorflow::Status BinaryTensorOpTensor(
   CHECK_EQ_TYPE(tensor_r->getType(), dtype);
   auto op_pair = ops.find(node_def.op());
   if (op_pair == ops.end())
-    return tensorflow::errors::Unimplemented("binary op: " + node_def.op() +
-                                             " not supported at: " +
-                                             node_def.name());
+    return tensorflow::errors::Unimplemented(
+        "binary op: " + node_def.op() +
+        " not supported at: " + node_def.name());
 
   nvinfer1::IElementWiseLayer* layer = ctx.network()->addElementWise(
       *const_cast<nvinfer1::ITensor*>(tensor_l),
@@ -2138,9 +2139,7 @@ void Converter::register_op_converters() {
 }
 
 }  // namespace
-tensorflow::Status GetTensorRTGraph(tensorrt::convert::SubGraphParams& s) {
-  return tensorflow::errors::Unimplemented("Not implemented yet");
-}
+
 tensorflow::Status ConvertCalibrationNodeToEngineNode(
     tensorflow::Graph& graph, tensorflow::Node* c_node) {
   const auto ndef = c_node->def();
@@ -2164,9 +2163,23 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   for (auto n : graph.op_nodes()) {
     node_maps.insert({n->name(), n});
   }
+  std::set<int> subgraph_ids;
+  for (const auto internal_node : segment_nodes) {
+    subgraph_ids.insert(node_maps.at(internal_node)->id());
+  }
+  if (VLOG_IS_ON(2)) {
+    string node_names = StrCat(c_node->name(), " segment nodes= ");
+
+    for (const auto& node_name : segment_nodes) {
+      StrAppend(&node_names, node_name, ", ");
+    }
+    VLOG(2) << node_names;
+  }
+
   VLOG(1) << "Output Nodes:";
   std::vector<tensorflow::DataType> out_types;
   std::vector<const tensorflow::Edge*> out_edges;
+
   for (auto& i : output_nodes) {
     auto node_port = tensorflow::str_util::Split(i, ":");
     VLOG(1) << " " << i << " in graph " << node_maps.count(i);
@@ -2186,18 +2199,24 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
         out_types.push_back(out_node->output_type(0));
       }
       for (auto out_edge : out_node->out_edges()) {
+        if (subgraph_ids.count(out_edge->dst()->id()))
+          continue;  // skip internal edges;
         if (out_edge->src_output() == port) {
           out_edges.push_back(out_edge);
-          break;
+          VLOG(1) << "OUTPUT EDGE " << out_edge->src()->name() << ":"
+                  << out_edge->src_output() << " -> " << out_edge->dst()->name()
+                  << ":" << out_edge->dst_input();
         }
       }
     } else {
       LOG(WARNING) << " couldn't find output node " << out_node_name;
     }
   }
-  VLOG(1) << "Input Nodes:";
-  for (auto& i : input_names) {
-    VLOG(1) << " " << i << " in graph " << node_maps.count(i);
+  if (VLOG_IS_ON(1)) {
+    VLOG(1) << c_node->name() << " Input Nodes:";
+    for (auto& i : input_names) {
+      VLOG(1) << " Input " << i << " in graph " << node_maps.count(i);
+    }
   }
   auto trt_rm = tensorflow::tensorrt::TRTResourceManager::instance();
   auto resmgr = trt_rm->getManager("TRTCalibOps");
@@ -2231,14 +2250,24 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   calib_res->builder_ = nullptr;
   tensorflow::NodeDefBuilder op_builder(engine_name, "TRTEngineOp");
   std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
+  income_edges.resize(c_node->num_inputs());
   for (const auto in_edge : c_node->in_edges()) {
     auto src = in_edge->src();
     int dest_port = in_edge->dst_input();
-    income_edges.emplace_back(src->name(), in_edge->src_output(),
-                              c_node->input_type(dest_port));
+    VLOG(1) << "Incoming connection " << src->name() << ":"
+            << in_edge->src_output() << " -> " << c_node->name() << ":"
+            << dest_port;
+    income_edges.at(dest_port) = {src->name(), in_edge->src_output(),
+                                  c_node->input_type(dest_port)};
   }
   tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
       income_edges);
+  if (VLOG_IS_ON(2)) {
+    for (const auto& inp : input_list) {
+      VLOG(2) << " Input from inputlist " << inp.node << ":" << inp.index << " "
+              << tensorflow::DataTypeString(inp.data_type);
+    }
+  }
   op_builder.Input(input_list);
   tensorflow::NodeDef engine_node;
   const char* engine_plan_data = static_cast<const char*>(engine_plan->data());
@@ -2255,13 +2284,26 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   }
   auto trt_engine_node = graph.AddNode(engine_node, &status);
   TF_RETURN_IF_ERROR(status);
-  for (size_t i = 0; i < out_edges.size(); i++) {
-    VLOG(1) << "Connecting trt_engine_node output " << i << " with "
-            << out_edges.at(i)->dst()->name() << " port "
-            << out_edges.at(i)->dst_input();
-    TF_RETURN_IF_ERROR(graph.UpdateEdge(trt_engine_node, i,
-                                        out_edges.at(i)->dst(),
-                                        out_edges.at(i)->dst_input()));
+  std::map<string, int> port_map;
+  for (size_t t = 0; t < output_nodes.size(); t++) {
+    port_map.insert({output_nodes.at(t), t});
+  }
+  for (auto& i : out_edges) {
+    string s(i->src()->name());
+    if (i->src_output()) StrAppend(&s, ":", i->src_output());
+    int out_port = port_map.at(s);
+    VLOG(1) << "Connecting " << trt_engine_node->name() << ":" << out_port
+            << " -> " << i->dst()->name() << ":" << i->dst_input();
+    TF_RETURN_IF_ERROR(
+        graph.UpdateEdge(trt_engine_node, out_port, i->dst(), i->dst_input()));
+  }
+  for (const auto ed : trt_engine_node->in_edges()) {
+    VLOG(1) << "In Edge  " << ed->src()->name() << ":" << ed->src_output()
+            << " -> " << ed->dst()->name() << ":" << ed->dst_input();
+  }
+  for (const auto ed : trt_engine_node->out_edges()) {
+    VLOG(1) << "Out Edge " << ed->src()->name() << ":" << ed->src_output()
+            << " -> " << ed->dst()->name() << ":" << ed->dst_input();
   }
   VLOG(1) << "Segment nodes:";
   for (auto& i : segment_nodes) {
@@ -2332,6 +2374,7 @@ tensorflow::Status ConvertSubgraph(
     std::vector<string>* output_names,
     std::vector<tensorflow::DataType>* output_dtypes,
     const string& engine_name) {
+  std::set<string> added_tensors;
   for (const std::pair<int, int>& input : s.input_inds) {
     VLOG(2) << "parsing input. Node id= " << input.first;
     int node_id = input.first;
@@ -2374,7 +2417,6 @@ tensorflow::Status ConvertSubgraph(
 
     auto op_info = op_info_vec.at(shape_inference_output_idx);
     tensorflow::DataType tf_dtype = op_info.dtype();
-    input_dtypes->push_back(tf_dtype);
 
     nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
     auto type_status = ConvertDType(tf_dtype, &dtype);
@@ -2410,8 +2452,10 @@ tensorflow::Status ConvertSubgraph(
     if (output_idx != 0) {
       input_tensor_name = StrCat(node_name, ":", output_idx);
     }
-
+    if (added_tensors.count(input_tensor_name)) continue;
+    added_tensors.insert(input_tensor_name);
     input_names->push_back(input_tensor_name);
+    input_dtypes->push_back(tf_dtype);
     nvinfer1::ITensor* input_tensor = converter.network()->addInput(
         input_tensor_name.c_str(), dtype, input_dim_pseudo_chw);
 
@@ -2435,6 +2479,7 @@ tensorflow::Status ConvertSubgraph(
 
   // Gather output metadata
   int trt_engine_op_output_idx = 0;
+  added_tensors.clear();
   for (const std::pair<int, int>& output : s.output_inds) {
     int node_id = output.first;
     int output_idx = output.second;
@@ -2451,6 +2496,8 @@ tensorflow::Status ConvertSubgraph(
     if (output_idx != 0)
       tensorflow::strings::StrAppend(&tensor_name, ":", output_idx);
     VLOG(2) << "Output tensor name: " << tensor_name;
+    if (added_tensors.count(tensor_name)) continue;
+    added_tensors.insert(tensor_name);
     output_names->push_back(tensor_name);
     auto tensor_or_weights = converter.get_tensor(tensor_name);
     if (!tensor_or_weights.is_tensor()) {
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets.py b/tensorflow/contrib/tpu/python/tpu/datasets.py
index 2e472a2805..d879170b68 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets.py
+++ b/tensorflow/contrib/tpu/python/tpu/datasets.py
@@ -166,11 +166,21 @@ def StreamingFilesDataset(files,
     return remote_iterator.get_next()
 
   def MapFn(unused_input):
-    return functional_ops.remote_call(
+    if isinstance(source_dataset.output_types, dtypes.DType):
+      output_types = [source_dataset.output_types]
+    elif isinstance(source_dataset.output_types, (list, tuple)):
+      output_types = source_dataset.output_types
+    else:
+      raise ValueError('source dataset has invalid output types')
+    remote_calls = functional_ops.remote_call(
         args=[source_handle],
-        Tout=[dtypes.string],
+        Tout=output_types,
         f=LoadingFunc,
-        target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)[0]
+        target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)
+    if len(remote_calls) == 1:
+      return remote_calls[0]
+    else:
+      return remote_calls
 
   with ops.device('/job:%s' % worker_job):
     output_dataset = dataset_ops.Dataset.range(2).repeat().map(
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets_test.py b/tensorflow/contrib/tpu/python/tpu/datasets_test.py
index 918cf0ed8e..b58d05eac5 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/datasets_test.py
@@ -26,6 +26,8 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.lib.io import python_io
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
@@ -162,6 +164,30 @@ class DatasetsTest(test.TestCase):
 
     self.assertEqual(set(all_contents), set(retrieved_values))
 
+  def testArbitraryReaderFuncFromDatasetGenerator(self):
+
+    def my_generator():
+      yield (1, [1] * 10)
+
+    def gen_dataset(dummy):
+      return dataset_ops.Dataset.from_generator(
+          my_generator, (dtypes.int64, dtypes.int64),
+          (tensor_shape.TensorShape([]), tensor_shape.TensorShape([10])))
+
+    dataset = datasets.StreamingFilesDataset(
+        dataset_ops.Dataset.range(10), filetype=gen_dataset)
+
+    iterator = dataset.make_initializable_iterator()
+    self._sess.run(iterator.initializer)
+    get_next = iterator.get_next()
+
+    retrieved_values = self._sess.run(get_next)
+
+    self.assertIsInstance(retrieved_values, (list, tuple))
+    self.assertEqual(len(retrieved_values), 2)
+    self.assertEqual(retrieved_values[0], 1)
+    self.assertItemsEqual(retrieved_values[1], [1] * 10)
+
   def testUnexpectedFiletypeString(self):
     with self.assertRaises(ValueError):
       datasets.StreamingFilesDataset(
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index d89633199d..b1c224a345 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -699,7 +699,9 @@ cc_library(
     srcs = ["platform/stacktrace_handler.cc"],
     hdrs = ["platform/stacktrace_handler.h"],
     deps = [
+        ":abi",
         ":lib_platform",
+        ":stacktrace",
     ],
 )
 
@@ -3089,6 +3091,8 @@ cc_library(
         # we now need at least "str_util".
         ":lib",
         ":lib_platform",
+        ":stacktrace_handler",
+        ":test_lite",
         "//tensorflow/core/platform/default/build_config:test_lite_main",
     ],
     alwayslink = 1,
@@ -3569,7 +3573,10 @@ tf_cc_tests_gpu(
 tf_cc_test_mkl(
     name = "mkl_runtime_tests",
     size = "small",
-    srcs = ["common_runtime/mkl_cpu_allocator_test.cc"],
+    srcs = [
+        "common_runtime/mkl_cpu_allocator_test.cc",
+        "common_runtime/mkl_threadpool_device_test.cc",
+    ],
     linkstatic = 1,
     deps = [
         ":core",
diff --git a/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt b/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt
index cbe76de415..985f09312f 100644
--- a/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt
@@ -4,6 +4,10 @@ op {
   description: <<END
 if < 0, `scale * features` otherwise.
 
+To be used together with
+`initializer = tf.variance_scaling_initializer(factor=1.0, mode='FAN_IN')`.
+For correct dropout, use `tf.contrib.nn.alpha_dropout`.
+
 See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt
new file mode 100644
index 0000000000..6e13d0d049
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt
@@ -0,0 +1,48 @@
+op {
+  graph_op_name: "StringSplitV2"
+  in_arg {
+    name: "input"
+    description: <<END
+`1-D` string `Tensor`, the strings to split.
+END
+  }
+  in_arg {
+    name: "sep"
+    description: <<END
+`0-D` string `Tensor`, the delimiter character.
+END
+  }
+  attr {
+    name: "maxsplit"
+    description: <<END
+An `int`. If `maxsplit > 0`, limit of the split of the result.
+END
+  }
+  summary: "Split elements of `source` based on `sep` into a `SparseTensor`."
+  description: <<END
+Let N be the size of source (typically N will be the batch size). Split each
+element of `source` based on `sep` and return a `SparseTensor`
+containing the split tokens. Empty tokens are ignored.
+
+For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
+then the output will be
+```
+st.indices = [0, 0;
+              0, 1;
+              1, 0;
+              1, 1;
+              1, 2]
+st.shape = [2, 3]
+st.values = ['hello', 'world', 'a', 'b', 'c']
+```
+
+If `sep` is given, consecutive delimiters are not grouped together and are
+deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
+sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
+string, consecutive whitespace are regarded as a single separator, and the
+result will contain no empty strings at the startor end if the string has
+leading or trailing whitespace.
+
+Note that the above mentioned behavior matches python's str.split.
+END
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt
new file mode 100644
index 0000000000..0e8576fb01
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StringSplitV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc
index 8f2a419756..9cda17867b 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@@ -86,7 +86,7 @@ BFCAllocator::Chunk* BFCAllocator::ChunkFromHandle(ChunkHandle h) {
   return &(chunks_[h]);
 }
 
-bool BFCAllocator::Extend(size_t rounded_bytes) {
+bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
   size_t available_bytes = memory_limit_ - total_region_allocated_bytes_;
   // Rounds available_bytes down to the nearest multiple of kMinAllocationSize.
   available_bytes = (available_bytes / kMinAllocationSize) * kMinAllocationSize;
@@ -108,7 +108,7 @@ bool BFCAllocator::Extend(size_t rounded_bytes) {
 
   // Try allocating.
   size_t bytes = std::min(curr_region_allocation_bytes_, available_bytes);
-  void* mem_addr = suballocator_->Alloc(32, bytes);
+  void* mem_addr = suballocator_->Alloc(alignment, bytes);
   if (mem_addr == nullptr && !started_backpedal_) {
     // Only backpedal once.
     started_backpedal_ = true;
@@ -119,7 +119,7 @@ bool BFCAllocator::Extend(size_t rounded_bytes) {
     while (mem_addr == nullptr) {
       bytes = RoundedBytes(bytes * kBackpedalFactor);
       if (bytes < rounded_bytes) break;
-      mem_addr = suballocator_->Alloc(32, bytes);
+      mem_addr = suballocator_->Alloc(alignment, bytes);
     }
   }
 
@@ -261,7 +261,7 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
   }
 
   // Try to extend
-  if (Extend(rounded_bytes)) {
+  if (Extend(unused_alignment, rounded_bytes)) {
     ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes);
     if (ptr != nullptr) {
       return ptr;
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index ba5a3eea3a..52aedb1e9c 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -305,7 +305,8 @@ class BFCAllocator : public VisitableAllocator {
   // Try to add a new memory region that can satisfy an allocation of
   // 'rounded_bytes' bytes.  Returns true on success and false on
   // failure.
-  bool Extend(size_t rounded_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  bool Extend(size_t alignment, size_t rounded_bytes)
+      EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   // Returns a pointer to an underlying allocated chunk of size
   // 'rounded_bytes'.
diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
index c21a1ea9f2..9028e6298c 100644
--- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
@@ -102,9 +102,25 @@ TEST(DirectSessionWithTrackingAllocTest, CostModelTest) {
         EXPECT_EQ(2, shape.dim(0).size());
         EXPECT_EQ(1, shape.dim(1).size());
         if (node->name() == y->name()) {
+#ifdef INTEL_MKL
+          // if MKL is used, it goes through various additional 
+          // graph rewrite pass. In TF, everytime a graph pass 
+          // happens, "constant" nodes are allocated
+          // and deallocated. Each allocation calls the
+          // (FindChunkPtr of BFCAllocator),
+          // which increments the value of AllocationId. 
+          // Thus AllocationId becomes more than 3 and 4 if 
+          // MKL is used. Now they are 9 and 10 for MKL. 
+          EXPECT_EQ(19, cm->AllocationId(node, 0));
+#else
           EXPECT_EQ(21, cm->AllocationId(node, 0));
+#endif 
         } else {
+#ifdef INTEL_MKL
+          EXPECT_EQ(20, cm->AllocationId(node, 0));
+#else
           EXPECT_EQ(22, cm->AllocationId(node, 0));
+#endif 
         }
       }
       EXPECT_LE(0, cm->MaxExecutionTime(node));
diff --git a/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc b/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
new file mode 100644
index 0000000000..5d583a8360
--- /dev/null
+++ b/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
@@ -0,0 +1,53 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#include "tensorflow/core/common_runtime/threadpool_device.h"
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+
+#ifdef _OPENMP
+TEST(MKLThreadPoolDeviceTest, TestOmpDefaults) {
+  SessionOptions options;
+  unsetenv("OMP_NUM_THREADS");
+
+  ThreadPoolDevice* tp = new ThreadPoolDevice(
+      options, "/device:CPU:0", Bytes(256), DeviceLocality(), cpu_allocator());
+
+  const int ht = port::NumHyperthreadsPerCore();
+  EXPECT_EQ(omp_get_max_threads(), (port::NumSchedulableCPUs() + ht - 1) / ht);
+}
+
+TEST(MKLThreadPoolDeviceTest, TestOmpPreSets) {
+  SessionOptions options;
+  setenv("OMP_NUM_THREADS", "314", 1);
+
+  ThreadPoolDevice* tp = new ThreadPoolDevice(
+      options, "/device:CPU:0", Bytes(256), DeviceLocality(), cpu_allocator());
+
+  EXPECT_EQ(omp_get_max_threads(), 314);
+}
+#endif  // _OPENMP
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc
index 21912236d0..a5d31b75c7 100644
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -16,8 +16,10 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/process_util.h"
 
 #ifdef INTEL_MKL
+#ifdef _OPENMP
 #include <omp.h>
-#endif
+#endif  // _OPENMP
+#endif  // INTEL_MKL
 #include <string.h>
 
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -57,7 +59,10 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
   // MKL library executes ops in parallel using OMP threads
   // Set inter_op conservatively to avoid thread oversubscription that could
   // lead to severe perf degradations and OMP resource exhaustion
-  const int mkl_intra_op = omp_get_max_threads();
+  int mkl_intra_op = 1;
+#ifdef _OPENMP
+  mkl_intra_op = omp_get_max_threads();
+#endif  // _OPENMP
   CHECK_GE(mkl_intra_op, 1);
   const int32 mkl_inter_op = std::max(
       (port::NumSchedulableCPUs() + mkl_intra_op - 1) / mkl_intra_op, 2);
@@ -68,7 +73,7 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
 #else
   // Default to using the number of cores available in the process.
   return port::NumSchedulableCPUs();
-#endif
+#endif  // INTEL_MKL
 }
 
 thread::ThreadPool* NewThreadPoolFromSessionOptions(
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index f7a07fe503..74a87215e1 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -31,7 +31,11 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 
 #ifdef INTEL_MKL
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 #include "tensorflow/core/common_runtime/mkl_cpu_allocator.h"
+#include "tensorflow/core/platform/cpu_info.h"
 #endif
 
 namespace tensorflow {
@@ -43,7 +47,26 @@ ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options,
     : LocalDevice(options, Device::BuildDeviceAttributes(
                                name, DEVICE_CPU, memory_limit, locality)),
       allocator_(allocator),
-      scoped_allocator_mgr_(new ScopedAllocatorMgr(name)) {}
+      scoped_allocator_mgr_(new ScopedAllocatorMgr(name)) {
+#ifdef INTEL_MKL
+#ifdef _OPENMP
+  const char* user_omp_threads = getenv("OMP_NUM_THREADS");
+  if (user_omp_threads == nullptr) {
+    // OMP_NUM_THREADS controls MKL's intra-op parallelization
+    // Default to available physical cores
+    const int mkl_intra_op = port::NumSchedulableCPUs();
+    const int ht = port::NumHyperthreadsPerCore();
+    omp_set_num_threads((mkl_intra_op + ht - 1) / ht);
+  } else {
+    uint64 user_val = 0;
+    if (strings::safe_strtou64(user_omp_threads, &user_val)) {
+      // Superflous but triggers OpenMP loading
+      omp_set_num_threads(user_val);
+    }
+  }
+#endif  // _OPENMP
+#endif  // INTEL_MKL
+}
 
 ThreadPoolDevice::~ThreadPoolDevice() {}
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
index 1cea1b1462..770a0fcf14 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
@@ -147,7 +147,9 @@ MasterService::Stub::Stub(
 }
 
 MasterService::AsyncService::AsyncService() {
-  for (int i = 0; i < 10; ++i) {
+  int method_len = sizeof(grpcMasterService_method_names) / 
+                    sizeof(grpcMasterService_method_names[0]);
+  for (int i = 0; i < method_len; ++i) {
     AddMethod(new ::grpc::internal::RpcServiceMethod(
         grpcMasterService_method_names[i],
         ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr));
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
index 89f83f9f24..a8508d2d4f 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_session.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
@@ -50,9 +51,14 @@ Status TestCluster::MakeTestCluster(const SessionOptions& options, int n,
   }
 
   for (int i = 0; i < n; ++i) {
+    string server_file =
+        strings::StrCat(testing::TensorFlowSrcRoot(),
+                        "/core/distributed_runtime/rpc/grpc_testlib_server");
+    if (!options.env->FileExists(server_file).ok()) {
+      return errors::Internal("Could not find grpc_testlib_server");
+    }
     const std::vector<string> argv(
-        {strings::StrCat(testing::TensorFlowSrcRoot(),
-                         "/core/distributed_runtime/rpc/grpc_testlib_server"),
+        {server_file,
          /* see grpc_testlib_server.cc for flags */
          tf_jobs, "--tf_job=localhost", strings::StrCat("--tf_task=", i),
          strings::StrCat("--num_cpus=", num_cpus),
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 2c87156dca..2bb4d32d57 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -67,13 +67,8 @@ struct AllocatorStats {
 // device memory.
 class Allocator {
  public:
-#ifdef EIGEN_VECTORIZE_AVX512
   // Align to 64 byte boundary.
   static constexpr size_t kAllocatorAlignment = 64;
-#else
-  // Align to 32 byte boundary.
-  static constexpr size_t kAllocatorAlignment = 32;
-#endif
 
   virtual ~Allocator();
 
diff --git a/tensorflow/core/framework/op_gen_lib.cc b/tensorflow/core/framework/op_gen_lib.cc
index 3d7920a6e2..4b56d807df 100644
--- a/tensorflow/core/framework/op_gen_lib.cc
+++ b/tensorflow/core/framework/op_gen_lib.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/op_gen_lib.h"
 
+#include <algorithm>
 #include <vector>
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
diff --git a/tensorflow/core/framework/remote_fused_graph_execute_info.proto b/tensorflow/core/framework/remote_fused_graph_execute_info.proto
index eb689ec1e6..10072724d2 100644
--- a/tensorflow/core/framework/remote_fused_graph_execute_info.proto
+++ b/tensorflow/core/framework/remote_fused_graph_execute_info.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "RemoteFusedGraphExecuteInfoProto";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-//add go_package externally
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index b613effd18..80e168df97 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -1147,29 +1147,29 @@ TEST(Tensor, FailureToAllocate) {
 
 // On the alignment.
 //
-// As of 2015/8, tensorflow::Tensor allocates its buffer with 32-byte
+// As of 2018/5, tensorflow::Tensor allocates its buffer with 64-byte
 // alignment. Tensor::tensor/flat/vec/matrix methods requires the
 // buffer satisfies Eigen::Aligned (e.g., 16-bytes aligned usually,
-// and 32-bytes for AVX). Tensor::Slice requires the caller to ensure
-// its result is aligned if the caller intends to use those methods.
-// In this test case, we simply make sure each slice is 32-byte
-// aligned: sizeof(float) * 4 * 2 = 32.
+// 32-bytes for AVX, and 64-bytes for AVX512). Tensor::Slice requires
+// the caller to ensure its result is aligned if the caller intends
+// to use those methods. In this test case, we simply make sure each
+// slice is 64-byte aligned: sizeof(float) * 4 * 36 = 576.  576 % 64 = 0.
 TEST(Tensor, Slice_Basic) {
   Tensor saved;
   {  // General
-    Tensor x(DT_FLOAT, TensorShape({10, 4, 34}));
+    Tensor x(DT_FLOAT, TensorShape({10, 4, 36}));
     // Fills in known values.
     for (int i = 0; i < 10; ++i) {
       x.Slice(i, i + 1).flat<float>().setConstant(i * 1.f);
     }
     // A simple slice along dim0.
     Tensor y = x.Slice(4, 8);
-    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 4, 34})));
+    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 4, 36})));
     auto tx = x.tensor<float, 3>();
     auto ty = y.tensor<float, 3>();
     for (int i = 0; i < 4; ++i) {
       for (int j = 0; j < 4; ++j) {
-        for (int k = 0; k < 34; ++k) {
+        for (int k = 0; k < 36; ++k) {
           EXPECT_EQ(ty(i, j, k), 4.0 + i);
           EXPECT_EQ(&tx(4 + i, j, k), &ty(i, j, k));
         }
@@ -1186,7 +1186,7 @@ TEST(Tensor, Slice_Basic) {
     auto tz = z.tensor<float, 3>();
     EXPECT_EQ(1, z.dim_size(0));
     for (int j = 0; j < 4; ++j) {
-      for (int k = 0; k < 34; ++k) {
+      for (int k = 0; k < 36; ++k) {
         EXPECT_EQ(tz(0, j, k), 6.0);
       }
     }
@@ -1198,16 +1198,16 @@ TEST(Tensor, Slice_Basic) {
     EXPECT_EQ(1, saved.dim_size(0));
     auto tsaved = saved.tensor<float, 3>();
     for (int j = 0; j < 4; ++j) {
-      for (int k = 0; k < 34; ++k) {
+      for (int k = 0; k < 36; ++k) {
         EXPECT_EQ(tsaved(0, j, k), 6.0);
       }
     }
   }
   {  // Empty
-    Tensor x(DT_FLOAT, TensorShape({10, 0, 34}));
+    Tensor x(DT_FLOAT, TensorShape({10, 0, 36}));
     x.flat<float>().setRandom();
     Tensor y = x.Slice(4, 8);
-    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 0, 34})));
+    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 0, 36})));
   }
 
   {
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 72a13d4da7..b9667998d6 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -2691,14 +2691,14 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
     // If Op has been specifically assigned to a non-CPU device, then No.
     if (!n->assigned_device_name().empty() &&
-        !str_util::StrContains(n->assigned_device_name(),kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->assigned_device_name(), kCPUDeviceSubStr)) {
       result = false;
       reason = "Op has been assigned a runtime device that is not CPU.";
     }
 
     // If user has specifically assigned this op to a non-CPU device, then No.
     if (!n->def().device().empty() &&
-        !str_util::StrContains(n->def().device(),kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->def().device(), kCPUDeviceSubStr)) {
       result = false;
       reason = "User has assigned a device that is not CPU.";
     }
@@ -2865,9 +2865,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return false;
   }
 
-  // If the depth_radius of LRN is not 2, then MKL DNN takes unoptimized 
-  // path. The unoptimized path is slow. Thus we dont rewrite the node 
-  // and use default Eigen. But for depth_radius=2, MKL DNN optimized 
+  // If the depth_radius of LRN is not 2, then MKL DNN takes unoptimized
+  // path. The unoptimized path is slow. Thus we dont rewrite the node
+  // and use default Eigen. But for depth_radius=2, MKL DNN optimized
   // path is taken, i.e., eigen node is rewritten by MKl DNN node.
   static bool LrnRewrite(const Node* n) {
     CHECK_NOTNULL(n);
@@ -2876,13 +2876,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     CHECK_EQ(GetNodeAttr(n->def(), "depth_radius", &depth_radius).ok(), true);
 
     // if the depth_radius of LRN is not 2, don't rewrite the node by MKL DNN
-    // and use eigen node instead 
+    // and use eigen node instead
     if (depth_radius == 2) {
       return true;
     }
     VLOG(1) << "LrnRewrite: The model sets depth_radius as not 2 which"
             << "case is not optimized by Intel MKL, thus using Eigen op"
-            << "for LRN " ; 
+            << "for LRN ";
 
     return false;
   }
@@ -3015,6 +3015,35 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                                 std::vector<NodeBuilder::NodeOut>* ws_tensors,
                                 bool* are_ws_tensors_added);
 
+  // Helper function used by FixMklMetaDataEdges. Fixes the metadata edge
+  // pointed by 'e_metadata' corresponding to the data edge 'e_data' in graph
+  // 'g'. Returns true is fixup was done; otherwise, it returns false.
+  bool FixMklMetaDataEdgeIfNeeded(std::unique_ptr<Graph>* g,
+    const Edge* e_data, const Edge* e_metadata);
+
+  // Are the input Mkl metadata edges for node 'n' in graph 'g' correctly
+  // connected? If not, then fix them. This is needed because a graph may have
+  // some input Mkl metadata edges incorrectly setup after node merge and
+  // rewrite passes. This could happen because GetReversePostOrder function may
+  // not provide topologically sorted order if a graph contains cycles. The
+  // function returns true if at least one Mkl metadata edge for node 'n' was
+  // fixed. Otherwise, it returns false.
+  //
+  // Example:
+  //
+  // X = MklConv2D(_, _, _)
+  // Y = MklConv2DWithBias(_, _, _, _, _, _)
+  // Z = MklAdd(X, Y, DummyMklTensor, Y:1)
+  //
+  // For a graph such as shown above, note that 3rd argument of MklAdd contains
+  // DummyMklTensor. Actually, it should be getting the Mkl metadata from
+  // MklConv2D op (specifically, X:2). This incorrect plumbing could be possible
+  // (although rare) if the Mkl NodeMerge + NodeRewrite passes visit Z before X
+  // (possible if X, Y, Z are part of a loop.) This function fixes the Mkl
+  // metadata edges only - it does not rewrite nodes nor does it modify the Mkl
+  // data edges (1st and 2nd arguments of MklAdd).
+  bool FixMklMetaDataEdges(std::unique_ptr<Graph>* g, Node* n);
+
   // Functions specific to operators to copy attributes
   // We need operator-specific function to copy attributes because the framework
   // does not provide any generic function for it.
@@ -4241,6 +4270,92 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
   return nullptr;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+//              Post-rewrite Mkl metadata fixup pass
+///////////////////////////////////////////////////////////////////////////////
+bool MklLayoutRewritePass::FixMklMetaDataEdgeIfNeeded(std::unique_ptr<Graph>* g,
+    const Edge* e_data, const Edge* e_metadata) {
+  if (g == nullptr || e_data == nullptr || e_metadata == nullptr) {
+    return false;
+  }
+
+  Node* n_data = e_data->src();
+  int n_data_op_slot = e_data->src_output();
+  int n_metadata_op_slot = GetTensorMetaDataIndex(n_data_op_slot,
+                                                  n_data->num_outputs());
+
+  // If the source of meta edge is a constant node (producing dummy Mkl metadata
+  // tensor), then we will need to fix.
+  if (IsConstant(e_metadata->src())) {
+    Node* e_metadata_dst = e_metadata->dst();
+    int e_metadata_in_slot = e_metadata->dst_input();
+    CHECK_NOTNULL((*g)->AddEdge(n_data, n_metadata_op_slot,
+                  e_metadata_dst, e_metadata_in_slot));
+
+    (*g)->RemoveEdge(e_metadata);
+    return true;
+  }
+
+  return false;
+}
+
+bool MklLayoutRewritePass::FixMklMetaDataEdges(std::unique_ptr<Graph>* g,
+    Node* n) {
+  bool result = false;
+
+  // If graph node is not Mkl node, then return.
+  DataType T = DT_INVALID;
+  if (!GetNodeAttr(n->def(), "T", &T).ok() ||
+      !mkl_op_registry::IsMklOp(n->type_string(), T)) {
+    return result;
+  }
+
+  // If it is Mkl node, then check if the input edges to this node that carry
+  // Mkl metadata are linked up correctly with the source node.
+
+  // For Mkl nodes, we generate twice the number of input tensors (n for Mkl
+  // data tensors + n for Mkl metadata tensors). We need to check for correct
+  // connection of n metadata tensors only.
+  int num_data_inputs = n->num_inputs() / 2;
+  for (int idx = 0; idx < num_data_inputs; idx++) {
+    // Get the edge connecting input slot with index (idx).
+    const Edge* e = nullptr;
+    TF_CHECK_OK(n->input_edge(idx, &e));
+
+    // If e is control edge, then skip.
+    if (e->IsControlEdge()) {
+      continue;
+    }
+
+    // Check that the source node for edge 'e' is Mkl node. If it is not an Mkl
+    // node, then we don't need to do anything.
+    Node* e_src = e->src();
+    if (GetNodeAttr(e_src->def(), "T", &T).ok() &&
+        mkl_op_registry::IsMklOp(e_src->type_string(), T)) {
+      // Source node for edge 'e' is Mkl node.
+      // Destination node and destination input slot of e is node 'n' and 'idx'
+      // resp.
+      CHECK_EQ(e->dst(), n);
+      CHECK_EQ(e->dst_input(), idx);
+
+      // Let's get edge that carries Mkl metadata corresponding to Mkl data edge
+      // 'e'. For that, let's first get the input slot of 'n' where the meta
+      // edge will feed the value.
+      int e_meta_in_slot = GetTensorMetaDataIndex(e->dst_input(),
+                                                  n->num_inputs());
+      const Edge* e_meta = nullptr;
+      TF_CHECK_OK(n->input_edge(e_meta_in_slot, &e_meta));
+
+      // Let's check if we need to fix this meta edge.
+      if (FixMklMetaDataEdgeIfNeeded(g, e, e_meta)) {
+        result = true;
+      }
+    }
+  }
+
+  return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 //              Run function for the pass
 ///////////////////////////////////////////////////////////////////////////////
@@ -4307,6 +4422,25 @@ bool MklLayoutRewritePass::RunPass(std::unique_ptr<Graph>* g) {
 
   DumpGraph("After running MklLayoutRewritePass(NodeMerge+Rewrite)", &**g);
 
+  order.clear();
+  GetReversePostOrder(**g, &order);  // This will give us topological sort.
+  for (Node* n : order) {
+    // If node is not an op or it cannot run on CPU device, then skip.
+    if (!n->IsOp() || !CanOpRunOnCPUDevice(n)) {
+      continue;
+    }
+    if (FixMklMetaDataEdges(g, n)) {
+      string node_name = n->name();
+      string op_name = n->type_string();
+
+      VLOG(1) << "MklLayoutRewritePass: fixed metadata edges for node "
+              << node_name << " with op " << op_name;
+      result = true;
+    }
+  }
+  DumpGraph("After running MklLayoutRewritePass(NodeMerge+Rewrite+Fixup)",
+            &**g);
+
   return result;
 }
 
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 029cdcf94a..7645b4a7f0 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -3518,6 +3518,37 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_DeviceTest) {
             "B->C:1;C->E;D->E:1;E->Z;M->C:2;N->C:3;Y->Z:1");
 }
 
+/////////////////////////////////////////////////////////////////////
+//         Post-rewrite fixup pass test
+
+TEST_F(MklLayoutPassTest, PostRewriteFixUpPass) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'C' op: '_MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A', 'B', 'M', 'N']}"
+      "node { name: 'D' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_UINT8 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_UINT8 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'E' op: '_MklAdd'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A', 'D', 'D']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Const);E(_MklAdd);"
+            "M(_MklInput);N(_MklInput)|A->C;A->E:1;B->C:1;C->E;C:2->E:2;"
+            "D->E:3;M->C:2;N->C:3");
+}
+
 /////////////////////////////////////////////////////////////////////
 
 static void BM_MklLayoutRewritePass(int iters, int op_nodes) {
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 6749a7c571..0c02876ac5 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -610,7 +610,6 @@ class SymbolicShapeRefiner {
     }
   };
 
-  // Compute the shape of the tensors outputed by node 'node' at output port
   // 'port_index' as the union of shape1 and shape2.
   ShapeHandle OutputAsUnion(const NodeDef* node, int port_index,
                             ShapeHandle shape1, ShapeHandle shape2) {
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 1b18087cdf..8ca726df0b 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -679,6 +679,7 @@ cc_library(
     deps = [
         ":constant_folding",
         ":graph_optimizer",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:graph_view",
         "//tensorflow/core/grappler:grappler_item",
@@ -780,7 +781,6 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:scoped_allocator_ops_op_lib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index 4dde7ed1b4..03e36a7b9c 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -200,8 +201,7 @@ Status Remapper::Optimize(Cluster* /*cluster*/, const GrapplerItem& item,
         }
       }
       if (optimizable) {
-        VLOG(2) << "Optimizing fused batch norm node " << node.DebugString()
-                << std::endl;
+        VLOG(1) << "Optimizing fused batch norm node " << node.DebugString();
         AddBatchNormNodes(optimized_graph, node);
         continue;
       }
diff --git a/tensorflow/core/kernels/as_string_op.cc b/tensorflow/core/kernels/as_string_op.cc
index 66c4aff3e3..a7757d1361 100644
--- a/tensorflow/core/kernels/as_string_op.cc
+++ b/tensorflow/core/kernels/as_string_op.cc
@@ -73,6 +73,7 @@ class AsStringOp : public OpKernel {
     }
     switch (dtype) {
       case DT_INT8:
+      case DT_INT16:
       case DT_INT32:
         strings::Appendf(&format_, "d");
         break;
@@ -129,6 +130,7 @@ class AsStringOp : public OpKernel {
       ENCODE_TYPE(DT_FLOAT, float, format_);
       ENCODE_TYPE(DT_DOUBLE, double, format_);
       ENCODE_TYPE(DT_INT8, int8, format_);
+      ENCODE_TYPE(DT_INT16, int16, format_);
       case (DT_BOOL): {
         const auto& input_flat = input_tensor->flat<bool>();
         for (int i = 0; i < input_flat.size(); ++i) {
diff --git a/tensorflow/core/kernels/cwise_op_clip.cc b/tensorflow/core/kernels/cwise_op_clip.cc
index 14d889e8e3..49b90e855b 100644
--- a/tensorflow/core/kernels/cwise_op_clip.cc
+++ b/tensorflow/core/kernels/cwise_op_clip.cc
@@ -33,52 +33,41 @@ class ClipOp : public OpKernel {
     const Tensor& in0 = ctx->input(0);
     const Tensor& in1 = ctx->input(1);
     const Tensor& in2 = ctx->input(2);
+    OP_REQUIRES(ctx, (in0.shape() == in1.shape() ||
+                      TensorShapeUtils::IsScalar(in1.shape())) &&
+                     (in0.shape() == in2.shape() ||
+                      TensorShapeUtils::IsScalar(in2.shape())),
+                errors::InvalidArgument(
+                    "clip_value_min and clip_value_max must be either of "
+                    "the same shape as input, or a scalar. ",
+                    "input shape: ", in0.shape().DebugString(),
+                    "clip_value_min shape: ", in1.shape().DebugString(),
+                    "clip_value_max shape: ", in2.shape().DebugString()));
+
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(
+        ctx, ctx->forward_input_or_allocate_output({0}, 0, in0.shape(), &out));
+    if (out->NumElements() == 0) return;  // Nothing to do for empty output
 
     auto in0_flat = in0.flat<T>();
     auto in1_flat = in1.flat<T>();
     auto in2_flat = in2.flat<T>();
+    auto out_flat = out->flat<T>();
     const Device& d = ctx->eigen_device<Device>();
 
-    Tensor* out = nullptr;
-    OP_REQUIRES_OK(
-        ctx, ctx->forward_input_or_allocate_output({0}, 0, in0.shape(), &out));
-    auto out_flat = out->flat<T>();
     if (in1.shape() == in2.shape()) {
       if (in0.shape() == in1.shape()) {
         functor::TernaryClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                             out_flat);
       } else {
-        OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(in1.shape()),
-                    errors::InvalidArgument(
-                        "clip_value_min and clip_value_max must be either of "
-                        "the same shape as input, or a scalar. ",
-                        "input shape: ", in0.shape().DebugString(),
-                        "clip_value_min shape: ", in1.shape().DebugString(),
-                        "clip_value_max shape: ", in2.shape().DebugString()));
         functor::UnaryClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                           out_flat);
       }
     } else {
       if (in0.shape() == in1.shape()) {
-        OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(in2.shape()),
-                    errors::InvalidArgument(
-                        "clip_value_min and clip_value_max must be either of "
-                        "the same shape as input, or a scalar. ",
-                        "input shape: ", in0.shape().DebugString(),
-                        "clip_value_min shape: ", in1.shape().DebugString(),
-                        "clip_value_max shape: ", in2.shape().DebugString()));
         functor::BinaryLeftClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                                out_flat);
       } else {
-        OP_REQUIRES(ctx,
-                    (in0.shape() == in2.shape() &&
-                     TensorShapeUtils::IsScalar(in1.shape())),
-                    errors::InvalidArgument(
-                        "clip_value_min and clip_value_max must be either of "
-                        "the same shape as input, or a scalar. ",
-                        "input shape: ", in0.shape().DebugString(),
-                        "clip_value_min shape: ", in1.shape().DebugString(),
-                        "clip_value_max shape: ", in2.shape().DebugString()));
         functor::BinaryRightClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                                 out_flat);
       }
diff --git a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
index 9a3b2303a3..17a85d9773 100644
--- a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
@@ -57,6 +57,7 @@ struct DenseUpdate<GPUDevice, T, SUB> {
   template struct functor::DenseUpdate<GPUDevice, T, ADD>; \
   template struct functor::DenseUpdate<GPUDevice, T, SUB>;
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
+TF_CALL_int32(DEFINE_GPU_KERNELS);
 TF_CALL_int64(DEFINE_GPU_KERNELS);
 #undef DEFINE_GPU_KERNELS
 
diff --git a/tensorflow/core/kernels/gather_functor.cc b/tensorflow/core/kernels/gather_functor.cc
index e6fefe643b..5cd8e04927 100644
--- a/tensorflow/core/kernels/gather_functor.cc
+++ b/tensorflow/core/kernels/gather_functor.cc
@@ -37,6 +37,7 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
+TF_CALL_int64(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
 TF_CALL_complex128(DECLARE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/gather_functor_gpu.cu.cc b/tensorflow/core/kernels/gather_functor_gpu.cu.cc
index 39b6924d74..4563fc6353 100644
--- a/tensorflow/core/kernels/gather_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/gather_functor_gpu.cu.cc
@@ -31,6 +31,7 @@ typedef Eigen::GpuDevice GPUDevice;
   DEFINE_GPU_SPECS_INDEX(T, int32); \
   DEFINE_GPU_SPECS_INDEX(T, int64);
 
+TF_CALL_int64(DEFINE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
 TF_CALL_complex64(DEFINE_GPU_SPECS);
 TF_CALL_complex128(DEFINE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/gather_nd_op.cc b/tensorflow/core/kernels/gather_nd_op.cc
index 7e5a9e1ec5..4e53291b7f 100644
--- a/tensorflow/core/kernels/gather_nd_op.cc
+++ b/tensorflow/core/kernels/gather_nd_op.cc
@@ -228,6 +228,8 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
+TF_CALL_int32(DECLARE_GPU_SPECS);
+TF_CALL_int64(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
 TF_CALL_complex128(DECLARE_GPU_SPECS);
@@ -239,6 +241,8 @@ TF_CALL_complex128(DECLARE_GPU_SPECS);
 // Registration of the GPU implementations.
 #define REGISTER_GATHER_ND_GPU(type) REGISTER_GATHER_ND_ALL_INDICES(GPU, type)
 
+TF_CALL_int32(REGISTER_GATHER_ND_GPU);
+TF_CALL_int64(REGISTER_GATHER_ND_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GATHER_ND_GPU);
 TF_CALL_complex64(REGISTER_GATHER_ND_GPU);
 TF_CALL_complex128(REGISTER_GATHER_ND_GPU);
diff --git a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
index b03efc684f..da8d2e9e3c 100644
--- a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
@@ -119,6 +119,8 @@ struct GatherNdSlice<GPUDevice, T, Index, IXDIM> {
   DEFINE_GPU_SPECS_INDEX(T, int32); \
   DEFINE_GPU_SPECS_INDEX(T, int64);
 
+TF_CALL_int32(DEFINE_GPU_SPECS);
+TF_CALL_int64(DEFINE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
 TF_CALL_complex64(DEFINE_GPU_SPECS);
 TF_CALL_complex128(DEFINE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index ef332ebee3..094504d6b9 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -153,6 +153,7 @@ TF_CALL_uint64(REGISTER_GATHER_CPU);
 // Registration of the GPU implementations.
 #define REGISTER_GATHER_GPU(type) REGISTER_GATHER_ALL_INDICES(GPU, type)
 
+TF_CALL_int64(REGISTER_GATHER_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GATHER_GPU);
 TF_CALL_complex64(REGISTER_GATHER_GPU);
 TF_CALL_complex128(REGISTER_GATHER_GPU);
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index 5eeb23d810..31d1b949ef 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -14,6 +14,7 @@ limitations under the License.
 
 #include <limits>
 #include <vector>
+#include <unordered_map>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -590,8 +591,8 @@ class MklConcatOp : public OpKernel {
       const int N = input_tensors.size();
 
       // Get Tensor shapes.
-      std::vector<MklDnnShape> input_shapes(N);
-      GetMklShapeList(context, "values", &input_shapes);
+      std::vector<MklDnnShape> mkl_input_shapes(N);
+      GetMklShapeList(context, "values", &mkl_input_shapes);
 
       const Tensor& concat_dim_tensor = (AxisArgName == NAME_IS_CONCAT_DIM)
                                             ? MklGetInput(context, 0)
@@ -610,19 +611,14 @@ class MklConcatOp : public OpKernel {
       int i = 0;
       bool invoke_eigen = false;
       bool are_all_mkl_inputs = true, are_all_tf_inputs = true;
-      const TensorShape expected_shape = input_shapes[0].IsMklTensor()
-                                             ? input_shapes[0].GetTfShape()
-                                             : input_tensors[0].shape();
+      const TensorShape expected_shape = mkl_input_shapes[0].IsMklTensor()
+                                       ? mkl_input_shapes[0].GetTfShape()
+                                       : input_tensors[0].shape();
       size_t expected_dims = expected_shape.dims();
 
       if (concat_dim < 0) concat_dim = expected_dims + concat_dim;
 
-      for (auto& s : input_shapes) {
-        if (s == expected_shape) {
-          ++i;
-          continue;
-        }
-
+      for (auto& s : mkl_input_shapes) {
         TensorShape s_shape =
             s.IsMklTensor() ? s.GetTfShape() : input_tensors[i].shape();
         size_t s_dims = s_shape.dims();
@@ -665,21 +661,14 @@ class MklConcatOp : public OpKernel {
 
       // Call Eigen library
       if (invoke_eigen) {
-        TensorShapeList tf_input_shapes;
-        i = 0;
-        for (auto& s : input_shapes) {
-          TensorShape s_shape =
-              s.IsMklTensor() ? s.GetTfShape() : input_tensors[i].shape();
-          tf_input_shapes.push_back(s_shape);
-          ++i;
-        }
-        CallEigenVersion(context, input_tensors, tf_input_shapes);
+        CallEigenVersion(context, input_tensors, mkl_input_shapes);
         return;
       }
 
       memory::dims dst_dims;
+
       if (are_all_mkl_inputs)
-        dst_dims = TFShapeToMklDnnDims(input_shapes[0].GetTfShape());
+        dst_dims = TFShapeToMklDnnDims(mkl_input_shapes[0].GetTfShape());
       else
         // When all the inputs are in Tensorflow format, we don't know
         // what is the input data format. In that case, we just use
@@ -689,26 +678,61 @@ class MklConcatOp : public OpKernel {
       std::vector<memory::primitive_desc> srcs_pd;
       std::vector<MklDnnData<T>> srcs(N, MklDnnData<T>(&cpu_engine));
       int64 dst_concat_dim_size = 0;
-      for (int k = 0; k < N; k++) {
-        bool is_mkl_tensor = input_shapes[k].IsMklTensor();
-        memory::dims src_dims;
-
-        // Same comment as dst_dims for src_dims.
-        src_dims = (is_mkl_tensor)
-                       ? TFShapeToMklDnnDims(input_shapes[k].GetTfShape())
-                       : TFShapeToMklDnnDims(input_tensors[k].shape());
-
-        dst_concat_dim_size += src_dims[concat_dim];
-        auto src_md =
-            is_mkl_tensor ? input_shapes[k].GetMklLayout() :
-                          // It does not matter what data format we use here
-                          // (NHWC or NCHW). We just need to ensure that output
-                          // of Concat uses same data format as input.
-                memory::desc(src_dims, MklDnnType<T>(), memory::format::nchw);
-
-        srcs[k].SetUsrMem(src_md, &input_tensors[k]);
-        auto src_mpd = srcs[k].GetUsrMemPrimDesc();
-        srcs_pd.push_back(src_mpd);
+
+      bool isMklReorderNeeded = false;
+      memory::format mkl_common_format = memory::format::any;
+      if (are_all_mkl_inputs) {
+        mkl_common_format =
+            FindMklCommonFormat(mkl_input_shapes, concat_dim,
+               &isMklReorderNeeded, &dst_concat_dim_size);
+
+        if (!isMklReorderNeeded) {
+          // All MKL tensors have a same format. Reorder is not needed.
+          for (int k = 0; k < N; k++) {
+            if (input_tensors[k].NumElements() == 0)
+              continue;
+
+            auto src_md = mkl_input_shapes[k].GetMklLayout();
+            srcs[k].SetUsrMem(src_md, &input_tensors[k]);
+            auto src_mpd = srcs[k].GetUsrMemPrimDesc();
+            srcs_pd.push_back(src_mpd);
+          }
+        } else {
+          // MKL tensors have different formats.
+          // Reorder them to most common format.
+          for (int k = 0; k < N; k++) {
+            if (input_tensors[k].NumElements() == 0)
+              continue;
+
+            auto src_dims = TFShapeToMklDnnDims(
+                mkl_input_shapes[k].GetTfShape());
+            auto src_md = mkl_input_shapes[k].GetMklLayout();
+            srcs[k].SetUsrMem(src_md, &input_tensors[k]);
+
+            if (src_md.data.format != mkl_common_format)
+              src_md = memory::desc(src_dims, MklDnnType<T>(),
+                           mkl_common_format);
+
+            srcs_pd.push_back(memory::primitive_desc(src_md, cpu_engine));
+          }
+        }
+      } else {  // All TF inputs
+        for (int k = 0; k < N; k++) {
+          if (input_tensors[k].NumElements() == 0)
+            continue;
+
+          memory::dims src_dims = TFShapeToMklDnnDims(input_tensors[k].shape());
+          dst_concat_dim_size += src_dims[concat_dim];
+
+          // It does not matter what data format to be used (NHWC versus NCHW).
+          // We just need to ensure that output uses same data format as inputs.
+          auto src_md =
+              memory::desc(src_dims, MklDnnType<T>(), memory::format::nchw);
+
+          srcs[k].SetUsrMem(src_md, &input_tensors[k]);
+          auto src_mpd = srcs[k].GetUsrMemPrimDesc();
+          srcs_pd.push_back(src_mpd);
+        }
       }
       dst_dims[concat_dim] = dst_concat_dim_size;
 
@@ -718,25 +742,33 @@ class MklConcatOp : public OpKernel {
       if (are_all_mkl_inputs) {
         // Since we are passing a specific format for destination,
         // we need to have dst_dims in MklDnn order (NCHW).
-        auto orig_tf_format = input_shapes[0].GetTfDataFormat();
+        auto orig_tf_format = mkl_input_shapes[0].GetTfDataFormat();
         dst_dims_in_nchw = MklDnnDimsInNCHW(
             dst_dims, MklDnnDataFormatToTFDataFormat(orig_tf_format));
-        // We will set the output in the same format as input to avoid layout
-        // conversions.
-        // Currently we are setting dst format same as input format.
-        // See if we can make this choice in a better way.
+        // Set the output format same as the most common format of inputs
+        // to avoid layout conversions.
         dst_md = memory::desc(
-            dst_dims_in_nchw, MklDnnType<T>(),
-            (memory::format)input_shapes[0].GetMklLayout().data.format);
+            dst_dims_in_nchw, MklDnnType<T>(), mkl_common_format);
       } else {
-        // Again, format does not matter here. We just need to make it same as
-        // input format.
+        // All inputs are TF tensors.
+        // Set the output format same as input format (nchw).
         dst_md = memory::desc(dst_dims, MklDnnType<T>(), memory::format::nchw);
       }
 
       std::vector<primitive::at> inputs;
-      for (int k = 0; k < input_tensors.size(); k++)
-        inputs.push_back(srcs[k].GetOpMem());
+      std::vector<primitive> net;
+      if (isMklReorderNeeded) {
+        for (int k = 0; k < input_tensors.size(); k++) {
+          if (input_tensors[k].NumElements() > 0) {
+            srcs[k].CheckReorderToOpMem(srcs_pd[k], &net);
+          }
+        }
+      }
+      for (int k = 0; k < input_tensors.size(); k++) {
+        if (input_tensors[k].NumElements() > 0) {
+          inputs.push_back(srcs[k].GetOpMem());
+        }
+      }
 
       // If all inputs are in MKL format, then meaning of concat_dim needs to
       // change. Value of concat_dim is tied to input Tensorflow data format
@@ -745,7 +777,8 @@ class MklConcatOp : public OpKernel {
       // But ifinput tensors are in NHWC order, then semantics need to change.
       // E.g., if we are concatinating over Channel (dimension 3 for NHWC),
       // then since MklDnn order is NCHW, concat_dim needs to be 1.
-      if (are_all_mkl_inputs) concat_dim = input_shapes[0].TfDimIdx(concat_dim);
+      if (are_all_mkl_inputs)
+         concat_dim = mkl_input_shapes[0].TfDimIdx(concat_dim);
 
       auto concat_pd = concat::primitive_desc(dst_md, concat_dim, srcs_pd);
 
@@ -758,7 +791,7 @@ class MklConcatOp : public OpKernel {
         dnn_shape_dst.SetMklLayout(&dst_pd);
         dnn_shape_dst.SetElemType(MklDnnType<T>());
         dnn_shape_dst.SetTfLayout(dst_dims.size(), dst_dims_in_nchw,
-                                  input_shapes[0].GetTfDataFormat());
+                                  mkl_input_shapes[0].GetTfDataFormat());
         tf_shape_dst.AddDim((dst_pd.get_size() / sizeof(T)));
       } else {
         dnn_shape_dst.SetMklTensor(false);
@@ -773,7 +806,6 @@ class MklConcatOp : public OpKernel {
       dst.SetUsrMem(dst_md, dst_tensor);
 
       auto concat_op = concat(concat_pd, inputs, dst.GetOpMem());
-      std::vector<primitive> net;
       net.push_back(concat_op);
       stream(stream::kind::eager).submit(net).wait();
     } catch (mkldnn::error& e) {
@@ -787,15 +819,27 @@ class MklConcatOp : public OpKernel {
   }
 
   void CallEigenVersion(OpKernelContext* context, const OpInputList& values,
-                        const TensorShapeList& input_shapes) {
-    CHECK_EQ(values.size(), input_shapes.size());
+                        const MklDnnShapeList& mkl_input_shapes) {
+    CHECK_EQ(values.size(), mkl_input_shapes.size());
 
     std::vector<Tensor> converted_values;
-    for (int i = 0; i < input_shapes.size(); i++)
-      converted_values.push_back(values[i]);
+    TensorShapeList tf_input_shapes;
+    for (int i = 0; i < mkl_input_shapes.size(); i++) {
+      if (mkl_input_shapes[i].IsMklTensor()) {
+        // do conversion from MKL to TF
+        Tensor tmp_tensor =
+            ConvertMklToTF<T>(context, values[i], mkl_input_shapes[i]);
+        converted_values.push_back(tmp_tensor);
+        tf_input_shapes.push_back(mkl_input_shapes[i].GetTfShape());
+      } else {
+        // no conversion since it is TF tensor already
+        converted_values.push_back(values[i]);
+        tf_input_shapes.push_back(values[i].shape());
+      }
+    }
 
     // Call Eigen concat.
-    eigen_concat_op_.Compute(context, converted_values, input_shapes);
+    eigen_concat_op_.Compute(context, converted_values, tf_input_shapes);
 
     // Set output Mkl tensor for this op.
     MklDnnShape dnn_shape_output;
@@ -812,6 +856,55 @@ class MklConcatOp : public OpKernel {
         output_tensor->flat<uint8>().data(),
         output_tensor->flat<uint8>().size() * sizeof(uint8));
   }
+
+  // This method finds the most commom format accross all MKL inputs
+  // Inputs:
+  //   1. input_shapes: shapes of input (MKL) tensors.
+  //   2. concat_dim: concat dimension.
+  // Outputs:
+  //   1. is_reorder_needed is set to true if inputs have difference formats
+  //      It is set to false otherwise.
+  //   2. concat_dim_size is the size of concat_dim.
+  // Return:
+  //   return the common MKL format.
+  memory::format FindMklCommonFormat(const MklDnnShapeList& input_shapes,
+      int concat_dim, bool* is_reorder_needed, int64* concat_dim_size) {
+    *is_reorder_needed = false;
+    *concat_dim_size = 0;
+    std::unordered_map<int, int> occurrence_map;
+    if (input_shapes.size() == 0)
+      return memory::format::any;
+
+    // Compute ocurrences of each format of all inputs.
+    for (int k=0; k <input_shapes.size(); k++) {
+      auto src_dims = TFShapeToMklDnnDims(input_shapes[k].GetTfShape());
+      *concat_dim_size += src_dims[concat_dim];
+      int fmt = static_cast<int>(
+          input_shapes[k].GetMklLayout().data.format);
+      occurrence_map[fmt] += 1;
+    }
+
+    if (occurrence_map.size() == 1) {
+       // this means that all inputs have a same format
+       // return it with is_reorder_needed set false.
+       return static_cast<memory::format>(
+           input_shapes[0].GetMklLayout().data.format);
+    }
+
+    // Input tensors have different formats. Thus, reorder is needed.
+    // We pick up the most common format to minimize the total
+    // number of input reorder.
+    memory::format commonest_format = memory::format::any;
+    int max_occurrence = 0;
+    *is_reorder_needed = true;
+    for (auto item : occurrence_map) {
+      if (item.second > max_occurrence) {
+        commonest_format = static_cast<memory::format>(item.first);
+        max_occurrence = item.second;
+      }
+    }
+    return commonest_format;
+  }
 };
 
 #endif
diff --git a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
index c1da0ded1d..f857be6c32 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
@@ -18,6 +18,7 @@ limitations under the License.
 // bias.
 
 #ifdef INTEL_MKL
+#ifdef INTEL_MKL_ML
 
 #define USE_EIGEN_TENSOR
 #define EIGEN_USE_THREADS
@@ -264,4 +265,5 @@ class MklConv2DCustomBackpropBiasOp : public OpKernel {
 TF_CALL_float(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 } /* namespace tensorflow */
+#endif /* INTEL_MKL_ML */
 #endif /* INTEL_MKL */
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
index 279167aba2..c0dfed7d7d 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -199,13 +199,15 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase<T> {
     CHECK_NOTNULL(pool_params);
     CHECK_NOTNULL(dnn_data_input);
     TensorShape input_tensor_shape = input_tensor.shape();
-    memory::desc input_md =
+    if (input_tensor.NumElements() != 0) {
+      memory::desc input_md =
         input_mkl_shape.IsMklTensor()
             ? input_mkl_shape.GetMklLayout()
             : memory::desc(TFShapeToMklDnnDimsInNCHW(input_tensor_shape,
                                                      this->data_format_tf_),
                            MklDnnType<T>(), this->data_format_mkldnn_);
-    dnn_data_input->SetUsrMem(input_md, &input_tensor);
+      dnn_data_input->SetUsrMem(input_md, &input_tensor);
+    }
     this->InitMklPoolParameters(context, pool_params, input_mkl_shape,
                                 input_tensor_shape);
   }
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index 43c5b29509..e1fc2ea128 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -292,6 +292,7 @@ TF_CALL_string(REGISTER_SCATTER_ND_CPU);
   REGISTER_SCATTER_ND_UPDATE_GPU(type);   \
   REGISTER_SCATTER_ND_GPU(type);
 
+TF_CALL_int32(REGISTER_SCATTER_ND_ALL_GPU);
 // TODO(b/66916790): Support half types in ScatterNd.
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_ALL_GPU);
 TF_CALL_complex64(REGISTER_SCATTER_ND_ALL_GPU);
@@ -306,6 +307,8 @@ TF_CALL_complex128(REGISTER_SCATTER_ND_ALL_GPU);
 #define REGISTER_SCATTER_ND_UPDATE_SYCL(type) \
   REGISTER_SCATTER_ND_UPDATE(type, SYCL);
 
+TF_CALL_int32(REGISTER_SCATTER_ND_ADD_SUB_SYCL);
+TF_CALL_int32(REGISTER_SCATTER_ND_UPDATE_SYCL);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_ADD_SUB_SYCL);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_UPDATE_SYCL);
 #undef REGISTER_SCATTER_ND_ADD_SUB_SYCL
@@ -576,6 +579,7 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
+TF_CALL_int32(DECLARE_GPU_SPECS);
 // TODO(b/66916790): Support half types in ScatterNd.
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
index a3c21edc15..08b657f4c3 100644
--- a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
@@ -170,6 +170,7 @@ struct ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM> {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
+TF_CALL_int32(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
 TF_CALL_complex128(DECLARE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/scoped_allocator_ops_test.cc b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
index bb0129fa6f..634f9ba887 100644
--- a/tensorflow/core/kernels/scoped_allocator_ops_test.cc
+++ b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
@@ -216,8 +216,13 @@ TEST_F(ScopedAllocatorConcatOpTest, Success3) {
 }
 
 TEST_F(ScopedAllocatorConcatOpTest, Reshape) {
-  MakeOp({2, 2, 2}, DT_DOUBLE, true, "test", 120, 2);
-  ExecOp(DT_DOUBLE, 120, {{2, 2}, {2, 2}});
+  MakeOp({2, 2, 4}, DT_DOUBLE, true, "test", 120, 2);
+
+  // The elements of the third parameter to ExecOp must be multiples of
+  // Allocator::kAllocatorAlignment in size.  If they are not, the backing
+  // tensor allocated by PrepOp will have too many elements and reshaping
+  // will fail.
+  ExecOp(DT_DOUBLE, 120, {{2, 4}, {2, 4}});
 }
 
 TEST_F(ScopedAllocatorConcatOpTest, NoReshapeAttr) {
diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h
index 7796bf3587..d65692a552 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.h
+++ b/tensorflow/core/kernels/segment_reduction_ops.h
@@ -16,6 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 #define TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 
+
+// This file requires the following include because it uses CudaAtomicMax:
+// #include "tensorflow/core/util/cuda_kernel_helper.h"
+
+// Unfortunately we can't add the #include, since it breaks compilation for
+// non-GPU targets. This only breaks in clang, because it's more strict for
+// template code and CudaAtomicMax is used in template context.
+
 // This file requires the following include because it uses CudaAtomicMax:
 // #include "tensorflow/core/util/cuda_kernel_helper.h"
 
@@ -130,4 +138,4 @@ struct Highest {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index a1f9667b78..866c5dcd52 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -1490,7 +1490,7 @@ inline void LibxsmmSparseMatMul<TL, TR>::Compute(
 
 #endif  // TENSORFLOW_USE_LIBXSMM
 
-// Here is a an overview of the SparseMatMul code. Note that we assume that the
+// Here is an overview of the SparseMatMul code. Note that we assume that the
 // left matrix is sparse.
 //
 // The matrix "left" is divided into a grid with blocksize of (M, KL). Each
diff --git a/tensorflow/core/kernels/string_split_op.cc b/tensorflow/core/kernels/string_split_op.cc
index 4c2b312c34..26ab72f12e 100644
--- a/tensorflow/core/kernels/string_split_op.cc
+++ b/tensorflow/core/kernels/string_split_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
@@ -43,6 +44,63 @@ std::vector<string> Split(const string& str, const string& delimiter,
   return char_vector;
 }
 
+std::vector<string> SplitV2(const string& str, StringPiece sep, int maxsplit) {
+  // This SplitV2 method matches the behavior of python's str.split:
+  //   If sep is given, consecutive delimiters are not grouped together
+  //   and are deemed to delimit empty strings (for example, '1,,2'.split(',')
+  //   returns ['1', '', '2']). The sep argument may consist of multiple
+  //   characters (for example, '1<>2<>3'.split('<>') returns ['1', '2', '3']).
+  //   Splitting an empty string with a specified separator returns [''].
+  //
+  //   If sep is not specified or is None, a different splitting algorithm is
+  //   applied: runs of consecutive whitespace are regarded as a single
+  //   separator, and the result will contain no empty strings at the start or
+  //   end if the string has leading or trailing whitespace. Consequently,
+  //   splitting an empty string or a string consisting of just whitespace
+  //   with a None separator returns [].
+
+  std::vector<string> result;
+
+  StringPiece text(str);
+  if (maxsplit == 0) {
+    result.emplace_back(std::string(text));
+    return result;
+  }
+
+  if (sep.empty()) {
+    StringPiece token;
+    // Remove leading whitespaces.
+    str_util::RemoveLeadingWhitespace(&text);
+    int split = 0;
+    while (str_util::ConsumeNonWhitespace(&text, &token)) {
+      result.emplace_back(std::string(token));
+      str_util::RemoveLeadingWhitespace(&text);
+      ++split;
+      if (maxsplit > 0 && split == maxsplit) {
+        result.emplace_back(std::string(text));
+        return result;
+      }
+    }
+    return result;
+  }
+  auto p = std::search(text.begin(), text.end(), sep.begin(), sep.end());
+  int split = 0;
+  while (p != text.end()) {
+    StringPiece token = text.substr(0, p - text.begin());
+    result.emplace_back(std::string(token));
+    text.remove_prefix(token.size());
+    text.remove_prefix(sep.size());
+    ++split;
+    if (maxsplit > 0 && split == maxsplit) {
+      result.emplace_back(std::string(text));
+      return result;
+    }
+    p = std::search(text.begin(), text.end(), sep.begin(), sep.end());
+  }
+  result.emplace_back(std::string(text));
+  return result;
+}
+
 }  // namespace
 
 class StringSplitOp : public OpKernel {
@@ -122,6 +180,78 @@ class StringSplitOp : public OpKernel {
   bool skip_empty_;
 };
 
+class StringSplitV2Op : public OpKernel {
+ public:
+  explicit StringSplitV2Op(OpKernelConstruction* context)
+      : OpKernel(context), maxsplit_(-1) {
+    OP_REQUIRES_OK(context, context->GetAttr("maxsplit", &maxsplit_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* input_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(input_tensor->shape()),
+                errors::InvalidArgument("input must be a vector, got shape: ",
+                                        input_tensor->shape().DebugString()));
+
+    const auto input_vec = input_tensor->vec<string>();
+    const int64 batch_size = input_vec.dimension(0);
+
+    const Tensor* sep_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("sep", &sep_tensor));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(sep_tensor->shape()),
+                errors::InvalidArgument("sep must be a scalar, got shape: ",
+                                        sep_tensor->shape().DebugString()));
+    const auto sep_vec = sep_tensor->flat<string>();
+    StringPiece sep(sep_vec(0));
+    std::vector<string> tokens;
+    // Guess that we'll be unpacking a handful of tokens per example.
+    static constexpr int kReserveSize = 4;
+    tokens.reserve(batch_size * kReserveSize);
+
+    int64 output_size = 0;
+    int64 max_num_entries = 0;
+    std::vector<int64> num_indices(batch_size);
+    for (int64 i = 0; i < batch_size; ++i) {
+      std::vector<string> parts = SplitV2(input_vec(i), sep, maxsplit_);
+      int64 n_entries = parts.size();
+      num_indices[i] = n_entries;
+      output_size += n_entries;
+      max_num_entries = std::max(max_num_entries, n_entries);
+      tokens.insert(tokens.end(), parts.begin(), parts.end());
+    }
+
+    Tensor* sp_indices_t;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({output_size, 2}),
+                                             &sp_indices_t));
+    Tensor* sp_tokens_t;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output(1, TensorShape({output_size}), &sp_tokens_t));
+    Tensor* sp_shape_t;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(2, TensorShape({2}), &sp_shape_t));
+
+    auto sp_indices = sp_indices_t->matrix<int64>();
+    auto sp_tokens = sp_tokens_t->vec<string>();
+    auto sp_shape = sp_shape_t->vec<int64>();
+    sp_shape(0) = batch_size;
+    sp_shape(1) = max_num_entries;
+    size_t c = 0;
+    for (size_t i = 0; i < batch_size; ++i) {
+      for (size_t j = 0; j < num_indices[i]; ++j) {
+        sp_indices(c, 0) = i;
+        sp_indices(c, 1) = j;
+        sp_tokens(c) = tokens[c];
+        ++c;
+      }
+    }
+  }
+
+ private:
+  int maxsplit_;
+};
+
 REGISTER_KERNEL_BUILDER(Name("StringSplit").Device(DEVICE_CPU), StringSplitOp);
+REGISTER_KERNEL_BUILDER(Name("StringSplitV2").Device(DEVICE_CPU),
+                        StringSplitV2Op);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/candidate_sampling_ops.cc b/tensorflow/core/ops/candidate_sampling_ops.cc
index 6e4d100b04..6e589c8d1c 100644
--- a/tensorflow/core/ops/candidate_sampling_ops.cc
+++ b/tensorflow/core/ops/candidate_sampling_ops.cc
@@ -145,12 +145,15 @@ REGISTER_OP("ComputeAccidentalHits")
       int64 num_true;
       TF_RETURN_IF_ERROR(c->GetAttr("num_true", &num_true));
 
-      // Validate true_classes.
+      // Validate true_classes, must be a matrix.
       ShapeHandle true_classes;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &true_classes));
       DimensionHandle unused;
       TF_RETURN_IF_ERROR(
           c->WithValue(c->Dim(true_classes, 1), num_true, &unused));
+      // Validate sampled_candidates, must be a vector.
+      ShapeHandle sampled_candidates;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &sampled_candidates));
 
       // All three outputs are the same shape.
       ShapeHandle v = c->Vector(InferenceContext::kUnknownDim);
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 15e0ca8af9..9dca5f53ce 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -218,7 +218,17 @@ REGISTER_OP("MapAndBatchDataset")
     .Attr("Targuments: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      // Use index from the end to retrieve the Input shapes,
+      // so that to avoid guessing the length of "other_arguments".
+      // batch_size, num_parallel_batches, and drop_remainder are 0-D scalars.
+      shape_inference::ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 1), 0, &unused));
+
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("MapAndBatchDatasetV2")
     .Input("input_dataset: variant")
@@ -231,7 +241,17 @@ REGISTER_OP("MapAndBatchDatasetV2")
     .Attr("Targuments: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      // Use index from the end to retrieve the Input shapes,
+      // so that to avoid guessing the length of "other_arguments".
+      // batch_size, num_parallel_calls, and drop_remainder are 0-D scalars.
+      shape_inference::ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 1), 0, &unused));
+
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("PrefetchDataset")
     .Input("input_dataset: variant")
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index d949e70c66..87f4991134 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -454,7 +454,9 @@ REGISTER_OP("DrawBoundingBoxes")
       DimensionHandle unused;
       TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 2), 4, &unused));
 
-      return shape_inference::UnchangedShapeWithRankAtLeast(c, 3);
+      // The rank of the input image (rank = 4) has already been restricted
+      // above, and the output is of the same shape as the input.
+      return shape_inference::UnchangedShape(c);
     });
 
 // --------------------------------------------------------------------------
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 1740fa152c..b3487122e2 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1084,7 +1084,7 @@ REGISTER_OP("UnsortedSegmentProd")
     .Input("segment_ids: Tindices")
     .Input("num_segments: Tnumsegments")
     .Output("output: T")
-    .Attr("T: realnumbertype")
+    .Attr("T: numbertype")
     .Attr("Tindices: {int32,int64}")
     .Attr("Tnumsegments: {int32,int64} = DT_INT32")
     .SetShapeFn(UnsortedSegmentReductionShapeFn);
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index fc60e807b9..41efa49ce3 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -1453,6 +1453,7 @@ REGISTER_OP("QuantizedReluX")
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Scalar());
       return Status::OK();
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index 1d5c743a56..4423062362 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -78,7 +78,7 @@ REGISTER_OP("ReduceJoin")
 REGISTER_OP("AsString")
     .Input("input: T")
     .Output("output: string")
-    .Attr("T: {int32, int64, complex64, float, double, bool, int8}")
+    .Attr("T: {int8, int16, int32, int64, complex64, float, double, bool}")
     .Attr("precision: int = -1")
     .Attr("scientific: bool = false")
     .Attr("shortest: bool = false")
@@ -134,6 +134,24 @@ REGISTER_OP("StringSplit")
       return Status::OK();
     });
 
+REGISTER_OP("StringSplitV2")
+    .Input("input: string")
+    .Input("sep: string")
+    .Output("indices: int64")
+    .Output("values: string")
+    .Output("shape: int64")
+    .Attr("maxsplit: int = -1")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+
+      c->set_output(0, c->Matrix(InferenceContext::kUnknownDim, 2));
+      c->set_output(1, c->Vector(InferenceContext::kUnknownDim));
+      c->set_output(2, c->Vector(2));
+      return Status::OK();
+    });
+
 REGISTER_OP("StringStrip")
     .Input("input: string")
     .Output("output: string")
diff --git a/tensorflow/core/platform/cpu_info.cc b/tensorflow/core/platform/cpu_info.cc
index 99de364042..e9da3d8e32 100644
--- a/tensorflow/core/platform/cpu_info.cc
+++ b/tensorflow/core/platform/cpu_info.cc
@@ -344,5 +344,28 @@ int CPUModelNum() {
 #endif
 }
 
+int CPUIDNumSMT() {
+#ifdef PLATFORM_IS_X86
+  // https://software.intel.com/en-us/articles/intel-64-architecture-processor-topology-enumeration
+  // https://software.intel.com/en-us/articles/intel-sdm (Vol 3A)
+  // Section: Detecting Hardware Multi-threads Support and Topology
+  // Uses CPUID Leaf 11 to enumerate system topology on Intel x86 architectures
+  // Other cases not supported
+  uint32 eax, ebx, ecx, edx;
+  // Check if system supports Leaf 11
+  GETCPUID(eax, ebx, ecx, edx, 0, 0);
+  if (eax >= 11) {
+    // 1) Leaf 11 available? CPUID.(EAX=11, ECX=0):EBX != 0
+    // 2) SMT_Mask_Width = CPUID.(EAX=11, ECX=0):EAX[4:0] if CPUID.(EAX=11,
+    // ECX=0):ECX[15:8] is 1
+    GETCPUID(eax, ebx, ecx, edx, 11, 0);
+    if (ebx != 0 && ((ecx & 0xff00) >> 8) == 1) {
+      return 1 << (eax & 0x1f);  // 2 ^ SMT_Mask_Width
+    }
+  }
+#endif  // PLATFORM_IS_X86
+  return 0;
+}
+
 }  // namespace port
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cpu_info.h b/tensorflow/core/platform/cpu_info.h
index b5be7e8b54..175c9ae8b1 100644
--- a/tensorflow/core/platform/cpu_info.h
+++ b/tensorflow/core/platform/cpu_info.h
@@ -35,6 +35,10 @@ namespace port {
 // software can change it dynamically.
 int NumSchedulableCPUs();
 
+// Returns an estimate of the number of hyperthreads per physical core
+// on the CPU
+int NumHyperthreadsPerCore();
+
 // Mostly ISA related features that we care about
 enum CPUFeature {
   // Do not change numeric assignments.
@@ -107,6 +111,9 @@ int CPUModelNum();
 // Returns nominal core processor cycles per second of each processor.
 double NominalCPUFrequency();
 
+// Returns num of hyperthreads per physical core
+int CPUIDNumSMT();
+
 }  // namespace port
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index ae81f9b5b3..a319ccbdbe 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -71,6 +71,8 @@ def pyx_library(
         name = filename + "_cython_translation",
         srcs = [filename],
         outs = [filename.split(".")[0] + ".cpp"],
+        # Optionally use PYTHON_BIN_PATH on Linux platforms so that python 3
+        # works. Windows has issues with cython_binary so skip PYTHON_BIN_PATH.
         cmd = "PYTHONHASHSEED=0 $(location @cython//:cython_binary) --cplus $(SRCS) --output-file $(OUTS)",
         tools = ["@cython//:cython_binary"] + pxd_srcs,
     )
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
index 72c12318ca..ff4b4436bb 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@@ -115,18 +115,17 @@ class LibHDFS {
     const char* kLibHdfsDso = "libhdfs.so";
 #endif
     char* hdfs_home = getenv("HADOOP_HDFS_HOME");
-    if (hdfs_home == nullptr) {
-      status_ = errors::FailedPrecondition(
-          "Environment variable HADOOP_HDFS_HOME not set");
-      return;
-    }
-    string path = io::JoinPath(hdfs_home, "lib", "native", kLibHdfsDso);
-    status_ = TryLoadAndBind(path.c_str(), &handle_);
-    if (!status_.ok()) {
-      // try load libhdfs.so using dynamic loader's search path in case
-      // libhdfs.so is installed in non-standard location
-      status_ = TryLoadAndBind(kLibHdfsDso, &handle_);
+    if (hdfs_home != nullptr) {
+      string path = io::JoinPath(hdfs_home, "lib", "native", kLibHdfsDso);
+      status_ = TryLoadAndBind(path.c_str(), &handle_);
+      if (status_.ok()) {
+        return;
+      }
     }
+
+    // Try to load the library dynamically in case it has been installed
+    // to a in non-standard location.
+    status_ = TryLoadAndBind(kLibHdfsDso, &handle_);
   }
 
   Status status_;
diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc
index 8e316472fe..708f32ba80 100644
--- a/tensorflow/core/platform/posix/port.cc
+++ b/tensorflow/core/platform/posix/port.cc
@@ -74,6 +74,11 @@ int NumSchedulableCPUs() {
   return kDefaultCores;
 }
 
+int NumHyperthreadsPerCore() {
+  static const int ht_per_core = tensorflow::port::CPUIDNumSMT();
+  return (ht_per_core > 0) ? ht_per_core : 1;
+}
+
 void* AlignedMalloc(size_t size, int minimum_alignment) {
 #if defined(__ANDROID__)
   return memalign(minimum_alignment, size);
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 522a9d84fd..cb1fd09dbb 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -19,12 +19,12 @@ limitations under the License.
 // TensorFlow uses semantic versioning, see http://semver.org/.
 
 #define TF_MAJOR_VERSION 1
-#define TF_MINOR_VERSION 8
+#define TF_MINOR_VERSION 9
 #define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX ""
+#define TF_VERSION_SUFFIX "-rc0"
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index dffc965b14..90b6533690 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -42,6 +42,7 @@ limitations under the License.
 
 #ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
+#include "tensorflow/core/lib/core/stringpiece.h"
 
 using mkldnn::engine;
 using mkldnn::memory;
@@ -712,15 +713,48 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
   return output_tensor;
 }
 #else
+using mkldnn::stream;
+template <typename T> class MklDnnData;
+
 template <typename T>
 inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
                              const MklDnnShape& mkl_shape) {
   Tensor output_tensor;
-  TensorShape output_shape;
-
-  TF_CHECK_OK(
-      Status(error::Code::UNIMPLEMENTED, "Unimplemented conversion function"));
-
+  try {
+    if (!mkl_shape.IsMklTensor())
+      return mkl_tensor;  // return input since it is already TF tensor
+
+    TensorShape output_shape = mkl_shape.GetTfShape();;
+
+    // Allocate output tensor.
+    context->allocate_temp(DataTypeToEnum<T>::v(),
+        output_shape, &output_tensor);
+
+    auto cpu_engine = engine(engine::cpu, 0);
+    MklDnnData<T> input(&cpu_engine);
+
+    // Get Mkl layout of input tensor.
+    auto input_mkl_md = mkl_shape.GetMklLayout();
+    auto output_tf_md = mkl_shape.GetTfLayout();
+    auto output_tf_pd = memory::primitive_desc(output_tf_md, cpu_engine);
+    input.SetUsrMem(input_mkl_md, &mkl_tensor);
+
+    // reorder
+    if (input.IsReorderNeeded(output_tf_pd)) {
+      std::vector<primitive> net;
+      CHECK_EQ(input.CheckReorderToOpMem(output_tf_pd, &output_tensor, &net),
+             true);
+      stream(stream::kind::eager).submit(net).wait();
+    } else {
+      // If not, just forward input tensor to output tensor.
+      CHECK(output_tensor.CopyFrom(mkl_tensor, output_shape));
+    }
+  } catch (mkldnn::error& e) {
+    string error_msg = "Status: " + std::to_string(e.status) +
+                       ", message: " + string(e.message) + ", in file " +
+                       string(__FILE__) + ":" + std::to_string(__LINE__);
+    LOG(FATAL) << "Operation received an exception: " << error_msg;
+  }
   return output_tensor;
 }
 #endif
@@ -1843,7 +1877,7 @@ class FactoryKeyCreator {
   template <typename T>
   void AddAsKey(const T data) {
     auto buffer = reinterpret_cast<const char *>(&data);
-    Append(absl::string_view(buffer, sizeof(T)));
+    Append(StringPiece(buffer, sizeof(T)));
   }
 
   std::string GetKey() {
@@ -1854,8 +1888,8 @@ class FactoryKeyCreator {
   string key_;
   const char delimiter = 'x';
   const int kMaxKeyLength = 256;
-  void Append(absl::string_view s) {
-    key_.append(string(s));
+  void Append(StringPiece s) {
+    key_.append(s.ToString());
     key_.append(1, delimiter);
   }
 };
diff --git a/tensorflow/docs_src/community/groups.md b/tensorflow/docs_src/community/groups.md
index d92f5775fa..0b07d413da 100644
--- a/tensorflow/docs_src/community/groups.md
+++ b/tensorflow/docs_src/community/groups.md
@@ -1,17 +1,38 @@
 # User Groups
 
-TensorFlow has communities around the world.
+TensorFlow has communities around the world. [Submit your community!](https://docs.google.com/forms/d/e/1FAIpQLSc_RQIUYtVgLLihzATaO_WUXkEyBDE_OoRoOXYDPmBEvHuEBA/viewform)
 
 ## Asia
 
-* [TensorFlow Korea (TF-KR) User Group](https://www.facebook.com/groups/TensorFlowKR/) _(Korean language)_
-* [TensorFlow User Group Tokyo](https://tfug-tokyo.connpass.com/) _(Japanese Language)_
-* [Soleil Data Dojo](https://soleildatadojo.connpass.com/) _(Japanese language)_
+* [TensorFlow China community](https://www.tensorflowers.cn)
+* [TensorFlow Korea (TF-KR) User Group](https://www.facebook.com/groups/TensorFlowKR/)
+* [TensorFlow User Group Tokyo](https://tfug-tokyo.connpass.com/)
+* [Soleil Data Dojo](https://soleildatadojo.connpass.com/)
 * [TensorFlow User Group Utsunomiya](https://tfug-utsunomiya.connpass.com/)
+* [TensorFlow Philippines Community](https://www.facebook.com/groups/TensorFlowPH/)
+* [TensorFlow and Deep Learning Singapore](https://www.meetup.com/TensorFlow-and-Deep-Learning-Singapore/)
+* [TensorFlow India](https://www.facebook.com/tensorflowindia)
 
 
 ## Europe
 
 * [TensorFlow Barcelona](https://www.meetup.com/Barcelona-Machine-Learning-Meetup/)
 * [TensorFlow Madrid](https://www.meetup.com/TensorFlow-Madrid/)
+* [Tensorflow Belgium](https://www.meetup.com/TensorFlow-Belgium)
+* [TensorFlow x Rome Meetup](https://www.meetup.com/it-IT/TensorFlow-x-Rome-Meetup)
+* [TensorFlow London](https://www.meetup.com/TensorFlow-London/)
+* [TensorFlow Edinburgh](https://www.meetup.com/tensorflow-edinburgh/)
 
+
+## America
+
+* [TensorFlow Buenos Aires](https://www.meetup.com/TensorFlow-Buenos-Aires/)
+
+
+## Oceania
+* [Melbourne TensorFlow Meetup](https://www.meetup.com/Melbourne-TensorFlow-Meetup)
+
+
+## Africa
+
+* [TensorFlow Tunis Meetup](https://www.meetup.com/fr-FR/TensorFlow-Tunis-Meetup/)
diff --git a/tensorflow/docs_src/get_started/eager.md b/tensorflow/docs_src/get_started/eager.md
index f08ac74425..bbb25e20c6 100644
--- a/tensorflow/docs_src/get_started/eager.md
+++ b/tensorflow/docs_src/get_started/eager.md
@@ -1,3 +1,3 @@
 # Get Started with Eager Execution
 
-[Colab notebook](https://colab.research.google.com/github/tensorflow/models/blob/r1.8.0/samples/core/get_started/eager.ipynb)
+[Colab notebook](https://colab.research.google.com/github/tensorflow/models/blob/r1.9.0/samples/core/get_started/eager.ipynb)
diff --git a/tensorflow/docs_src/get_started/index.md b/tensorflow/docs_src/get_started/index.md
index 55579d52fb..232d2f1547 100644
--- a/tensorflow/docs_src/get_started/index.md
+++ b/tensorflow/docs_src/get_started/index.md
@@ -10,9 +10,9 @@ course prior to diving into TensorFlow documentation:
 TensorFlow is a tool for machine learning. While it contains a wide range of
 functionality, TensorFlow is mainly designed for deep neural network models.
 
-The easiest way to get started with TensorFlow is using Eager Execution.
+The easiest way to get started with TensorFlow is by using Eager Execution.
 
-  * @{$get_started/eager}, is for anyone new to  machine learning or TensorFlow.
+  * @{$get_started/eager}, is for anyone new to machine learning or TensorFlow.
 
 TensorFlow provides many APIs. The remainder of this section focuses on the
 Estimator API which provide scalable, high-performance models. See the
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index 1abd840ab3..2901848745 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -38,7 +38,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.8.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.9.0-rc0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index 52a2a3f8a6..55bc0f64e7 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.8.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.9.0-rc0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index 1256fb99c4..637231da12 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.8.0</version>
+  <version>1.9.0-rc0</version>
 </dependency>
 ```
 
@@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.8.0</version>
+                 <version>1.9.0-rc0</version>
                </dependency>
              </dependencies>
          </project>
@@ -124,12 +124,12 @@ instead:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-  <version>1.8.0</version>
+  <version>1.9.0-rc0</version>
 </dependency>
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow_jni_gpu</artifactId>
-  <version>1.8.0</version>
+  <version>1.9.0-rc0</version>
 </dependency>
 ```
 
@@ -148,7 +148,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.9.0-rc0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -167,7 +167,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.8.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.9.0-rc0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -175,13 +175,13 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.9.0-rc0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.8.0.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.9.0-rc0.zip).
   3. Extract this .zip file.
 
-
+__Note__: The native library (`tensorflow_jni.dll`) requires `msvcp140.dll` at runtime, which is included in the [Visual C++ 2015 Redistributable](https://www.microsoft.com/en-us/download/details.aspx?id=48145) package. 
 
 ### Validate the installation
 
@@ -227,7 +227,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.8.0.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.9.0-rc0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -241,11 +241,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<pre><b>java -cp libtensorflow-1.8.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.9.0-rc0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.8.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.9.0-rc0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 0ed8160027..c8d706cf3c 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -339,9 +339,7 @@ Docker will download the TensorFlow binary image the first time you launch it.
 
 #### GPU support
 
-Prior to installing TensorFlow with GPU support, ensure that your system meets all
-[NVIDIA software requirements](#NVIDIARequirements).  To launch a Docker container
-with NVidia GPU support, enter a command of the following format:
+To launch a Docker container with NVidia GPU support, enter a command of the following format (this [does not require any local CUDA installation](https://github.com/nvidia/nvidia-docker/wiki/CUDA#requirements)):
 
 <pre>
 $ <b>nvidia-docker run -it</b> <i>-p hostPort:containerPort TensorFlowGPUImage</i>
@@ -438,7 +436,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 <a name="ValidateYourInstallation"></a>
 ## Validate your installation
@@ -517,7 +515,7 @@ on your system:
   from source. To use the TensorFlow binaries, version 3.5 or higher is required.
   See the [NVIDIA documentation](https://developer.nvidia.com/cuda-gpus) for a
   list of supported GPU cards.
-* [GPU drivers](http://nvidia.com/driver) that support your version of the CUDA
+* [GPU drivers](http://nvidia.com/drivers) that support your version of the CUDA
   Toolkit.
 * The `libcupti-dev` library is the NVIDIA CUDA Profile Tools Interface. This
   library provides advanced profiling support. To install this library,
@@ -684,14 +682,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -703,14 +701,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -722,14 +720,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -741,14 +739,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index 29a867a9e3..9d01271c5a 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -119,7 +119,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py3-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -242,7 +242,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py3-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -350,7 +350,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -522,7 +522,7 @@ The value you specify depends on your Python version.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py2-none-any.whl
 </pre>
 
 
@@ -530,5 +530,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py2-none-any.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py3-none-any.whl
 </pre>
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index 5ba522b436..dc6c1e36fc 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -81,7 +81,7 @@ or
 [macOS](#PrepareMac)
 
 
-<a name="#PrepareLinux"></a>
+<a name="PrepareLinux"></a>
 ## Prepare environment for Linux
 
 Before building TensorFlow on Linux, install the following build
@@ -328,10 +328,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.8.0 on Linux:
+for TensorFlow 1.9.0rc0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.8.0-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.9.0rc0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
@@ -373,9 +373,9 @@ The build and installation problems you encounter typically depend on the
 operating system.  See the "Common installation problems" section
 of one of the following guides:
 
-  * @{$install_linux#CommonInstallationProblems$Installing TensorFlow on Linux}
-  * @{$install_mac#CommonInstallationProblems$Installing TensorFlow on Mac OS}
-  * @{$install_windows#CommonInstallationProblems$Installing TensorFlow on Windows}
+  * @{$install_linux#common_installation_problems$Installing TensorFlow on Linux}
+  * @{$install_mac#common_installation_problems$Installing TensorFlow on Mac OS}
+  * @{$install_windows#common_installation_problems$Installing TensorFlow on Windows}
 
 Beyond the errors documented in those two guides, the following table
 notes additional errors specific to building TensorFlow.  Note that we
@@ -433,6 +433,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Linux**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.9.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.11.0</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.9.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.11.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.8.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.10.0</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.8.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.7.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.10.0</td><td>N/A</td><td>N/A</td></tr>
@@ -456,6 +458,7 @@ Stack Overflow and specify the `tensorflow` tag.
 **Mac**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.9.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.11.0</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.8.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.10.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.7.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.10.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.6.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
@@ -472,6 +475,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Windows**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.9.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.9.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.8.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.8.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.7.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
diff --git a/tensorflow/docs_src/mobile/linking_libs.md b/tensorflow/docs_src/mobile/linking_libs.md
index cf0db59021..efef5dd0da 100644
--- a/tensorflow/docs_src/mobile/linking_libs.md
+++ b/tensorflow/docs_src/mobile/linking_libs.md
@@ -27,7 +27,7 @@ called `libandroid_tensorflow_inference_java.jar`. There are three ways to
 include this functionality in your program:
 
 1. Include the jcenter AAR which contains it, as in this
- [example app](https://github.com/googlecodelabs/tensorflow-for-poets-2/blob/master/android/build.gradle#L59-L65)
+ [example app](https://github.com/googlecodelabs/tensorflow-for-poets-2/blob/master/android/tfmobile/build.gradle#L59-L65)
 
 2. Download the nightly precompiled version from
 [ci.tensorflow.org](http://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/).
diff --git a/tensorflow/docs_src/mobile/prepare_models.md b/tensorflow/docs_src/mobile/prepare_models.md
index 8b22c04d87..2b84dbb973 100644
--- a/tensorflow/docs_src/mobile/prepare_models.md
+++ b/tensorflow/docs_src/mobile/prepare_models.md
@@ -105,8 +105,8 @@ inline constants so everything’s in one file.  To handle the conversion, you
 need the `freeze_graph.py` script, that’s held in
 [`tensorflow/python/tools/freeze_graph.py`](https://www.tensorflow.org/code/tensorflow/python/tools/freeze_graph.py). You’ll run it like this:
 
-    bazel build tensorflow/tools:freeze_graph
-    bazel-bin/tensorflow/tools/freeze_graph \
+    bazel build tensorflow/python/tools:freeze_graph
+    bazel-bin/tensorflow/python/tools/freeze_graph \
     --input_graph=/tmp/model/my_graph.pb \
     --input_checkpoint=/tmp/model/model.ckpt-1000 \
     --output_graph=/tmp/frozen_graph.pb \
diff --git a/tensorflow/docs_src/performance/quantization.md b/tensorflow/docs_src/performance/quantization.md
index 2fea02d861..c97f74139c 100644
--- a/tensorflow/docs_src/performance/quantization.md
+++ b/tensorflow/docs_src/performance/quantization.md
@@ -227,8 +227,8 @@ of 30.0f, and an 8-bit array, the quantized values represent the following:
   <table>
     <tr><th>Quantized</th><th>Float</th></tr>
     <tr><td>0</td><td>-10.0</td></tr>
-    <tr><td>255</td><td>30.0</td></tr>
     <tr><td>128</td><td>10.0</td></tr>
+    <tr><td>255</td><td>30.0</td></tr>
   </table>
   <figcaption>
     <b>Table 2</b>: Example quantized value range
diff --git a/tensorflow/docs_src/programmers_guide/estimators.md b/tensorflow/docs_src/programmers_guide/estimators.md
index c4aae1d9d6..b13b47184d 100644
--- a/tensorflow/docs_src/programmers_guide/estimators.md
+++ b/tensorflow/docs_src/programmers_guide/estimators.md
@@ -21,18 +21,17 @@ Note: TensorFlow also includes a deprecated `Estimator` class at
 
 Estimators provide the following benefits:
 
-*   You can run Estimators-based models on a local host or on a
+*   You can run Estimator-based models on a local host or on a
     distributed multi-server environment without changing your model.
-    Furthermore, you can run Estimators-based models on CPUs, GPUs,
+    Furthermore, you can run Estimator-based models on CPUs, GPUs,
     or TPUs without recoding your model.
 *   Estimators simplify sharing implementations between model developers.
-*   You can develop a state of the art model with high-level intuitive code,
+*   You can develop a state of the art model with high-level intuitive code.
     In short, it is generally much easier to create models with Estimators
     than with the low-level TensorFlow APIs.
-*   Estimators are themselves built on tf.layers, which
+*   Estimators are themselves built on @{tf.layers}, which
     simplifies customization.
-*   Estimators build the graph for you.  In other words, you don't have to
-    build the graph.
+*   Estimators build the graph for you.
 *   Estimators provide a safe distributed training loop that controls how and
     when to:
     *   build the graph
@@ -57,7 +56,7 @@ the "plumbing" for you.  That is, pre-made Estimators create and manage
 pre-made Estimators let you experiment with different model architectures by
 making only minimal code changes.  @{tf.estimator.DNNClassifier$`DNNClassifier`},
 for example, is a pre-made Estimator class that trains classification models
-through dense, feed-forward neural networks.
+based on dense, feed-forward neural networks.
 
 
 ### Structure of a pre-made Estimators program
@@ -79,7 +78,7 @@ of the following four steps:
     an input function:
 
         def input_fn(dataset):
-           ...  # manipulate dataset, extracting feature names and the label
+           ...  # manipulate dataset, extracting the feature dict and the label
            return feature_dict, label
 
     (See @{$programmers_guide/datasets} for full details.)
@@ -96,13 +95,13 @@ of the following four steps:
         population = tf.feature_column.numeric_column('population')
         crime_rate = tf.feature_column.numeric_column('crime_rate')
         median_education = tf.feature_column.numeric_column('median_education',
-                            normalizer_fn='lambda x: x - global_education_mean')
+                            normalizer_fn=lambda x: x - global_education_mean)
 
 3.  **Instantiate the relevant pre-made Estimator.**  For example, here's
     a sample instantiation of a pre-made Estimator named `LinearClassifier`:
 
         # Instantiate an estimator, passing the feature columns.
-        estimator = tf.estimator.Estimator.LinearClassifier(
+        estimator = tf.estimator.LinearClassifier(
             feature_columns=[population, crime_rate, median_education],
             )
 
diff --git a/tensorflow/docs_src/programmers_guide/feature_columns.md b/tensorflow/docs_src/programmers_guide/feature_columns.md
index 845194fe0e..90f5c53a17 100644
--- a/tensorflow/docs_src/programmers_guide/feature_columns.md
+++ b/tensorflow/docs_src/programmers_guide/feature_columns.md
@@ -528,10 +528,10 @@ suggested by the following snippet:
 categorical_column = ... # Create any categorical column
 
 # Represent the categorical column as an embedding column.
-# This means creating a one-hot vector with one element for each category.
+# This means creating an embedding vector lookup table with one element for each category.
 embedding_column = tf.feature_column.embedding_column(
     categorical_column=categorical_column,
-    dimension=dimension_of_embedding_vector)
+    dimension=embedding_dimensions)
 ```
 
 @{$programmers_guide/embedding$Embeddings} is a significant topic within machine
diff --git a/tensorflow/examples/learn/iris.py b/tensorflow/examples/learn/iris.py
index 03e60972aa..86f5204ec3 100644
--- a/tensorflow/examples/learn/iris.py
+++ b/tensorflow/examples/learn/iris.py
@@ -21,7 +21,8 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-import urllib
+
+from six.moves.urllib.request import urlretrieve
 
 import tensorflow as tf
 
@@ -38,9 +39,7 @@ FEATURE_KEYS = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
 def maybe_download_iris_data(file_name, download_url):
   """Downloads the file and returns the number of data."""
   if not os.path.exists(file_name):
-    raw = urllib.urlopen(download_url).read()
-    with open(file_name, 'w') as f:
-      f.write(raw)
+    urlretrieve(download_url, file_name)
 
   # The first line is a comma-separated string. The first one is the number of
   # total data in the file.
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 5602775b62..a5224fbda0 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -10955,7 +10955,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 // SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
 //
 // value: The cropped area of the image must contain a fraction of the
-// supplied image within in this range.
+// supplied image within this range.
 // If not specified, defaults to <f:0.05 f:1 >
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
@@ -18098,9 +18098,10 @@ func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_val
 }
 
 // Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
-//
 // if < 0, `scale * features` otherwise.
 //
+// Assumes weights to have zero mean and variance 1.0 / fan_in.
+//
 // See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
 func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
@@ -21625,7 +21626,7 @@ func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 //    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
 //
 // The `bad_color` argument is the color to use in the generated images for
-// non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
+// non-finite input values.  It is a `uint8` 1-D tensor of length `channels`.
 // Each element must be in the range `[0, 255]` (It represents the value of a
 // pixel in the output image).  Non-finite values in the input tensor are
 // replaced by this tensor in the output image.  The default value is the color
@@ -24018,7 +24019,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 // SampleDistortedBoundingBoxV2AreaRange sets the optional area_range attribute to value.
 //
 // value: The cropped area of the image must contain a fraction of the
-// supplied image within in this range.
+// supplied image within this range.
 // If not specified, defaults to <f:0.05 f:1 >
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
@@ -24714,8 +24715,7 @@ type DecodeProtoV2Attr func(optionalAttr)
 // If not specified, defaults to "local://"
 func DecodeProtoV2DescriptorSource(value string) DecodeProtoV2Attr {
 	return func(m optionalAttr) {
-		m["descriptor_source"] = value
-	}
+		m["descriptor_source"] = value	}
 }
 
 // DecodeProtoV2MessageFormat sets the optional message_format attribute to value.
diff --git a/tensorflow/java/src/gen/cc/op_generator.cc b/tensorflow/java/src/gen/cc/op_generator.cc
index debd95fc62..9b171f66ec 100644
--- a/tensorflow/java/src/gen/cc/op_generator.cc
+++ b/tensorflow/java/src/gen/cc/op_generator.cc
@@ -376,9 +376,6 @@ void GenerateOp(const OpSpec& op, const EndpointSpec& endpoint,
     }
   }
   // op annotations
-  op_class.add_annotation(
-      Annotation::Create("Generated", "javax.annotation")
-          .attributes("value = \"TensorFlow Java Op Generator\""));
   if (endpoint.deprecated()) {
     op_class.add_annotation(Annotation::Create("Deprecated"));
     string explanation;
@@ -415,8 +412,12 @@ void GenerateOp(const OpSpec& op, const EndpointSpec& endpoint,
   SourceFileWriter writer(op_file.get());
   std::list<Type> dependencies;
   CollectOpDependencies(op, mode, &dependencies);
-  writer.Write(kLicense).EndLine().BeginType(op_class, PUBLIC | FINAL,
-                                             &dependencies, &op_javadoc);
+  writer.Write(kLicense)
+      .EndLine()
+      .Write("// This class has been generated, DO NOT EDIT!")
+      .EndLine()
+      .EndLine()
+      .BeginType(op_class, PUBLIC | FINAL, &dependencies, &op_javadoc);
   if (!op.optional_attributes().empty()) {
     RenderOptionsClass(op, op_class, &writer);
   }
diff --git a/tensorflow/java/src/gen/cc/op_specs.cc b/tensorflow/java/src/gen/cc/op_specs.cc
index 181fd4c5e3..941ab2699c 100644
--- a/tensorflow/java/src/gen/cc/op_specs.cc
+++ b/tensorflow/java/src/gen/cc/op_specs.cc
@@ -96,6 +96,7 @@ Type TypeResolver::TypeOf(const OpDef_ArgDef& arg_def, bool* iterable_out) {
     *iterable_out = true;
     visited_attrs_.insert(std::make_pair(arg_def.number_attr(), Type::Int()));
   }
+
   Type type = Type::Wildcard();
   if (arg_def.type() != DataType::DT_INVALID) {
     // resolve type from DataType
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index b2e6c60021..bd97b181ff 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -196,11 +196,11 @@ def implicit_val_and_grad(f):
   # TODO(cais): Remove calls to tf.constant() once the gradients functions
   # accept lists and np.ndarrays.
 
-  def grad_fn(*args):
+  def grad_fn(*args, **kwds):
     """Computes the gradient of the wrapped function."""
     this_tape = tape.push_new_tape()
     try:
-      end_node = f(*args)
+      end_node = f(*args, **kwds)
       if end_node is None:
         raise ValueError("Cannot differentiate a function that returns None; "
                          "did you forget to return a value from {}?".format(
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index 9cd17e0407..20522098b0 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -978,7 +978,10 @@ py_test(
     size = "large",
     srcs = ["keras_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["notsan"],
+    tags = [
+        "no_windows",
+        "notsan",
+    ],
     deps = [
         ":keras",
         "//tensorflow/core:protos_all_py",
diff --git a/tensorflow/python/estimator/exporter.py b/tensorflow/python/estimator/exporter.py
index 7cdf840c97..b18212cfcd 100644
--- a/tensorflow/python/estimator/exporter.py
+++ b/tensorflow/python/estimator/exporter.py
@@ -156,7 +156,7 @@ def _loss_smaller(best_eval_result, current_eval_result):
   return best_eval_result[default_key] > current_eval_result[default_key]
 
 
-def _verify_compre_fn_args(compare_fn):
+def _verify_compare_fn_args(compare_fn):
   """Verifies compare_fn arguments."""
   args = set(util.fn_args(compare_fn))
   if 'best_eval_result' not in args:
@@ -265,7 +265,7 @@ class BestExporter(Exporter):
     self._compare_fn = compare_fn
     if self._compare_fn is None:
       raise ValueError('`compare_fn` must not be None.')
-    _verify_compre_fn_args(self._compare_fn)
+    _verify_compare_fn_args(self._compare_fn)
 
     self._saved_model_exporter = _SavedModelExporter(
         name, serving_input_receiver_fn, assets_extra, as_text)
diff --git a/tensorflow/python/estimator/inputs/numpy_io.py b/tensorflow/python/estimator/inputs/numpy_io.py
index 035c7c148c..a6cefdece2 100644
--- a/tensorflow/python/estimator/inputs/numpy_io.py
+++ b/tensorflow/python/estimator/inputs/numpy_io.py
@@ -136,11 +136,13 @@ def numpy_input_fn(x,
       values in `x` have same shape).
     ValueError: if duplicate keys are in both `x` and `y` when `y` is a dict.
     ValueError: if x or y is an empty dict.
-    TypeError: `x` is not a dict or array, or if `shuffle` is not bool.
+    TypeError: `x` is not a dict or array.
+    ValueError: if 'shuffle' is not provided or a bool.
   """
   if not isinstance(shuffle, bool):
-    raise TypeError('shuffle must be explicitly set as boolean; '
-                    'got {}'.format(shuffle))
+    raise ValueError('shuffle must be provided and explicitly set as boolean '
+                     '(it is recommended to set it as True for training); '
+                     'got {}'.format(shuffle))
 
   def input_fn():
     """Numpy input function."""
diff --git a/tensorflow/python/estimator/inputs/numpy_io_test.py b/tensorflow/python/estimator/inputs/numpy_io_test.py
index 92d057e25d..81b201cc5c 100644
--- a/tensorflow/python/estimator/inputs/numpy_io_test.py
+++ b/tensorflow/python/estimator/inputs/numpy_io_test.py
@@ -286,8 +286,9 @@ class NumpyIoTest(test.TestCase):
     x = np.arange(32, 36)
     y = np.arange(4)
     with self.test_session():
-      with self.assertRaisesRegexp(TypeError,
-                                   'shuffle must be explicitly set as boolean'):
+      with self.assertRaisesRegexp(ValueError,
+                                   'shuffle must be provided and explicitly '
+                                   'set as boolean'):
         # Default shuffle is None.
         numpy_io.numpy_input_fn(x, y)
 
diff --git a/tensorflow/python/estimator/inputs/pandas_io.py b/tensorflow/python/estimator/inputs/pandas_io.py
index 938e244fb3..57f8e5fd6a 100644
--- a/tensorflow/python/estimator/inputs/pandas_io.py
+++ b/tensorflow/python/estimator/inputs/pandas_io.py
@@ -68,15 +68,16 @@ def pandas_input_fn(x,
   Raises:
     ValueError: if `x` already contains a column with the same name as `y`, or
       if the indexes of `x` and `y` don't match.
-    TypeError: `shuffle` is not bool.
+    ValueError: if 'shuffle' is not provided or a bool.
   """
   if not HAS_PANDAS:
     raise TypeError(
         'pandas_input_fn should not be called without pandas installed')
 
   if not isinstance(shuffle, bool):
-    raise TypeError('shuffle must be explicitly set as boolean; '
-                    'got {}'.format(shuffle))
+    raise ValueError('shuffle must be provided and explicitly set as boolean '
+                     '(it is recommended to set it as True for training); '
+                     'got {}'.format(shuffle))
 
   x = x.copy()
   if y is not None:
diff --git a/tensorflow/python/estimator/inputs/pandas_io_test.py b/tensorflow/python/estimator/inputs/pandas_io_test.py
index e5912a3b28..dcecf6dd61 100644
--- a/tensorflow/python/estimator/inputs/pandas_io_test.py
+++ b/tensorflow/python/estimator/inputs/pandas_io_test.py
@@ -70,8 +70,9 @@ class PandasIoTest(test.TestCase):
       return
     x, _ = self.makeTestDataFrame()
     y_noindex = pd.Series(np.arange(-32, -28))
-    with self.assertRaisesRegexp(TypeError,
-                                 'shuffle must be explicitly set as boolean'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'shuffle must be provided and explicitly '
+                                 'set as boolean'):
       # Default shuffle is None
       pandas_io.pandas_input_fn(x, y_noindex)
 
diff --git a/tensorflow/python/estimator/inputs/queues/feeding_functions.py b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
index 8e2ec83020..51a61adb21 100644
--- a/tensorflow/python/estimator/inputs/queues/feeding_functions.py
+++ b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
@@ -250,7 +250,7 @@ class _PandasFeedFn(object):
                num_epochs=None):
     if len(placeholders) != len(dataframe.columns) + 1:
       raise ValueError("Expected {} placeholders; got {}.".format(
-          len(dataframe.columns), len(placeholders)))
+          len(dataframe.columns) + 1, len(placeholders)))
     self._index_placeholder = placeholders[0]
     self._col_placeholders = placeholders[1:]
     self._dataframe = dataframe
diff --git a/tensorflow/python/estimator/keras.py b/tensorflow/python/estimator/keras.py
index c80af08fba..2f439f765e 100644
--- a/tensorflow/python/estimator/keras.py
+++ b/tensorflow/python/estimator/keras.py
@@ -70,7 +70,7 @@ def _convert_tensor(x):
   return x
 
 
-def _any_variable_initalized():
+def _any_variable_initialized():
   """Check if any variable has been initialized in the Keras model.
 
   Returns:
@@ -511,7 +511,7 @@ def model_to_estimator(keras_model=None,
       keras_model_fn, model_dir=model_dir, config=config)
 
   # Check if we need to call get_weights:
-  if _any_variable_initalized():
+  if _any_variable_initialized():
     keras_weights = keras_model.get_weights()
     # Warn if config passed to estimator tries to update GPUOptions. If a
     # session has already been created, the GPUOptions passed to the first
diff --git a/tensorflow/python/estimator/keras_test.py b/tensorflow/python/estimator/keras_test.py
index 6688a84130..5e094ae92b 100644
--- a/tensorflow/python/estimator/keras_test.py
+++ b/tensorflow/python/estimator/keras_test.py
@@ -31,10 +31,10 @@ from tensorflow.python.estimator import run_config as run_config_lib
 from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.applications import mobilenet
 from tensorflow.python.keras.optimizers import SGD
+from tensorflow.python.ops.parsing_ops import gen_parsing_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
@@ -146,13 +146,13 @@ def randomize_io_type(array, name):
 def multi_inputs_multi_outputs_model():
   a = keras.layers.Input(shape=(16,), name='input_a')
   b = keras.layers.Input(shape=(16,), name='input_b')
-  m = keras.layers.Input(shape=(8,), dtype='bool', name='input_m')
+  m = keras.layers.Input(shape=(8,), dtype='string', name='input_m')
   dense = keras.layers.Dense(8, name='dense_1')
 
   a_2 = dense(a)
-  # Apply a mask
-  s_2 = keras.layers.Lambda(lambda k:
-                            K.switch(k[0], k[1], K.zeros_like(k[1])))([m, a_2])
+  # Read m
+  m_2 = keras.layers.Lambda(gen_parsing_ops.string_to_number)(m)
+  s_2 = keras.layers.Lambda(lambda k: k[0] * k[1])([m_2, a_2])
   b_2 = dense(b)
   merged = keras.layers.concatenate([s_2, b_2], name='merge')
   c = keras.layers.Dense(3, activation='softmax', name='dense_2')(merged)
@@ -372,13 +372,13 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
 
     def train_input_fn():
       input_dict = {'input_a': a_train, 'input_b': b_train,
-                    'input_m': input_m_train > 0}
+                    'input_m': input_m_train.astype(np.str)}
       output_dict = {'dense_2': c_train, 'dense_3': d_train}
       return input_dict, output_dict
 
     def eval_input_fn():
       input_dict = {'input_a': a_test, 'input_b': b_test,
-                    'input_m': input_m_test > 0}
+                    'input_m': input_m_test.astype(np.str)}
       output_dict = {'dense_2': c_test, 'dense_3': d_test}
       return input_dict, output_dict
 
diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index 2d6925d1a8..af5d709f7e 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -1389,7 +1389,7 @@ class LayoutOptimizerTest(test.TestCase):
       expected_num_transposes = 3
       self.assertEqual(expected_num_transposes, num_transposes)
       self._assert_trans_nhwc_to_nchw('map/while/Conv2D-0', nodes)
-      self._assert_trans_nchw_to_nhwc('map/while/Add-0-2', nodes)
+      self._assert_trans_nchw_to_nhwc('map/while/Add_1-0-2', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
   def testLoopWithVecAnd4D(self):
@@ -1413,7 +1413,7 @@ class LayoutOptimizerTest(test.TestCase):
       expected_num_transposes = 2
       self.assertEqual(expected_num_transposes, num_transposes)
       self._assert_trans_nhwc_to_nchw('map/while/Conv2D-0', nodes)
-      self._assert_trans_nchw_to_nhwc('map/while/Add-0-2', nodes)
+      self._assert_trans_nchw_to_nhwc('map/while/Add_1-0-2', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
   def testBinaryOpSecondPort(self):
diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py
index e487f583be..f608dea430 100644
--- a/tensorflow/python/keras/activations.py
+++ b/tensorflow/python/keras/activations.py
@@ -93,6 +93,8 @@ def selu(x):
       - To be used together with the initialization "lecun_normal".
       - To be used together with the dropout variant "AlphaDropout".
 
+  References:
+      - [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
   """
   alpha = 1.6732632423543772848170429916717
   scale = 1.0507009873554804934193349852946
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 70b6a8431a..9f91368e5b 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -724,15 +724,6 @@ class TensorBoard(Callback):
         for weight in layer.weights:
           mapped_weight_name = weight.name.replace(':', '_')
           tf_summary.histogram(mapped_weight_name, weight)
-          if self.write_grads:
-            grads = model.optimizer.get_gradients(model.total_loss, weight)
-
-            def is_indexed_slices(grad):
-              return type(grad).__name__ == 'IndexedSlices'
-
-            grads = [grad.values if is_indexed_slices(grad) else grad
-                     for grad in grads]
-            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)
           if self.write_images:
             w_img = array_ops.squeeze(weight)
             shape = K.int_shape(w_img)
@@ -759,6 +750,18 @@ class TensorBoard(Callback):
             assert len(shape) == 4 and shape[-1] in [1, 3, 4]
             tf_summary.image(mapped_weight_name, w_img)
 
+        if self.write_grads:
+          for weight in layer.trainable_weights:
+            mapped_weight_name = weight.name.replace(':', '_')
+            grads = model.optimizer.get_gradients(model.total_loss, weight)
+
+            def is_indexed_slices(grad):
+              return type(grad).__name__ == 'IndexedSlices'
+
+            grads = [grad.values if is_indexed_slices(grad) else grad
+                     for grad in grads]
+            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)
+
         if hasattr(layer, 'output'):
           tf_summary.histogram('{}_out'.format(layer.name), layer.output)
     self.merged = tf_summary.merge_all()
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index b355f4a269..5062a26580 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -653,6 +653,8 @@ class KerasCallbacksTest(test.TestCase):
       model.add(
           keras.layers.Dense(
               NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+      # non_trainable_weights: moving_variance, moving_mean
+      model.add(keras.layers.BatchNormalization())
       model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
       model.compile(
           loss='categorical_crossentropy',
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index a4cd017d60..1c9135982e 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -123,7 +123,7 @@ class Network(base_layer.Layer):
     # Entries are unique. Includes input and output layers.
     self._layers = []
 
-    # Used in symbolic mode only, only in conjonction with graph-networks
+    # Used in symbolic mode only, only in conjunction with graph-networks
     self._outbound_nodes = []
     self._inbound_nodes = []
 
diff --git a/tensorflow/python/keras/engine/saving_test.py b/tensorflow/python/keras/engine/saving_test.py
index 6a94986b9c..7e82db028b 100644
--- a/tensorflow/python/keras/engine/saving_test.py
+++ b/tensorflow/python/keras/engine/saving_test.py
@@ -482,7 +482,7 @@ class TestWholeModelSaving(test.TestCase):
       with h5py.File(fname, 'r') as h5file:
         num_names_arrays = len([attr for attr in h5file['model_weights'].attrs
                                 if attr.startswith('layer_names')])
-      # The chunking of layer names array should have happend.
+      # The chunking of layer names array should have happened.
       self.assertGreater(num_names_arrays, 0)
       out2 = model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
@@ -527,7 +527,7 @@ class TestWholeModelSaving(test.TestCase):
         num_weight_arrays = len(
             [attr for attr in h5file['model_weights']['nested_model'].attrs
              if attr.startswith('weight_names')])
-      # The chunking of layer names array should have happend.
+      # The chunking of layer names array should have happened.
       self.assertGreater(num_weight_arrays, 0)
       out2 = model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 89c1f1a40f..fce6cbdb7a 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -24,6 +24,7 @@ import numpy as np
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -409,11 +410,13 @@ class Model(Network):
         else:
           if sample_weight_mode == 'temporal':
             sample_weights.append(array_ops.placeholder_with_default(
-                [[1.]], shape=[None, None], name=name + '_sample_weights'))
+                constant_op.constant([[1.]], dtype=K.floatx()),
+                shape=[None, None], name=name + '_sample_weights'))
             sample_weight_modes.append('temporal')
           else:
             sample_weights.append(array_ops.placeholder_with_default(
-                [1.], shape=[None], name=name + '_sample_weights'))
+                constant_op.constant([1.], dtype=K.floatx()),
+                shape=[None], name=name + '_sample_weights'))
             sample_weight_modes.append(None)
     self.sample_weight_modes = sample_weight_modes
     self._feed_sample_weight_modes = []
diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py
index 2ecbff3a1c..e8838cd3bc 100644
--- a/tensorflow/python/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/engine/training_eager.py
@@ -732,7 +732,7 @@ def slice_arrays(arrays, indices, contiguous=True):
   """Slices batches out of provided arrays (workaround for eager tensors).
 
   Unfortunately eager tensors don't have the same slicing behavior as
-  Numpy arrays (they folow  the same slicing behavior as symbolic TF tensors),
+  Numpy arrays (they follow the same slicing behavior as symbolic TF tensors),
   hence we cannot use `generic_utils.slice_arrays` directly
   and we have to implement this workaround based on `concat`. This has a
   performance cost.
diff --git a/tensorflow/python/keras/initializers_test.py b/tensorflow/python/keras/initializers_test.py
index a54d6da839..c519e194bd 100644
--- a/tensorflow/python/keras/initializers_test.py
+++ b/tensorflow/python/keras/initializers_test.py
@@ -71,7 +71,7 @@ class KerasInitializersTest(test.TestCase):
                                                       stddev=1,
                                                       seed=126),
                    tensor_shape,
-                   target_mean=0., target_std=None, target_max=2)
+                   target_mean=0., target_max=2, target_min=-2)
 
   def test_constant(self):
     tensor_shape = (5, 6, 4)
@@ -83,49 +83,49 @@ class KerasInitializersTest(test.TestCase):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(3. / fan_in)
+      std = np.sqrt(1. / fan_in)
       self._runner(keras.initializers.lecun_uniform(seed=123), tensor_shape,
-                   target_mean=0., target_max=scale, target_min=-scale)
+                   target_mean=0., target_std=std)
 
   def test_glorot_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, fan_out = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(6. / (fan_in + fan_out))
+      std = np.sqrt(2. / (fan_in + fan_out))
       self._runner(keras.initializers.glorot_uniform(seed=123), tensor_shape,
-                   target_mean=0., target_max=scale, target_min=-scale)
+                   target_mean=0., target_std=std)
 
   def test_he_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(6. / fan_in)
+      std = np.sqrt(2. / fan_in)
       self._runner(keras.initializers.he_uniform(seed=123), tensor_shape,
-                   target_mean=0., target_max=scale, target_min=-scale)
+                   target_mean=0., target_std=std)
 
   def test_lecun_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(1. / fan_in)
+      std = np.sqrt(1. / fan_in)
       self._runner(keras.initializers.lecun_normal(seed=123), tensor_shape,
-                   target_mean=0., target_std=None, target_max=2 * scale)
+                   target_mean=0., target_std=std)
 
   def test_glorot_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, fan_out = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(2. / (fan_in + fan_out))
+      std = np.sqrt(2. / (fan_in + fan_out))
       self._runner(keras.initializers.glorot_normal(seed=123), tensor_shape,
-                   target_mean=0., target_std=None, target_max=2 * scale)
+                   target_mean=0., target_std=std)
 
   def test_he_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(2. / fan_in)
+      std = np.sqrt(2. / fan_in)
       self._runner(keras.initializers.he_normal(seed=123), tensor_shape,
-                   target_mean=0., target_std=None, target_max=2 * scale)
+                   target_mean=0., target_std=std)
 
   def test_orthogonal(self):
     tensor_shape = (20, 20)
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index 5061825d38..f60064ed63 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -19,7 +19,9 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
+import sys
 import types as python_types
+import warnings
 
 import numpy as np
 
@@ -714,6 +716,7 @@ class Lambda(Layer):
     return self.mask
 
   def get_config(self):
+    module = self.function.__module__
     if isinstance(self.function, python_types.LambdaType):
       function = generic_utils.func_dump(self.function)
       function_type = 'lambda'
@@ -721,21 +724,26 @@ class Lambda(Layer):
       function = self.function.__name__
       function_type = 'function'
 
+    output_shape_module = None
     if isinstance(self._output_shape, python_types.LambdaType):
       output_shape = generic_utils.func_dump(self._output_shape)
       output_shape_type = 'lambda'
+      output_shape_module = self._output_shape.__module__
     elif callable(self._output_shape):
       output_shape = self._output_shape.__name__
       output_shape_type = 'function'
+      output_shape_module = self._output_shape.__module__
     else:
       output_shape = self._output_shape
       output_shape_type = 'raw'
 
     config = {
         'function': function,
+        'module': module,
         'function_type': function_type,
         'output_shape': output_shape,
         'output_shape_type': output_shape_type,
+        'output_shape_module': output_shape_module,
         'arguments': self.arguments
     }
     base_config = super(Lambda, self).get_config()
@@ -745,8 +753,16 @@ class Lambda(Layer):
   def from_config(cls, config, custom_objects=None):
     config = config.copy()
     globs = globals()
+    module = config.pop('module', None)
+    if module in sys.modules:
+      globs.update(sys.modules[module].__dict__)
+    elif module is not None:
+      # Note: we don't know the name of the function if it's a lambda.
+      warnings.warn('{} is not loaded, but a Lambda layer uses it. '
+                    'It may cause errors.'.format(module)
+                    , UserWarning)
     if custom_objects:
-      globs = dict(list(globs.items()) + list(custom_objects.items()))
+      globs.update(custom_objects)
     function_type = config.pop('function_type')
     if function_type == 'function':
       # Simple lookup in custom objects
@@ -760,6 +776,14 @@ class Lambda(Layer):
     else:
       raise TypeError('Unknown function type:', function_type)
 
+    output_shape_module = config.pop('output_shape_module', None)
+    if output_shape_module in sys.modules:
+      globs.update(sys.modules[output_shape_module].__dict__)
+    elif output_shape_module is not None:
+      # Note: we don't know the name of the function if it's a lambda.
+      warnings.warn('{} is not loaded, but a Lambda layer uses it. '
+                    'It may cause errors.'.format(output_shape_module)
+                    , UserWarning)
     output_shape_type = config.pop('output_shape_type')
     if output_shape_type == 'function':
       # Simple lookup in custom objects
diff --git a/tensorflow/python/keras/models_test.py b/tensorflow/python/keras/models_test.py
index c616d8f24f..e6e45902a8 100644
--- a/tensorflow/python/keras/models_test.py
+++ b/tensorflow/python/keras/models_test.py
@@ -144,5 +144,19 @@ class CheckpointingTests(test.TestCase):
     model.load_weights(save_prefix)
     self.assertEqual(12., self.evaluate(beta1_power))
 
+class TestModelBackend(test.TestCase):
+
+  def test_model_backend_float64_use_cases(self):
+    # Test case for GitHub issue 19318
+    floatx = keras.backend.floatx()
+    keras.backend.set_floatx('float64')
+
+    x = keras.Input((5,))
+    y = keras.layers.Dense(1)(x)
+    model = keras.models.Model(x, y)
+    model.compile('rmsprop', 'mse')
+
+    keras.backend.set_floatx(floatx)
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/as_string_op_test.py b/tensorflow/python/kernel_tests/as_string_op_test.py
index 9d54add264..94ed8ebd31 100644
--- a/tensorflow/python/kernel_tests/as_string_op_test.py
+++ b/tensorflow/python/kernel_tests/as_string_op_test.py
@@ -130,6 +130,16 @@ class AsStringOpTest(test.TestCase):
       result = output.eval(feed_dict={input_: int_inputs_})
       self.assertAllEqual(s(result), ["%d" % x for x in int_inputs_])
 
+  def testHalfInt(self):
+    s = lambda strs: [x.decode("ascii") for x in strs]
+
+    with self.test_session():
+      input_ = array_ops.placeholder(dtypes.int16)
+      int_inputs_ = [np.iinfo(np.int16).min, np.iinfo(np.int16).max]
+      output = string_ops.as_string(input_)
+      result = output.eval(feed_dict={input_: int_inputs_})
+      self.assertAllEqual(s(result), ["%d" % x for x in int_inputs_])
+
   def testBool(self):
     bool_inputs_ = [False, True]
     s = lambda strs: [x.decode("ascii") for x in strs]
diff --git a/tensorflow/python/kernel_tests/betainc_op_test.py b/tensorflow/python/kernel_tests/betainc_op_test.py
index 08b03f8518..16fdedac41 100644
--- a/tensorflow/python/kernel_tests/betainc_op_test.py
+++ b/tensorflow/python/kernel_tests/betainc_op_test.py
@@ -172,7 +172,7 @@ class BetaincTest(test.TestCase):
       tf_gout_t = math_ops.betainc(tf_ga_s, tf_gb_s, tf_gx_s)
       err = gradient_checker.compute_gradient_error(
           [tf_gx_s], [gx_s.shape], tf_gout_t, gx_s.shape)
-      print("betainc gradient err = %g " % err)
+      tf_logging.info("betainc gradient err = %g " % err)
       self.assertLess(err, err_tolerance)
 
       # Test broadcast gradient
@@ -181,7 +181,7 @@ class BetaincTest(test.TestCase):
       tf_gout_t = math_ops.betainc(tf_ga_s, tf_gb_s, tf_gx_s)
       err = gradient_checker.compute_gradient_error(
           [tf_gx_s], [()], tf_gout_t, ga_s.shape)
-      print("betainc gradient err = %g " % err)
+      tf_logging.info("betainc gradient err = %g " % err)
       self.assertLess(err, err_tolerance)
 
 
diff --git a/tensorflow/python/kernel_tests/clip_ops_test.py b/tensorflow/python/kernel_tests/clip_ops_test.py
index e08123b041..fb52d10475 100644
--- a/tensorflow/python/kernel_tests/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/clip_ops_test.py
@@ -18,9 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.platform import test
@@ -414,6 +417,16 @@ class ClipTest(test.TestCase):
 
     self.assertAllClose(np_ans, tf_ans)
 
+  def testClipByValueEmptyTensor(self):
+    # Test case for GitHub issue 19337
+    zero = array_ops.placeholder(dtype=dtypes.float32, shape=None)
+    x = clip_ops.clip_by_value(zero, zero, zero)
+    y = clip_ops.clip_by_value(zero, 1.0, 1.0)
+    z = clip_ops.clip_by_value(zero, zero, 1.0)
+    w = clip_ops.clip_by_value(zero, 1.0, zero)
+    with self.test_session(use_gpu=True) as sess:
+      sess.run([x, y, z, w], feed_dict={zero: np.zeros((7, 0))})
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index 8699fd5b25..80ba7dafc9 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -312,8 +312,8 @@ class Conv2DTest(test.TestCase):
       expected_values = self.evaluate(expected_results)
       computed_values = self.evaluate(computed_results)
       for e_value, c_value in zip(expected_values, computed_values):
-        print("expected = ", e_value)
-        print("actual = ", c_value)
+        tf_logging.info("expected = ", e_value)
+        tf_logging.info("actual = ", c_value)
         self.assertAllClose(
             e_value.flatten(), c_value.flatten(), atol=tolerance, rtol=1e-4)
 
@@ -337,8 +337,8 @@ class Conv2DTest(test.TestCase):
       for i in range(len(tensors)):
         conv = tensors[i]
         value = values[i]
-        print("expected = ", expected)
-        print("actual = ", value)
+        tf_logging.info("expected = ", expected)
+        tf_logging.info("actual = ", value)
         tol = 1e-5
         if value.dtype == np.float16:
           tol = 1e-3
@@ -547,8 +547,8 @@ class Conv2DTest(test.TestCase):
       # "values" consists of two tensors for two backprops
       value = self.evaluate(conv)
       self.assertShapeEqual(value, conv)
-    print("expected = ", expected)
-    print("actual = ", value)
+    tf_logging.info("expected = ", expected)
+    tf_logging.info("actual = ", value)
     self.assertArrayNear(expected, value.flatten(), err)
 
   def _CompareBackpropInput(self, input_sizes, filter_sizes, output_sizes,
@@ -723,8 +723,8 @@ class Conv2DTest(test.TestCase):
             data_format=data_format)
         value = self.evaluate(conv)
         self.assertShapeEqual(value, conv)
-      print("expected = ", expected)
-      print("actual = ", value)
+      tf_logging.info("expected = ", expected)
+      tf_logging.info("actual = ", value)
       self.assertArrayNear(expected, value.flatten(), 1e-5)
 
   def _CompareBackFilter(self, input_sizes, filter_sizes, output_sizes,
@@ -912,8 +912,8 @@ class Conv2DTest(test.TestCase):
         value_2 = sess.run(conv_2)
         self.assertShapeEqual(value, conv)
         self.assertShapeEqual(value_2, conv_2)
-      print("expected = ", value_2)
-      print("actual = ", value)
+      tf_logging.info("expected = ", value_2)
+      tf_logging.info("actual = ", value)
       self.assertArrayNear(value_2.flatten(), value.flatten(), err)
 
   # Testing for backprops
@@ -965,8 +965,8 @@ class Conv2DTest(test.TestCase):
         value_2 = sess.run(conv_2)
         self.assertShapeEqual(value, conv)
         self.assertShapeEqual(value_2, conv_2)
-      print("expected = ", value_2)
-      print("actual = ", value)
+      tf_logging.info("expected = ", value_2)
+      tf_logging.info("actual = ", value)
       self.assertArrayNear(value_2.flatten(), value.flatten(), err)
 
   def testConv2D2x2Depth3ValidBackpropFilterStride1x1Dilation2x1(self):
@@ -1178,7 +1178,7 @@ class Conv2DTest(test.TestCase):
           # since fp16 numerical gradients are too imprecise.
           err = np.fabs(jacob_t - reference_jacob_t).max()
 
-        print("conv_2d gradient error = ", err)
+        tf_logging.info("conv_2d gradient error = ", err)
         self.assertLess(err, 0.002)
 
   def testInputGradientValidPaddingStrideOne(self):
@@ -1546,7 +1546,7 @@ class DepthwiseConv2DTest(test.TestCase):
       conv = nn_impl.depthwise_conv2d(
           t1, t2, strides=[1, stride, stride, 1], padding=padding)
       value = sess.run(conv)
-    print("value = ", value)
+    tf_logging.info("value = ", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
 
@@ -1668,7 +1668,7 @@ class SeparableConv2DTest(test.TestCase):
         conv = array_ops.transpose(conv, [0, 2, 3, 1])
 
       value = sess.run(conv)
-    print("value = ", value)
+    tf_logging.info("value = ", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
 
@@ -1826,7 +1826,7 @@ class Conv2DBenchmark(test.Benchmark):
         wall_time = time.time() - start
         self.report_benchmark(
             name="conv_stack_iter_%d" % iter_index, wall_time=wall_time)
-        print("conv_stack_iter_%d: %.4f" % (iter_index, wall_time))
+        tf_logging.info("conv_stack_iter_%d: %.4f" % (iter_index, wall_time))
 
 
 def GetInceptionFwdTest(input_size, filter_size, stride, padding,
diff --git a/tensorflow/python/kernel_tests/gather_nd_op_test.py b/tensorflow/python/kernel_tests/gather_nd_op_test.py
index 91ebe8de99..58e2a8ac2a 100644
--- a/tensorflow/python/kernel_tests/gather_nd_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_nd_op_test.py
@@ -197,7 +197,21 @@ class GatherNdTest(test.TestCase):
     self.assertEqual(None, shape.ndims)
     self.assertEqual(None, shape[0].value)
 
-  def testBadIndices(self):
+  def testBadIndicesCPU(self):
+    with self.test_session(use_gpu=False):
+      params = [0, 1, 2]
+      indices = [[[0], [7]]]  # Make this one higher rank
+      gather_nd = array_ops.gather_nd(params, indices)
+      with self.assertRaisesOpError(
+          r"flat indices\[1, :\] = \[7\] does not index into param "
+          r"\(shape: \[3\]\)"):
+        gather_nd.eval()
+
+  def _disabledTestBadIndicesGPU(self):
+    # TODO disabled due to different behavior on GPU and CPU
+    # On GPU the bad indices do not raise error but fetch 0 values
+    if not test.is_gpu_available():
+      return
     with self.test_session(use_gpu=True):
       params = [0, 1, 2]
       indices = [[[0], [7]]]  # Make this one higher rank
@@ -207,7 +221,21 @@ class GatherNdTest(test.TestCase):
           r"\(shape: \[3\]\)"):
         gather_nd.eval()
 
-  def testBadIndicesWithSlices(self):
+  def testBadIndicesWithSlicesCPU(self):
+    with self.test_session(use_gpu=False):
+      params = [[0, 1, 2]]
+      indices = [[[0], [0], [1]]]  # Make this one higher rank
+      gather_nd = array_ops.gather_nd(params, indices)
+      with self.assertRaisesOpError(
+          r"flat indices\[2, :\] = \[1\] does not index into param "
+          r"\(shape: \[1,3\]\)"):
+        gather_nd.eval()
+
+  def _disabledTestBadIndicesWithSlicesGPU(self):
+    # TODO disabled due to different behavior on GPU and CPU
+    # On GPU the bad indices do not raise error but fetch 0 values
+    if not test.is_gpu_available():
+      return
     with self.test_session(use_gpu=True):
       params = [[0, 1, 2]]
       indices = [[[0], [0], [1]]]  # Make this one higher rank
diff --git a/tensorflow/python/kernel_tests/gather_op_test.py b/tensorflow/python/kernel_tests/gather_op_test.py
index a2fcd751df..033fa95935 100644
--- a/tensorflow/python/kernel_tests/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_op_test.py
@@ -27,7 +27,8 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.platform import test
 
-_TEST_TYPES = (dtypes.float32, dtypes.complex64, dtypes.complex128)
+_TEST_TYPES = (dtypes.int64, dtypes.float32,
+               dtypes.complex64, dtypes.complex128)
 
 
 class GatherTest(test.TestCase):
@@ -122,6 +123,9 @@ class GatherTest(test.TestCase):
                 gather, [tf_params, tf_indices, tf_axis], gather_grad)
             self.assertEqual(indices_grad, None)
             self.assertEqual(axis_grad, None)
+            if dtype.is_integer:
+              self.assertEqual(params_grad, None)
+              continue
             # For axis 0, we are able to create an efficient IndexedSlices for
             # the gradient.
             if axis == 0:
@@ -177,7 +181,19 @@ class GatherTest(test.TestCase):
     gather_t = array_ops.gather(params, indices, axis=axis)
     self.assertEqual(None, gather_t.shape)
 
-  def testBadIndices(self):
+  def testBadIndicesCPU(self):
+    with self.test_session(use_gpu=False):
+      params = [[0, 1, 2], [3, 4, 5]]
+      with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 2\)"):
+        array_ops.gather(params, [[7]], axis=0).eval()
+      with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 3\)"):
+        array_ops.gather(params, [[7]], axis=1).eval()
+
+  def _disabledTestBadIndicesGPU(self):
+    # TODO disabled due to different behavior on GPU and CPU
+    # On GPU the bad indices do not raise error but fetch 0 values
+    if not test.is_gpu_available():
+      return
     with self.test_session(use_gpu=True):
       params = [[0, 1, 2], [3, 4, 5]]
       with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 2\)"):
diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py
index a9b55854f1..795aa67248 100644
--- a/tensorflow/python/kernel_tests/init_ops_test.py
+++ b/tensorflow/python/kernel_tests/init_ops_test.py
@@ -362,6 +362,33 @@ class UniformUnitScalingInitializationTest(test.TestCase):
         dtype=dtypes.string)
 
 
+class VarianceScalingInitializationTest(test.TestCase):
+
+  def testNormalDistribution(self):
+    shape = [100, 100]
+    expect_mean = 0.
+    expect_var = 1. / shape[0]
+    init = init_ops.variance_scaling_initializer(distribution='normal')
+
+    with self.test_session(use_gpu=True):
+      x = init(shape).eval()
+
+    self.assertNear(np.mean(x), expect_mean, err=1e-2)
+    self.assertNear(np.var(x), expect_var, err=1e-2)
+
+  def testUniformDistribution(self):
+    shape = [100, 100]
+    expect_mean = 0.
+    expect_var = 1. / shape[0]
+    init = init_ops.variance_scaling_initializer(distribution='uniform')
+
+    with self.test_session(use_gpu=True):
+      x = init(shape).eval()
+
+    self.assertNear(np.mean(x), expect_mean, err=1e-2)
+    self.assertNear(np.var(x), expect_var, err=1e-2)
+
+
 # TODO(vrv): move to sequence_ops_test?
 class RangeTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index a0c372db7d..e95c729715 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -947,7 +947,7 @@ class PoolingTest(test.TestCase):
           output_sizes,
           x_init_value=x_init_value,
           delta=1e-2)
-    print("%s gradient error = " % func_name, err)
+    tf_logging.info("%s gradient error = " % func_name, err)
     self.assertLess(err, err_tolerance)
 
   def _ConstructAndTestSecondGradient(self,
@@ -1024,7 +1024,7 @@ class PoolingTest(test.TestCase):
           input_sizes,
           x_init_value=x_init_value,
           delta=1e-2)
-    print("%s second-order gradient error = " % func_name, err)
+    tf_logging.info("%s second-order gradient error = " % func_name, err)
     self.assertLess(err, err_tolerance)
 
   def _testMaxPoolGradValidPadding1_1(self, data_format, use_gpu):
diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index 677253946e..253e43920b 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import gc
 import re
 
 import numpy as np
@@ -434,13 +435,29 @@ class PyFuncTest(test.TestCase):
 
   # ----- Tests shared by py_func and eager_py_func -----
   def testCleanup(self):
-    for _ in xrange(1000):
-      g = ops.Graph()
-      with g.as_default():
-        c = constant_op.constant([1.], dtypes.float32)
-        _ = script_ops.py_func(lambda x: x + 1, [c], [dtypes.float32])
-        _ = script_ops.eager_py_func(lambda x: x + 1, [c], [dtypes.float32])
-    self.assertLess(script_ops._py_funcs.size(), 100)
+    # Delete everything created by previous tests to avoid side effects.
+    ops.reset_default_graph()
+    gc.collect()
+    initial_size = script_ops._py_funcs.size()
+    # Encapsulate the graph generation, so locals can be deleted.
+    def make_graphs():
+      for _ in xrange(1000):
+        g = ops.Graph()
+        with g.as_default():
+          c = constant_op.constant([1.], dtypes.float32)
+          _ = script_ops.py_func(lambda x: x + 1, [c], [dtypes.float32])
+          _ = script_ops.eager_py_func(lambda x: x + 1, [c], [dtypes.float32])
+          # These ops have a reference to 'c' which has a reference to the graph.
+          # Checks if the functions are being deleted though the graph is referenced from them.
+          # (see #18292)
+          _ = script_ops.py_func(lambda x: x + c.shape[0], [c], [dtypes.float32])
+          _ = script_ops.eager_py_func(lambda x: x + c.shape[0], [c], [dtypes.float32])
+ 
+    # Call garbage collector to enforce deletion.
+    make_graphs()
+    ops.reset_default_graph()
+    gc.collect()
+    self.assertEqual(initial_size, script_ops._py_funcs.size())
 
   # ----- Tests for eager_py_func -----
   @test_util.run_in_graph_and_eager_modes()
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index 79fe927b8a..faa4b49a8d 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -144,7 +144,9 @@ class StatefulScatterNdTest(test.TestCase):
         self.assertAllClose(new, ref_var.eval())
 
   def _VariableRankTests(self, np_scatter, tf_scatter):
-    for vtype in (np.float32, np.float64, np.complex64, np.complex128):
+    for vtype in (np.int32,
+                  np.float32, np.float64,
+                  np.complex64, np.complex128):
       for itype in (np.int32, np.int64):
         self._VariableRankTest(np_scatter, tf_scatter, vtype, itype)
 
@@ -221,7 +223,7 @@ class StatefulScatterNdTest(test.TestCase):
   #   self._VariableRankTests(_NumpyDiv, state_ops.scatter_nd_div)
 
   def _ScatterRepeatIndicesTest(self, np_scatter, tf_scatter):
-    for vtype in (np.float32, np.float64):
+    for vtype in (np.int32, np.float32, np.float64):
       for itype in (np.int32, np.int64):
         self._VariableRankTest(
             np_scatter, tf_scatter, vtype, itype, repeat_indices=True)
diff --git a/tensorflow/python/kernel_tests/scatter_ops_test.py b/tensorflow/python/kernel_tests/scatter_ops_test.py
index c70a4ffce7..1a0fa744ae 100644
--- a/tensorflow/python/kernel_tests/scatter_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_ops_test.py
@@ -159,7 +159,13 @@ class ScatterTest(test.TestCase):
 
           # Clips small values to avoid division by zero.
           def clip_small_values(x):
-            return 1e-4 * np.sign(x) if np.abs(x) < 1e-4 else x
+            threshold = 1e-4
+            sign = np.sign(x)
+
+            if isinstance(x, np.int32):
+              threshold = 1
+              sign = np.random.choice([-1, 1])
+            return threshold * sign if np.abs(x) < threshold else x
 
           updates = np.vectorize(clip_small_values)(updates)
           old = _AsType(np.random.randn(*((first_dim,) + extra_shape)), vtype)
@@ -181,7 +187,11 @@ class ScatterTest(test.TestCase):
                          tf_scatter,
                          repeat_indices=False,
                          updates_are_scalar=False):
-    for vtype in (np.float32, np.float64):
+    vtypes = [np.float32, np.float64]
+    if tf_scatter != state_ops.scatter_div:
+      vtypes.append(np.int32)
+
+    for vtype in vtypes:
       for itype in (np.int32, np.int64):
         self._VariableRankTest(tf_scatter, vtype, itype, repeat_indices,
                                updates_are_scalar)
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index 794be096b7..a82855dfeb 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -264,7 +264,9 @@ class UnsortedSegmentTest(SegmentReductionHelper):
 
     # A subset of ops has been enabled for complex numbers
     self.complex_ops_list = [(np.add, None,
-                              math_ops.unsorted_segment_sum, lambda t: 0)]
+                              math_ops.unsorted_segment_sum, lambda t: 0),
+                             (np.ndarray.__mul__, None,
+                              math_ops.unsorted_segment_prod, lambda t: 1)]
     self.differentiable_dtypes = [dtypes_lib.float16, dtypes_lib.float32,
                                   dtypes_lib.float64]
     self.all_dtypes = (self.differentiable_dtypes +
diff --git a/tensorflow/python/kernel_tests/string_split_op_test.py b/tensorflow/python/kernel_tests/string_split_op_test.py
index a5bd1b6ee0..e20daccb28 100644
--- a/tensorflow/python/kernel_tests/string_split_op_test.py
+++ b/tensorflow/python/kernel_tests/string_split_op_test.py
@@ -146,5 +146,101 @@ class StringSplitOpTest(test.TestCase):
       self.assertAllEqual(shape, [3, 1])
 
 
+class StringSplitV2OpTest(test.TestCase):
+
+  def testSplitV2(self):
+    strings = ["pigs on the wing", "animals"]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings)
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0]])
+      self.assertAllEqual(values, [b"pigs", b"on", b"the", b"wing", b"animals"])
+      self.assertAllEqual(shape, [2, 4])
+
+  def testSplitV2MultiCharSeparator(self):
+    # Match Python behavior:
+    # >>> '1<>2<>3'.split('<>')
+    # ['1', '2', '3']
+    # >>> "<><>4<>5<><>6<>".split("<>")
+    # ['', '', '4', '5', '', '6', '']
+    strings = ["1<>2<>3", "<><>4<>5<><>6<>"]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings, sep="<>")
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(
+          indices, [[0, 0], [0, 1], [0, 2],
+                    [1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5], [1, 6]])
+      self.assertAllEqual(values, [b"1", b"2", b"3",
+                                   b"", b"", b"4", b"5", b"", b"6", b""])
+      self.assertAllEqual(shape, [2, 7])
+
+  def testSplitV2SimpleSeparator(self):
+    # Match Python behavior:
+    # >>> '1,2,3'.split(',')
+    # ['1', '2', '3']
+    # >>> '1,2,,3,'.split(',')
+    # ['1', '2', '', '3', '']
+    strings = ["1,2,3", "4,5,,6,"]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings, sep=',')
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2],
+                                    [1, 0], [1, 1], [1, 2], [1, 3], [1, 4]])
+      self.assertAllEqual(values, [b"1", b"2", b"3",
+                                   b"4", b"5", b"", b"6", b""])
+      self.assertAllEqual(shape, [2, 5])
+
+  def testSplitV2EmptySeparator(self):
+    # Match Python behavior:
+    # >>> '1 2 3'.split()
+    # ['1', '2', '3']
+    #>>> '   1   2   3   '.split()
+    #['1', '2', '3']
+    strings = ["1 2 3", "  4  5    6  "]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings)
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2],
+                                    [1, 0], [1, 1], [1, 2]])
+      self.assertAllEqual(values, [b"1", b"2", b"3", b"4", b"5", b"6"])
+      self.assertAllEqual(shape, [2, 3])
+
+  def testSplitV2SimpleSeparatorMaxSplit(self):
+    # Match Python behavior:
+    # >>> '1,2,3'.split(',', maxsplit=1)
+    # ['1', '2,3']
+    # >>> '4,5,,6,'.split(',', maxsplit=1)
+    # ['4', '5,,6,']
+    strings = ["1,2,3", "4,5,,6,"]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings, sep=',', maxsplit=1)
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(indices, [[0, 0], [0, 1],
+                                    [1, 0], [1, 1]])
+      self.assertAllEqual(values, [b"1", b"2,3", b"4", b"5,,6,"])
+      self.assertAllEqual(shape, [2, 2])
+
+  def testSplitV2EmptySeparatorMaxSplit(self):
+    # Match Python behavior:
+    # '1 2 3'.split(maxsplit=1)
+    # ['1', '2 3']
+    # >>> "  4  5    6  ".split(maxsplit=1)
+    # ['4', '5    6  ']
+    strings = ["1 2 3", "  4  5    6  "]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings, maxsplit=1)
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(indices, [[0, 0], [0, 1],
+                                    [1, 0], [1, 1]])
+      self.assertAllEqual(values, [b"1", b"2 3", b"4", b"5    6  "])
+      self.assertAllEqual(shape, [2, 2])
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 8129334703..fae63b1132 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -2619,6 +2619,10 @@ reverse.__doc__ = gen_array_ops.reverse_v2.__doc__
 
 # pylint: disable=redefined-builtin
 @tf_export("reverse_sequence")
+@deprecation.deprecated_args(
+    None, "seq_dim is deprecated, use seq_axis instead", "seq_dim")
+@deprecation.deprecated_args(
+    None, "batch_dim is deprecated, use batch_axis instead", "batch_dim")
 def reverse_sequence(input,
                      seq_lengths,
                      seq_axis=None,
diff --git a/tensorflow/python/ops/gradient_checker.py b/tensorflow/python/ops/gradient_checker.py
index 12afcd0b51..94c8d79335 100644
--- a/tensorflow/python/ops/gradient_checker.py
+++ b/tensorflow/python/ops/gradient_checker.py
@@ -283,10 +283,10 @@ def compute_gradient(x,
   numbers.  For example, if `x` is complex with shape `[m]` and `y` is complex
   with shape `[n]`, each Jacobian `J` will have shape `[m * 2, n * 2]` with
 
-      J[:m, :n] = d(Re y)/d(Re x)
-      J[:m, n:] = d(Im y)/d(Re x)
-      J[m:, :n] = d(Re y)/d(Im x)
-      J[m:, n:] = d(Im y)/d(Im x)
+      J[::2, ::2] = d(Re y)/d(Re x)
+      J[::2, 1::2] = d(Im y)/d(Re x)
+      J[1::2, ::2] = d(Re y)/d(Im x)
+      J[1::2, 1::2] = d(Im y)/d(Im x)
 
   Args:
     x: a tensor or list of tensors
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index bdcf420980..f27d9224c1 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_image_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
@@ -258,14 +259,14 @@ def random_flip_up_down(image, seed=None):
   dimension, which is `height`.  Otherwise output the image as-is.
 
   Args:
-    image: A 3-D tensor of shape `[height, width, channels].`
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
     seed: A Python integer. Used to create a random seed. See
       @{tf.set_random_seed}
       for behavior.
 
   Returns:
-    A 3-D tensor of the same type and shape as `image`.
-
+    A tensor of the same type and shape as `image`.
   Raises:
     ValueError: if the shape of `image` not supported.
   """
@@ -280,13 +281,14 @@ def random_flip_left_right(image, seed=None):
   second dimension, which is `width`.  Otherwise output the image as-is.
 
   Args:
-    image: A 3-D tensor of shape `[height, width, channels].`
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
     seed: A Python integer. Used to create a random seed. See
       @{tf.set_random_seed}
       for behavior.
 
   Returns:
-    A 3-D tensor of the same type and shape as `image`.
+    A tensor of the same type and shape as `image`.
 
   Raises:
     ValueError: if the shape of `image` not supported.
@@ -297,7 +299,8 @@ def random_flip_left_right(image, seed=None):
 def _random_flip(image, flip_index, seed, scope_name):
   """Randomly (50% chance) flip an image along axis `flip_index`.
     Args:
-      image: A 3-D tensor of shape `[height, width, channels].`
+      image: 4-D Tensor of shape `[batch, height, width, channels]` or
+             3-D Tensor of shape `[height, width, channels]`.
       flip_index: The dimension along which to flip the image.
                   Vertical: 0, Horizontal: 1
       seed: A Python integer. Used to create a random seed. See
@@ -306,22 +309,37 @@ def _random_flip(image, flip_index, seed, scope_name):
       scope_name: Name of the scope in which the ops are added.
 
     Returns:
-      A 3-D tensor of the same type and shape as `image`.
+      A tensor of the same type and shape as `image`.
 
     Raises:
       ValueError: if the shape of `image` not supported.
   """
   with ops.name_scope(None, scope_name, [image]) as scope:
     image = ops.convert_to_tensor(image, name='image')
-    image = _Assert3DImage(image)
-    uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
-    mirror_cond = math_ops.less(uniform_random, .5)
-    result = control_flow_ops.cond(
-        mirror_cond,
-        lambda: array_ops.reverse(image, [flip_index]),
-        lambda: image,
-        name=scope)
-    return fix_image_flip_shape(image, result)
+    image = _AssertAtLeast3DImage(image)
+    shape = image.get_shape()
+    if shape.ndims == 3 or shape.ndims is None:
+      uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
+      mirror_cond = math_ops.less(uniform_random, .5)
+      result = control_flow_ops.cond(
+          mirror_cond,
+          lambda: array_ops.reverse(image, [flip_index]),
+          lambda: image,
+          name=scope
+      )
+      return fix_image_flip_shape(image, result)
+    elif shape.ndims == 4:
+      uniform_random = random_ops.random_uniform(
+          [array_ops.shape(image)[0]], 0, 1.0, seed=seed
+      )
+      mirror_cond = math_ops.less(uniform_random, .5)
+      return array_ops.where(
+          mirror_cond,
+          image,
+          functional_ops.map_fn(lambda x: array_ops.reverse(x, [flip_index]), image, dtype=image.dtype)
+      )
+    else:
+      raise ValueError('\'image\' must have either 3 or 4 dimensions.')
 
 
 @tf_export('image.flip_left_right')
@@ -1634,13 +1652,13 @@ def is_jpeg(contents, name=None):
 
 
 @tf_export('image.decode_image')
-def decode_image(contents, channels=None, name=None):
+def decode_image(contents, channels=None, dtype=dtypes.uint8, name=None):
   """Convenience function for `decode_bmp`, `decode_gif`, `decode_jpeg`,
   and `decode_png`.
 
   Detects whether an image is a BMP, GIF, JPEG, or PNG, and performs the
-  appropriate operation to convert the input bytes `string` into a `Tensor` of
-  type `uint8`.
+  appropriate operation to convert the input bytes `string` into a `Tensor`
+  of type `dtype`.
 
   Note: `decode_gif` returns a 4-D array `[num_frames, height, width, 3]`, as
   opposed to `decode_bmp`, `decode_jpeg` and `decode_png`, which return 3-D
@@ -1652,10 +1670,11 @@ def decode_image(contents, channels=None, name=None):
     contents: 0-D `string`. The encoded image bytes.
     channels: An optional `int`. Defaults to `0`. Number of color channels for
       the decoded image.
+    dtype: The desired DType of the returned `Tensor`.
     name: A name for the operation (optional)
 
   Returns:
-    `Tensor` with type `uint8` with shape `[height, width, num_channels]` for
+    `Tensor` with type `dtype` and shape `[height, width, num_channels]` for
       BMP, JPEG, and PNG images and shape `[num_frames, height, width, 3]` for
       GIF images.
 
@@ -1679,7 +1698,7 @@ def decode_image(contents, channels=None, name=None):
       channels_msg = 'Channels must be in (None, 0, 3) when decoding BMP images'
       assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
       with ops.control_dependencies([assert_decode, assert_channels]):
-        return gen_image_ops.decode_bmp(contents)
+        return convert_image_dtype(gen_image_ops.decode_bmp(contents), dtype)
 
     def _gif():
       # Create assert to make sure that channels is not set to 1
@@ -1692,7 +1711,7 @@ def decode_image(contents, channels=None, name=None):
       channels_msg = 'Channels must be in (None, 0, 3) when decoding GIF images'
       assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
       with ops.control_dependencies([assert_channels]):
-        return gen_image_ops.decode_gif(contents)
+        return convert_image_dtype(gen_image_ops.decode_gif(contents), dtype)
 
     def check_gif():
       # Create assert op to check that bytes are GIF decodable
@@ -1701,7 +1720,11 @@ def decode_image(contents, channels=None, name=None):
 
     def _png():
       """Decodes a PNG image."""
-      return gen_image_ops.decode_png(contents, channels)
+      return convert_image_dtype(
+          gen_image_ops.decode_png(contents, channels,
+                                   dtype=dtypes.uint8
+                                   if dtype == dtypes.uint8
+                                   else dtypes.uint16), dtype)
 
     def check_png():
       """Checks if an image is PNG."""
@@ -1717,7 +1740,8 @@ def decode_image(contents, channels=None, name=None):
                       'images')
       assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
       with ops.control_dependencies([assert_channels]):
-        return gen_image_ops.decode_jpeg(contents, channels)
+        return convert_image_dtype(
+            gen_image_ops.decode_jpeg(contents, channels), dtype)
 
     # Decode normal JPEG images (start with \xff\xd8\xff\xe0)
     # as well as JPEG images with EXIF data (start with \xff\xd8\xff\xe1).
@@ -1878,7 +1902,7 @@ def sample_distorted_bounding_box(image_size,
       width / height within this range.
     area_range: An optional list of `floats`. Defaults to `[0.05, 1]`.
       The cropped area of the image must contain a fraction of the
-      supplied image within in this range.
+      supplied image within this range.
     max_attempts: An optional `int`. Defaults to `100`.
       Number of attempts at generating a cropped region of the image
       of the specified constraints. After `max_attempts` failures, return the
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 45499dcce0..2a6ab26e96 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -533,6 +533,37 @@ class FlipImageBenchmark(test.Benchmark):
         iters=benchmark_rounds,
         wall_time=step_time)
 
+  def _benchmarkBatchedRandomFlipLeftRight(self, device, cpu_count):
+    image_shape = [16, 299, 299, 3]
+    warmup_rounds = 100
+    benchmark_rounds = 1000
+    config = config_pb2.ConfigProto()
+    if cpu_count is not None:
+      config.inter_op_parallelism_threads = 1
+      config.intra_op_parallelism_threads = cpu_count
+    with session.Session("", graph=ops.Graph(), config=config) as sess:
+      with ops.device(device):
+        inputs = variables.Variable(
+            random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255,
+            trainable=False,
+            dtype=dtypes.float32)
+        run_op = image_ops.random_flip_left_right(inputs)
+        sess.run(variables.global_variables_initializer())
+        for i in xrange(warmup_rounds + benchmark_rounds):
+          if i == warmup_rounds:
+            start = time.time()
+          sess.run(run_op)
+    end = time.time()
+    step_time = (end - start) / benchmark_rounds
+    tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
+    print("benchmarkBatchedRandomFlipLeftRight_16_299_299_3_%s step_time: "
+          "%.2f us" %
+          (tag, step_time * 1e6))
+    self.report_benchmark(
+        name="benchmarkBatchedRandomFlipLeftRight_16_299_299_3_%s" % (tag),
+        iters=benchmark_rounds,
+        wall_time=step_time)
+
   def benchmarkFlipLeftRightCpu1(self):
     self._benchmarkFlipLeftRight("/cpu:0", 1)
 
@@ -551,6 +582,15 @@ class FlipImageBenchmark(test.Benchmark):
   def benchmarkRandomFlipLeftRightGpu(self):
     self._benchmarkRandomFlipLeftRight(test.gpu_device_name(), None)
 
+  def benchmarkBatchedRandomFlipLeftRightCpu1(self):
+    self._benchmarkBatchedRandomFlipLeftRight("/cpu:0", 1)
+
+  def benchmarkBatchedRandomFlipLeftRightCpuAll(self):
+    self._benchmarkBatchedRandomFlipLeftRight("/cpu:0", None)
+
+  def benchmarkBatchedRandomFlipLeftRightGpu(self):
+    self._benchmarkBatchedRandomFlipLeftRight(test.gpu_device_name(), None)
+
 
 class AdjustHueBenchmark(test.Benchmark):
 
@@ -987,7 +1027,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
 
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_left_right(x_tf)
+      y = image_ops.random_flip_left_right(x_tf, seed=seed)
       self.assertTrue(y.op.name.startswith("random_flip_left_right"))
 
       count_flipped = 0
@@ -1008,6 +1048,50 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       self.assertGreaterEqual(count_flipped, 20)
       self.assertGreaterEqual(count_unflipped, 20)
 
+  def testRandomFlipLeftRightWithBatch(self):
+    batch_size = 16
+    seed = 42
+
+    # create single item of test data
+    x_np_raw = np.array(
+        [[1, 2, 3], [1, 2, 3]], dtype=np.uint8
+    ).reshape([1, 2, 3, 1])
+    y_np_raw = np.array(
+        [[3, 2, 1], [3, 2, 1]], dtype=np.uint8
+    ).reshape([1, 2, 3, 1])
+
+    # create batched test data
+    x_np = np.vstack([x_np_raw for _ in range(batch_size)])
+    y_np = np.vstack([y_np_raw for _ in range(batch_size)])
+
+    with self.test_session(use_gpu=True):
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.random_flip_left_right(x_tf, seed=seed)
+      self.assertTrue(y.op.name.startswith("random_flip_left_right"))
+
+      count_flipped = 0
+      count_unflipped = 0
+      for _ in range(100):
+        y_tf = y.eval()
+
+        # check every element of the batch
+        for i in range(batch_size):
+          if y_tf[i][0][0] == 1:
+            self.assertAllEqual(y_tf[i], x_np[i])
+            count_unflipped += 1
+          else:
+            self.assertAllEqual(y_tf[i], y_np[i])
+            count_flipped += 1
+
+      # 100 trials, each containing batch_size elements
+      # Mean: 50 * batch_size
+      # Std Dev: ~5 * sqrt(batch_size)
+      # Six Sigma: 50 * batch_size - (5 * 6 * sqrt(batch_size))
+      #          = 50 * batch_size - 30 * sqrt(batch_size) = 800 - 30 * 4 = 680
+      six_sigma = 50 * batch_size - 30 * np.sqrt(batch_size)
+      self.assertGreaterEqual(count_flipped, six_sigma)
+      self.assertGreaterEqual(count_unflipped, six_sigma)
+
   def testInvolutionUpDown(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
 
@@ -1057,9 +1141,11 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[4, 5, 6], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
 
+    seed = 42
+
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_up_down(x_tf, seed=42)
+      y = image_ops.random_flip_up_down(x_tf, seed=seed)
       self.assertTrue(y.op.name.startswith("random_flip_up_down"))
       count_flipped = 0
       count_unflipped = 0
@@ -1079,6 +1165,50 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       self.assertGreaterEqual(count_flipped, 20)
       self.assertGreaterEqual(count_unflipped, 20)
 
+  def testRandomFlipUpDownWithBatch(self):
+    batch_size = 16
+    seed = 42
+
+    # create single item of test data
+    x_np_raw = np.array(
+        [[1, 2, 3], [4, 5, 6]], dtype=np.uint8
+    ).reshape([1, 2, 3, 1])
+    y_np_raw = np.array(
+        [[4, 5, 6], [1, 2, 3]], dtype=np.uint8
+    ).reshape([1, 2, 3, 1])
+
+    # create batched test data
+    x_np = np.vstack([x_np_raw for _ in range(batch_size)])
+    y_np = np.vstack([y_np_raw for _ in range(batch_size)])
+
+    with self.test_session(use_gpu=True):
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.random_flip_up_down(x_tf, seed=seed)
+      self.assertTrue(y.op.name.startswith("random_flip_up_down"))
+
+      count_flipped = 0
+      count_unflipped = 0
+      for _ in range(100):
+        y_tf = y.eval()
+
+        # check every element of the batch
+        for i in range(batch_size):
+          if y_tf[i][0][0] == 1:
+            self.assertAllEqual(y_tf[i], x_np[i])
+            count_unflipped += 1
+          else:
+            self.assertAllEqual(y_tf[i], y_np[i])
+            count_flipped += 1
+
+      # 100 trials, each containing batch_size elements
+      # Mean: 50 * batch_size
+      # Std Dev: ~5 * sqrt(batch_size)
+      # Six Sigma: 50 * batch_size - (5 * 6 * sqrt(batch_size))
+      #          = 50 * batch_size - 30 * sqrt(batch_size) = 800 - 30 * 4 = 680
+      six_sigma = 50 * batch_size - 30 * np.sqrt(batch_size)
+      self.assertGreaterEqual(count_flipped, six_sigma)
+      self.assertGreaterEqual(count_unflipped, six_sigma)
+
   def testInvolutionTranspose(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
 
@@ -1156,6 +1286,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     #Ops that support 4D input
     for op in [
         image_ops.flip_left_right, image_ops.flip_up_down,
+        image_ops.random_flip_left_right, image_ops.random_flip_up_down,
         image_ops.transpose_image, image_ops.rot90
     ]:
       transformed_unknown_dims_4 = op(p_unknown_dims_4)
@@ -1166,14 +1297,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
                                    "must be at least three-dimensional"):
         op(p_wrong_rank)
 
-    for op in [
-        image_ops.random_flip_left_right,
-        image_ops.random_flip_up_down,
-    ]:
-      with self.assertRaisesRegexp(ValueError, "must be three-dimensional"):
-        op(p_wrong_rank)
-
-
   def testRot90GroupOrder(self):
     image = np.arange(24, dtype=np.uint8).reshape([2, 4, 3])
     with self.test_session(use_gpu=True):
@@ -1208,41 +1331,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         y_np = np.rot90(image, k=k, axes=(1, 2))
         self.assertAllEqual(y_np, y_tf.eval({k_placeholder: k}))
 
-class RandomFlipTest(test_util.TensorFlowTestCase):
-
-  def testRandomLeftRight(self):
-    x_np = np.array([0, 1], dtype=np.uint8).reshape([1, 2, 1])
-    num_iterations = 500
-
-    hist = [0, 0]
-    with self.test_session(use_gpu=True):
-      x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_left_right(x_tf)
-      for _ in xrange(num_iterations):
-        y_np = y.eval().flatten()[0]
-        hist[y_np] += 1
-
-    # Ensure that each entry is observed within 4 standard deviations.
-    four_stddev = 4.0 * np.sqrt(num_iterations / 2.0)
-    self.assertAllClose(hist, [num_iterations / 2.0] * 2, atol=four_stddev)
-
-  def testRandomUpDown(self):
-    x_np = np.array([0, 1], dtype=np.uint8).reshape([2, 1, 1])
-    num_iterations = 500
-
-    hist = [0, 0]
-    with self.test_session(use_gpu=True):
-      x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_up_down(x_tf)
-      for _ in xrange(num_iterations):
-        y_np = y.eval().flatten()[0]
-        hist[y_np] += 1
-
-    # Ensure that each entry is observed within 4 standard deviations.
-    four_stddev = 4.0 * np.sqrt(num_iterations / 2.0)
-    self.assertAllClose(hist, [num_iterations / 2.0] * 2, atol=four_stddev)
-
-
 class AdjustContrastTest(test_util.TensorFlowTestCase):
 
   def _testContrast(self, x_np, y_np, contrast_factor):
@@ -3880,5 +3968,88 @@ class SobelEdgesTest(test_util.TensorFlowTestCase):
       self.assertAllClose(expected_batch, actual_sobel)
 
 
+class DecodeImageTest(test_util.TensorFlowTestCase):
+
+  def testJpegUint16(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/jpeg/testdata"
+      jpeg0 = io_ops.read_file(os.path.join(base, "jpeg_merge_test1.jpg"))
+      image0 = image_ops.decode_image(jpeg0, dtype=dtypes.uint16)
+      image1 = image_ops.convert_image_dtype(image_ops.decode_jpeg(jpeg0),
+                                             dtypes.uint16)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testPngUint16(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/png/testdata"
+      png0 = io_ops.read_file(os.path.join(base, "lena_rgba.png"))
+      image0 = image_ops.decode_image(png0, dtype=dtypes.uint16)
+      image1 = image_ops.convert_image_dtype(
+          image_ops.decode_png(png0, dtype=dtypes.uint16), dtypes.uint16)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testGifUint16(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/gif/testdata"
+      gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
+      image0 = image_ops.decode_image(gif0, dtype=dtypes.uint16)
+      image1 = image_ops.convert_image_dtype(image_ops.decode_gif(gif0),
+                                             dtypes.uint16)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testBmpUint16(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/bmp/testdata"
+      bmp0 = io_ops.read_file(os.path.join(base, "lena.bmp"))
+      image0 = image_ops.decode_image(bmp0, dtype=dtypes.uint16)
+      image1 = image_ops.convert_image_dtype(image_ops.decode_bmp(bmp0),
+                                             dtypes.uint16)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testJpegFloat32(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/jpeg/testdata"
+      jpeg0 = io_ops.read_file(os.path.join(base, "jpeg_merge_test1.jpg"))
+      image0 = image_ops.decode_image(jpeg0, dtype=dtypes.float32)
+      image1 = image_ops.convert_image_dtype(image_ops.decode_jpeg(jpeg0),
+                                             dtypes.float32)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testPngFloat32(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/png/testdata"
+      png0 = io_ops.read_file(os.path.join(base, "lena_rgba.png"))
+      image0 = image_ops.decode_image(png0, dtype=dtypes.float32)
+      image1 = image_ops.convert_image_dtype(
+          image_ops.decode_png(png0, dtype=dtypes.uint16), dtypes.float32)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testGifFloat32(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/gif/testdata"
+      gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
+      image0 = image_ops.decode_image(gif0, dtype=dtypes.float32)
+      image1 = image_ops.convert_image_dtype(image_ops.decode_gif(gif0),
+                                             dtypes.float32)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testBmpFloat32(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/bmp/testdata"
+      bmp0 = io_ops.read_file(os.path.join(base, "lena.bmp"))
+      image0 = image_ops.decode_image(bmp0, dtype=dtypes.float32)
+      image1 = image_ops.convert_image_dtype(image_ops.decode_bmp(bmp0),
+                                             dtypes.float32)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 2df230d470..724fcc39cd 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -467,7 +467,8 @@ class VarianceScaling(Initializer):
     else:
       scale /= max(1., (fan_in + fan_out) / 2.)
     if self.distribution == "normal":
-      stddev = math.sqrt(scale)
+      # constant taken from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
+      stddev = math.sqrt(scale) / .87962566103423978
       return random_ops.truncated_normal(
           shape, 0.0, stddev, dtype, seed=self.seed)
     else:
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 222b8ebc9d..8276047cb6 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -35,8 +35,9 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 # Assert and Print are special symbols in python, so we must
-# use an upper-case version of them.
-@tf_export("Print")
+# have an upper-case version of them.  For users with Python 3 or Python 2.7
+# with `from __future__ import print_function`, we also allow lowercase.
+@tf_export("Print", "print")
 def Print(input_, data, message=None, first_n=None, summarize=None,
           name=None):
   """Prints a list of tensors.
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index e40481f3a7..466d0dadc8 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -125,8 +125,8 @@ def abs(x, name=None):  # pylint: disable=redefined-builtin
   ```
 
   Args:
-    x: A `Tensor` or `SparseTensor` of type `float32`, `float64`, `int32`,
-      `int64`, `complex64` or `complex128`.
+    x: A `Tensor` or `SparseTensor` of type `float16`, `float32`, `float64`,
+      `int32`, `int64`, `complex64` or `complex128`.
     name: A name for the operation (optional).
 
   Returns:
@@ -430,10 +430,10 @@ def pow(x, y, name=None):  # pylint: disable=redefined-builtin
   ```
 
   Args:
-    x: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
-     or `complex128`.
-    y: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
-     or `complex128`.
+    x: A `Tensor` of type `float16`, `float32`, `float64`, `int32`, `int64`,
+     `complex64`, or `complex128`.
+    y: A `Tensor` of type `float16`, `float32`, `float64`, `int32`, `int64`,
+     `complex64`, or `complex128`.
     name: A name for the operation (optional).
 
   Returns:
@@ -600,7 +600,7 @@ def round(x, name=None):  # pylint: disable=redefined-builtin
   ```
 
   Args:
-    x: A `Tensor` of type `float32` or `float64`.
+    x: A `Tensor` of type `float16`, `float32`, `float64`, `int32`, or `int64`.
     name: A name for the operation (optional).
 
   Returns:
@@ -1257,7 +1257,7 @@ def reduce_sum(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
@@ -1397,7 +1397,7 @@ def reduce_mean(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
@@ -1469,7 +1469,7 @@ def reduce_prod(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   Args:
@@ -1519,7 +1519,7 @@ def reduce_min(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   Args:
@@ -1568,7 +1568,7 @@ def reduce_max(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   Args:
@@ -1617,7 +1617,7 @@ def reduce_all(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
@@ -1675,7 +1675,7 @@ def reduce_any(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 783d485892..f47f38e29e 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -621,7 +621,7 @@ def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):
   """Calculate the mean and variance of based on the sufficient statistics.
 
   Args:
-    counts: A `Tensor` containing a the total count of the data (one value).
+    counts: A `Tensor` containing the total count of the data (one value).
     mean_ss: A `Tensor` containing the mean sufficient statistics: the (possibly
       shifted) sum of the elements to average over.
     variance_ss: A `Tensor` containing the variance sufficient statistics: the
@@ -689,6 +689,9 @@ def moments(
     # Compute true mean while keeping the dims for proper broadcasting.
     mean = math_ops.reduce_mean(y, axes, keepdims=True, name="mean")
     # sample variance, not unbiased variance
+    # Note: stop_gradient does not change the gradient that gets 
+    #       backpropagated to the mean from the variance calculation,
+    #       because that gradient is zero
     variance = math_ops.reduce_mean(
         math_ops.squared_difference(y, array_ops.stop_gradient(mean)),
         axes,
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index a0b55eb077..0c2f5b06c4 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1596,12 +1596,12 @@ def leaky_relu(features, alpha=0.2, name=None):
   Returns:
     The activation value.
   """
-  with ops.name_scope(name, "LeakyRelu", [features, alpha]):
+  with ops.name_scope(name, "LeakyRelu", [features, alpha]) as name:
     features = ops.convert_to_tensor(features, name="features")
     if features.dtype.is_integer:
       features = math_ops.to_float(features)
     alpha = ops.convert_to_tensor(alpha, dtype=features.dtype, name="alpha")
-    return math_ops.maximum(alpha * features, features)
+    return math_ops.maximum(alpha * features, features, name=name)
 
 
 def _flatten_outer_dims(logits):
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 46a5f4fae6..035b4735af 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -962,6 +962,16 @@ class LeakyReluTest(test_lib.TestCase):
       self.assertAllClose(
           outputs, [-0.4, -0.2, 0.0, 1.0, 2.0], rtol=tol, atol=tol)
 
+  def testName(self):
+    np_values = np.array([-2, -1, 0, 1, 2], dtype=np.float64)
+    outputs_with_name_set = nn_ops.leaky_relu(
+        constant_op.constant(np_values),
+        name='test_relu_op')
+    self.assertEqual(outputs_with_name_set.name, 'test_relu_op:0')
+    outputs_without_name_set = nn_ops.leaky_relu(
+        constant_op.constant(np_values))
+    self.assertEqual(outputs_without_name_set.name, 'LeakyRelu:0')
+
 
 class SwishTest(test_lib.TestCase):
 
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index f8676ccb5f..219562de5d 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -23,6 +23,7 @@ import threading
 
 # Used by py_util.cc to get tracebacks.
 import traceback  # pylint: disable=unused-import
+import weakref
 
 import numpy as np
 import six
@@ -129,11 +130,14 @@ class FuncRegistry(object):
   def __init__(self):
     self._lock = threading.Lock()
     self._unique_id = 0  # GUARDED_BY(self._lock)
-    self._funcs = {}
+    # Only store weakrefs to the funtions. The strong reference is stored in
+    # the graph.
+    self._funcs = weakref.WeakValueDictionary()
 
   def insert(self, func):
     """Registers `func` and returns a unique token for this entry."""
     token = self._next_unique_token()
+    # Store a weakref to the function
     self._funcs[token] = func
     return token
 
@@ -186,7 +190,7 @@ class FuncRegistry(object):
     Raises:
       ValueError: if no function is registered for `token`.
     """
-    func = self._funcs[token]
+    func = self._funcs.get(token, None)
     if func is None:
       raise ValueError("callback %s is not found" % token)
     if isinstance(func, EagerFunc):
@@ -228,19 +232,6 @@ _py_funcs = FuncRegistry()
 pywrap_tensorflow.InitializePyTrampoline(_py_funcs)
 
 
-class CleanupFunc(object):
-  """A helper class to remove a registered function from _py_funcs."""
-
-  def __init__(self, token):
-    self._token = token
-
-  def __del__(self):
-    if _py_funcs is not None:
-      # If _py_funcs is None, the program is most likely in shutdown, and the
-      # _py_funcs object has been destroyed already.
-      _py_funcs.remove(self._token)
-
-
 def _internal_py_func(func,
                       inp,
                       Tout,
@@ -270,17 +261,15 @@ def _internal_py_func(func,
     # bound to that of the outer graph instead.
     graph = graph._outer_graph
 
-  cleanup = CleanupFunc(token)
-
   # TODO(zhifengc): Consider adding a Graph method to collect
   # `cleanup` objects in one of its member.
-  if not hasattr(graph, "_cleanup_py_funcs_used_in_graph"):
-    graph._cleanup_py_funcs_used_in_graph = []
+  if not hasattr(graph, "_py_funcs_used_in_graph"):
+    graph._py_funcs_used_in_graph = []
 
-  # When `graph` is destroyed, elements in _cleanup_py_funcs_used_in_graph
-  # will be destroyed and their __del__ will remove the 'token' from
-  # the funcs registry.
-  graph._cleanup_py_funcs_used_in_graph.append(cleanup)
+  # Store a reference to the function in the graph to ensure it stays alive
+  # as long as the graph lives. When the graph is destroyed, the function
+  # is left to the garbage collector for destruction as well.
+  graph._py_funcs_used_in_graph.append(func)
   # pylint: enable=protected-access
 
   if eager:
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 0130233746..c3b16a7bd5 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -84,6 +84,8 @@ def _convert_to_sparse_tensors(sp_inputs):
 
 # pylint: disable=protected-access
 @tf_export("sparse_concat")
+@deprecation.deprecated_args(
+    None, "concat_dim is deprecated, use axis instead", "concat_dim")
 def sparse_concat(axis,
                   sp_inputs,
                   name=None,
@@ -597,6 +599,8 @@ class KeywordRequired(object):
 
 
 @tf_export("sparse_split")
+@deprecation.deprecated_args(
+    None, "split_dim is deprecated, use axis instead", "split_dim")
 def sparse_split(keyword_required=KeywordRequired(),
                  sp_input=None,
                  num_split=None,
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index ae79c01949..0280c89c10 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -91,6 +91,59 @@ def string_split(source, delimiter=" ", skip_empty=True):  # pylint: disable=inv
   shape.set_shape([2])
   return sparse_tensor.SparseTensor(indices, values, shape)
 
+@tf_export("strings.split")
+def string_split_v2(source, sep=None, maxsplit=-1):
+  """Split elements of `source` based on `sep` into a `SparseTensor`.
+
+  Let N be the size of source (typically N will be the batch size). Split each
+  element of `source` based on `sep` and return a `SparseTensor`
+  containing the split tokens. Empty tokens are ignored.
+
+  For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
+  then the output will be
+
+  st.indices = [0, 0;
+                0, 1;
+                1, 0;
+                1, 1;
+                1, 2]
+  st.shape = [2, 3]
+  st.values = ['hello', 'world', 'a', 'b', 'c']
+
+  If `sep` is given, consecutive delimiters are not grouped together and are
+  deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
+  sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
+  string, consecutive whitespace are regarded as a single separator, and the
+  result will contain no empty strings at the startor end if the string has
+  leading or trailing whitespace.
+
+  Note that the above mentioned behavior matches python's str.split.
+
+  Args:
+    source: `1-D` string `Tensor`, the strings to split.
+    sep: `0-D` string `Tensor`, the delimiter character.
+    maxsplit: An `int`. If `maxsplit > 0`, limit of the split of the result.
+
+  Raises:
+    ValueError: If sep is not a string.
+
+  Returns:
+    A `SparseTensor` of rank `2`, the strings split according to the delimiter.
+    The first column of the indices corresponds to the row in `source` and the
+    second column corresponds to the index of the split component in this row.
+  """
+  if sep is None:
+    sep = ''
+  sep = ops.convert_to_tensor(sep, dtype=dtypes.string)
+  source = ops.convert_to_tensor(source, dtype=dtypes.string)
+
+  indices, values, shape = gen_string_ops.string_split_v2(
+      source, sep=sep, maxsplit=maxsplit)
+  indices.set_shape([None, 2])
+  values.set_shape([None])
+  shape.set_shape([2])
+  return sparse_tensor.SparseTensor(indices, values, shape)
+
 
 def _reduce_join_reduction_dims(x, axis, reduction_indices):
   """Returns range(rank(x) - 1, 0, -1) if reduction_indices is None."""
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index f49e2d314d..47414c28af 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -1786,6 +1786,23 @@ class variable_scope(object):
           assert v.name == "foo/bar/v:0"
   ```
 
+  Simple example of how to reenter a premade variable scope safely:
+
+  ```python
+  with tf.variable_scope("foo") as vs:
+    pass
+
+  # Re-enter the variable scope.
+  with tf.variable_scope(vs,
+                         auxiliary_name_scope=False) as vs1:
+    # Restore the original name_scope.
+    with tf.name_scope(vs1.original_name_scope):
+        v = tf.get_variable("v", [1])
+        assert v.name == "foo/v:0"
+        c = tf.constant([1], name="c")
+        assert c.name == "foo/c:0"
+  ```
+
   Basic example of sharing a variable AUTO_REUSE:
 
   ```python
@@ -1924,7 +1941,9 @@ class variable_scope(object):
         (which must have the same shape). Constraints are not safe to
         use when doing asynchronous distributed training.
       auxiliary_name_scope: If `True`, we create an auxiliary name scope with
-        the scope. If `False`, we don't touch name scope.
+        the scope. If `False`, we don't create it. Note that the argument is
+        not inherited, and it only takes effect for once when creating. You
+        should only use it for re-entering a premade variable scope.
 
     Returns:
       A scope that can be captured and reused.
diff --git a/tensorflow/python/tools/import_pb_to_tensorboard.py b/tensorflow/python/tools/import_pb_to_tensorboard.py
old mode 100755
new mode 100644
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 522965990b..b59f8e1f98 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1719,7 +1719,7 @@ def tf_py_build_info_genrule():
       name="py_build_info_gen",
       outs=["platform/build_info.py"],
       cmd=
-      "$(location //tensorflow/tools/build_info:gen_build_info.py) --raw_generate \"$@\" --build_config " + if_cuda("cuda", "cpu"),
+     "$(location //tensorflow/tools/build_info:gen_build_info.py) --raw_generate \"$@\" --build_config " + if_cuda("cuda", "cpu"),
       local=1,
       tools=[clean_dep("//tensorflow/tools/build_info:gen_build_info.py")],)
 
diff --git a/tensorflow/tools/api/generator/create_python_api.py b/tensorflow/tools/api/generator/create_python_api.py
index bca9fa49eb..671b7e387e 100644
--- a/tensorflow/tools/api/generator/create_python_api.py
+++ b/tensorflow/tools/api/generator/create_python_api.py
@@ -41,7 +41,11 @@ _GENERATED_FILE_HEADER = """# This file is MACHINE GENERATED! Do not edit.
 # Generated by: tensorflow/tools/api/generator/create_python_api.py script.
 \"\"\"%s
 \"\"\"
+
+from __future__ import print_function
+
 """
+_GENERATED_FILE_FOOTER = "\n\ndel print_function\n"
 
 
 class SymbolExposedTwiceError(Exception):
@@ -149,6 +153,7 @@ class _ModuleInitCodeBuilder(object):
 _names_with_underscore = [%s]
 __all__ = [_s for _s in dir() if not _s.startswith('_')]
 __all__.extend([_s for _s in _names_with_underscore])
+__all__.remove('print_function')
 ''' % underscore_names_str
 
     return module_text_map
@@ -333,7 +338,8 @@ def create_api_files(
     if module or not root_init_template:
       contents = (
           _GENERATED_FILE_HEADER %
-          get_module_docstring(module, package, api_name) + text)
+          get_module_docstring(module, package, api_name) +
+          text + _GENERATED_FILE_FOOTER)
     else:
       # Read base init file
       with open(root_init_template, 'r') as root_init_template_file:
diff --git a/tensorflow/tools/api/golden/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
index 5bb3b3c444..10171b3d60 100644
--- a/tensorflow/tools/api/golden/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
@@ -58,7 +58,7 @@ tf_module {
   }
   member_method {
     name: "decode_image"
-    argspec: "args=[\'contents\', \'channels\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'uint8\'>\", \'None\'], "
   }
   member_method {
     name: "decode_jpeg"
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index dc2bd40096..3051c4437e 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -1532,6 +1532,10 @@ tf_module {
     name: "pow"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "print"
+    argspec: "args=[\'input_\', \'data\', \'message\', \'first_n\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "py_func"
     argspec: "args=[\'func\', \'inp\', \'Tout\', \'stateful\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/tensorflow.strings.pbtxt
index a3fbe95bba..b641c39feb 100644
--- a/tensorflow/tools/api/golden/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.strings.pbtxt
@@ -4,4 +4,8 @@ tf_module {
     name: "regex_full_match"
     argspec: "args=[\'input\', \'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "split"
+    argspec: "args=[\'source\', \'sep\', \'maxsplit\'], varargs=None, keywords=None, defaults=[\'None\', \'-1\'], "
+  }
 }
diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index 5fa75e1d61..883bb93647 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@@ -322,6 +322,10 @@ create_activate_virtualenv_and_install_tensorflow() {
   pip install -v ${PIP_FLAGS} ${WHL_PATH} || \
     die "pip install (forcing to reinstall tensorflow) FAILED"
   echo "Successfully installed pip package ${TF_WHEEL_PATH}"
+
+  # Force downgrade setuptools.
+  pip install --upgrade setuptools==39.1.0
+
 }
 
 ################################################################################
diff --git a/tensorflow/tools/ci_build/builds/with_the_same_user b/tensorflow/tools/ci_build/builds/with_the_same_user
index d4bf546d40..b216e3549f 100755
--- a/tensorflow/tools/ci_build/builds/with_the_same_user
+++ b/tensorflow/tools/ci_build/builds/with_the_same_user
@@ -40,7 +40,7 @@ if [ -n "${CI_BUILD_USER_FORCE_BADNAME}" ]; then
   ADDUSER_OPTS="--force-badname"
 fi
 
-getent group "${CI_BUILD_GID}" || addgroup --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}"
+getent group "${CI_BUILD_GID}" || addgroup ${ADDUSER_OPTS} --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}"
 getent passwd "${CI_BUILD_UID}" || adduser ${ADDUSER_OPTS} \
     --gid "${CI_BUILD_GID}" --uid "${CI_BUILD_UID}" \
     --gecos "${CI_BUILD_USER} (generated by with_the_same_user script)" \
diff --git a/tensorflow/tools/ci_build/ci_build.sh b/tensorflow/tools/ci_build/ci_build.sh
index 072dd6ab99..1f0fd0387a 100755
--- a/tensorflow/tools/ci_build/ci_build.sh
+++ b/tensorflow/tools/ci_build/ci_build.sh
@@ -134,6 +134,12 @@ if [[ $? != "0" ]]; then
   die "ERROR: docker build failed. Dockerfile is at ${DOCKERFILE_PATH}"
 fi
 
+# If caller wants the with_the_same_user script to allow bad usernames, 
+# pass the var to the docker environment
+if [ -n "${CI_BUILD_USER_FORCE_BADNAME}" ]; then
+        CI_BUILD_USER_FORCE_BADNAME_ENV="-e CI_BUILD_USER_FORCE_BADNAME=yes"
+fi
+
 # Run the command inside the container.
 echo "Running '${COMMAND[*]}' inside ${DOCKER_IMG_NAME}..."
 mkdir -p ${WORKSPACE}/bazel-ci_build-cache
@@ -148,6 +154,7 @@ ${DOCKER_BINARY} run --rm --pid=host \
     -e "CI_BUILD_GROUP=$(id -g -n)" \
     -e "CI_BUILD_GID=$(id -g)" \
     -e "CI_TENSORFLOW_SUBMODULE_PATH=${CI_TENSORFLOW_SUBMODULE_PATH}" \
+    ${CI_BUILD_USER_FORCE_BADNAME_ENV} \
     -v ${WORKSPACE}:/workspace \
     -w /workspace \
     ${GPU_EXTRA_PARAMS} \
diff --git a/tensorflow/tools/ci_build/copy_binary.py b/tensorflow/tools/ci_build/copy_binary.py
index 420d390d2b..148526492d 100755
--- a/tensorflow/tools/ci_build/copy_binary.py
+++ b/tensorflow/tools/ci_build/copy_binary.py
@@ -32,7 +32,8 @@ import shutil
 import tempfile
 import zipfile
 
-TF_NIGHTLY_REGEX = r"(.+)tf_nightly(|_gpu)-(\d\.\d\.\d.dev[\d]{0,8})-(.+)\.whl"
+TF_NIGHTLY_REGEX = (r"(.+)tf_nightly(|_gpu)-(\d\.[\d]{1,2}"
+                    "\.\d.dev[\d]{0,8})-(.+)\.whl")
 BINARY_STRING_TEMPLATE = "%s-%s-%s.whl"
 
 
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index 60290df833..88f1d04193 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -115,3 +115,7 @@ pip2 install keras_applications==1.0.2
 pip3 install keras_applications==1.0.2
 pip2 install keras_preprocessing==1.0.1
 pip3 install keras_preprocessing==1.0.1
+
+# Install last working version of setuptools.
+pip2 install --upgrade setuptools==39.1.0
+pip3 install --upgrade setuptools==39.1.0
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index edb9d4b929..acd69ef346 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -39,7 +39,6 @@ if [[ -z $pip35_version ]]; then
 fi
 
 set -e
-pip3.5 install --upgrade setuptools
 pip3.5 install --upgrade pip
 
 pip3.5 install --upgrade virtualenv
@@ -86,4 +85,7 @@ pip3.5 install --upgrade termcolor
 pip3.5 install keras_applications==1.0.2
 pip3.5 install keras_preprocessing==1.0.1
 
+# Install last working version of setuptools.
+pip3.5 install --upgrade setuptools==39.1.0
+
 # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh)
diff --git a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
index 5635977731..323b30f48e 100755
--- a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
@@ -49,7 +49,6 @@ cd Python-3.6.1
 make altinstall
 ln -s /usr/local/bin/pip3.6 /usr/local/bin/pip3
 
-pip3 install --upgrade setuptools
 pip3 install --upgrade pip
 
 pip3 install --upgrade virtualenv
@@ -101,4 +100,8 @@ pip3 install --upgrade termcolor
 # Keras
 pip3.5 install keras_applications==1.0.2
 pip3.5 install keras_preprocessing==1.0.1
+
+# Install last working version of setuptools.
+pip3 install --upgrade setuptools==39.1.0
+
 # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh)
diff --git a/tensorflow/tools/ci_build/linux/mkl/basic-mkl-test.sh b/tensorflow/tools/ci_build/linux/mkl/basic-mkl-test.sh
new file mode 100755
index 0000000000..10a09a415a
--- /dev/null
+++ b/tensorflow/tools/ci_build/linux/mkl/basic-mkl-test.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Usage: basic_mkl_test.sh
+
+# Helper function to traverse directories up until given file is found.
+function upsearch () {
+  test / == "$PWD" && return || \
+      test -e "$1" && echo "$PWD" && return || \
+      cd .. && upsearch "$1"
+}
+
+# Set up WORKSPACE.
+WORKSPACE="${WORKSPACE:-$(upsearch WORKSPACE)}"
+
+BUILD_TAG=mkl-ci-test CI_BUILD_USER_FORCE_BADNAME=yes ${WORKSPACE}/tensorflow/tools/ci_build/ci_build.sh cpu tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
diff --git a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
index 1bd1852ffc..b8bce57c87 100755
--- a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
+++ b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
@@ -79,6 +79,7 @@ if [[ $1 == "PI_ONE" ]]; then
   --linkopt=-L${OPENBLAS_INSTALL_PATH}/lib/
   --linkopt=-l:libopenblas.a"
   echo "Building for the Pi One/Zero, with no NEON support"
+  WHEEL_ARCH=linux_armv6l
 else
   PI_COPTS='--copt=-march=armv7-a --copt=-mfpu=neon-vfpv4
   --copt=-std=gnu11 --copt=-DS_IREAD=S_IRUSR --copt=-DS_IWRITE=S_IWUSR
@@ -86,6 +87,7 @@ else
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8'
+  WHEEL_ARCH=linux_armv7l
   echo "Building for the Pi Two/Three, with NEON acceleration"
 fi
 
@@ -100,6 +102,8 @@ bazel build -c opt ${PI_COPTS} \
   --copt=-fomit-frame-pointer --cpu=armeabi \
   --crosstool_top=@local_config_arm_compiler//:toolchain \
   --verbose_failures \
+  //tensorflow:libtensorflow.so \
+  //tensorflow:libtensorflow_framework.so \
   //tensorflow/tools/benchmark:benchmark_model \
   //tensorflow/tools/pip_package:build_pip_package
 
@@ -112,10 +116,12 @@ BDIST_OPTS="--universal" \
   bazel-bin/tensorflow/tools/pip_package/build_pip_package "${OUTDIR}"
 
 OLD_FN=$(ls "${OUTDIR}" | grep -m 1 \.whl)
-SUB='s/tensorflow-([^-]+)-([^-]+)-.*/tensorflow-\1-\2-none-any.whl/; print'
+SUB='s/tensorflow-([^-]+)-([^-]+)-.*/tensorflow-\1-\2-none-'${WHEEL_ARCH}'.whl/; print'
 NEW_FN=$(echo "${OLD_FN}" | perl -ne "${SUB}")
 mv "${OUTDIR}/${OLD_FN}" "${OUTDIR}/${NEW_FN}"
 cp bazel-bin/tensorflow/tools/benchmark/benchmark_model "${OUTDIR}"
+cp bazel-bin/tensorflow/libtensorflow.so "${OUTDIR}"
+cp bazel-bin/tensorflow/libtensorflow_framework.so "${OUTDIR}"
 
 echo "Output can be found here:"
 find "${OUTDIR}"
diff --git a/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl b/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
index 47539b2423..f8f63e276c 100644
--- a/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
+++ b/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
@@ -31,7 +31,11 @@ def _def_file_filter_configure_impl(repository_ctx):
   vc_path = find_vc_path(repository_ctx)
   if vc_path == "visual-studio-not-found":
     auto_configure_fail("Visual C++ build tools not found on your machine")
-  undname_bin_path = find_msvc_tool(repository_ctx, vc_path, "undname.exe").replace("\\", "\\\\")
+
+  undname = find_msvc_tool(repository_ctx, vc_path, "undname.exe")
+  if undname == None:
+    auto_configure_fail("Couldn't find undname.exe under %s, please check your VC installation and set BAZEL_VC environment variable correctly." % vc_path)
+  undname_bin_path = undname.replace("\\", "\\\\")
 
   repository_ctx.template(
     "def_file_filter.py",
diff --git a/tensorflow/tools/dist_test/local_test.sh b/tensorflow/tools/dist_test/local_test.sh
index 06c2b997cb..b0114721bd 100755
--- a/tensorflow/tools/dist_test/local_test.sh
+++ b/tensorflow/tools/dist_test/local_test.sh
@@ -64,9 +64,6 @@ die() {
 # Configurations
 DOCKER_IMG_NAME="tensorflow/tf-dist-test-local-cluster"
 
-# Use TensorFlow v1.5.0 for Python 2.7 and CPU only as we set num_gpus to 0 in the below
-DEFAULT_WHL_FILE_LOCATION="https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp27-none-linux_x86_64.whl"
-
 # Parse input arguments
 LEAVE_CONTAINER_RUNNING=0
 MODEL_NAME=""
@@ -77,8 +74,7 @@ SYNC_REPLICAS_FLAG=""
 
 WHL_FILE_LOCATION=${1}
 if [[ -z "${WHL_FILE_LOCATION}" ]]; then
-  WHL_FILE_LOCATION=${DEFAULT_WHL_FILE_LOCATION}
-  echo "use default whl file location"
+  echo "WARNING: No wheel url passed. Will use latest tf-nightly cpu p2 wheel."
 fi
 
 while true; do
@@ -131,7 +127,11 @@ echo "Building in temporary directory: ${BUILD_DIR}"
 cp -r ${DIR}/* "${BUILD_DIR}"/ || \
   die "Failed to copy files to ${BUILD_DIR}"
 
-if [[ $WHL_FILE_LOCATION =~ 'http://' || $WHL_FILE_LOCATION =~ 'https://' ]]; then
+# Download whl file into the build context directory.
+if [[ -z "${WHL_FILE_LOCATION}" ]]; then
+  pip2 download --no-deps tf-nightly
+  cp tf-nightly-*.whl "${BUILD_DIR}"/tensorflow-none-any.whl
+elif [[ $WHL_FILE_LOCATION =~ 'http://' || $WHL_FILE_LOCATION =~ 'https://' ]]; then
     # Download whl file into the build context directory.
     wget -P "${BUILD_DIR}" "${WHL_FILE_LOCATION}" || \
         die "Failed to download tensorflow whl file from URL: ${WHL_FILE_LOCATION}"
diff --git a/tensorflow/tools/dist_test/remote_test.sh b/tensorflow/tools/dist_test/remote_test.sh
index 935535312d..e188c88c8f 100755
--- a/tensorflow/tools/dist_test/remote_test.sh
+++ b/tensorflow/tools/dist_test/remote_test.sh
@@ -108,7 +108,7 @@ fi
 # Parse command-line arguments.
 WHL_URL=${1}
 if [[ -z "${WHL_URL}" ]]; then
-  die "whl URL is not specified"
+  echo "WARNING: No wheel url passed. Will use latest tf-nightly cpu p2 wheel."
 fi
 
 # Create docker build context directory.
@@ -121,8 +121,13 @@ cp -r ${DIR}/* ${BUILD_DIR}/ || \
   die "Failed to copy files to ${BUILD_DIR}"
 
 # Download whl file into the build context directory.
-wget -P "${BUILD_DIR}" ${WHL_URL} || \
-  die "Failed to download tensorflow whl file from URL: ${WHL_URL}"
+if [[ -z "${WHL_URL}" ]]; then
+  pip2 download --no-deps tf-nightly
+  cp tf-nightly-*.whl "${BUILD_DIR}"/tensorflow-none-any.whl
+else
+  wget -P "${BUILD_DIR}" ${WHL_URL} || \
+    die "Failed to download tensorflow whl file from URL: ${WHL_URL}"
+fi
 
 # Build docker image for test.
 docker build ${NO_CACHE_FLAG} \
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 406d134699..57a491255e 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -76,7 +76,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.8 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.9 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # TODO(craigcitro): Don't install the pip package, since it makes it
 # more difficult to experiment with local changes. Instead, just add
diff --git a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
index a6cd44ced1..6796ad70e5 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
+++ b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
@@ -3,7 +3,7 @@ FROM tensorflow/tensorflow:latest-devel
 LABEL maintainer="Clayne Robison<clayne.b.robison@intel.com>"
 
 # These arguments are parameterized. Use --build-args to override.
-ARG TF_BRANCH=r1.8
+ARG TF_BRANCH=r1.9
 ARG WHL_DIR=/whl
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 2fe47f3356..204b5b4dba 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -13,8 +13,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         cuda-cusparse-dev-9-0 \
         curl \
         git \
-        libcudnn7=7.0.5.15-1+cuda9.0 \
-        libcudnn7-dev=7.0.5.15-1+cuda9.0 \
+        libcudnn7=7.1.4.18-1+cuda9.0 \
+        libcudnn7-dev=7.1.4.18-1+cuda9.0 \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
@@ -85,7 +85,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.8 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.9 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
index bff4a20392..9197651ff4 100644
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@@ -12,7 +12,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         cuda-cusolver-9-0 \
         cuda-cusparse-9-0 \
         curl \
-        libcudnn7=7.0.5.15-1+cuda9.0 \
+        libcudnn7=7.1.4.18-1+cuda9.0 \
         libfreetype6-dev \
         libhdf5-serial-dev \
         libpng12-dev \
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 5910f0625e..620fef9363 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -61,6 +61,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/autograph/core:core",
     "//tensorflow/contrib/autograph/impl:impl",
     "//tensorflow/contrib/autograph/lang:lang",
+    "//tensorflow/contrib/autograph/operators:operators",
     "//tensorflow/contrib/autograph/pyct:pyct",
     "//tensorflow/contrib/autograph/pyct/static_analysis:static_analysis",
     "//tensorflow/contrib/boosted_trees:boosted_trees_pip",
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index 0c4065bc77..f7e42ce536 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -41,51 +41,15 @@ function is_windows() {
   fi
 }
 
-function main() {
+function prepare_src() {
   if [ $# -lt 1 ] ; then
     echo "No destination dir provided"
     exit 1
   fi
 
-  DEST=$(real_path $1)
-  TMPDIR=$(mktemp -d -t tmp.XXXXXXXXXX)
-
-  PKG_NAME_FLAG=""
-  GPU_BUILD=0
-  NIGHTLY_BUILD=0
-  PROJECT_NAME=""
-  while true; do
-    if [[ "$1" == "--nightly_flag" ]]; then
-      NIGHTLY_BUILD=1
-    elif [[ "$1" == "--gpu" ]]; then
-      GPU_BUILD=1
-    elif [[ "$1" == "--gpudirect" ]]; then
-      PKG_NAME_FLAG="--project_name tensorflow_gpudirect"
-    elif [[ "$1" == "--project_name" ]]; then
-      shift
-      if [[ -z "$1" ]]; then
-        break
-      fi
-      PROJECT_NAME="$1"
-    fi
-    shift
-
-    if [[ -z "$1" ]]; then
-      break
-    fi
-  done
-
-  if [[ -n ${PROJECT_NAME} ]]; then
-    PKG_NAME_FLAG="--project_name ${PROJECT_NAME}"
-  elif [[ ${NIGHTLY_BUILD} == "1" && ${GPU_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tf_nightly_gpu"
-  elif [[ ${NIGHTLY_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tf_nightly"
-  elif [[ ${GPU_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tensorflow_gpu"
-  fi
-
-  echo $(date) : "=== Using tmpdir: ${TMPDIR}"
+  TMPDIR="$1"
+  mkdir -p "$TMPDIR"
+  echo $(date) : "=== Preparing sources in dir: ${TMPDIR}"
 
   if [ ! -d bazel-bin/tensorflow ]; then
     echo "Could not find bazel-bin.  Did you run from the root of the build tree?"
@@ -155,17 +119,28 @@ function main() {
   # over so user defined ops can be compiled.
   mkdir -p ${TMPDIR}/google
   mkdir -p ${TMPDIR}/third_party
-  pushd ${RUNFILES%org_tensorflow}
+  pushd ${RUNFILES%org_tensorflow} > /dev/null
   for header in $(find protobuf_archive -name \*.h); do
     mkdir -p "${TMPDIR}/google/$(dirname ${header})"
     cp "$header" "${TMPDIR}/google/$(dirname ${header})/"
   done
-  popd
+  popd > /dev/null
   cp -R $RUNFILES/third_party/eigen3 ${TMPDIR}/third_party
 
   cp tensorflow/tools/pip_package/MANIFEST.in ${TMPDIR}
   cp tensorflow/tools/pip_package/README ${TMPDIR}
   cp tensorflow/tools/pip_package/setup.py ${TMPDIR}
+}
+
+function build_wheel() {
+  if [ $# -lt 2 ] ; then
+    echo "No src and dest dir provided"
+    exit 1
+  fi
+
+  TMPDIR="$1"
+  DEST="$2"
+  PKG_NAME_FLAG="$3"
 
   # Before we leave the top-level directory, make sure we know how to
   # call python.
@@ -173,15 +148,110 @@ function main() {
     source tools/python_bin_path.sh
   fi
 
-  pushd ${TMPDIR}
+  pushd ${TMPDIR} > /dev/null
   rm -f MANIFEST
   echo $(date) : "=== Building wheel"
   "${PYTHON_BIN_PATH:-python}" setup.py bdist_wheel ${PKG_NAME_FLAG} >/dev/null
   mkdir -p ${DEST}
   cp dist/* ${DEST}
-  popd
-  rm -rf ${TMPDIR}
+  popd > /dev/null
   echo $(date) : "=== Output wheel file is in: ${DEST}"
 }
 
+function usage() {
+  echo "Usage:"
+  echo "$0 [--src srcdir] [--dst dstdir] [options]"
+  echo "$0 dstdir [options]"
+  echo ""
+  echo "    --src                 prepare sources in srcdir"
+  echo "                              will use temporary dir if not specified"
+  echo ""
+  echo "    --dst                 build wheel in dstdir"
+  echo "                              if dstdir is not set do not build, only prepare sources"
+  echo ""
+  echo "  Options:"
+  echo "    --project_name <name> set project name to name"
+  echo "    --gpu                 build tensorflow_gpu"
+  echo "    --gpudirect           build tensorflow_gpudirect"
+  echo "    --nightly_flag        build tensorflow nightly"
+  echo ""
+  exit 1
+}
+
+function main() {
+  PKG_NAME_FLAG=""
+  PROJECT_NAME=""
+  GPU_BUILD=0
+  NIGHTLY_BUILD=0
+  SRCDIR=""
+  DSTDIR=""
+  CLEANSRC=1
+  while true; do
+    if [[ "$1" == "--help" ]]; then
+      usage
+      exit 1
+    elif [[ "$1" == "--nightly_flag" ]]; then
+      NIGHTLY_BUILD=1
+    elif [[ "$1" == "--gpu" ]]; then
+      GPU_BUILD=1
+    elif [[ "$1" == "--gpudirect" ]]; then
+      PKG_NAME_FLAG="--project_name tensorflow_gpudirect"
+    elif [[ "$1" == "--project_name" ]]; then
+      shift
+      if [[ -z "$1" ]]; then
+        break
+      fi
+      PROJECT_NAME="$1"
+    elif [[ "$1" == "--src" ]]; then
+      shift
+      SRCDIR="$(real_path $1)"
+      CLEANSRC=0
+    elif [[ "$1" == "--dst" ]]; then
+      shift
+      DSTDIR="$(real_path $1)"
+    else
+      DSTDIR="$(real_path $1)"
+    fi
+    shift
+
+    if [[ -z "$1" ]]; then
+      break
+    fi
+  done
+
+  if [[ -z "$DSTDIR" ]] && [[ -z "$SRCDIR" ]]; then
+    echo "No destination dir provided"
+    usage
+    exit 1
+  fi
+
+  if [[ -z "$SRCDIR" ]]; then
+    # make temp srcdir if none set
+    SRCDIR="$(mktemp -d -t tmp.XXXXXXXXXX)"
+  fi
+
+  prepare_src "$SRCDIR"
+
+  if [[ -z "$DSTDIR" ]]; then
+      # only want to prepare sources
+      exit
+  fi
+
+  if [[ -n ${PROJECT_NAME} ]]; then
+    PKG_NAME_FLAG="--project_name ${PROJECT_NAME}"
+  elif [[ ${NIGHTLY_BUILD} == "1" && ${GPU_BUILD} == "1" ]]; then
+    PKG_NAME_FLAG="--project_name tf_nightly_gpu"
+  elif [[ ${NIGHTLY_BUILD} == "1" ]]; then
+    PKG_NAME_FLAG="--project_name tf_nightly"
+  elif [[ ${GPU_BUILD} == "1" ]]; then
+    PKG_NAME_FLAG="--project_name tensorflow_gpu"
+  fi
+
+  build_wheel "$SRCDIR" "$DSTDIR" "$PKG_NAME_FLAG"
+
+  if [[ $CLEANSRC -ne 0 ]]; then
+    rm -rf "${TMPDIR}"
+  fi
+}
+
 main "$@"
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index d25a9e77b1..97f625e7e9 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -45,7 +45,7 @@ DOCLINES = __doc__.split('\n')
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.8.0'
+_VERSION = '1.9.0-rc0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
@@ -54,6 +54,7 @@ REQUIRED_PACKAGES = [
     'numpy >= 1.13.3',
     'six >= 1.10.0',
     'protobuf >= 3.4.0',
+    'setuptools <= 39.1.0',
     'tensorboard >= 1.8.0, < 1.9.0',
     'termcolor >= 1.1.0',
 ]
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
index 29add6d5ea..15d7c70281 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
@@ -814,6 +814,9 @@ void Generator::Generate(const FileDescriptor& fd) {
   // Add header to cc file.
   SetOutput(&cc_);
   Print("// GENERATED FILE - DO NOT MODIFY");
+  Print();
+  Print("#include <algorithm>");  // for `std::stable_sort()`
+  Print();
   headers = {GetProtoTextHeaderName(fd, true /* impl */)};
   AddHeadersToCurrentSection(headers);
   Print();
diff --git a/tensorflow/tools/quantization/quantize_graph_test.py b/tensorflow/tools/quantization/quantize_graph_test.py
index df71840b64..92bb5127da 100644
--- a/tensorflow/tools/quantization/quantize_graph_test.py
+++ b/tensorflow/tools/quantization/quantize_graph_test.py
@@ -119,8 +119,8 @@ def are_tensors_near(a, b, tolerance):
   flat_a = a.flatten()
   flat_b = b.flatten()
   if len(flat_a) != len(flat_b):
-    print("Tensors are different sizes: " + str(len(flat_a)) + " vs " + str(
-        len(flat_b)))
+    tf_logging.info("Tensors are different sizes: " + str(len(flat_a)) + " vs "
+                    + str(len(flat_b)))
     return False
   value_count = len(flat_a)
   how_many_different = 0
@@ -140,10 +140,10 @@ def are_tensors_near(a, b, tolerance):
   if how_many_different == 0:
     return True
   else:
-    print("Tensors have {0} different values ({1}%), with mean difference"
-          " {2} and mean absolute difference {3}".format(
-              how_many_different, proportion_different * 100, mean_difference,
-              mean_abs_difference))
+    tf_logging.info("Tensors have {0} different values ({1}%), with mean"
+                    " difference {2} and mean absolute difference {3}".format(
+                        how_many_different, proportion_different * 100,
+                        mean_difference, mean_abs_difference))
     return False
 
 
diff --git a/tensorflow/tools/test/upload_test_benchmarks.py b/tensorflow/tools/test/upload_test_benchmarks.py
index 9c45359ee1..c030575109 100644
--- a/tensorflow/tools/test/upload_test_benchmarks.py
+++ b/tensorflow/tools/test/upload_test_benchmarks.py
@@ -89,7 +89,6 @@ import shutil
 
 from six import text_type
 from google.cloud import datastore
-from six import text_type
 
 
 def is_real_file(dirpath, fname):
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index dbec66216a..4f3df570a5 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -50,31 +50,31 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   mkl_repository(
       name = "mkl_linux",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_lnx_2018.0.2.20180127.tgz",
-          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_lnx_2018.0.2.20180127.tgz",
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.14/mklml_lnx_2018.0.3.20180406.tgz",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.14/mklml_lnx_2018.0.3.20180406.tgz"
       ],
-      sha256 = "74844bd77294742bf2396ff040369d1aa4cdd9e826fcd38cf8398ae83564d146",
-      strip_prefix = "mklml_lnx_2018.0.2.20180127",
+      sha256 = "d2305244fdc9b87db7426ed4496e87a4b3977ad3374d73b8000e8b7a5b7aa725",
+      strip_prefix = "mklml_lnx_2018.0.3.20180406",
       build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
   mkl_repository(
       name = "mkl_windows",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_win_2018.0.2.20180127.zip",
-          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_win_2018.0.2.20180127.zip"
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.14/mklml_win_2018.0.3.20180406.zip",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.14/mklml_win_2018.0.3.20180406.zip"
       ],
-      sha256 = "d8fbf0faa0684bffa3548005d05fe5cfe56ff9dbc0e15e7612d7ac01055a6ded",
-      strip_prefix = "mklml_win_2018.0.2.20180127",
+      sha256 = "a584a5bf1c8d2ad70b90d12b52652030e9a338217719064fdb84b7ad0d693694",
+      strip_prefix = "mklml_win_2018.0.3.20180406",
       build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
   mkl_repository(
       name = "mkl_darwin",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_mac_2018.0.2.20180127.tgz",
-          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_mac_2018.0.2.20180127.tgz"
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.14/mklml_mac_2018.0.3.20180406.tgz",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.14/mklml_mac_2018.0.3.20180406.tgz"
       ],
-      sha256 = "aa740d71e14562bfea56e6829e6dc186e7487cbcf6748a88dec73826b7ec1943",
-      strip_prefix = "mklml_mac_2018.0.2.20180127",
+      sha256 = "094e3dfd61c816136dc8d12a45cc611ce26c5f4828176a3644cd0b0efa15a25b",
+      strip_prefix = "mklml_mac_2018.0.3.20180406",
       build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
 
@@ -85,11 +85,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "mkl_dnn",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/v0.13.tar.gz",
-          "https://github.com/intel/mkl-dnn/archive/v0.13.tar.gz",
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/v0.14.tar.gz",
+          "https://github.com/intel/mkl-dnn/archive/v0.14.tar.gz",
       ],
-      sha256 = "d2cfd93a70cfe86ebe054477c530c9b5c1218b70f75856eb6d1956c68ee89e8f",
-      strip_prefix = "mkl-dnn-0.13",
+      sha256 = "efebc53882856afec86457a2da644693f5d59c68772d41d640d6b60a8efc4eb0",
+      strip_prefix = "mkl-dnn-0.14",
       build_file = clean_dep("//third_party/mkl_dnn:mkldnn.BUILD"),
   )
 
@@ -187,11 +187,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "highwayhash",
       urls = [
-          "https://mirror.bazel.build/github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
-          "https://github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
+          "http://mirror.bazel.build/github.com/google/highwayhash/archive/fd3d9af80465e4383162e4a7c5e2f406e82dd968.tar.gz",
+          "https://github.com/google/highwayhash/archive/fd3d9af80465e4383162e4a7c5e2f406e82dd968.tar.gz",
       ],
-      sha256 = "0f30a15b1566d93f146c8d149878a06e91d9bb7ec2cfd76906df62a82be4aac9",
-      strip_prefix = "highwayhash-dfcb97ca4fe9277bf9dc1802dd979b071896453b",
+      sha256 = "9c3e0e87d581feeb0c18d814d98f170ff23e62967a2bd6855847f0b2fe598a37",
+      strip_prefix = "highwayhash-fd3d9af80465e4383162e4a7c5e2f406e82dd968",
       build_file = clean_dep("//third_party:highwayhash.BUILD"),
   )
 
diff --git a/third_party/eigen.BUILD b/third_party/eigen.BUILD
index 07bb6645eb..e54c1a4501 100644
--- a/third_party/eigen.BUILD
+++ b/third_party/eigen.BUILD
@@ -64,6 +64,7 @@ cc_library(
         # This define (mostly) guarantees we don't link any problematic
         # code. We use it, but we do not rely on it, as evidenced above.
         "EIGEN_MPL2_ONLY",
+        "EIGEN_MAX_ALIGN_BYTES=64",
     ],
     includes = ["."],
     visibility = ["//visibility:public"],
diff --git a/third_party/highwayhash.BUILD b/third_party/highwayhash.BUILD
index 1b8e40765e..08cb84ea2c 100644
--- a/third_party/highwayhash.BUILD
+++ b/third_party/highwayhash.BUILD
@@ -10,6 +10,7 @@ cc_library(
     srcs = ["highwayhash/sip_hash.cc"],
     hdrs = [
         "highwayhash/sip_hash.h",
+        "highwayhash/endianess.h",
         "highwayhash/state_helpers.h",
     ],
     visibility = ["//visibility:public"],
diff --git a/third_party/jpeg/jpeg.BUILD b/third_party/jpeg/jpeg.BUILD
index 4418ac32fc..663a218733 100644
--- a/third_party/jpeg/jpeg.BUILD
+++ b/third_party/jpeg/jpeg.BUILD
@@ -291,8 +291,10 @@ cc_library(
         "jchuff.h",
         "jconfig.h",
         "jdct.h",
+        "jerror.h",
         "jinclude.h",
         "jmorecfg.h",
+        "jpegint.h",
         "jpeglib.h",
         "jsimd.h",
         "jsimddct.h",
diff --git a/third_party/png.BUILD b/third_party/png.BUILD
index 76ab32d69c..17c5449cc0 100644
--- a/third_party/png.BUILD
+++ b/third_party/png.BUILD
@@ -28,7 +28,14 @@ cc_library(
         "pngwrite.c",
         "pngwtran.c",
         "pngwutil.c",
-    ],
+    ] + select({
+        "@org_tensorflow//tensorflow:linux_ppc64le": [
+            "powerpc/powerpc_init.c",
+            "powerpc/filter_vsx_intrinsics.c",
+        ],
+        "//conditions:default": [
+        ],
+    }),
     hdrs = [
         "png.h",
         "pngconf.h",
diff --git a/third_party/py/python_configure.bzl b/third_party/py/python_configure.bzl
index 954f21f5f8..3c7e5c8469 100644
--- a/third_party/py/python_configure.bzl
+++ b/third_party/py/python_configure.bzl
@@ -6,6 +6,7 @@
   * `PYTHON_LIB_PATH`: Location of python libraries.
 """
 
+_BAZEL_SH = "BAZEL_SH"
 _PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
 _PYTHON_LIB_PATH = "PYTHON_LIB_PATH"
 _TF_PYTHON_CONFIG_REPO = "TF_PYTHON_CONFIG_REPO"
@@ -152,6 +153,22 @@ def _get_python_bin(repository_ctx):
             _PYTHON_BIN_PATH, repository_ctx.os.environ.get("PATH", "")))
 
 
+def _get_bash_bin(repository_ctx):
+  """Gets the bash bin path."""
+  bash_bin = repository_ctx.os.environ.get(_BAZEL_SH)
+  if bash_bin != None:
+    return bash_bin
+  else:
+    bash_bin_path = repository_ctx.which("bash")
+    if bash_bin_path != None:
+      return str(bash_bin_path)
+    else:
+      _fail("Cannot find bash in PATH, please make sure " +
+            "bash is installed and add its directory in PATH, or --define " +
+            "%s='/path/to/bash'.\nPATH=%s" % (
+                _BAZEL_SH, repository_ctx.os.environ.get("PATH", "")))
+
+
 def _get_python_lib(repository_ctx, python_bin):
   """Gets the python lib path."""
   python_lib = repository_ctx.os.environ.get(_PYTHON_LIB_PATH)
@@ -184,14 +201,14 @@ def _get_python_lib(repository_ctx, python_bin):
       "  print(paths[0])\n" +
       "END")
   cmd = '%s - %s' % (python_bin, print_lib)
-  result = repository_ctx.execute(["bash", "-c", cmd])
+  result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
   return result.stdout.strip('\n')
 
 
 def _check_python_lib(repository_ctx, python_lib):
   """Checks the python lib path."""
   cmd = 'test -d "%s" -a -x "%s"' % (python_lib, python_lib)
-  result = repository_ctx.execute(["bash", "-c", cmd])
+  result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
   if result.return_code == 1:
     _fail("Invalid python library path: %s" % python_lib)
 
@@ -199,7 +216,7 @@ def _check_python_lib(repository_ctx, python_lib):
 def _check_python_bin(repository_ctx, python_bin):
   """Checks the python bin path."""
   cmd =  '[[ -x "%s" ]] && [[ ! -d "%s" ]]' % (python_bin, python_bin)
-  result = repository_ctx.execute(["bash", "-c", cmd])
+  result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
   if result.return_code == 1:
     _fail("--define %s='%s' is not executable. Is it the python binary?" % (
         _PYTHON_BIN_PATH, python_bin))
@@ -294,6 +311,7 @@ def _python_autoconf_impl(repository_ctx):
 python_configure = repository_rule(
     implementation = _python_autoconf_impl,
     environ = [
+        _BAZEL_SH,
         _PYTHON_BIN_PATH,
         _PYTHON_LIB_PATH,
         _TF_PYTHON_CONFIG_REPO,
diff --git a/third_party/repo.bzl b/third_party/repo.bzl
index 36f5aa5bde..cb67d3e961 100644
--- a/third_party/repo.bzl
+++ b/third_party/repo.bzl
@@ -17,7 +17,6 @@
 _SINGLE_URL_WHITELIST = depset([
     "arm_compiler",
     "ortools_archive",
-    "gemmlowp",
 ])
 
 def _is_windows(ctx):
@@ -88,7 +87,9 @@ def _tf_http_archive(ctx):
   if ctx.attr.patch_file != None:
     _apply_patch(ctx, ctx.attr.patch_file)
   if ctx.attr.build_file != None:
-    ctx.template("BUILD", ctx.attr.build_file, {
+    # Use BUILD.bazel to avoid conflict with third party projects with
+    # BUILD or build (directory) underneath.
+    ctx.template("BUILD.bazel", ctx.attr.build_file, {
         "%prefix%": ".." if _repos_are_siblings() else "external",
     }, False)
 
-- 
cgit v1.2.3


From 148b4381fd0259cae441e459ec8ebe2c5d557722 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Mon, 18 Jun 2018 11:48:36 -0700
Subject: Automated g4 rollback of changelist 201011811

PiperOrigin-RevId: 201033171
---
 CONTRIBUTING.md                                    |   2 +-
 README.md                                          |   1 -
 RELEASE.md                                         |  67 +---
 configure.py                                       |   5 -
 tensorflow/BUILD                                   |   4 +-
 tensorflow/c/generate-pc.sh                        |  11 +-
 tensorflow/cc/gradients/math_grad.cc               |   1 -
 tensorflow/cc/gradients/nn_grad.cc                 |  47 ---
 tensorflow/cc/gradients/nn_grad_test.cc            |  84 +----
 tensorflow/compiler/aot/codegen_test_h.golden      |   4 +-
 .../compiler/aot/embedded_protocol_buffers.h       |   2 +-
 tensorflow/compiler/aot/runtime.h                  |   4 +-
 tensorflow/compiler/aot/runtime_test.cc            |  16 +-
 tensorflow/compiler/xla/service/cpu/BUILD          |  18 +-
 tensorflow/compiler/xla/service/cpu/cpu_runtime.cc |   2 -
 tensorflow/compiler/xla/service/cpu/cpu_runtime.h  |   1 -
 tensorflow/compiler/xla/service/cpu/ir_emitter.cc  |   8 +-
 .../compiler/xla/service/cpu/runtime_fft_impl.h    |  20 +-
 .../xla/service/cpu/runtime_single_threaded_fft.cc |  32 --
 .../xla/service/cpu/runtime_single_threaded_fft.h  |  31 --
 .../compiler/xla/service/cpu/simple_orc_jit.cc     |   2 -
 tensorflow/compiler/xla/service/pattern_matcher.h  |   2 +-
 .../compiler/xla/service/tuple_simplifier.cc       |   7 -
 tensorflow/compiler/xla/service/tuple_simplifier.h |   9 +-
 .../compiler/xla/service/tuple_simplifier_test.cc  |  77 ----
 tensorflow/contrib/autograph/__init__.py           |   3 -
 tensorflow/contrib/cmake/tf_c.cmake                |  22 +-
 tensorflow/contrib/cmake/tf_cc_ops.cmake           |   2 +-
 tensorflow/contrib/cmake/tf_python.cmake           |   3 +-
 tensorflow/contrib/cmake/tools/create_def_file.py  |   9 +-
 .../bijectors/sinh_arcsinh_bijector_test.py        |  28 +-
 tensorflow/contrib/eager/python/datasets.py        |   3 +-
 .../python/examples/notebooks/4_high_level.ipynb   |   4 +-
 .../feature_column/sequence_feature_column.py      |  22 +-
 .../feature_column/sequence_feature_column_test.py |  41 --
 tensorflow/contrib/ffmpeg/__init__.py              |   1 +
 tensorflow/contrib/ffmpeg/ffmpeg_ops.py            |   1 +
 tensorflow/contrib/framework/__init__.py           |   3 +-
 .../ops/fused_conv2d_bias_activation_op_test.py    |  11 +-
 .../src_impl/hexagon_controller.c                  |   2 +-
 tensorflow/contrib/lite/download_dependencies.sh   |   4 +-
 .../contrib/lite/examples/minimal/minimal.cc       |   2 +-
 .../contrib/lite/g3doc/tf_ops_compatibility.md     |  14 +-
 tensorflow/contrib/lite/java/ovic/README.md        |   4 +-
 .../kernels/internal/reference/reference_ops.h     |   4 +-
 tensorflow/contrib/lite/python/interpreter.py      |   2 +-
 .../interpreter_wrapper/interpreter_wrapper.cc     |   9 +-
 .../interpreter_wrapper/interpreter_wrapper.h      |   3 +-
 tensorflow/contrib/lite/python/lite.py             |  11 -
 tensorflow/contrib/lite/toco/import_tensorflow.cc  |   2 +-
 tensorflow/contrib/lite/toco/toco_port.cc          |   6 -
 tensorflow/contrib/lite/toco/toco_port.h           |  18 -
 tensorflow/contrib/makefile/compile_nsync.sh       |   2 +-
 .../contrib/makefile/download_dependencies.sh      |   4 +-
 .../contrib/metrics/python/ops/metric_ops.py       |   2 +-
 tensorflow/contrib/mpi_collectives/kernels/ring.h  |   2 +-
 .../contrib/opt/python/training/adamax_test.py     |   6 +-
 .../opt/python/training/model_average_optimizer.py |   2 +-
 tensorflow/contrib/periodic_resample/BUILD         |  20 +-
 .../kernels/periodic_resample_op.cc                |   5 -
 .../kernels/periodic_resample_op.h                 | 415 ++++++---------------
 .../contrib/periodic_resample/ops/array_ops.cc     |  53 +--
 .../periodic_resample/ops/array_ops_test.cc        |  41 --
 .../kernel_tests/periodic_resample_op_test.py      |  27 +-
 .../python/ops/periodic_resample_op.py             |   8 +-
 .../predictor/contrib_estimator_predictor.py       |   5 +-
 .../contrib/predictor/core_estimator_predictor.py  |   5 +-
 .../contrib/predictor/predictor_factories.py       |  24 +-
 .../contrib/predictor/predictor_factories_test.py  |  19 -
 .../contrib/predictor/saved_model_predictor.py     |   6 +-
 tensorflow/contrib/quantize/README.md              |   2 +-
 .../contrib/slim/python/slim/evaluation_test.py    |  25 +-
 tensorflow/contrib/summary/summary.py              |   5 +-
 .../contrib/tensor_forest/client/eval_metrics.py   |  45 ++-
 .../contrib/tensor_forest/python/tensor_forest.py  |  34 +-
 .../tensor_forest/python/tensor_forest_test.py     |  45 ---
 .../contrib/tensorrt/convert/convert_graph.cc      |  66 ++--
 .../contrib/tensorrt/convert/convert_nodes.cc      |  97 ++---
 tensorflow/contrib/tpu/python/tpu/datasets.py      |  16 +-
 tensorflow/contrib/tpu/python/tpu/datasets_test.py |  26 --
 tensorflow/core/BUILD                              |   9 +-
 .../core/api_def/base_api/api_def_Selu.pbtxt       |   4 -
 .../api_def/base_api/api_def_StringSplitV2.pbtxt   |  48 ---
 .../api_def/python_api/api_def_StringSplitV2.pbtxt |   4 -
 tensorflow/core/common_runtime/bfc_allocator.cc    |   8 +-
 tensorflow/core/common_runtime/bfc_allocator.h     |   3 +-
 .../direct_session_with_tracking_alloc_test.cc     |  16 -
 .../common_runtime/mkl_threadpool_device_test.cc   |  53 ---
 tensorflow/core/common_runtime/process_util.cc     |  11 +-
 .../core/common_runtime/threadpool_device.cc       |  25 +-
 .../rpc/grpc_master_service_impl.cc                |   4 +-
 .../core/distributed_runtime/rpc/grpc_testlib.cc   |  10 +-
 tensorflow/core/framework/allocator.h              |   5 +
 tensorflow/core/framework/op_gen_lib.cc            |   1 -
 .../remote_fused_graph_execute_info.proto          |   2 +-
 tensorflow/core/framework/tensor_test.cc           |  24 +-
 tensorflow/core/graph/mkl_layout_pass.cc           | 148 +-------
 tensorflow/core/graph/mkl_layout_pass_test.cc      |  31 --
 tensorflow/core/grappler/costs/graph_properties.cc |   1 +
 tensorflow/core/grappler/optimizers/BUILD          |   2 +-
 tensorflow/core/grappler/optimizers/remapper.cc    |   4 +-
 tensorflow/core/kernels/as_string_op.cc            |   2 -
 tensorflow/core/kernels/cwise_op_clip.cc           |  43 ++-
 .../core/kernels/dense_update_functor_gpu.cu.cc    |   1 -
 tensorflow/core/kernels/gather_functor.cc          |   1 -
 tensorflow/core/kernels/gather_functor_gpu.cu.cc   |   1 -
 tensorflow/core/kernels/gather_nd_op.cc            |   4 -
 tensorflow/core/kernels/gather_nd_op_gpu.cu.cc     |   2 -
 tensorflow/core/kernels/gather_op.cc               |   1 -
 tensorflow/core/kernels/mkl_concat_op.cc           | 213 +++--------
 tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc  |   2 -
 tensorflow/core/kernels/mkl_pooling_ops_common.h   |   6 +-
 tensorflow/core/kernels/scatter_nd_op.cc           |   4 -
 tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc    |   1 -
 .../core/kernels/scoped_allocator_ops_test.cc      |   9 +-
 tensorflow/core/kernels/segment_reduction_ops.h    |  10 +-
 tensorflow/core/kernels/sparse_matmul_op.cc        |   2 +-
 tensorflow/core/kernels/string_split_op.cc         | 130 -------
 tensorflow/core/ops/candidate_sampling_ops.cc      |   5 +-
 tensorflow/core/ops/dataset_ops.cc                 |  24 +-
 tensorflow/core/ops/image_ops.cc                   |   4 +-
 tensorflow/core/ops/math_ops.cc                    |   2 +-
 tensorflow/core/ops/nn_ops.cc                      |   1 -
 tensorflow/core/ops/string_ops.cc                  |  20 +-
 tensorflow/core/platform/cpu_info.cc               |  23 --
 tensorflow/core/platform/cpu_info.h                |   7 -
 tensorflow/core/platform/default/build_config.bzl  |   2 -
 .../core/platform/hadoop/hadoop_file_system.cc     |  21 +-
 tensorflow/core/platform/posix/port.cc             |   5 -
 tensorflow/core/public/version.h                   |   4 +-
 tensorflow/core/util/mkl_util.h                    |  50 +--
 tensorflow/docs_src/community/groups.md            |  29 +-
 tensorflow/docs_src/get_started/eager.md           |   2 +-
 tensorflow/docs_src/get_started/index.md           |   4 +-
 tensorflow/docs_src/install/install_c.md           |   2 +-
 tensorflow/docs_src/install/install_go.md          |   2 +-
 tensorflow/docs_src/install/install_java.md        |  24 +-
 tensorflow/docs_src/install/install_linux.md       |  24 +-
 tensorflow/docs_src/install/install_mac.md         |  10 +-
 tensorflow/docs_src/install/install_sources.md     |  17 +-
 tensorflow/docs_src/mobile/linking_libs.md         |   2 +-
 tensorflow/docs_src/mobile/prepare_models.md       |   4 +-
 tensorflow/docs_src/performance/quantization.md    |   2 +-
 .../docs_src/programmers_guide/estimators.md       |  19 +-
 .../docs_src/programmers_guide/feature_columns.md  |   4 +-
 tensorflow/examples/learn/iris.py                  |   7 +-
 tensorflow/java/src/gen/cc/op_generator.cc         |  11 +-
 tensorflow/java/src/gen/cc/op_specs.cc             |   1 -
 tensorflow/python/eager/backprop.py                |   4 +-
 tensorflow/python/estimator/BUILD                  |   5 +-
 tensorflow/python/estimator/exporter.py            |   4 +-
 tensorflow/python/estimator/inputs/numpy_io.py     |   8 +-
 .../python/estimator/inputs/numpy_io_test.py       |   5 +-
 tensorflow/python/estimator/inputs/pandas_io.py    |   7 +-
 .../python/estimator/inputs/pandas_io_test.py      |   5 +-
 .../estimator/inputs/queues/feeding_functions.py   |   2 +-
 tensorflow/python/estimator/keras.py               |   4 +-
 tensorflow/python/estimator/keras_test.py          |  14 +-
 .../python/grappler/layout_optimizer_test.py       |   4 +-
 tensorflow/python/keras/activations.py             |   2 -
 tensorflow/python/keras/callbacks.py               |  21 +-
 tensorflow/python/keras/callbacks_test.py          |   2 -
 tensorflow/python/keras/engine/network.py          |   2 +-
 tensorflow/python/keras/engine/saving_test.py      |   4 +-
 tensorflow/python/keras/engine/training.py         |   7 +-
 tensorflow/python/keras/engine/training_eager.py   |   2 +-
 tensorflow/python/keras/initializers_test.py       |  26 +-
 tensorflow/python/keras/layers/core.py             |  26 +-
 tensorflow/python/keras/models_test.py             |  14 -
 .../python/kernel_tests/as_string_op_test.py       |  10 -
 tensorflow/python/kernel_tests/betainc_op_test.py  |   4 +-
 tensorflow/python/kernel_tests/clip_ops_test.py    |  13 -
 tensorflow/python/kernel_tests/conv_ops_test.py    |  32 +-
 .../python/kernel_tests/gather_nd_op_test.py       |  32 +-
 tensorflow/python/kernel_tests/gather_op_test.py   |  20 +-
 tensorflow/python/kernel_tests/init_ops_test.py    |  27 --
 tensorflow/python/kernel_tests/pooling_ops_test.py |   4 +-
 tensorflow/python/kernel_tests/py_func_test.py     |  31 +-
 .../python/kernel_tests/scatter_nd_ops_test.py     |   6 +-
 tensorflow/python/kernel_tests/scatter_ops_test.py |  14 +-
 .../kernel_tests/segment_reduction_ops_test.py     |   4 +-
 .../python/kernel_tests/string_split_op_test.py    |  96 -----
 tensorflow/python/ops/array_ops.py                 |   4 -
 tensorflow/python/ops/gradient_checker.py          |   8 +-
 tensorflow/python/ops/image_ops_impl.py            |  74 ++--
 tensorflow/python/ops/image_ops_test.py            | 261 +++----------
 tensorflow/python/ops/init_ops.py                  |   3 +-
 tensorflow/python/ops/logging_ops.py               |   5 +-
 tensorflow/python/ops/math_ops.py                  |  28 +-
 tensorflow/python/ops/nn_impl.py                   |   5 +-
 tensorflow/python/ops/nn_ops.py                    |   4 +-
 tensorflow/python/ops/nn_test.py                   |  10 -
 tensorflow/python/ops/script_ops.py                |  35 +-
 tensorflow/python/ops/sparse_ops.py                |   4 -
 tensorflow/python/ops/string_ops.py                |  53 ---
 tensorflow/python/ops/variable_scope.py            |  21 +-
 .../python/tools/import_pb_to_tensorboard.py       |   0
 tensorflow/tensorflow.bzl                          |   2 +-
 .../tools/api/generator/create_python_api.py       |   8 +-
 tensorflow/tools/api/golden/tensorflow.image.pbtxt |   2 +-
 tensorflow/tools/api/golden/tensorflow.pbtxt       |   4 -
 .../tools/api/golden/tensorflow.strings.pbtxt      |   4 -
 tensorflow/tools/ci_build/builds/pip.sh            |   4 -
 .../tools/ci_build/builds/with_the_same_user       |   2 +-
 tensorflow/tools/ci_build/ci_build.sh              |   7 -
 tensorflow/tools/ci_build/copy_binary.py           |   3 +-
 .../tools/ci_build/install/install_pip_packages.sh |   4 -
 .../install/install_python3.5_pip_packages.sh      |   4 +-
 .../install/install_python3.6_pip_packages.sh      |   5 +-
 .../tools/ci_build/linux/mkl/basic-mkl-test.sh     |  29 --
 tensorflow/tools/ci_build/pi/build_raspberry_pi.sh |   8 +-
 .../def_file_filter/def_file_filter_configure.bzl  |   6 +-
 tensorflow/tools/dist_test/local_test.sh           |  12 +-
 tensorflow/tools/dist_test/remote_test.sh          |  11 +-
 tensorflow/tools/docker/Dockerfile.devel           |   2 +-
 tensorflow/tools/docker/Dockerfile.devel-cpu-mkl   |   2 +-
 tensorflow/tools/docker/Dockerfile.devel-gpu       |   6 +-
 tensorflow/tools/docker/Dockerfile.gpu             |   2 +-
 tensorflow/tools/pip_package/BUILD                 |   1 -
 tensorflow/tools/pip_package/build_pip_package.sh  | 160 +++-----
 tensorflow/tools/pip_package/setup.py              |   3 +-
 .../proto_text/gen_proto_text_functions_lib.cc     |   3 -
 .../tools/quantization/quantize_graph_test.py      |  12 +-
 tensorflow/tools/test/upload_test_benchmarks.py    |   1 +
 tensorflow/workspace.bzl                           |  40 +-
 third_party/eigen.BUILD                            |   1 -
 third_party/highwayhash.BUILD                      |   1 -
 third_party/jpeg/jpeg.BUILD                        |   2 -
 third_party/png.BUILD                              |   9 +-
 third_party/py/python_configure.bzl                |  24 +-
 third_party/repo.bzl                               |   5 +-
 231 files changed, 903 insertions(+), 3337 deletions(-)
 delete mode 100644 tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc
 delete mode 100644 tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h
 delete mode 100644 tensorflow/contrib/periodic_resample/ops/array_ops_test.cc
 delete mode 100644 tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt
 delete mode 100644 tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt
 delete mode 100644 tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
 mode change 100644 => 100755 tensorflow/python/tools/import_pb_to_tensorboard.py
 delete mode 100755 tensorflow/tools/ci_build/linux/mkl/basic-mkl-test.sh

(limited to 'configure.py')

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index db4b1581ae..8669c25c45 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -90,7 +90,7 @@ Bazel BUILD files also need to include a license section, e.g.,
 Changes to TensorFlow C++ code should conform to
 [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
 
-Use `clang-tidy` to check your C/C++ changes. To install `clang-tidy` on ubuntu:16.04, do:
+Use `clang-tidy` to check your C/C++ changes. To install clang-tidy on ubuntu:16.04, do:
 
 ```bash
 apt-get install -y clang-tidy
diff --git a/README.md b/README.md
index 63853137cf..6fb4486d0d 100644
--- a/README.md
+++ b/README.md
@@ -56,7 +56,6 @@ $ python
 42
 >>> sess.close()
 ```
-Learn more examples about how to do specific tasks in TensorFlow at the [tutorials page of tensorflow.org](https://www.tensorflow.org/tutorials/).
 
 ## Contribution guidelines
 
diff --git a/RELEASE.md b/RELEASE.md
index e09e9c6190..84d9d52868 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,62 +1,3 @@
-# Release 1.9.0
-
-## Major Features And Improvements
-* Update tf.keras to the Keras 2.1.6 API.
-* `tfe.Network` is deprecated. Please inherit from `tf.keras.Model`.
-* Adding support of core feature columns and losses to gradient boosted trees estimators.
-* The distributions.Bijector API supports broadcasting for Bijectors with new API changes. See [here](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/distributions/bijectors/Bijector) for more details.
-* Layered variable names have changed in the following conditions:
-  * Using `tf.keras.layers` with custom variable scopes.
-  * Using `tf.layers` in  a subclassed `tf.keras.Model` class. See [here](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/layers) for more details
-
-## Breaking Chances
-  * If you're opening empty variable scopes; replace `variable_scope`('', ...) by `variable_scope`(`tf.get_variable_scope()`, ...).
-
-## Bug Fixes and Other Changes
-* `tf.data`:
-  * The `DatasetBase::DebugString()` method is now `const`.
-  * Added the `tf.contrib.data.sample_from_datasets()` API for randomly sampling from multiple datasets.
-* Eager Execution:
-* `tf.keras`:
-  * Move Keras code out of _impl folder and remove API files.
-  * `tf.keras.Model.save_weights` now saves in TensorFlow format by default.
-  * Enable dataset iterators to be passed to `tf.keras.Model` training/eval methods.
-* Accelerated Linear Algebra (XLA):
-* TensorFlow Debugger (tfdbg): fix an issue in which the TensorBoard Debugger Plugin could not handle total source file size exceeding gRPC message size limit (4 MB).
-* `tf.contrib`:
-  * Add `tf.contrib.data.choose_from_datasets()`.
-  * `tf.contrib.data.make_csv_dataset()` now supports line breaks in quoted strings. Two arguments were removed from `make_csv_dataset`.
-  * `tf.contrib.framework.zero_initializer` supports ResourceVariable.
-  * Adding "constrained_optimization" to tensorflow/contrib.
-* Other:
-  * Add GCS Configuration Ops.
-  * Changing signature of `MakeIterator` to enable propagating error status.
-  * KL divergence for two Dirichlet distributions.
-  * More consistent GcsFileSystem behavior for certain reads past EOF.
-  * Update benchmark for tf.scan to match ranges across eager and graph modes.
-  * Fixed bug in `tf.reduce_prod gradient` for complex dtypes.
-  * Add optional `args` argument to `Dataset.from_generator()`.
-  * Allow the use of '.' in variables (e.g. "hparams.parse('a.b=1.0')"), which would previously raise an error. This will correspond to an attribute name with an embedded '.' symbol (e.g. 'a.b'), which can only be accessed indirectly (e.g. through getattr and setattr).  To set this up the user will first need to explicitly add the variable to the hparam object (e.g. "hparams.add_hparam(name='a.b', value=0.0)").
-  * Benchmark for tf.scan in graph and eager modes.
-  * Added complex128 support to FFT, FFT2D, FFT3D, IFFT, IFFT2D, and IFFT3D.
-  * Making ids unique in `nn.embedding_lookup_sparse`. This helps to reduce RPC calls for looking up the embeddings when there are repeated ids in the batch.
-  * Support indicator column in boosted trees.
-  * Prevent `tf.gradients()` from backpropagating through integer tensors.
-  * LinearOperator[1D,2D,3D]Circulant added to `tensorflow.linalg`.
-  * Conv3D, Conv3DBackpropInput, Conv3DBackpropFilter now supports arbitrary.
-  * Added `tf.train.Checkpoint` for reading/writing object-based checkpoints.
-  * `Dataset.list_files()` now produces determinstic results when `shuffle=False` or a `seed` is passed.
-  * Added LinearOperatorKronecker, a dense-free implementation of the Kronecker Product.
-  * Allow LinearOperator to broadcast.
-  * SavedModelBuilder will now deduplicate asset names that point to files with the same basename and the same contents. Note that this may result in new asset files included in SavedModels in cases where assets with the same name but different contents were previously overwriting each other.
-
-
-## Thanks to our Contributors
-
-This release contains contributions from many people at Google, as well as:
-
-Abdullah Alrasheed, Achal Shah, Ad-530, ADiegoCAlonso, Aditya Yogi, Ag Ramesh, akindyakov, Andy Kernahan, Anya Petrova, Aurelien Geron, Ben, Ben Barsdell, Bhavani-Subramanian, braincodercn, Brett Koonce, Brian Nemsick, Brian Zier, Bryan Heden, candy.dc, cclauss, Clayne Robison, ctiijima, Dalmo Cirne, David Norman, David T.H. Kao, DosLin, ekelsen, Elson Rodriguez, Erik Smistad, Felix Abecassis, Fergal Cotter, fo40225, foo0x29a, Freedom" Koan-Sin Tan, FréDéRic Branchaud-Charron, gdh1995, Geoffrey Irving, Giuseppe, gracehoney, Guido Zuidhof, Guillaume Klein, Guozhong Zhuang, Haggai, Harald Husum, imsheridan, Ivan Zhang, Jan Zikes, Jayaram Bobba, Jesse Benson, Jesse Gumz, Jiajia Li, Jie, jinghuangintel, Jingwen, jjsjann123, Joe Yearsley, Joel Hestness, Joel Shor, josephyearsley, Junpeng Lao, Karol M. Langner, Kb Sriram, krantideep95, Krish Ravindranath, Letian Feng, Loo Rong Jie, Lukas Geiger, Maciej, Mahmoud Abuzaina, ManHyuk, Mark Ryan, mbhuiyan, Michal Turek, Mostafa Alaa, Myungsung Kwak, Nand Dalal, Nehal J Wani, Neil Tenenholtz, ngc92, Nicholas Nadeau, P.Eng., Avs, Niranjan Hasabnis, P-Hidringer, Paul Van Eck, Peng Yu, Qing Zhao, Qingying Chen, Quanlong, Rajendra Arora, Rholais Lii, rmanyari, Robin Richtsfeld, Russell Klopfer, Sagi, Sam Sendelbach, Sandeep N Gupta, Sandip Giri, Sarah Edkins, Scott Tseng, Sdalbsoo, Sergii Khomenko, Seungwoo Choi (Biggie), Seyed Majid Azimi, Shaoning Zeng, shengfuintel, Siu Kei, Muk, Smit Shilu, soonson, Stefan Schweter, Sukhwan Kim, Sunitha Kambhampati, Taehoon Lee, tamimaddari82, Tang, Wenyi, Ted Chang, u2takey, Utkarsh Upadhyay, Vadim Markovtsev, voegtlel, Wai Hon Law, wangsiyu, Wenhao Hu, wenhao.hu, William D. Irons, Yan Facai (颜发才), Yanbo Liang, Yihong Wang, Yilei (Dolee) Yang, Yong Tang, Yuan (Terry) Tang
-
 # Release 1.8.0
 
 ## Major Features And Improvements
@@ -463,6 +404,14 @@ answered questions, and were part of inspiring discussions.
 
 # Release 1.4.0
 
+## Major Features And Improvements
+* `tf.keras` is now part of the core TensorFlow API.
+* [`tf.data`](http://tensorflow.org/programmers_guide/datasets) is now part of
+  the core TensorFlow API.
+  * The API is now subject to backwards compatibility guarantees.
+
+# Release 1.4.0
+
 ## Major Features And Improvements
 * `tf.keras` is now part of the core TensorFlow API.
 * [`tf.data`](http://tensorflow.org/programmers_guide/datasets) is now part of
diff --git a/configure.py b/configure.py
index ada342a50a..bde7af8c0e 100644
--- a/configure.py
+++ b/configure.py
@@ -1397,10 +1397,6 @@ def set_grpc_build_flags():
   write_to_bazelrc('build --define grpc_no_ares=true')
 
 
-def set_build_strip_flag():
-  write_to_bazelrc('build --strip=always')
-
-
 def set_windows_build_flags():
   if is_windows():
     # The non-monolithic build is not supported yet
@@ -1523,7 +1519,6 @@ def main():
 
   set_grpc_build_flags()
   set_cc_opt_flags(environ_cp)
-  set_build_strip_flag()
   set_windows_build_flags()
 
   if get_var(
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 6d134dbb80..a73c4ca3aa 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -475,7 +475,7 @@ tf_cc_shared_object(
 # excludes all but a subset of function names.
 # On MacOS, the linker does not support version_script, but has an
 # an "-exported_symbols_list" command.  -z defs disallows undefined
-# symbols in object files.
+# symbols in object files and -s strips the output.
 
 tf_cc_shared_object(
     name = "libtensorflow.so",
@@ -489,6 +489,7 @@ tf_cc_shared_object(
         "//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "-z defs",
+            "-s",
             "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
             "$(location //tensorflow/c:version_script.lds)",
         ],
@@ -514,6 +515,7 @@ tf_cc_shared_object(
         "//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "-z defs",
+            "-s",
             "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
             "$(location //tensorflow:tf_version_script.lds)",
         ],
diff --git a/tensorflow/c/generate-pc.sh b/tensorflow/c/generate-pc.sh
index 7184ad68fb..02a6a58b61 100755
--- a/tensorflow/c/generate-pc.sh
+++ b/tensorflow/c/generate-pc.sh
@@ -15,12 +15,10 @@
 # ==============================================================================
 
 TF_PREFIX='/usr/local'
-LIBDIR='lib'
 
 usage() {
     echo "Usage: $0 OPTIONS"
     echo -e "-p, --prefix\tset installation prefix (default: /usr/local)"
-    echo -e "-l, --libdir\tset lib directory (default: lib)"
     echo -e "-v, --version\tset TensorFlow version"
     echo -e "-h, --help\tdisplay this message"
 }
@@ -28,7 +26,7 @@ usage() {
 [ $# == 0 ] && usage && exit 0
 
 # read the options
-ARGS=$(getopt -o p:l:v:h --long prefix:,libdir:,version:,help -n $0 -- "$@")
+ARGS=$(getopt -o p:v:h --long prefix:,version:,help -n $0 -- "$@")
 eval set -- "$ARGS"
 
 # extract options and their arguments into variables.
@@ -40,11 +38,6 @@ while true ; do
                 "") shift 2 ;;
                 *) TF_PREFIX=$2 ; shift 2 ;;
             esac ;;
-        -l|--libdir)
-            case "$2" in
-                "") shift 2 ;;
-                *) LIBDIR=$2 ; shift 2 ;;
-            esac ;;
         -v|--version)
             case "$2" in
                 "") shift 2 ;;
@@ -62,7 +55,7 @@ echo "Generating pkgconfig file for TensorFlow $TF_VERSION in $TF_PREFIX"
 cat << EOF > tensorflow.pc
 prefix=${TF_PREFIX}
 exec_prefix=\${prefix}
-libdir=\${exec_prefix}/${LIBDIR}
+libdir=\${exec_prefix}/lib
 includedir=\${prefix}/include
 
 Name: TensorFlow
diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc
index 35a01e0341..52c177212a 100644
--- a/tensorflow/cc/gradients/math_grad.cc
+++ b/tensorflow/cc/gradients/math_grad.cc
@@ -38,7 +38,6 @@ REGISTER_NO_GRADIENT_OP("NotEqual");
 REGISTER_NO_GRADIENT_OP("LogicalAnd");
 REGISTER_NO_GRADIENT_OP("LogicalOr");
 REGISTER_NO_GRADIENT_OP("LogicalNot");
-REGISTER_NO_GRADIENT_OP("Floor");
 
 // Conjugate helper function returns the conjugate of an Output if it
 // is complex valued.
diff --git a/tensorflow/cc/gradients/nn_grad.cc b/tensorflow/cc/gradients/nn_grad.cc
index c73482d5f4..0cb3132e94 100644
--- a/tensorflow/cc/gradients/nn_grad.cc
+++ b/tensorflow/cc/gradients/nn_grad.cc
@@ -255,53 +255,6 @@ Status LRNGradHelper(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("LRN", LRNGradHelper);
 
-Status SoftplusGradHelper(const Scope& scope, const Operation& op,
-                          const std::vector<Output>& grad_inputs,
-                          std::vector<Output>* grad_outputs) {
-  auto dx = internal::SoftplusGrad(scope, grad_inputs[0], op.input(0));
-  grad_outputs->push_back(dx);
-  return scope.status();
-}
-REGISTER_GRADIENT_OP("Softplus", SoftplusGradHelper);
-
-Status SoftsignGradHelper(const Scope& scope, const Operation& op,
-                          const std::vector<Output>& grad_inputs,
-                          std::vector<Output>* grad_outputs) {
-  auto dx = internal::SoftsignGrad(scope, grad_inputs[0], op.input(0));
-  grad_outputs->push_back(dx);
-  return scope.status();
-}
-REGISTER_GRADIENT_OP("Softsign", SoftsignGradHelper);
-
-Status FractionalAvgPoolGradHelper(const Scope& scope, const Operation& op,
-                                   const std::vector<Output>& grad_inputs,
-                                   std::vector<Output>* grad_outputs) {
-  bool overlapping;
-  TF_RETURN_IF_ERROR(
-      GetNodeAttr(op.output(0).node()->attrs(), "overlapping", &overlapping));
-  auto dx = internal::FractionalAvgPoolGrad(
-      scope, Shape(scope, op.input(0), Shape::OutType(DT_INT64)),
-      grad_inputs[0], op.output(1), op.output(2),
-      internal::FractionalAvgPoolGrad::Overlapping(overlapping));
-  grad_outputs->push_back(dx);
-  return scope.status();
-}
-REGISTER_GRADIENT_OP("FractionalAvgPool", FractionalAvgPoolGradHelper);
-
-Status FractionalMaxPoolGradHelper(const Scope& scope, const Operation& op,
-                                   const std::vector<Output>& grad_inputs,
-                                   std::vector<Output>* grad_outputs) {
-  bool overlapping;
-  TF_RETURN_IF_ERROR(
-      GetNodeAttr(op.output(0).node()->attrs(), "overlapping", &overlapping));
-  auto dx = internal::FractionalMaxPoolGrad(
-      scope, op.input(0), op.output(0), grad_inputs[0], op.output(1),
-      op.output(2), internal::FractionalMaxPoolGrad::Overlapping(overlapping));
-  grad_outputs->push_back(dx);
-  return scope.status();
-}
-REGISTER_GRADIENT_OP("FractionalMaxPool", FractionalMaxPoolGradHelper);
-
 }  // anonymous namespace
 }  // namespace ops
 }  // namespace tensorflow
diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc
index b4d457a9d1..c4eba7ecb0 100644
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@@ -28,8 +28,6 @@ namespace {
 using ops::BiasAdd;
 using ops::Conv2D;
 using ops::Elu;
-using ops::FractionalAvgPool;
-using ops::FractionalMaxPool;
 using ops::L2Loss;
 using ops::LogSoftmax;
 using ops::LRN;
@@ -43,8 +41,6 @@ using ops::Relu;
 using ops::Relu6;
 using ops::Selu;
 using ops::Softmax;
-using ops::Softplus;
-using ops::Softsign;
 
 class NNGradTest : public ::testing::Test {
  protected:
@@ -75,30 +71,22 @@ class NNGradTest : public ::testing::Test {
     EXPECT_LT(max_error, 1e-3);
   }
 
-  // Sets tensor with random values, ensuring that every pair of elements are at
-  // least a reasonable amount apart.
-  // This is an issue for max pooling operations, in which perturbations by the
-  // numeric gradient computation in the gradient checker can change the max
-  // value if a pool has values that are too close together.
+  // Sets tensor with random values, ensuring that the max value is largest by
+  // a reasonable amount.
+  // This is an issue for MaxPool, MaxPoolV2 and MaxPool3D, in which
+  // perturbations by the numeric gradient computation in the gradient checker
+  // can change the max value if values are too close together.
   template <typename T>
-  void SetRandomValuesForMaxPooling(Tensor* tensor) {
+  void SetRandomValuesWithBumpedMax(Tensor* tensor) {
     auto tensor_flat = tensor->flat<T>();
-    // First set the array to an increasing sequence of values spaced
-    // a reasonable amount apart
-    T cur = 0;
-    for (size_t i = 0; i < tensor->NumElements(); i++) {
-      tensor_flat(i) = cur;
-      cur += 5e-2;
-    }
-    // Fischer-Yates shuffle the array
-    for (size_t i = tensor->NumElements() - 1; i >= 1; i--) {
-      // j <- random integer 0 <= j <= i
-      size_t j = random::New64() % (i + 1);
-      // swap values at i, j
-      T tmp = tensor_flat(i);
-      tensor_flat(i) = tensor_flat(j);
-      tensor_flat(j) = tmp;
+    tensor_flat.setRandom();
+    int32 max_index = 0;
+    for (size_t i = 1; i < tensor->NumElements(); i++) {
+      if (tensor_flat(i) > tensor_flat(max_index)) {
+        max_index = i;
+      }
     }
+    tensor_flat(max_index) += 1e-2;
   }
 
   Scope scope_;
@@ -201,7 +189,7 @@ TEST_F(NNGradTest, MaxPoolGradHelper) {
   const std::vector<int> strides{1, 2, 2, 1};
   auto y = MaxPool(scope_, x, ksize, strides, "VALID");
   Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
-  SetRandomValuesForMaxPooling<float>(&x_init_value);
+  SetRandomValuesWithBumpedMax<float>(&x_init_value);
   RunTest(x, x_init_value, y, y_shape);
 }
 
@@ -214,7 +202,7 @@ TEST_F(NNGradTest, MaxPoolGradV2Helper) {
   Tensor strides = test::AsTensor<int>({1, 2, 2, 1}, {4});
   auto y = MaxPoolV2(scope_, x, ksize, strides, "VALID");
   Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
-  SetRandomValuesForMaxPooling<float>(&x_init_value);
+  SetRandomValuesWithBumpedMax<float>(&x_init_value);
   RunTest(x, x_init_value, y, y_shape);
 }
 
@@ -227,7 +215,7 @@ TEST_F(NNGradTest, MaxPool3DGradHelper) {
   const std::vector<int> strides{1, 3, 3, 3, 1};
   auto y = MaxPool3D(scope_, x, ksize, strides, "VALID");
   Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
-  SetRandomValuesForMaxPooling<float>(&x_init_value);
+  SetRandomValuesWithBumpedMax<float>(&x_init_value);
   RunTest(x, x_init_value, y, y_shape);
 }
 
@@ -260,45 +248,5 @@ TEST_F(NNGradTest, LRN){
   RunTest(x, x_shape, y, x_shape);
 }
 
-TEST_F(NNGradTest, SoftplusGrad) {
-  TensorShape shape({3, 7});
-  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
-  auto y = Softplus(scope_, x);
-  RunTest(x, shape, y, shape);
-}
-
-TEST_F(NNGradTest, SoftsignGrad) {
-  TensorShape shape({3, 7});
-  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
-  auto y = Softsign(scope_, x);
-  RunTest(x, shape, y, shape);
-}
-
-TEST_F(NNGradTest, FractionalAvgPoolGradHelper) {
-  TensorShape x_shape({1, 3, 7, 1});
-  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
-  // Force consistent pooling regions for unit testing.
-  auto y = FractionalAvgPool(
-      scope_, x, {1, 1.2, 1.9, 1},
-      FractionalAvgPool::Deterministic(true).Overlapping(true).Seed(1).Seed2(
-          2));
-  TensorShape y_shape({1, 2, 3, 1});
-  RunTest(x, x_shape, y.output, y_shape);
-}
-
-TEST_F(NNGradTest, FractionalMaxPoolGradHelper) {
-  TensorShape x_shape({1, 3, 7, 1});
-  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
-  // Force consistent pooling regions for unit testing.
-  auto y = FractionalMaxPool(
-      scope_, x, {1, 1.2, 1.9, 1},
-      FractionalMaxPool::Deterministic(true).Overlapping(true).Seed(1).Seed2(
-          2));
-  Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
-  SetRandomValuesForMaxPooling<float>(&x_init_value);
-  TensorShape y_shape({1, 2, 3, 1});
-  RunTest(x, x_init_value, y.output, y_shape);
-}
-
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index 6641d45e83..6e050cf564 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -56,9 +56,9 @@ namespace bar {
 //
 // Memory stats:
 //   arg bytes total:    104
-//   arg bytes aligned:  192
+//   arg bytes aligned:  128
 //   temp bytes total:   126
-//   temp bytes aligned: 320
+//   temp bytes aligned: 224
 class MyClass : public tensorflow::XlaCompiledCpuFunction {
  public:
   // Number of input arguments for the compiled computation.
diff --git a/tensorflow/compiler/aot/embedded_protocol_buffers.h b/tensorflow/compiler/aot/embedded_protocol_buffers.h
index 4e194a6aba..ebfe4806c2 100644
--- a/tensorflow/compiler/aot/embedded_protocol_buffers.h
+++ b/tensorflow/compiler/aot/embedded_protocol_buffers.h
@@ -71,7 +71,7 @@ struct ProtobufToEmbed {
   const ::tensorflow::protobuf::MessageLite* message;
 };
 
-// Embeds a sequence of protocol buffers into an object file.
+// Embeds a a sequence of protocol buffers into an object file.
 //
 // `target_triple` is the target triple for the target architecture for the
 // generated object file.
diff --git a/tensorflow/compiler/aot/runtime.h b/tensorflow/compiler/aot/runtime.h
index d1a669ceb1..d085864f00 100644
--- a/tensorflow/compiler/aot/runtime.h
+++ b/tensorflow/compiler/aot/runtime.h
@@ -25,8 +25,8 @@ namespace tensorflow {
 namespace tfcompile {
 namespace runtime {
 
-// Align to 64-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment.
-static constexpr size_t kAlign = 64;
+// Align to 32-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment.
+static constexpr size_t kAlign = 32;
 
 // aligned_buffer_bytes returns the sum of each size in `sizes`, skipping -1
 // values.  There are `n` entries in `sizes`.  Each buffer is aligned to kAlign
diff --git a/tensorflow/compiler/aot/runtime_test.cc b/tensorflow/compiler/aot/runtime_test.cc
index 06ec623eb2..6d603a02eb 100644
--- a/tensorflow/compiler/aot/runtime_test.cc
+++ b/tensorflow/compiler/aot/runtime_test.cc
@@ -24,7 +24,7 @@ namespace runtime {
 namespace {
 
 TEST(Runtime, AlignmentValue) {
-  // We've chosen 64 byte alignment for the tfcompile runtime to mimic the
+  // We've chosen 32 byte alignment for the tfcompile runtime to mimic the
   // regular tensorflow allocator, which was chosen to play nicely with Eigen.
   // The tfcompile runtime also has a requirement that comes from the xla
   // generated code, on the relation: buffer_size >= 16 ? 2 * sizeof(void*) : 8
@@ -39,13 +39,13 @@ TEST(Runtime, AlignedBufferBytes) {
   EXPECT_EQ(aligned_buffer_bytes(sizesA, 1), 0);
 
   static constexpr intptr_t sizesB[1] = {3};
-  EXPECT_EQ(aligned_buffer_bytes(sizesB, 1), 64);
+  EXPECT_EQ(aligned_buffer_bytes(sizesB, 1), 32);
 
   static constexpr intptr_t sizesC[1] = {32};
-  EXPECT_EQ(aligned_buffer_bytes(sizesC, 1), 64);
+  EXPECT_EQ(aligned_buffer_bytes(sizesC, 1), 32);
 
   static constexpr intptr_t sizesD[7] = {1, -1, 32, -1, 64, 2, 3};
-  EXPECT_EQ(aligned_buffer_bytes(sizesD, 7), 320);
+  EXPECT_EQ(aligned_buffer_bytes(sizesD, 7), 192);
 }
 
 void* add_ptr(void* base, uintptr_t delta) {
@@ -101,11 +101,11 @@ TEST(Runtime, MallocFreeContiguousBuffers) {
   EXPECT_NE(base, nullptr);
   EXPECT_EQ(bufD[0], add_ptr(base, 0));
   EXPECT_EQ(bufD[1], nullptr);
-  EXPECT_EQ(bufD[2], add_ptr(base, 64));
+  EXPECT_EQ(bufD[2], add_ptr(base, 32));
   EXPECT_EQ(bufD[3], nullptr);
-  EXPECT_EQ(bufD[4], add_ptr(base, 128));
-  EXPECT_EQ(bufD[5], add_ptr(base, 192));
-  EXPECT_EQ(bufD[6], add_ptr(base, 256));
+  EXPECT_EQ(bufD[4], add_ptr(base, 64));
+  EXPECT_EQ(bufD[5], add_ptr(base, 128));
+  EXPECT_EQ(bufD[6], add_ptr(base, 160));
   for (int i = 0; i < 7; ++i) {
     const intptr_t size = sizesD[i];
     if (size != -1) {
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 1067b38f93..d82922a359 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -178,7 +178,6 @@ cc_library(
         ":runtime_matmul",
         ":runtime_matmul_mkl",
         ":runtime_single_threaded_conv2d",
-        ":runtime_single_threaded_fft",
         ":runtime_single_threaded_matmul",
         "@llvm//:execution_engine",
         "@llvm//:core",
@@ -517,6 +516,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:framework",
         "//tensorflow/core:framework_lite",
         "//third_party/eigen3",
     ],
@@ -578,22 +578,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "runtime_single_threaded_fft",
-    srcs = [
-        "runtime_fft_impl.h",
-        "runtime_single_threaded_fft.cc",
-    ],
-    hdrs = ["runtime_single_threaded_fft.h"],
-    copts = runtime_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/core:framework_lite",
-        "//third_party/eigen3",
-    ],
-)
-
 cc_library(
     name = "runtime_single_threaded_matmul",
     srcs = ["runtime_single_threaded_matmul.cc"],
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
index 54c52bc08f..215405f680 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
@@ -51,8 +51,6 @@ extern const char* const kEigenConvF16SymbolName =
 extern const char* const kEigenConvF32SymbolName =
     "__xla_cpu_runtime_EigenConvF32";
 extern const char* const kEigenFftSymbolName = "__xla_cpu_runtime_EigenFft";
-extern const char* const kEigenSingleThreadedFftSymbolName =
-    "__xla_cpu_runtime_EigenSingleThreadedFft";
 extern const char* const kEigenSingleThreadedMatMulF16SymbolName =
     "__xla_cpu_runtime_EigenSingleThreadedMatMulF16";
 extern const char* const kEigenSingleThreadedMatMulF32SymbolName =
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
index aa0e967123..1dce6efa5c 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
@@ -52,7 +52,6 @@ extern const char* const kMKLSingleThreadedMatMulF64SymbolName;
 extern const char* const kEigenConvF16SymbolName;
 extern const char* const kEigenConvF32SymbolName;
 extern const char* const kEigenFftSymbolName;
-extern const char* const kEigenSingleThreadedFftSymbolName;
 extern const char* const kEigenSingleThreadedMatMulF16SymbolName;
 extern const char* const kEigenSingleThreadedMatMulF32SymbolName;
 extern const char* const kEigenSingleThreadedMatMulF64SymbolName;
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 758b8c62b4..2c20be155f 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -1172,13 +1172,7 @@ Status IrEmitter::HandleFft(HloInstruction* fft) {
       {int8_ptr_type, int8_ptr_type, int8_ptr_type, int32_type, int32_type,
        int64_type, int64_type, int64_type, int64_type},
       /*isVarArg=*/false);
-
-  bool multi_threaded_eigen =
-      hlo_module_config_.debug_options().xla_cpu_multi_thread_eigen();
-  const char* fn_name = multi_threaded_eigen
-                            ? runtime::kEigenFftSymbolName
-                            : runtime::kEigenSingleThreadedFftSymbolName;
-
+  const char* fn_name = runtime::kEigenFftSymbolName;
   llvm::Function* fft_func = llvm::cast<llvm::Function>(
       module_->getOrInsertFunction(fn_name, fft_type));
   fft_func->setCallingConv(llvm::CallingConv::C);
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h b/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
index 0bf693edd0..984cb0616e 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
@@ -21,6 +21,8 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/platform/types.h"
 
 // 'tensorflow' namespace is used so that int64 and other types don't require
@@ -69,9 +71,11 @@ void EigenFftR2C(const EigenDevice& device, complex64* out, float* operand,
   in_dims[0] = input_batch;
   Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> out_dims;
   out_dims[0] = input_batch;
+  TensorShape temp_shape{input_batch};
   for (int i = 0; i < FFTRank; i++) {
     in_dims[i + 1] = fft_shape[i];
     out_dims[i + 1] = i == FFTRank - 1 ? fft_shape[i] / 2 + 1 : fft_shape[i];
+    temp_shape.AddDim(fft_shape[i]);
   }
   const Eigen::TensorMap<Eigen::Tensor<float, FFTRank + 1, Eigen::RowMajor>,
                          Eigen::Aligned>
@@ -84,8 +88,8 @@ void EigenFftR2C(const EigenDevice& device, complex64* out, float* operand,
   const auto axes = Eigen::ArrayXi::LinSpaced(FFTRank, 1, FFTRank);
 
   // Compute the full FFT using a temporary tensor.
-  Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor> full_fft(in_dims);
-
+  Tensor temp(DataTypeToEnum<complex64>::v(), temp_shape);
+  auto full_fft = temp.flat_inner_dims<complex64, FFTRank + 1>();
   const Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> zero_start_indices;
   full_fft.device(device) =
       input.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(axes);
@@ -108,9 +112,11 @@ void EigenFftC2R(const EigenDevice& device, float* out, complex64* operand,
   in_dims[0] = input_batch;
   Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> out_dims;
   out_dims[0] = input_batch;
+  TensorShape temp_shape{input_batch};
   for (int i = 0; i < FFTRank; i++) {
     in_dims[i + 1] = i == FFTRank - 1 ? fft_shape[i] / 2 + 1 : fft_shape[i];
     out_dims[i + 1] = fft_shape[i];
+    temp_shape.AddDim(fft_shape[i]);
   }
   const Eigen::TensorMap<Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor>,
                          Eigen::Aligned>
@@ -123,7 +129,8 @@ void EigenFftC2R(const EigenDevice& device, float* out, complex64* operand,
   // region we will slice from input given fft_shape. We slice input to
   // fft_shape on its inner-most dimensions, except the last (which we
   // slice to fft_shape[-1] / 2 + 1).
-  Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor> full_fft(out_dims);
+  Tensor temp(DataTypeToEnum<complex64>::v(), temp_shape);
+  auto full_fft = temp.flat_inner_dims<complex64, FFTRank + 1>();
 
   // Calculate the starting point and range of the source of
   // negative frequency part.
@@ -172,6 +179,7 @@ template <int FFTRank, typename EigenDevice>
 void EigenFftWithRank(const EigenDevice& device, void* out, void* operand,
                       int32 fft_type, int64 input_batch, int64 fft_length0,
                       int64 fft_length1, int64 fft_length2) {
+  CHECK(::xla::FftType_IsValid(fft_type)) << fft_type;
   switch (fft_type) {
     case ::xla::FftType::FFT:
       EigenFftC2C<true, FFTRank, EigenDevice>(
@@ -196,8 +204,7 @@ void EigenFftWithRank(const EigenDevice& device, void* out, void* operand,
           input_batch, fft_length0, fft_length1, fft_length2);
       break;
     default:
-      // Unsupported FFT type
-      abort();
+      LOG(FATAL) << "Unsupported FFT type: " << fft_type;
   }
 }
 
@@ -223,8 +230,7 @@ void EigenFftImpl(const EigenDevice& device, void* out, void* operand,
                                                  fft_length1, fft_length2);
       break;
     default:
-      // Unsupported FFT rank
-      abort();
+      LOG(FATAL) << "Unsupported FFT rank " << fft_rank;
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc
deleted file mode 100644
index 2613ddb127..0000000000
--- a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h"
-
-#include "tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h"
-#include "tensorflow/core/platform/dynamic_annotations.h"
-#include "tensorflow/core/platform/types.h"
-
-using tensorflow::int32;
-using tensorflow::int64;
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenSingleThreadedFft(
-    const void* run_options_ptr, void* out, void* operand, int32 fft_type,
-    int32 fft_rank, int64 input_batch, int64 fft_length0, int64 fft_length1,
-    int64 fft_length2) {
-  tensorflow::xla::EigenFftImpl(Eigen::DefaultDevice(), out, operand, fft_type,
-                                fft_rank, input_batch, fft_length0, fft_length1,
-                                fft_length2);
-}
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h
deleted file mode 100644
index dcd133d012..0000000000
--- a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_FFT_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_FFT_H_
-
-#include "tensorflow/core/platform/types.h"
-
-extern "C" {
-
-extern void __xla_cpu_runtime_EigenSingleThreadedFft(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, void* out,
-    void* operand, tensorflow::int32 fft_type, tensorflow::int32 fft_rank,
-    tensorflow::int64 input_batch, tensorflow::int64 fft_length0,
-    tensorflow::int64 fft_length1, tensorflow::int64 fft_length2);
-
-}  // extern "C"
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_FFT_H_
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index c4c90515ac..8d8c5e4c44 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -38,7 +38,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.h"
-#include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "tensorflow/compiler/xla/service/cpu/windows_compatibility.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -203,7 +202,6 @@ bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(MKLSingleThreadedMatMulF64);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF32);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedFft);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF64);
diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
index 2515222cf2..d3bc47e61e 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher.h
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -204,7 +204,7 @@ class LayoutPattern {
   // Modifies the pattern to match only if the layout equals the given proto.
   // The layout must outlive the returned pattern.
   constexpr LayoutPattern<LayoutType, LayoutPatternEqualImpl<Impl>> EqualTo(
-      const ::xla::Layout* layout) const {
+      const Layout* layout) const {
     return LayoutPattern<LayoutType, LayoutPatternEqualImpl<Impl>>(
         LayoutPatternEqualImpl<Impl>(impl_, layout), matched_layout_);
   }
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier.cc b/tensorflow/compiler/xla/service/tuple_simplifier.cc
index 77bdcc9de0..e536c8afbf 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier.cc
+++ b/tensorflow/compiler/xla/service/tuple_simplifier.cc
@@ -30,17 +30,10 @@ limitations under the License.
 
 namespace xla {
 
-TupleSimplifier::TupleSimplifier(bool exclude_entry_computation) :
-    exclude_entry_computation_(exclude_entry_computation) {}
-
 StatusOr<bool> TupleSimplifier::Run(HloModule* module) {
   // Initially add all GTE and Tuple instructions to the worklist.
   std::queue<HloInstruction*> worklist;
   for (auto* computation : module->computations()) {
-    if (exclude_entry_computation_ &&
-        computation == module->entry_computation()) {
-      continue;
-    }
     for (auto* instruction : computation->instructions()) {
       if (instruction->opcode() == HloOpcode::kTuple ||
           instruction->opcode() == HloOpcode::kGetTupleElement) {
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier.h b/tensorflow/compiler/xla/service/tuple_simplifier.h
index 7509501883..e5e9b10b5b 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier.h
+++ b/tensorflow/compiler/xla/service/tuple_simplifier.h
@@ -27,20 +27,13 @@ namespace xla {
 // the module.
 class TupleSimplifier : public HloPassInterface {
  public:
-  TupleSimplifier() : TupleSimplifier(/*exclude_entry_computation=*/false) {}
-  explicit TupleSimplifier(bool exclude_entry_computation);
+  TupleSimplifier() {}
   ~TupleSimplifier() override {}
   tensorflow::StringPiece name() const override { return "tuple-simplifier"; }
 
   // Run tuple simplification on the given computation. Returns whether the
   // computation was changed.
   StatusOr<bool> Run(HloModule* module) override;
-
- private:
-  // When set, this pipeline stage will perform optimization of all computations
-  // apart from the module's entry computation. This is used by Graphcore's
-  // backend.
-  bool exclude_entry_computation_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier_test.cc b/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
index d3635eae81..ca9ae91281 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
@@ -42,12 +42,6 @@ class TupleSimplifierTest : public HloTestBase {
     TF_ASSERT_OK(changed_status.status());
     EXPECT_EQ(change_expected, changed_status.ValueOrDie());
   }
-  void Run(HloModule* module, bool change_expected, bool exclude_entry) {
-    TupleSimplifier simplifier(exclude_entry);
-    auto changed_status = simplifier.Run(module);
-    TF_ASSERT_OK(changed_status.status());
-    EXPECT_EQ(change_expected, changed_status.ValueOrDie());
-  }
 
   const Shape scalar_shape_ = ShapeUtil::MakeShape(F32, {});
   const Shape tuple_shape_ = ShapeUtil::MakeTupleShape(
@@ -217,76 +211,5 @@ TEST_F(TupleSimplifierTest, IncompatibleTuples) {
   EXPECT_THAT(computation->root_instruction(), tuple);
 }
 
-TEST_F(TupleSimplifierTest, CanExcludeEntryComputation) {
-  //  Verify that the root computation can be excluded
-  auto module = CreateNewModule();
-
-  HloInstruction* p0;
-  HloInstruction* p1;
-  HloComputation* c0;
-  HloComputation* c1;
-  HloComputation* entry;
-
-  {
-    HloComputation::Builder builder(TestName() + "_1");
-    p0 = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, tuple_shape_, "param"));
-    HloInstruction* gte0 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_shape_, p0, 0));
-    HloInstruction* gte1 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_shape_, p0, 1));
-    HloInstruction* gte2 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_shape_, p0, 2));
-
-    builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1, gte2}));
-
-    c0 = module->AddEmbeddedComputation(builder.Build());
-  }
-  {
-    HloComputation::Builder builder(TestName() + "_2");
-    p1 = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, tuple_shape_, "param"));
-    HloInstruction* gte0 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_shape_, p1, 0));
-    HloInstruction* gte1 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_shape_, p1, 1));
-    HloInstruction* gte2 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_shape_, p1, 2));
-
-    builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1, gte2}));
-
-    c1 = module->AddEmbeddedComputation(builder.Build());
-  }
-  {
-    HloComputation::Builder builder(TestName() + "_Entry");
-    HloInstruction* tuple_param = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, tuple_shape_, "param"));
-    HloInstruction* call0 = builder.AddInstruction(
-        HloInstruction::CreateCall(tuple_shape_, {tuple_param}, c0));
-    HloInstruction* call1 = builder.AddInstruction(
-        HloInstruction::CreateCall(tuple_shape_, {tuple_param}, c1));
-    HloInstruction* gte0 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_shape_, call0, 0));
-    HloInstruction* gte1 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_shape_, call1, 1));
-    HloInstruction* tuple0 =
-        builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
-    HloInstruction* gte2 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_shape_, tuple0, 0));
-    HloInstruction* gte3 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_shape_, tuple0, 1));
-
-    builder.AddInstruction(HloInstruction::CreateTuple({gte2, gte3}));
-
-    entry = module->AddEntryComputation(builder.Build());
-  }
-
-  Run(module.get(), /*change_expected=*/true, /*exclude_entry=*/ true);
-
-  EXPECT_THAT(c0->root_instruction(), p0);
-  EXPECT_THAT(c1->root_instruction(), p1);
-  EXPECT_THAT(entry->instruction_count(), 9);
-}
-
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/contrib/autograph/__init__.py b/tensorflow/contrib/autograph/__init__.py
index dbdbad8f4c..637e49c082 100644
--- a/tensorflow/contrib/autograph/__init__.py
+++ b/tensorflow/contrib/autograph/__init__.py
@@ -23,7 +23,6 @@ from __future__ import print_function
 
 # TODO(mdan): Bring only the relevant symbols to the top level.
 from tensorflow.contrib.autograph import utils
-from tensorflow.contrib.autograph import operators
 from tensorflow.contrib.autograph.impl.api import convert
 from tensorflow.contrib.autograph.impl.api import converted_call
 from tensorflow.contrib.autograph.impl.api import do_not_convert
@@ -44,8 +43,6 @@ _allowed_symbols = [
     'do_not_convert',
     'to_code',
     'to_graph',
-    # Overloaded operators
-    'operators',
     # Special functions and directives
     'set_element_type',
     'set_loop_options',
diff --git a/tensorflow/contrib/cmake/tf_c.cmake b/tensorflow/contrib/cmake/tf_c.cmake
index 2e0a2fcef4..bda5e26f43 100644
--- a/tensorflow/contrib/cmake/tf_c.cmake
+++ b/tensorflow/contrib/cmake/tf_c.cmake
@@ -37,15 +37,13 @@ add_dependencies(
   tf_core_lib
   tf_protos_cc)
 
-if(tensorflow_BUILD_PYTHON_BINDINGS)
-  add_library(tf_c_python_api OBJECT
-    "${tensorflow_source_dir}/tensorflow/c/python_api.cc"
-    "${tensorflow_source_dir}/tensorflow/c/python_api.h"
-  )
-  add_dependencies(
-    tf_c_python_api
-    tf_c
-    tf_core_lib
-    tf_core_framework
-    tf_protos_cc)
-endif()
+add_library(tf_c_python_api OBJECT
+  "${tensorflow_source_dir}/tensorflow/c/python_api.cc"
+  "${tensorflow_source_dir}/tensorflow/c/python_api.h"
+)
+add_dependencies(
+  tf_c_python_api
+  tf_c
+  tf_core_lib
+  tf_core_framework
+  tf_protos_cc)
diff --git a/tensorflow/contrib/cmake/tf_cc_ops.cmake b/tensorflow/contrib/cmake/tf_cc_ops.cmake
index 6c90cf398c..f73da0b8ab 100644
--- a/tensorflow/contrib/cmake/tf_cc_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_cc_ops.cmake
@@ -155,7 +155,7 @@ if (WIN32)
     set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.lib")
   endif()
 else (WIN32)
-  set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal${CMAKE_SHARED_LIBRARY_SUFFIX}")
+  set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so")
 endif (WIN32)
 add_custom_target(tf_extension_ops)
 
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 9244604489..a0c3ddd28b 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -715,7 +715,7 @@ if(WIN32)
   endif()
 else()
   add_custom_command(TARGET pywrap_tensorflow_internal POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal${CMAKE_SHARED_LIBRARY_SUFFIX}
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so
                                      ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.so)
 endif()
 
@@ -832,6 +832,7 @@ add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
 add_custom_command(TARGET tf_python_copy_scripts_to_destination PRE_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/contrib/testing/python/framework/util_test.py
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/testing/python/framework/)
+
 add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tools/pip_package/README
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/)
diff --git a/tensorflow/contrib/cmake/tools/create_def_file.py b/tensorflow/contrib/cmake/tools/create_def_file.py
index 4f957f1e0b..cffe069aa3 100644
--- a/tensorflow/contrib/cmake/tools/create_def_file.py
+++ b/tensorflow/contrib/cmake/tools/create_def_file.py
@@ -44,8 +44,7 @@ UNDNAME = "undname.exe"
 DUMPBIN = "dumpbin.exe"
 
 # Exclude if matched
-EXCLUDE_RE = re.compile(r"RTTI|deleting destructor|::internal::|Internal|"
-                        r"python_op_gen_internal|grappler")
+EXCLUDE_RE = re.compile(r"RTTI|deleting destructor|::internal::")
 
 # Include if matched before exclude
 INCLUDEPRE_RE = re.compile(r"google::protobuf::internal::ExplicitlyConstructed|"
@@ -57,10 +56,6 @@ INCLUDEPRE_RE = re.compile(r"google::protobuf::internal::ExplicitlyConstructed|"
                            r"tensorflow::ops::internal::Enter|"
                            r"tensorflow::strings::internal::AppendPieces|"
                            r"tensorflow::strings::internal::CatPieces|"
-                           r"tensorflow::errors::Internal|"
-                           r"tensorflow::Tensor::CopyFromInternal|"
-                           r"tensorflow::kernel_factory::"
-                           r"OpKernelRegistrar::InitInternal|"
                            r"tensorflow::io::internal::JoinPathImpl")
 
 # Include if matched after exclude
@@ -69,7 +64,7 @@ INCLUDE_RE = re.compile(r"^(TF_\w*)$|"
                         r"tensorflow::|"
                         r"functor::|"
                         r"\?nsync_|"
-                        r"stream_executor::")
+                        r"perftools::gputools")
 
 # We want to identify data members explicitly in the DEF file, so that no one
 # can implicitly link against the DLL if they use one of the variables exported
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
index 795f1993ba..45760a29ee 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
@@ -151,24 +151,16 @@ class SinhArcsinhBijectorTest(test.TestCase):
         self.assertAllClose(y, bijector.forward(x).eval(), rtol=1e-4, atol=0.)
         self.assertAllClose(x, bijector.inverse(y).eval(), rtol=1e-4, atol=0.)
 
-        # On IBM PPC systems, longdouble (np.float128) is same as double except that it can have more precision.
-        # Type double being of 8 bytes, can't hold square of max of float64 (which is also 8 bytes) and
-        # below test fails due to overflow error giving inf. So this check avoids that error by skipping square
-        # calculation and corresponding assert.
-
-        if np.amax(y) <= np.sqrt(np.finfo(np.float128).max) and \
-           np.fabs(np.amin(y)) <= np.sqrt(np.fabs(np.finfo(np.float128).min)):
-
-          # Do the numpy calculation in float128 to avoid inf/nan.
-          y_float128 = np.float128(y)
-          self.assertAllClose(
-              np.log(np.cosh(
-                  np.arcsinh(y_float128) / tailweight - skewness) / np.sqrt(
-                      y_float128**2 + 1)) -
-              np.log(tailweight),
-              bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
-              rtol=1e-4,
-              atol=0.)
+        # Do the numpy calculation in float128 to avoid inf/nan.
+        y_float128 = np.float128(y)
+        self.assertAllClose(
+            np.log(np.cosh(
+                np.arcsinh(y_float128) / tailweight - skewness) / np.sqrt(
+                    y_float128**2 + 1)) -
+            np.log(tailweight),
+            bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
+            rtol=1e-4,
+            atol=0.)
         self.assertAllClose(
             -bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
             bijector.forward_log_det_jacobian(x, event_ndims=0).eval(),
diff --git a/tensorflow/contrib/eager/python/datasets.py b/tensorflow/contrib/eager/python/datasets.py
index adf92c27ea..d7909dd5a2 100644
--- a/tensorflow/contrib/eager/python/datasets.py
+++ b/tensorflow/contrib/eager/python/datasets.py
@@ -106,8 +106,7 @@ class Iterator(iterator_ops.EagerIterator, checkpointable.CheckpointableBase):
             target_device=target,
             buffer_size=10,
             container="",
-            shared_name=_generate_shared_name(
-                "contrib_eager_iterator_function_buffer_resource"))
+            shared_name=_generate_shared_name("function_buffer_resource"))
         self._buffer_resource_deleter = resource_variable_ops.EagerResourceDeleter(  # pylint: disable=line-too-long
             handle=self._buffer_resource_handle,
             handle_device=self._device)
diff --git a/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb b/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb
index 5749f22ac5..4fe3a0e3f3 100644
--- a/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb
+++ b/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb
@@ -68,7 +68,7 @@
         "# simply construct the object. Most layers take as a first argument the number\n",
         "# of output dimensions / channels.\n",
         "layer = tf.keras.layers.Dense(100)\n",
-        "# The number of input dimensions is often unnecessary, as it can be inferred\n",
+        "# The number of input dimensionss is often unnecessary, as it can be inferred\n",
         "# the first time the layer is used, but it can be provided if you want to \n",
         "# specify it manually, which is useful in some complex models.\n",
         "layer = tf.keras.layers.Dense(10, input_shape=(None, 5))"
@@ -267,7 +267,7 @@
         "  * `build`, where you know the shapes of the input tensors and can do the rest of the initialization\n",
         "  * `call`, where you do the forward computation\n",
         "\n",
-        "Note that you don't have to wait until `build` is called to create your variables, you can also create them in `__init__`. However, the advantage of creating them in `build` is that it enables late variable creation based on the shape of the inputs the layer will operate on. On the other hand, creating variables in `__init__` would mean that shapes required to create the variables will need to be explicitly specified."
+        "Note that you don't have to wait until `build` is called to create your variables, you can also create them in `__init__`. However, the advantage of creating them in `build` is that it enables late variable creation based on the shape of the inputs the layer will operate on. On the other hand, creating variables in `__init__` would mean that shapes requires to create the variables will need to be explicitly specified."
       ]
     },
     {
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
index 05bcdac2ca..84a413c791 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
@@ -346,8 +346,7 @@ def sequence_numeric_column(
     key,
     shape=(1,),
     default_value=0.,
-    dtype=dtypes.float32,
-    normalizer_fn=None):
+    dtype=dtypes.float32):
   """Returns a feature column that represents sequences of numeric data.
 
   Example:
@@ -371,12 +370,6 @@ def sequence_numeric_column(
     default_value: A single value compatible with `dtype` that is used for
       padding the sparse data into a dense `Tensor`.
     dtype: The type of values.
-    normalizer_fn: If not `None`, a function that can be used to normalize the
-      value of the tensor after `default_value` is applied for parsing.
-      Normalizer function takes the input `Tensor` as its argument, and returns
-      the output `Tensor`. (e.g. lambda x: (x - 3.0) / 4.2). Please note that
-      even though the most common use case of this function is normalization, it
-      can be used for any kind of Tensorflow transformations.
 
   Returns:
     A `_SequenceNumericColumn`.
@@ -390,16 +383,12 @@ def sequence_numeric_column(
   if not (dtype.is_integer or dtype.is_floating):
     raise ValueError('dtype must be convertible to float. '
                      'dtype: {}, key: {}'.format(dtype, key))
-  if normalizer_fn is not None and not callable(normalizer_fn):
-    raise TypeError(
-        'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
 
   return _SequenceNumericColumn(
       key,
       shape=shape,
       default_value=default_value,
-      dtype=dtype,
-      normalizer_fn=normalizer_fn)
+      dtype=dtype)
 
 
 def _assert_all_equal_and_return(tensors, name=None):
@@ -418,7 +407,7 @@ class _SequenceNumericColumn(
     fc._SequenceDenseColumn,
     collections.namedtuple(
         '_SequenceNumericColumn',
-        ['key', 'shape', 'default_value', 'dtype', 'normalizer_fn'])):
+        ['key', 'shape', 'default_value', 'dtype'])):
   """Represents sequences of numeric data."""
 
   @property
@@ -430,10 +419,7 @@ class _SequenceNumericColumn(
     return {self.key: parsing_ops.VarLenFeature(self.dtype)}
 
   def _transform_feature(self, inputs):
-    input_tensor = inputs.get(self.key)
-    if self.normalizer_fn is not None:
-      input_tensor = self.normalizer_fn(input_tensor)
-    return input_tensor
+    return inputs.get(self.key)
 
   @property
   def _variable_shape(self):
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
index 45d7b74046..ee74cf56dc 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
@@ -28,7 +28,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import monitored_session
 
@@ -948,7 +947,6 @@ class SequenceNumericColumnTest(test.TestCase):
     self.assertEqual((1,), a.shape)
     self.assertEqual(0., a.default_value)
     self.assertEqual(dtypes.float32, a.dtype)
-    self.assertIsNone(a.normalizer_fn)
 
   def test_shape_saved_as_tuple(self):
     a = sfc.sequence_numeric_column('aaa', shape=[1, 2])
@@ -967,10 +965,6 @@ class SequenceNumericColumnTest(test.TestCase):
         ValueError, 'dtype must be convertible to float'):
       sfc.sequence_numeric_column('aaa', dtype=dtypes.string)
 
-  def test_normalizer_fn_must_be_callable(self):
-    with self.assertRaisesRegexp(TypeError, 'must be a callable'):
-      sfc.sequence_numeric_column('aaa', normalizer_fn='NotACallable')
-
   def test_get_sequence_dense_tensor(self):
     sparse_input = sparse_tensor.SparseTensorValue(
         # example 0, values [[0.], [1]]
@@ -991,41 +985,6 @@ class SequenceNumericColumnTest(test.TestCase):
       self.assertAllEqual(
           expected_dense_tensor, dense_tensor.eval(session=sess))
 
-  def test_get_sequence_dense_tensor_with_normalizer_fn(self):
-
-    def _increment_two(input_sparse_tensor):
-      return sparse_ops.sparse_add(
-          input_sparse_tensor,
-          sparse_tensor.SparseTensor(((0, 0), (1, 1)), (2.0, 2.0), (2, 2))
-      )
-
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, values [[0.], [1]]
-        # example 1, [[10.]]
-        indices=((0, 0), (0, 1), (1, 0)),
-        values=(0., 1., 10.),
-        dense_shape=(2, 2))
-
-    # Before _increment_two:
-    #   [[0.], [1.]],
-    #   [[10.], [0.]],
-    # After _increment_two:
-    #   [[2.], [1.]],
-    #   [[10.], [2.]],
-    expected_dense_tensor = [
-        [[2.], [1.]],
-        [[10.], [2.]],
-    ]
-    numeric_column = sfc.sequence_numeric_column(
-        'aaa', normalizer_fn=_increment_two)
-
-    dense_tensor, _ = numeric_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
-
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_dense_tensor, dense_tensor.eval(session=sess))
-
   def test_get_sequence_dense_tensor_with_shape(self):
     """Tests get_sequence_dense_tensor with shape !=(1,)."""
     sparse_input = sparse_tensor.SparseTensorValue(
diff --git a/tensorflow/contrib/ffmpeg/__init__.py b/tensorflow/contrib/ffmpeg/__init__.py
index 484ffee3e7..daba965a98 100644
--- a/tensorflow/contrib/ffmpeg/__init__.py
+++ b/tensorflow/contrib/ffmpeg/__init__.py
@@ -28,6 +28,7 @@ from __future__ import print_function
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_audio
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_video
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import encode_audio
+from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_video
 
 from tensorflow.python.util.all_util import remove_undocumented
 
diff --git a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
index b1b5126d9e..020b5c99c6 100644
--- a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
+++ b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.contrib.ffmpeg.ops import gen_decode_audio_op_py
 from tensorflow.contrib.ffmpeg.ops import gen_decode_video_op_py
 from tensorflow.contrib.ffmpeg.ops import gen_encode_audio_op_py
+from tensorflow.contrib.ffmpeg.ops import gen_decode_video_op_py
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import resource_loader
diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index dc49383c5c..10d1ecc738 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -119,13 +119,14 @@ from tensorflow.python.framework.smart_cond import smart_cond
 from tensorflow.python.framework.smart_cond import smart_constant_value
 from tensorflow.python.framework.tensor_spec import BoundedTensorSpec
 from tensorflow.python.framework.tensor_spec import TensorSpec
+from tensorflow.python.ops.array_ops import broadcast_to
 from tensorflow.python.ops.init_ops import convolutional_delta_orthogonal
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_1d
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_2d
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_3d
 from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = ['nest']
+_allowed_symbols = ['nest', 'broadcast_to']
 _nest_allowed_symbols = [
     'assert_same_structure',
     'is_sequence',
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
index a955e21b72..65cb94b5a4 100644
--- a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
@@ -301,8 +301,8 @@ class FusedConv2DBiasActivationTest(test.TestCase):
           conv = tensors[i]
           value = values[i]
           ref_value = ref_values[i]
-          tf_logging.info("expected = ", ref_value)
-          tf_logging.info("actual = ", value)
+          print("expected = ", ref_value)
+          print("actual = ", value)
           tol = 1e-5
           if value.dtype == np.float16:
             tol = 1e-3
@@ -843,8 +843,7 @@ class FusedConvInt8Tests(test.TestCase):
                                                 vertical_stride, padding_type)
     output_width = CalculateConvolvedOutputDim(input_width, filter_width,
                                                horizontal_stride, padding_type)
-    tf_logging.info("output_height=", output_height, ", output_width=", 
-			                 output_width)
+    print("output_height=", output_height, ", output_width=", output_width)
 
     side_input, _, _ = gen_array_ops.quantize_v2(
         random_ops.random_uniform(
@@ -881,8 +880,8 @@ class FusedConvInt8Tests(test.TestCase):
     with self.test_session(
         use_gpu=True, config=NoMemoryOptimizationConfig()) as sess:
       actual_y, expected_y = sess.run([actual, expected])
-      tf_logging.info("actual_y = ", actual_y)
-      tf_logging.info("expected_y = ", expected_y)
+      print("actual_y = ", actual_y)
+      print("expected_y = ", expected_y)
       self.assertTrue(np.array_equal(actual_y, expected_y))
 
   def testFusedConvInt8(self):
diff --git a/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c b/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
index 2e5c84704f..6a5d982dc8 100644
--- a/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
+++ b/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "hexagon_controller.h"
 
-#include <stdlib.h>
+#include <malloc.h>
 #include <stdio.h>
 
 #include "adspmsgd.h"
diff --git a/tensorflow/contrib/lite/download_dependencies.sh b/tensorflow/contrib/lite/download_dependencies.sh
index 840015a7fa..436c3e1d4c 100755
--- a/tensorflow/contrib/lite/download_dependencies.sh
+++ b/tensorflow/contrib/lite/download_dependencies.sh
@@ -30,7 +30,9 @@ if [ ! -f $BZL_FILE_PATH ]; then
 fi
 
 EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
-GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
+# TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' once
+# the archive has been propagated in mirror.bazel.build.
+GEMMLOWP_URL="$(grep -o 'https://github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)"
 NEON_2_SSE_URL="https://github.com/intel/ARM_NEON_2_x86_SSE/archive/master.zip"
diff --git a/tensorflow/contrib/lite/examples/minimal/minimal.cc b/tensorflow/contrib/lite/examples/minimal/minimal.cc
index 8b0ace96cc..106e3b0270 100644
--- a/tensorflow/contrib/lite/examples/minimal/minimal.cc
+++ b/tensorflow/contrib/lite/examples/minimal/minimal.cc
@@ -38,7 +38,7 @@ using namespace tflite;
 
 int main(int argc, char *argv[]) {
   if(argc != 2) {
-    fprintf(stderr, "minimal <tflite model>\n");
+    fprintf(stderr, "Usage: %s <model>\n");
     return 1;
   }
   const char* filename = argv[1];
diff --git a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
index 965273f0f0..bb2e615eac 100644
--- a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
+++ b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
@@ -128,6 +128,7 @@ TensorFlow operation not listed above are likely unsupported. Notably, the
 following common ops are not supported at the moment:
 
 *   [tf.depth_to_space](https://www.tensorflow.org/api_docs/python/tf/depth_to_space)
+*   [tf.gather](https://www.tensorflow.org/api_docs/python/tf/gather)
 *   [tf.image.resize_bilinear](https://www.tensorflow.org/api_docs/python/tf/image/resize_bilinear)
 *   [tf.tanh](https://www.tensorflow.org/api_docs/python/tf/tanh)
 
@@ -305,19 +306,6 @@ Options {
 }
 ```
 
-**GATHER**
-
-```
-Inputs {
-  0: params tensor
-  1: indices tensor
-  2: axis tensor (optional)
-}
-Outputs {
-  0: a tensor with same type as the params tensor.
-}
-```
-
 **GREATER**
 
 ```
diff --git a/tensorflow/contrib/lite/java/ovic/README.md b/tensorflow/contrib/lite/java/ovic/README.md
index 26349347fa..5efa70987e 100644
--- a/tensorflow/contrib/lite/java/ovic/README.md
+++ b/tensorflow/contrib/lite/java/ovic/README.md
@@ -2,7 +2,7 @@
 
 This folder contains building code for track one of the [Low Power ImageNet Recognition Challenge workshop at CVPR 2018.](https://rebootingcomputing.ieee.org/home/sitemap/14-lpirc/80-low-power-image-recognition-challenge-lpirc-2018)
 
-## Pre-requisite
+## Pre-requesits
 
 Follow the steps [here](https://www.tensorflow.org/mobile/tflite/demo_android) to install Tensorflow, Bazel, and the Android NDK and SDK.
 
@@ -49,7 +49,7 @@ Once you have a submission that follows the instructions from the [competition s
 You can call the validator binary below to verify that your model fits the format requirements. This often helps you to catch size mismatches (e.g. output should be [1, 1001] instead of [1,1,1,1001]). Let say the submission file is located at `/path/to/my_model.lite`, then call:
 
 ```sh
-bazel build --cxxopt=--std=c++11 //tensorflow/contrib/lite/java/ovic:ovic_validator --cxxopt=-Wno-all
+bazel build --cxxopt--std=c++11 //tensorflow/contrib/lite/java/ovic:ovic_validator --cxxopt=-Wno-all
 bazel-bin/tensorflow/contrib/lite/java/ovic/ovic_validator /path/to/my_model.lite
 ```
 
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 1908f7fa6c..a2f192bbc2 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -1934,7 +1934,7 @@ inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
 // The quantization of the input, output arrays is as follows:
 //  - The input activations are quantized as uint8 on the interval
 //    [-1, 127/128].
-//    The rationale for that is that is the natural interval for output
+//    The rationale for that is that that is the natural interval for output
 //    activations (see next point) and these need to be concatenated together.
 //    We could accommodate different ranges by re-scaling, but we empirically
 //    found that setting the input activations range to be [-1, 127/128] in the
@@ -1999,7 +1999,7 @@ inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
 // However, for a fixed-point implementation in 16-bit integers, using 5
 // integer bits to represent the [-16, 16] range would leave only 11
 // fractional bits, giving an increment of 2^-11 = 4.9e-4 between consecutive
-// representable values. Notice that is higher than the
+// representable values. Notice that that is higher than the
 // worst-case clamping error with clamping to [-8, 8]: 3.4e-4 for Logistic.
 // Using [-8, 8] thus seems like the better compromise overall, enjoying
 // an increment of 2.4e-4 between representable values and a worst-case
diff --git a/tensorflow/contrib/lite/python/interpreter.py b/tensorflow/contrib/lite/python/interpreter.py
index fd90823425..9400e757b9 100644
--- a/tensorflow/contrib/lite/python/interpreter.py
+++ b/tensorflow/contrib/lite/python/interpreter.py
@@ -55,7 +55,7 @@ class Interpreter(object):
     elif model_content and not model_path:
       self._interpreter = (
           _interpreter_wrapper.InterpreterWrapper_CreateWrapperCPPFromBuffer(
-              model_content))
+              model_content, len(model_content)))
       if not self._interpreter:
         raise ValueError(
             'Failed to create model from {} bytes'.format(len(model_content)))
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index b283551c45..f705551fcb 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -397,14 +397,9 @@ InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromFile(
 }
 
 InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
-    PyObject* data) {
-  char * buf = nullptr;
-  Py_ssize_t length;
-  if (PY_TO_CPPSTRING(data, &buf, &length) == -1) {
-    return nullptr;
-  }
+    const char* data, size_t len) {
   std::unique_ptr<tflite::FlatBufferModel> model =
-      tflite::FlatBufferModel::BuildFromBuffer(buf, length);
+      tflite::FlatBufferModel::BuildFromBuffer(data, len);
   return model ? new InterpreterWrapper(std::move(model)) : nullptr;
 }
 
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
index cbeb53bee7..b0ed7c4559 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
@@ -40,7 +40,8 @@ class InterpreterWrapper {
   static InterpreterWrapper* CreateWrapperCPPFromFile(const char* model_path);
 
   // SWIG caller takes ownership of pointer.
-  static InterpreterWrapper* CreateWrapperCPPFromBuffer(PyObject* data);
+  static InterpreterWrapper* CreateWrapperCPPFromBuffer(const char* data,
+                                                        size_t len);
 
   ~InterpreterWrapper();
   bool AllocateTensors();
diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index 88dda7290b..0913cd2c5c 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -34,8 +34,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from six import PY3
-
 from google.protobuf import text_format as _text_format
 from google.protobuf.message import DecodeError
 from tensorflow.contrib.lite.python import lite_constants as constants
@@ -56,7 +54,6 @@ from tensorflow.python.framework.importer import import_graph_def
 from tensorflow.python.ops.variables import global_variables_initializer
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import tag_constants
-# from tensorflow.python.util.all_util import remove_undocumented
 
 
 class TocoConverter(object):
@@ -206,12 +203,6 @@ class TocoConverter(object):
       except (_text_format.ParseError, DecodeError):
         try:
           print("Ignore 'tcmalloc: large alloc' warnings.")
-
-          if not isinstance(file_content, str):
-            if PY3:
-              file_content = file_content.decode('utf-8')
-            else:
-              file_content = file_content.encode('utf-8')
           _text_format.Merge(file_content, graph_def)
         except (_text_format.ParseError, DecodeError):
           raise ValueError(
@@ -391,5 +382,3 @@ def _freeze_graph(sess, output_tensors):
                                                         output_arrays)
   else:
     return sess.graph_def
-
-# remove_undocumented(__name__)
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index 5c7fa09891..e33b430937 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -178,7 +178,7 @@ ArrayDataType ConvertDataType(tensorflow::DataType dtype) {
   else if (dtype == DT_STRING)
     return ArrayDataType::kString;
   else
-    LOG(INFO) << "Unsupported data type in placeholder op: " << dtype;
+    LOG(INFO) << "Unsupported data type in placehoder op: " << dtype;
   return ArrayDataType::kNone;
 }
 
diff --git a/tensorflow/contrib/lite/toco/toco_port.cc b/tensorflow/contrib/lite/toco/toco_port.cc
index de76fd4032..1b21c8bc60 100644
--- a/tensorflow/contrib/lite/toco/toco_port.cc
+++ b/tensorflow/contrib/lite/toco/toco_port.cc
@@ -20,12 +20,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 
-#if defined(__ANDROID__) && defined(__ARM_ARCH_7A__)
-namespace std {
-double round(double x) { return ::round(x); }
-}  // namespace std
-#endif
-
 namespace toco {
 namespace port {
 void CopyToBuffer(const string& src, char* dest) {
diff --git a/tensorflow/contrib/lite/toco/toco_port.h b/tensorflow/contrib/lite/toco/toco_port.h
index 17f82b9dd7..5c019cb2bf 100644
--- a/tensorflow/contrib/lite/toco/toco_port.h
+++ b/tensorflow/contrib/lite/toco/toco_port.h
@@ -34,24 +34,6 @@ limitations under the License.
 #define TFLITE_PROTO_NS google::protobuf
 #endif
 
-#ifdef __ANDROID__
-#include <sstream>
-namespace std {
-
-template <typename T>
-std::string to_string(T value)
-{
-    std::ostringstream os ;
-    os << value ;
-    return os.str() ;
-}
-
-#ifdef __ARM_ARCH_7A__
-double round(double x);
-#endif
-}
-#endif
-
 namespace toco {
 namespace port {
 
diff --git a/tensorflow/contrib/makefile/compile_nsync.sh b/tensorflow/contrib/makefile/compile_nsync.sh
index a28fc3a87f..e8c6edd7ba 100755
--- a/tensorflow/contrib/makefile/compile_nsync.sh
+++ b/tensorflow/contrib/makefile/compile_nsync.sh
@@ -270,7 +270,7 @@ for arch in $archs; do
                         PLATFORM_LDFLAGS=-pthread
                         MKDEP=${CC} -M -std=c++11
                         PLATFORM_C=../../platform/c++11/src/nsync_semaphore_mutex.cc \
-                                   ../../platform/posix/src/per_thread_waiter.c \
+                                   ../../platform/c++11/src/per_thread_waiter.cc \
                                    ../../platform/c++11/src/yield.cc \
                                    ../../platform/c++11/src/time_rep_timespec.cc \
                                    ../../platform/c++11/src/nsync_panic.cc
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index 48953e2e38..eff9081e35 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -27,7 +27,9 @@ if [ ! -f $BZL_FILE_PATH ]; then
 fi
 
 EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
-GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
+# TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' once
+# the archive has been propagated in mirror.bazel.build.
+GEMMLOWP_URL="$(grep -o 'https://github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 NSYNC_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
 PROTOBUF_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/protobuf/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index a6be2084aa..2ed99d50a4 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -2503,7 +2503,7 @@ def _compute_recall_at_precision(tp, fp, fn, precision, name):
     name: An optional variable_scope name.
 
   Returns:
-    The recall at a given `precision`.
+    The recall at a the given `precision`.
   """
   precisions = math_ops.div(tp, tp + fp + _EPSILON)
   tf_index = math_ops.argmin(
diff --git a/tensorflow/contrib/mpi_collectives/kernels/ring.h b/tensorflow/contrib/mpi_collectives/kernels/ring.h
index c001615d3f..1d56d588bc 100644
--- a/tensorflow/contrib/mpi_collectives/kernels/ring.h
+++ b/tensorflow/contrib/mpi_collectives/kernels/ring.h
@@ -129,7 +129,7 @@ cudaStream_t CudaStreamForMPI();
  *  has the fully accumulated Segment 1; and so on. The scatter-reduce is
  * complete.
  *
- *  Next, the allgather distributes these fully accumulated chunks across all
+ *  Next, the allgather distributes these fully accumululated chunks across all
  * nodes. Communication proceeds in the same ring, once again in N-1 steps. At
  * the ith step, node j will send chunk (j - i + 1) and receive chunk (j - i).
  * For example, at the first iteration, the following transfers will occur:
diff --git a/tensorflow/contrib/opt/python/training/adamax_test.py b/tensorflow/contrib/opt/python/training/adamax_test.py
index 915e6504e1..21bf3f5313 100644
--- a/tensorflow/contrib/opt/python/training/adamax_test.py
+++ b/tensorflow/contrib/opt/python/training/adamax_test.py
@@ -224,10 +224,8 @@ class AdaMaxOptimizerTest(test.TestCase):
           var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0),
-                                             rtol=1e-2)
-          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1),
-                                             rtol=1e-2)
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
           if use_resource:
             self.assertEqual("var0_%d/AdaMax:0" % (i,),
                              opt.get_slot(var=var0, name="m").name)
diff --git a/tensorflow/contrib/opt/python/training/model_average_optimizer.py b/tensorflow/contrib/opt/python/training/model_average_optimizer.py
index b6b10e500b..a7c97a1da2 100644
--- a/tensorflow/contrib/opt/python/training/model_average_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/model_average_optimizer.py
@@ -62,7 +62,7 @@ class ModelAverageCustomGetter(object):
   """
 
   def __init__(self, worker_device):
-    """Create a new `ModelAverageCustomGetter`.
+    """Create a new `ElasticAverageCustomGetter`.
 
     Args:
       worker_device: String.  Name of the `worker` job.
diff --git a/tensorflow/contrib/periodic_resample/BUILD b/tensorflow/contrib/periodic_resample/BUILD
index aad1ca04c5..6ca7fe8b6e 100644
--- a/tensorflow/contrib/periodic_resample/BUILD
+++ b/tensorflow/contrib/periodic_resample/BUILD
@@ -6,13 +6,12 @@ exports_files(["LICENSE"])
 
 load(
     "//tensorflow:tensorflow.bzl",
-    "tf_cc_test",
+    "py_test",
     "tf_gen_op_libs",
     "tf_custom_op_library",
     "tf_custom_op_py_library",
     "tf_gen_op_wrapper_py",
 )
-load("//tensorflow:tensorflow.bzl", "py_test")
 
 cc_library(
     name = "all_ops",
@@ -85,23 +84,6 @@ py_test(
         ":init_py",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradient_checker",
-    ],
-)
-
-tf_cc_test(
-    name = "periodic_resample_op_cc_test",
-    size = "small",
-    srcs = [
-        "ops/array_ops_test.cc",
-    ],
-    deps = [
-        ":all_ops",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_proto",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
     ],
 )
 
diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
index 514689cf45..e18923c8aa 100644
--- a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
+++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
@@ -22,9 +22,4 @@ namespace tensorflow {
 REGISTER_KERNEL_BUILDER(Name("PeriodicResample").Device(DEVICE_CPU),
                         PeriodicResampleOp);
 
-
-REGISTER_KERNEL_BUILDER(Name("PeriodicResampleOpGrad")
-                            .Device(DEVICE_CPU),
-                        PeriodicResampleOpGrad);
-
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
index 42fba81a5c..3ab588c458 100644
--- a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
+++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
@@ -25,202 +25,92 @@
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/util/work_sharder.h"
 
 namespace {
 
-// Computes input tensor index for given output index during forward
-// propagation through periodic_resample operation.
-class InputIndexer {
- public:
-  InputIndexer(const std::vector<tensorflow::int64>& output_dimensions,
-               const tensorflow::TensorShape& input_shape,
-               int adjustable_dimension)
-      : output_dimensions_(output_dimensions),
-        adjustable_dimension_(adjustable_dimension),
-        rank_(input_shape.dims()),
-        linear_output_index_(0),
-        linear_input_index_(0),
-        adjustable_dimension_carriage_sum_(0) {
-    auto input_dimensions = TensorShapeToVector(input_shape);
-    // factors by which input_dimensions increases/decreases w.r.t.
-    // output_dimensions
-    dimension_ceiling_ =
-        ComputeDimensionCeiling(output_dimensions, input_dimensions);
-    cumulative_dimensions_ = ComputeCumulativeDimensions();
-
-    output_indices_.resize(output_dimensions_.size());
-    input_indices_.resize(output_dimensions_.size());
-
-    // Compute index_factors
-    index_factors_.resize(rank_);
-    tensorflow::int64 last_index_factor = 1;
-    for (auto r = rank_ - 1; r >= 0; --r) {
-      index_factors_[r] = last_index_factor;
-      last_index_factor *= input_dimensions[r];
-    }
-  }
-
-  tensorflow::int64 linear_input_index() const { return linear_input_index_; }
-
-  void MoveToOutputIndex(tensorflow::int64 output_index);
-  void IncrementOutputIndex();
-
- private:
-  void RecomputeInputAdjustableDimensionIndex() {
-    tensorflow::int64 index = adjustable_dimension_carriage_sum_;
-    index *= output_dimensions_[adjustable_dimension_];
-    index += output_indices_[adjustable_dimension_];
-    input_indices_[adjustable_dimension_] = index;
-  }
-
-  std::vector<tensorflow::int64> TensorShapeToVector(
-      const tensorflow::TensorShape& tensor_shape);
-
-  std::vector<tensorflow::int64> ComputeDimensionCeiling(
-      const std::vector<tensorflow::int64>& output_dimensions,
-      const std::vector<tensorflow::int64>& input_dimensions);
-
-  std::vector<tensorflow::int64> ComputeCumulativeDimensions();
-
-  const std::vector<tensorflow::int64> output_dimensions_;
-  std::vector<tensorflow::int64> dimension_ceiling_;
-  std::vector<tensorflow::int64> index_factors_;
-  std::vector<tensorflow::int64> cumulative_dimensions_;
-  std::vector<tensorflow::int64> output_indices_;
-  std::vector<tensorflow::int64> input_indices_;
-
-  const int adjustable_dimension_;
-  const int rank_;
-  tensorflow::int64 linear_output_index_;
-  tensorflow::int64 linear_input_index_;
-  tensorflow::int64 adjustable_dimension_carriage_sum_;
-};
-
-void InputIndexer::MoveToOutputIndex(tensorflow::int64 output_index) {
-  linear_output_index_ = output_index;
-  linear_input_index_ = 0;
+template <class IndexVecT, class IndexT>
+IndexT compute_input_index(
+    IndexVecT* target_dimensions, const IndexT& output_index,
+    const IndexVecT& original_dimensions, const int& adjustable_dimension,
+    const std::vector<tensorflow::int64>& dimension_ceiling,
+    const std::vector<tensorflow::int64>& cumulative_dimensions, IndexT* result,
+    std::vector<IndexT>* output_indices, const int& rank) {
+  *result = 0;
+  output_indices->clear();
 
   // un-rasterize the output index
   auto last_reduced_i = output_index;
-  for (auto r = rank_ - 1; r >= 0; --r) {
-    output_indices_[r] = last_reduced_i % output_dimensions_[r];
+  for (auto r = rank - 1; r >= 0; --r) {
+    (*output_indices)[r] = last_reduced_i % (*target_dimensions)[r];
     last_reduced_i =
-        (last_reduced_i - output_indices_[r]) / output_dimensions_[r];
+        (last_reduced_i - (*output_indices)[r]) / (*target_dimensions)[r];
   }
 
-  tensorflow::int64 carriage_sum = 0;
-  for (int qi = 0; qi < rank_; ++qi) {
-    if (qi == adjustable_dimension_) continue;
-    carriage_sum += cumulative_dimensions_[qi] *
-                    (output_indices_[qi] % dimension_ceiling_[qi]);
-  }
-  adjustable_dimension_carriage_sum_ = carriage_sum;
-
   // rasterize the input index
-  for (auto r = rank_ - 1; r >= 0; --r) {
-    if (r != adjustable_dimension_) {
-      input_indices_[r] = output_indices_[r] / dimension_ceiling_[r];
-    } else {
-      RecomputeInputAdjustableDimensionIndex();
-    }
-  }
-  for (auto r = rank_ - 1; r >= 0; --r) {
-    linear_input_index_ += index_factors_[r] * input_indices_[r];
-  }
-}
-
-void InputIndexer::IncrementOutputIndex() {
-  linear_output_index_++;
-  for (auto r = rank_ - 1; r >= 0; --r) {
-    auto old_carriage_sum_increment =
-        cumulative_dimensions_[r] *
-        (output_indices_[r] % dimension_ceiling_[r]);
-    output_indices_[r] = (output_indices_[r] + 1) % output_dimensions_[r];
-    if (r != adjustable_dimension_) {
-      auto new_input_index = output_indices_[r] / dimension_ceiling_[r];
-      linear_input_index_ +=
-          (new_input_index - input_indices_[r]) * index_factors_[r];
-
-      input_indices_[r] = new_input_index;
-
-      auto new_carriage_sum_increment =
-          cumulative_dimensions_[r] *
-          (output_indices_[r] % dimension_ceiling_[r]);
-
-      adjustable_dimension_carriage_sum_ = adjustable_dimension_carriage_sum_ -
-                                           old_carriage_sum_increment +
-                                           new_carriage_sum_increment;
-    }
-
-    if (output_indices_[r] != 0) {
-      // No more carries to higher indices.
-      break;
+  IndexT last_index_factor = 1;
+  for (auto r = rank - 1; r >= 0; --r) {
+    IndexT index = 0;
+    if (r != adjustable_dimension)
+      index = (*output_indices)[r] / dimension_ceiling[r];
+    else {
+      for (int qi = 0; qi < rank; ++qi) {
+        if (qi == adjustable_dimension) continue;
+        index += cumulative_dimensions[qi] *
+                 ((*output_indices)[qi] % dimension_ceiling[qi]);
+      }
+      index *= (*target_dimensions)[adjustable_dimension];
+      index += (*output_indices)[r];
     }
+    *result += last_index_factor * index;
+    last_index_factor *= original_dimensions[r];
   }
-  auto old_adjustable_dimension_input_index =
-      input_indices_[adjustable_dimension_];
-  RecomputeInputAdjustableDimensionIndex();
-  linear_input_index_ += (input_indices_[adjustable_dimension_] -
-                           old_adjustable_dimension_input_index) *
-                          index_factors_[adjustable_dimension_];
-}
 
-std::vector<tensorflow::int64> InputIndexer::TensorShapeToVector(
-    const tensorflow::TensorShape& tensor_shape) {
-  std::vector<tensorflow::int64> result(tensor_shape.dims());
-  int count = 0;
-  for (const auto dim_info : tensor_shape) {
-    result[count] = dim_info.size;
-    ++count;
-  }
-  return result;
+  return *result;
 }
 
-std::vector<tensorflow::int64> InputIndexer::ComputeDimensionCeiling(
-    const std::vector<tensorflow::int64>& output_dimensions,
-    const std::vector<tensorflow::int64>& input_dimensions) {
-  std::vector<tensorflow::int64> dimension_ceiling(input_dimensions.size());
-  for (size_t i = 0; i < input_dimensions.size(); ++i) {
-    dimension_ceiling[i] = (output_dimensions[i] + input_dimensions[i] - 1) /
-        input_dimensions[i];
-  }
-  return dimension_ceiling;
-}
+template <class InputDataT,
+          class IndexVecT>  // both types are needed here b/c IndexVecT and
+                            // InputDataT are not related
+                            void
+                            fill_periodic_tensor(
+                                tensorflow::OpKernelContext* context,
+                                const IndexVecT& desired_shape,
+                                const tensorflow::Tensor& input_tensor) {
+  // input is a strided array (last index is fastest, C-ordered)
+  auto input = input_tensor.flat<InputDataT>();
+  const int rank = input_tensor.dims();
+  // original and target dimensions
+  std::vector<tensorflow::int64> original_dimensions(rank),
+      target_dimensions(rank);
+  tensorflow::int64 total_size(input_tensor.NumElements()), new_sliced_size(1);
+  // factors by which original_dimensions increases/decreases w.r.t.
+  // target_dimensions
+  std::vector<tensorflow::int64> dimension_ceiling(rank),
+      cumulative_dimensions(rank);
+  // index of adjustable dimension
+  int adjustable_dimension;
+  tensorflow::TensorShape output_shape;
 
-std::vector<tensorflow::int64> InputIndexer::ComputeCumulativeDimensions() {
-  std::vector<tensorflow::int64> cumulative_dimensions(rank_);
-  int count = 0;
-  for (int i = 0; i < rank_; ++i) {
-    if (count == 0) {
-      cumulative_dimensions[count] = 1;
-    } else {
-      cumulative_dimensions[count] =
-          cumulative_dimensions[count - 1] * dimension_ceiling_[count - 1];
-    }
-    ++count;
-  }
-  return cumulative_dimensions;
-}
+  // requires that the rank of the input tensor and length of the desired shape
+  // are equal
+  OP_REQUIRES(context, rank == desired_shape.size(),
+              tensorflow::errors::InvalidArgument(
+                  "periodic_resample expects the rank of the input tensor, ",
+                  rank, ", to be the same as the length of the desired shape, ",
+                  desired_shape.size(), "."));
 
-template <typename IndexVecT>
-void process_desired_shape(tensorflow::OpKernelContext* context,
-                           const tensorflow::TensorShape& input_tensor_shape,
-                           const IndexVecT& desired_shape,
-                           int* adjustable_dimension,
-                           std::vector<tensorflow::int64>* target_dimensions,
-                           tensorflow::int64* output_size) {
-  tensorflow::int64 new_sliced_size = 1;
   bool found = false;
-  const int rank = input_tensor_shape.dims();
+  const auto& input_tensor_shape = input_tensor.shape();
+
   for (int i = 0; i < rank; ++i) {
+    // if (desired_shape(i) < 1) {
     if (desired_shape[i] < 1) {
       // only one index can be adjustable
       OP_REQUIRES(context, !found,
                   tensorflow::errors::InvalidArgument(
                       "periodic_resample expects only "
                       "one index to be marked as adjustable."));
-      *adjustable_dimension = i;
+      adjustable_dimension = i;
       found = true;
     } else {
       OP_REQUIRES(
@@ -232,8 +122,9 @@ void process_desired_shape(tensorflow::OpKernelContext* context,
               i, " input tensor has size ", input_tensor_shape.dim_size(i),
               ", desired shape has size ", desired_shape[i], "."));
 
-      (*target_dimensions)[i] = desired_shape[i];
-      new_sliced_size *= (*target_dimensions)[i];
+      // target_dimensions[i] = desired_shape(i);
+      target_dimensions[i] = desired_shape[i];
+      new_sliced_size *= target_dimensions[i];
     }
   }
   // at least one index needs to be adjustable
@@ -241,50 +132,26 @@ void process_desired_shape(tensorflow::OpKernelContext* context,
               tensorflow::errors::InvalidArgument(
                   "periodic_resample expects at least "
                   "one index to be marked as adjustable."));
-  (*target_dimensions)[*adjustable_dimension] =
-      input_tensor_shape.num_elements() / new_sliced_size;
-
-  *output_size = new_sliced_size * (*target_dimensions)[*adjustable_dimension];
-}
-
-// Heuristic number based on measurements on
-// Intel(R) Core(TM) i7-4930K CPU @ 3.40GHz
-const tensorflow::int64 costPerFillIndex = 35;
 
-enum class Mode {
-  kForward,
-  kGradient
-};
-
-// Computes either periodic_resample operation output or gradients for it,
-// depending on |mode|.
-// |original_shape| is always shape of input to periodic_resample operation.
-// |source_tensor| is either source for periodic_resample (for forward mode)
-//     or gradients tensor.
-// |desired_shape| is always shape, provided by user, to which forward
-//     propagation attempts resample input tensor.
-template <class InputDataT, Mode mode>
-void
-do_periodic_resample_op(tensorflow::OpKernelContext* context,
-                        const tensorflow::TensorShape& original_shape,
-                        const tensorflow::PartialTensorShape& desired_shape,
-                        const tensorflow::Tensor& source_tensor) {
-  const int rank = source_tensor.dims();
+  int count = 0;
+  for (const auto dim_info : input_tensor.shape()) {
+    original_dimensions[count] = dim_info.size;
+    ++count;
+  }
 
-  // requires that the rank of the input tensor and length of the desired shape
-  // are equal
-  OP_REQUIRES(context, rank == desired_shape.dims(),
-              tensorflow::errors::InvalidArgument(
-                  "periodic_resample expects the rank of the input tensor, ",
-                  rank, ", to be the same as the length of the desired shape, ",
-                  desired_shape.dims(), "."));
+  target_dimensions[adjustable_dimension] = total_size / new_sliced_size;
 
-  std::vector<tensorflow::int64> target_dimensions(rank);
-  tensorflow::int64 new_size = 0;
-  // index of adjustable dimension
-  int adjustable_dimension = 0;
-  process_desired_shape(context, original_shape, desired_shape.dim_sizes(),
-                        &adjustable_dimension, &target_dimensions, &new_size);
+  count = 0;
+  for (int i = 0; i < input_tensor.shape().dims(); ++i) {
+    dimension_ceiling[count] = tensorflow::int64(std::ceil(
+        float(target_dimensions[count]) / float(original_dimensions[count])));
+    if (count == 0)
+      cumulative_dimensions[count] = 1;
+    else
+      cumulative_dimensions[count] =
+          cumulative_dimensions[count - 1] * dimension_ceiling[count - 1];
+    ++count;
+  }
 
   // ensure that the new dimension is greater than zero
   OP_REQUIRES(context, target_dimensions[adjustable_dimension] > 0,
@@ -293,14 +160,11 @@ do_periodic_resample_op(tensorflow::OpKernelContext* context,
                   "adjustable dimension, ",
                   adjustable_dimension, ", isn't greater than zero, ",
                   target_dimensions[adjustable_dimension], "."));
-  tensorflow::TensorShape output_shape;
-  if (mode == Mode::kForward) {
-    for (int i = 0; i < rank; ++i) {
-      output_shape.AddDim(target_dimensions[i]);
-    }
-  } else {
-    output_shape = original_shape;
+  for (int i = 0; i < rank; ++i) {
+    output_shape.AddDim(target_dimensions[i]);
   }
+  const auto new_size =
+      new_sliced_size * target_dimensions[adjustable_dimension];
 
   // Create an output tensor and attach it to the current context
   tensorflow::Tensor* output_tensor = nullptr;
@@ -308,73 +172,47 @@ do_periodic_resample_op(tensorflow::OpKernelContext* context,
                  context->allocate_output(0, output_shape, &output_tensor));
   auto output = output_tensor->flat<InputDataT>();
 
-  // input is a strided array (last index is fastest, C-ordered)
-  auto input = source_tensor.flat<InputDataT>();
+  // memory is allocated for these variables outside the inner loop for
+  // efficiency (although, I could create a separate class scope for
+  // this purpose instead)
+  tensorflow::int64 result = 0;
+  std::vector<tensorflow::int64> output_indices(target_dimensions.size());
 
   // Fill output tensor with periodically resampled input tensor values
-  InputIndexer input_indexer(target_dimensions, original_shape,
-                             adjustable_dimension);
-
-  auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-  auto fill_output_tensor = [&input_indexer, &output, &input](
-      tensorflow::int64 start, tensorflow::int64 limit) {
-    InputIndexer local_indexer(input_indexer);
-    local_indexer.MoveToOutputIndex(start);
-    for (tensorflow::int64 output_index = start; output_index < limit;
-         ++output_index) {
-      if (mode == Mode::kForward) {
-        output(output_index) = input(local_indexer.linear_input_index());
-      } else {
-        output(local_indexer.linear_input_index()) = input(output_index);
-      }
-      local_indexer.IncrementOutputIndex();
-    }
-  };
-  ::tensorflow::Shard(worker_threads.num_threads, worker_threads.workers,
-                      new_size, costPerFillIndex, fill_output_tensor);
-}
-
-#define DATA_TYPE_SWITCH(data_type, context, CASE)                            \
-  switch (data_type) {                                                        \
-    CASE(float)                                                               \
-    CASE(double)                                                              \
-    CASE(tensorflow::int32)                                                   \
-    CASE(tensorflow::int64)                                                   \
-    default:                                                                  \
-      context->CtxFailure(__FILE__, __LINE__,                                 \
-          tensorflow::errors::InvalidArgument(                                \
-              "Unsuppored tensor elements type"));                            \
-      break;                                                                  \
+  for (tensorflow::int64 output_index = 0; output_index < new_size;
+       ++output_index) {
+    output(output_index) = input(compute_input_index(
+        &target_dimensions, output_index, original_dimensions,
+        adjustable_dimension, dimension_ceiling, cumulative_dimensions, &result,
+        &output_indices, rank));
   }
+}
 
 void create_output_tensor(
     tensorflow::OpKernelContext* context,
     const tensorflow::Tensor& input_tensor,
     const tensorflow::DataType& input_tensor_type,
-    const tensorflow::PartialTensorShape& desired_shape) {
-#define CASE(type)                                                            \
-    case tensorflow::DataTypeToEnum<type>::value:                             \
-      do_periodic_resample_op<type, Mode::kForward>(                          \
-          context, input_tensor.shape(), desired_shape, input_tensor);        \
-      break;
+    const tensorflow::PartialTensorShape& desired_shape_tensor) {
+  auto desired_shape = desired_shape_tensor.dim_sizes();
 
-  DATA_TYPE_SWITCH(input_tensor_type, context, CASE);
-#undef CASE
-}
-
-void create_grad_tensor(tensorflow::OpKernelContext* context,
-                        const tensorflow::Tensor& grad_tensor,
-                        const tensorflow::DataType& grad_tensor_type,
-                        const tensorflow::TensorShape& original_shape,
-                        const tensorflow::PartialTensorShape& desired_shape) {
-#define CASE(type)                                                            \
-    case tensorflow::DataTypeToEnum<type>::value:                             \
-      do_periodic_resample_op<type, Mode::kGradient>(                         \
-          context, original_shape, desired_shape, grad_tensor);               \
+  // obligatory type switch
+  switch (input_tensor_type) {
+    case tensorflow::DataTypeToEnum<float>::value:
+      fill_periodic_tensor<float>(context, desired_shape, input_tensor);
       break;
-
-  DATA_TYPE_SWITCH(grad_tensor_type, context, CASE);
-#undef CASE
+    case tensorflow::DataTypeToEnum<double>::value:
+      fill_periodic_tensor<double>(context, desired_shape, input_tensor);
+      break;
+    case tensorflow::DataTypeToEnum<tensorflow::int32>::value:
+      fill_periodic_tensor<tensorflow::int32>(context, desired_shape,
+                                              input_tensor);
+      break;
+    case tensorflow::DataTypeToEnum<tensorflow::int64>::value:
+      fill_periodic_tensor<tensorflow::int64>(context, desired_shape,
+                                              input_tensor);
+      break;
+    default:;
+  }
 }
 
 }  // namespace
@@ -400,25 +238,4 @@ class PeriodicResampleOp : public tensorflow::OpKernel {
   tensorflow::PartialTensorShape desired_shape;
 };
 
-class PeriodicResampleOpGrad : public tensorflow::OpKernel {
- public:
-  explicit PeriodicResampleOpGrad(tensorflow::OpKernelConstruction* context)
-      : tensorflow::OpKernel(context) {
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("original_shape", &original_shape));
-    OP_REQUIRES_OK(context, context->GetAttr("desired_shape", &desired_shape));
-  }
-
-  void Compute(tensorflow::OpKernelContext* context) override {
-    const tensorflow::Tensor& grad_tensor = context->input(0);
-    const tensorflow::DataType grad_tensor_type = context->input_dtype(0);
-    create_grad_tensor(context, grad_tensor, grad_tensor_type, original_shape,
-                       desired_shape);
-  }
-
- private:
-  tensorflow::TensorShape original_shape;
-  tensorflow::PartialTensorShape desired_shape;
-};
-
 #endif  // TENSORFLOW_KERNELS_PERIODICRESAMPLE_OP_H_
diff --git a/tensorflow/contrib/periodic_resample/ops/array_ops.cc b/tensorflow/contrib/periodic_resample/ops/array_ops.cc
index fd38cd09b4..82bd796956 100644
--- a/tensorflow/contrib/periodic_resample/ops/array_ops.cc
+++ b/tensorflow/contrib/periodic_resample/ops/array_ops.cc
@@ -26,42 +26,7 @@ REGISTER_OP("PeriodicResample")
     .Input("values: T")
     .Attr("shape: shape")
     .Output("output: T")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      tensorflow::PartialTensorShape desired_shape;
-      TF_RETURN_IF_ERROR(c->GetAttr("shape", &desired_shape));
-      shape_inference::ShapeHandle input_tensor_shape = c->input(0);
-      shape_inference::DimensionHandle num_input_elements =
-          c->NumElements(input_tensor_shape);
-      shape_inference::ShapeHandle result_shape_handle;
-      if (!shape_inference::InferenceContext::ValueKnown(num_input_elements)) {
-        TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
-            desired_shape, &result_shape_handle));
-      } else {
-        const int rank = c->Rank(input_tensor_shape);
-        std::vector<tensorflow::int64> target_dimensions(rank);
-        tensorflow::int64 new_sliced_size = 1;
-        int adjustable_dimension = 0;
-        for (int i = 0; i < rank; ++i) {
-          if (desired_shape.dim_size(i) < 1) {
-            adjustable_dimension = i;
-          } else {
-            target_dimensions[i] = desired_shape.dim_size(i);
-            new_sliced_size *= target_dimensions[i];
-          }
-        }
-        target_dimensions[adjustable_dimension] =
-            shape_inference::InferenceContext::Value(
-                num_input_elements) / new_sliced_size;
-        tensorflow::TensorShape result_shape;
-        for (int i = 0; i < rank; ++i) {
-          result_shape.AddDim(target_dimensions[i]);
-        }
-        TF_RETURN_IF_ERROR(c->MakeShapeFromTensorShape(
-            result_shape, &result_shape_handle));
-      }
-      c->set_output(0, result_shape_handle);
-      return Status::OK();
-    })
+    .SetShapeFn(shape_inference::ExplicitShape)
     .Doc(R"doc(
 Periodically resample elements of a tensor to conform to `shape`.
 
@@ -136,20 +101,4 @@ output: Periodically resampled tensor that has dimensions specified as in
 
 )doc");
 
-
-REGISTER_OP("PeriodicResampleOpGrad")
-    .Attr("T: numbertype")
-    .Input("grad: T")
-    .Attr("original_shape: shape")
-    .Attr("desired_shape: shape")
-    .Output("grad_values: T")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      tensorflow::TensorShape original_shape;
-      TF_RETURN_IF_ERROR(c->GetAttr("original_shape", &original_shape));
-      shape_inference::ShapeHandle s;
-      TF_RETURN_IF_ERROR(c->MakeShapeFromTensorShape(original_shape, &s));
-      c->set_output(0, s);
-      return Status::OK();
-});
-
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/ops/array_ops_test.cc b/tensorflow/contrib/periodic_resample/ops/array_ops_test.cc
deleted file mode 100644
index 43b7c1799f..0000000000
--- a/tensorflow/contrib/periodic_resample/ops/array_ops_test.cc
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/node_def_builder.h"
-#include "tensorflow/core/framework/shape_inference_testutil.h"
-#include "tensorflow/core/framework/tensor_shape.pb.h"
-#include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-
-TEST(ArrayOpsTest, PeriodicResample_ShapeFn) {
-  ShapeInferenceTestOp op("PeriodicResample");
-  // Case 1: output shape can be fully inferreed.
-  PartialTensorShape shape({4, 4, -1});
-  TensorShapeProto shape_proto;
-  shape.AsProto(&shape_proto);
-
-  TF_ASSERT_OK(NodeDefBuilder("test", "PeriodicResample")
-                   .Input({"values", 0, DT_INT32})
-                   .Attr("shape", shape_proto)
-                   .Finalize(&op.node_def));
-  INFER_OK(op, "[2,2,4]", "[4,4,1]");
-  // Case 2: output shape can not be inferred - report desired shape.
-  INFER_OK(op, "[2,2,?]", "[4,4,?]");
-}
-
-}  // end namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py b/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
index 31a6fe1d94..a25de55e18 100644
--- a/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
+++ b/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
@@ -21,11 +21,8 @@ from __future__ import print_function
 import numpy
 
 from tensorflow.contrib.periodic_resample import periodic_resample
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
@@ -96,6 +93,7 @@ class PeriodicResampleTest(test_util.TensorFlowTestCase):
   def testPeriodicResampleErrors(self):
     input_tensor = numpy.zeros(shape=[1, 2, 2, 4])
     with self.test_session():
+      variables.global_variables_initializer().run()
       with self.assertRaisesWithPredicateMatch(
           errors_impl.InvalidArgumentError,
           'Dimension 3 input tensor has size 4, desired shape has size 1'):
@@ -105,29 +103,6 @@ class PeriodicResampleTest(test_util.TensorFlowTestCase):
           '4, to be the same as the length of the desired shape, 3'):
         periodic_resample(input_tensor, [None, 4, 4]).eval()
 
-  def testPeriodicResampleGradient(self):
-    desired_shape = numpy.array([4, 4, None])
-    result_shape = (4, 4, 1)
-    input_shape = (2, 2, 4)
-    with self.test_session() as sess:
-      x = array_ops.placeholder(dtypes.float32, shape=input_shape)
-      output = periodic_resample(x, desired_shape)
-      error = gradient_checker.compute_gradient_error(
-          x, input_shape, output, result_shape)
-      self.assertLess(error, 1e-4)
-
-  def testPeriodicResampleShapeInference(self):
-    with self.test_session() as sess:
-      # Case 1: output shape can be fully inferreed.
-      x = array_ops.placeholder(dtypes.float32, shape=(2, 2, 4))
-      output = periodic_resample(x, [4, 4, None])
-      self.assertEqual(output.shape, [4, 4, 1])
-      # Case 2: output shape can not be inferred - report desired shape.
-      x = array_ops.placeholder(dtypes.float32, shape=(2, 2, None))
-      output = periodic_resample(x, [4, 4, None])
-      self.assertTrue(output.shape.is_compatible_with([4, 4, None]))
-      self.assertEqual(output.shape[2].value, None)
-
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py b/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
index 470e300ccb..348623d8f8 100644
--- a/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
+++ b/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
@@ -21,17 +21,11 @@ from __future__ import print_function
 # pylint: disable=unused-import
 from tensorflow.contrib.periodic_resample.python.ops import gen_periodic_resample_op
 
-from tensorflow.contrib.periodic_resample.python.ops.gen_periodic_resample_op import periodic_resample, periodic_resample_op_grad
+from tensorflow.contrib.periodic_resample.python.ops.gen_periodic_resample_op import periodic_resample
 
 from tensorflow.contrib.util import loader
-from tensorflow.python.framework import ops
 from tensorflow.python.platform import resource_loader
 # pylint: enable=unused-import
 
 _periodic_resample_op = loader.load_op_library(
     resource_loader.get_path_to_datafile('_periodic_resample_op.so'))
-
-@ops.RegisterGradient("PeriodicResample")
-def _periodic_resample_grad_cc(op, grad):
-  return periodic_resample_op_grad(
-      grad, op.inputs[0].shape, op.get_attr('shape'))
diff --git a/tensorflow/contrib/predictor/contrib_estimator_predictor.py b/tensorflow/contrib/predictor/contrib_estimator_predictor.py
index af3b2ad1b5..b7a98c68e2 100644
--- a/tensorflow/contrib/predictor/contrib_estimator_predictor.py
+++ b/tensorflow/contrib/predictor/contrib_estimator_predictor.py
@@ -34,8 +34,7 @@ class ContribEstimatorPredictor(predictor.Predictor):
                prediction_input_fn,
                input_alternative_key=None,
                output_alternative_key=None,
-               graph=None,
-               config=None):
+               graph=None):
     """Initialize a `ContribEstimatorPredictor`.
 
     Args:
@@ -49,7 +48,6 @@ class ContribEstimatorPredictor(predictor.Predictor):
         multi-headed models.
       graph: Optional. The Tensorflow `graph` in which prediction should be
         done.
-      config: `ConfigProto` proto used to configure the session.
     """
     self._graph = graph or ops.Graph()
     with self._graph.as_default():
@@ -60,7 +58,6 @@ class ContribEstimatorPredictor(predictor.Predictor):
       checkpoint_path = saver.latest_checkpoint(estimator.model_dir)
       self._session = monitored_session.MonitoredSession(
           session_creator=monitored_session.ChiefSessionCreator(
-              config=config,
               checkpoint_filename_with_path=checkpoint_path))
 
     input_alternative_key = (
diff --git a/tensorflow/contrib/predictor/core_estimator_predictor.py b/tensorflow/contrib/predictor/core_estimator_predictor.py
index a725072e72..d78d94c269 100644
--- a/tensorflow/contrib/predictor/core_estimator_predictor.py
+++ b/tensorflow/contrib/predictor/core_estimator_predictor.py
@@ -51,8 +51,7 @@ class CoreEstimatorPredictor(predictor.Predictor):
                estimator,
                serving_input_receiver_fn,
                output_key=None,
-               graph=None,
-               config=None):
+               graph=None):
     """Initialize a `CoreEstimatorPredictor`.
 
     Args:
@@ -63,7 +62,6 @@ class CoreEstimatorPredictor(predictor.Predictor):
         `None`, then `DEFAULT_SERVING_SIGNATURE_DEF_KEY` is used.
       graph: Optional. The Tensorflow `graph` in which prediction should be
         done.
-      config: `ConfigProto` proto used to configure the session.
     """
     self._graph = graph or ops.Graph()
     with self._graph.as_default():
@@ -73,7 +71,6 @@ class CoreEstimatorPredictor(predictor.Predictor):
       checkpoint_dir = estimator.model_dir
       self._session = monitored_session.MonitoredSession(
           session_creator=monitored_session.ChiefSessionCreator(
-              config=config,
               checkpoint_dir=checkpoint_dir))
 
     feed_tensor_info = signature_def.inputs
diff --git a/tensorflow/contrib/predictor/predictor_factories.py b/tensorflow/contrib/predictor/predictor_factories.py
index f275bc15ad..6e77e934fe 100644
--- a/tensorflow/contrib/predictor/predictor_factories.py
+++ b/tensorflow/contrib/predictor/predictor_factories.py
@@ -30,8 +30,7 @@ def from_contrib_estimator(estimator,
                            prediction_input_fn,
                            input_alternative_key=None,
                            output_alternative_key=None,
-                           graph=None,
-                           config=None):
+                           graph=None):
   """Constructs a `Predictor` from a `tf.contrib.learn.Estimator`.
 
   Args:
@@ -45,7 +44,6 @@ def from_contrib_estimator(estimator,
       multi-headed models.
     graph: Optional. The Tensorflow `graph` in which prediction should be
       done.
-    config: `ConfigProto` proto used to configure the session.
 
   Returns:
     An initialized `Predictor`.
@@ -64,15 +62,13 @@ def from_contrib_estimator(estimator,
       prediction_input_fn,
       input_alternative_key=input_alternative_key,
       output_alternative_key=output_alternative_key,
-      graph=graph,
-      config=config)
+      graph=graph)
 
 
 def from_estimator(estimator,
                    serving_input_receiver_fn,
                    output_key=None,
-                   graph=None,
-                   config=None):
+                   graph=None):
   """Constructs a `Predictor` from a `tf.python.estimator.Estimator`.
 
   Args:
@@ -83,7 +79,6 @@ def from_estimator(estimator,
       `None`, then `DEFAULT_SERVING_SIGNATURE_DEF_KEY` is used.
     graph: Optional. The Tensorflow `graph` in which prediction should be
       done.
-    config: `ConfigProto` proto used to configure the session.
 
   Returns:
     An initialized `Predictor`.
@@ -98,19 +93,14 @@ def from_estimator(estimator,
                     'tf.contrib.learn.Estimator. You likely want to call '
                     'from_contrib_estimator.')
   return core_estimator_predictor.CoreEstimatorPredictor(
-      estimator,
-      serving_input_receiver_fn,
-      output_key=output_key,
-      graph=graph,
-      config=config)
+      estimator, serving_input_receiver_fn, output_key=output_key, graph=graph)
 
 
 def from_saved_model(export_dir,
                      signature_def_key=None,
                      signature_def=None,
                      tags=None,
-                     graph=None,
-                     config=None):
+                     graph=None):
   """Constructs a `Predictor` from a `SavedModel` on disk.
 
   Args:
@@ -125,7 +115,6 @@ def from_saved_model(export_dir,
       `SignatureDef`. Defaults to `DEFAULT_TAGS`.
     graph: Optional. The Tensorflow `graph` in which prediction should be
       done.
-    config: `ConfigProto` proto used to configure the session.
 
   Returns:
     An initialized `Predictor`.
@@ -139,5 +128,4 @@ def from_saved_model(export_dir,
       signature_def_key=signature_def_key,
       signature_def=signature_def,
       tags=tags,
-      graph=graph,
-      config=config)
+      graph=graph)
diff --git a/tensorflow/contrib/predictor/predictor_factories_test.py b/tensorflow/contrib/predictor/predictor_factories_test.py
index a2ef1dc3af..578d9424b2 100644
--- a/tensorflow/contrib/predictor/predictor_factories_test.py
+++ b/tensorflow/contrib/predictor/predictor_factories_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 from tensorflow.contrib.predictor import predictor_factories
 from tensorflow.contrib.predictor import testing_common
-from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.platform import test
 
 MODEL_DIR_NAME = 'contrib/predictor/test_export_dir'
@@ -42,11 +41,6 @@ class PredictorFactoriesTest(test.TestCase):
     """Test loading from_saved_model with tags."""
     predictor_factories.from_saved_model(self._export_dir, tags='serve')
 
-  def testFromSavedModelWithSessionConfig(self):
-    """Test loading from_saved_model with session config."""
-    predictor_factories.from_saved_model(
-        self._export_dir, config=config_pb2.ConfigProto())
-
   def testFromSavedModelWithBadTags(self):
     """Test that loading fails for bad tags."""
     bad_tags_regex = ('.*? could not be found in SavedModel')
@@ -59,13 +53,6 @@ class PredictorFactoriesTest(test.TestCase):
     predictor_factories.from_contrib_estimator(
         estimator, input_fn, output_alternative_key='sum')
 
-  def testFromContribEstimatorWithSessionConfig(self):
-    estimator = testing_common.get_arithmetic_estimator(core=False)
-    input_fn = testing_common.get_arithmetic_input_fn(core=False)
-    predictor_factories.from_contrib_estimator(
-        estimator, input_fn, output_alternative_key='sum',
-        config=config_pb2.ConfigProto())
-
   def testFromContribEstimatorWithCoreEstimatorRaises(self):
     estimator = testing_common.get_arithmetic_estimator(core=True)
     input_fn = testing_common.get_arithmetic_input_fn(core=True)
@@ -77,12 +64,6 @@ class PredictorFactoriesTest(test.TestCase):
     input_fn = testing_common.get_arithmetic_input_fn(core=True)
     predictor_factories.from_estimator(estimator, input_fn)
 
-  def testFromCoreEstimatorWithSessionConfig(self):
-    estimator = testing_common.get_arithmetic_estimator(core=True)
-    input_fn = testing_common.get_arithmetic_input_fn(core=True)
-    predictor_factories.from_estimator(
-        estimator, input_fn, config=config_pb2.ConfigProto())
-
   def testFromCoreEstimatorWithContribEstimatorRaises(self):
     estimator = testing_common.get_arithmetic_estimator(core=False)
     input_fn = testing_common.get_arithmetic_input_fn(core=False)
diff --git a/tensorflow/contrib/predictor/saved_model_predictor.py b/tensorflow/contrib/predictor/saved_model_predictor.py
index 95da6d04ed..0dbca0f813 100644
--- a/tensorflow/contrib/predictor/saved_model_predictor.py
+++ b/tensorflow/contrib/predictor/saved_model_predictor.py
@@ -121,8 +121,7 @@ class SavedModelPredictor(predictor.Predictor):
                input_names=None,
                output_names=None,
                tags=None,
-               graph=None,
-               config=None):
+               graph=None):
     """Initialize a `CoreEstimatorPredictor`.
 
     Args:
@@ -143,7 +142,6 @@ class SavedModelPredictor(predictor.Predictor):
         the correct `SignatureDef`. Defaults to `DEFAULT_TAGS`.
       graph: Optional. The Tensorflow `graph` in which prediction should be
         done.
-      config: `ConfigProto` proto used to configure the session.
     Raises:
       ValueError: If more than one of signature_def_key OR signature_def OR
         (input_names AND output_names) is specified.
@@ -154,7 +152,7 @@ class SavedModelPredictor(predictor.Predictor):
     self._graph = graph or ops.Graph()
 
     with self._graph.as_default():
-      self._session = session.Session(config=config)
+      self._session = session.Session()
       loader.load(self._session, tags.split(','), export_dir)
 
     if input_names is None:
diff --git a/tensorflow/contrib/quantize/README.md b/tensorflow/contrib/quantize/README.md
index 27a933c0f9..c83623ec94 100644
--- a/tensorflow/contrib/quantize/README.md
+++ b/tensorflow/contrib/quantize/README.md
@@ -6,7 +6,7 @@ inference. The details of the transformation implemented in this package is
 described here [1].
 
 This is done using the
-[fake quantization op](https://www.tensorflow.org/api_guides/python/array_ops#Fake_quantization).
+[fake quantization op](https://www.tensorflow.org/versions/r0.12/api_docs/python/array_ops/fake_quantization).
 
 Literature has shown that fixed point networks provide comparable performance to
 floating point networks [2]. This is achieved by modeling the quantization
diff --git a/tensorflow/contrib/slim/python/slim/evaluation_test.py b/tensorflow/contrib/slim/python/slim/evaluation_test.py
index 3d0308aaf3..94fc12ca81 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation_test.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation_test.py
@@ -26,6 +26,7 @@ import time
 import numpy as np
 
 from tensorflow.contrib.framework.python.ops import variables as variables_lib
+from tensorflow.contrib.metrics.python.ops import metric_ops
 from tensorflow.contrib.slim.python.slim import evaluation
 from tensorflow.contrib.training.python.training import evaluation as evaluation_lib
 from tensorflow.core.protobuf import saver_pb2
@@ -36,7 +37,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import metrics
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import flags
 from tensorflow.python.platform import gfile
@@ -89,8 +89,8 @@ class EvaluationTest(test.TestCase):
     self._predictions, self._scale = TestModel(self._inputs)
 
   def testFinalOpsOnEvaluationLoop(self):
-    value_op, update_op = metrics.accuracy(
-        labels=self._labels, predictions=self._predictions)
+    value_op, update_op = metric_ops.streaming_accuracy(self._predictions,
+                                                        self._labels)
     init_op = control_flow_ops.group(variables.global_variables_initializer(),
                                      variables.local_variables_initializer())
     # Create checkpoint and log directories:
@@ -136,10 +136,9 @@ class EvaluationTest(test.TestCase):
     self.assertTrue(obj.hook_was_run)
 
   def _create_names_to_metrics(self, predictions, labels):
-    accuracy0, update_op0 = metrics.accuracy(
-        labels=labels, predictions=predictions)
-    accuracy1, update_op1 = metrics.accuracy(
-        labels=labels, predictions=predictions + 1)
+    accuracy0, update_op0 = metric_ops.streaming_accuracy(predictions, labels)
+    accuracy1, update_op1 = metric_ops.streaming_accuracy(predictions + 1,
+                                                          labels)
 
     names_to_values = {'Accuracy': accuracy0, 'Another_accuracy': accuracy1}
     names_to_updates = {'Accuracy': update_op0, 'Another_accuracy': update_op1}
@@ -199,8 +198,8 @@ class EvaluationTest(test.TestCase):
     predictions_limited = input.limit_epochs(self._predictions, num_epochs=1)
     labels_limited = input.limit_epochs(self._labels, num_epochs=1)
 
-    value_op, update_op = metrics.accuracy(
-        labels=labels_limited, predictions=predictions_limited)
+    value_op, update_op = metric_ops.streaming_accuracy(
+        predictions_limited, labels_limited)
 
     init_op = control_flow_ops.group(variables.global_variables_initializer(),
                                      variables.local_variables_initializer())
@@ -261,8 +260,8 @@ class SingleEvaluationTest(test.TestCase):
     self._prepareCheckpoint(checkpoint_path)
 
     # Next, determine the metric to evaluate:
-    value_op, update_op = metrics.accuracy(
-        labels=self._labels, predictions=self._predictions)
+    value_op, update_op = metric_ops.streaming_accuracy(self._predictions,
+                                                        self._labels)
 
     # Run the evaluation and verify the results:
     accuracy_value = evaluation.evaluate_once(
@@ -277,8 +276,8 @@ class SingleEvaluationTest(test.TestCase):
     self._prepareCheckpoint(checkpoint_path)
 
     # Next, determine the metric to evaluate:
-    value_op, update_op = metrics.accuracy(
-        labels=self._labels, predictions=self._predictions)
+    value_op, update_op = metric_ops.streaming_accuracy(self._predictions,
+                                                        self._labels)
 
     dumping_root = os.path.join(self.get_temp_dir(), 'tfdbg_dump_dir')
     dumping_hook = hooks.DumpingDebugHook(dumping_root, log_usage=False)
diff --git a/tensorflow/contrib/summary/summary.py b/tensorflow/contrib/summary/summary.py
index d22b80ac88..99ced53e11 100644
--- a/tensorflow/contrib/summary/summary.py
+++ b/tensorflow/contrib/summary/summary.py
@@ -21,7 +21,6 @@ from @{tf.summary.merge_all} to @{tf.summary.FileWriter}.
 
 To use with eager execution enabled, write your code as follows:
 
-```python
 global_step = tf.train.get_or_create_global_step()
 summary_writer = tf.contrib.summary.create_file_writer(
     train_dir, flush_millis=10000)
@@ -31,11 +30,9 @@ with summary_writer.as_default(), tf.contrib.summary.always_record_summaries():
   tf.contrib.summary.scalar("loss", my_loss)
   # In this case every call to tf.contrib.summary.scalar will generate a record
   # ...
-```
 
 To use it with graph execution, write your code as follows:
 
-```python
 global_step = tf.train.get_or_create_global_step()
 summary_writer = tf.contrib.summary.create_file_writer(
     train_dir, flush_millis=10000)
@@ -56,7 +53,7 @@ with tf.Session(...) as sess:
   while not_done_training:
     sess.run([train_op, tf.contrib.summary.all_summary_ops()])
     # ...
-```
+
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/contrib/tensor_forest/client/eval_metrics.py b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
index d8236a0a6f..e893e1d1c8 100644
--- a/tensorflow/contrib/tensor_forest/client/eval_metrics.py
+++ b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
@@ -21,10 +21,10 @@ import numpy as np
 
 from tensorflow.contrib import losses
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
+from tensorflow.contrib.metrics.python.ops import metric_ops
 
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import metrics
 from tensorflow.python.ops import nn
 
 INFERENCE_PROB_NAME = prediction_key.PredictionKey.PROBABILITIES
@@ -38,13 +38,12 @@ def _top_k_generator(k):
     targets = math_ops.to_int32(targets)
     if targets.get_shape().ndims > 1:
       targets = array_ops.squeeze(targets, axis=[1])
-    return metrics.mean(nn.in_top_k(probabilities, targets, k))
+    return metric_ops.streaming_mean(nn.in_top_k(probabilities, targets, k))
   return _top_k
 
 
 def _accuracy(predictions, targets, weights=None):
-  return metrics.accuracy(
-      labels=targets, predictions=predictions, weights=weights)
+  return metric_ops.streaming_accuracy(predictions, targets, weights=weights)
 
 
 def _r2(probabilities, targets, weights=None):
@@ -54,7 +53,7 @@ def _r2(probabilities, targets, weights=None):
   squares_residuals = math_ops.reduce_sum(
       math_ops.square(targets - probabilities), 0)
   score = 1 - math_ops.reduce_sum(squares_residuals / squares_total)
-  return metrics.mean(score, weights=weights)
+  return metric_ops.streaming_mean(score, weights=weights)
 
 
 def _squeeze_and_onehot(targets, depth):
@@ -63,7 +62,7 @@ def _squeeze_and_onehot(targets, depth):
 
 
 def _sigmoid_entropy(probabilities, targets, weights=None):
-  return metrics.mean(
+  return metric_ops.streaming_mean(
       losses.sigmoid_cross_entropy(probabilities,
                                    _squeeze_and_onehot(
                                        targets,
@@ -72,7 +71,7 @@ def _sigmoid_entropy(probabilities, targets, weights=None):
 
 
 def _softmax_entropy(probabilities, targets, weights=None):
-  return metrics.mean(
+  return metric_ops.streaming_mean(
       losses.sparse_softmax_cross_entropy(probabilities,
                                           math_ops.to_int32(targets)),
       weights=weights)
@@ -83,7 +82,7 @@ def _predictions(predictions, unused_targets, **unused_kwargs):
 
 
 def _class_log_loss(probabilities, targets, weights=None):
-  return metrics.mean(
+  return metric_ops.streaming_mean(
       losses.log_loss(probabilities,
                       _squeeze_and_onehot(targets,
                                           array_ops.shape(probabilities)[1])),
@@ -91,36 +90,34 @@ def _class_log_loss(probabilities, targets, weights=None):
 
 
 def _precision(predictions, targets, weights=None):
-  return metrics.precision(
-      labels=targets, predictions=predictions, weights=weights)
+  return metric_ops.streaming_precision(predictions, targets, weights=weights)
 
 
 def _precision_at_thresholds(predictions, targets, weights=None):
-  return metrics.precision_at_thresholds(
-      labels=targets,
-      predictions=array_ops.slice(predictions, [0, 1], [-1, 1]),
-      thresholds=np.arange(0, 1, 0.01, dtype=np.float32),
+  return metric_ops.streaming_precision_at_thresholds(
+      array_ops.slice(predictions, [0, 1], [-1, 1]),
+      targets,
+      np.arange(
+          0, 1, 0.01, dtype=np.float32),
       weights=weights)
 
 
 def _recall(predictions, targets, weights=None):
-  return metrics.recall(
-      labels=targets, predictions=predictions, weights=weights)
+  return metric_ops.streaming_recall(predictions, targets, weights=weights)
 
 
 def _recall_at_thresholds(predictions, targets, weights=None):
-  return metrics.recall_at_thresholds(
-      labels=targets,
-      predictions=array_ops.slice(predictions, [0, 1], [-1, 1]),
-      thresholds=np.arange(0, 1, 0.01, dtype=np.float32),
+  return metric_ops.streaming_recall_at_thresholds(
+      array_ops.slice(predictions, [0, 1], [-1, 1]),
+      targets,
+      np.arange(
+          0, 1, 0.01, dtype=np.float32),
       weights=weights)
 
 
 def _auc(probs, targets, weights=None):
-  return metrics.auc(
-      labels=targets,
-      predictions=array_ops.slice(probs, [0, 1], [-1, 1]),
-      weights=weights)
+  return metric_ops.streaming_auc(array_ops.slice(probs, [0, 1], [-1, 1]),
+                                  targets, weights=weights)
 
 
 _EVAL_METRICS = {
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest.py b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
index 6f62cd11a9..7a35a70bbe 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
@@ -295,7 +295,7 @@ def get_epoch_variable():
 
 
 # A simple container to hold the training variables for a single tree.
-class TreeVariables(object):
+class TreeTrainingVariables(object):
   """Stores tf.Variables for training a single random tree.
 
   Uses tf.get_variable to get tree-specific names so that this can be used
@@ -303,7 +303,7 @@ class TreeVariables(object):
   then relies on restoring that model to evaluate).
   """
 
-  def __init__(self, params, tree_num, training, tree_config='', tree_stat=''):
+  def __init__(self, params, tree_num, training):
     if (not hasattr(params, 'params_proto') or
         not isinstance(params.params_proto,
                        _params_proto.TensorForestParams)):
@@ -315,28 +315,27 @@ class TreeVariables(object):
       # TODO(gilberth): Manually shard this to be able to fit it on
       # multiple machines.
       self.stats = stats_ops.fertile_stats_variable(
-          params, tree_stat, self.get_tree_name('stats', tree_num))
+          params, '', self.get_tree_name('stats', tree_num))
     self.tree = model_ops.tree_variable(
-        params, tree_config, self.stats, self.get_tree_name('tree', tree_num))
+        params, '', self.stats, self.get_tree_name('tree', tree_num))
 
   def get_tree_name(self, name, num):
     return '{0}-{1}'.format(name, num)
 
 
-class ForestVariables(object):
+class ForestTrainingVariables(object):
   """A container for a forests training data, consisting of multiple trees.
 
-  Instantiates a TreeVariables object for each tree. We override the
+  Instantiates a TreeTrainingVariables object for each tree. We override the
   __getitem__ and __setitem__ function so that usage looks like this:
 
-    forest_variables = ForestVariables(params)
+    forest_variables = ForestTrainingVariables(params)
 
     ... forest_variables.tree ...
   """
 
   def __init__(self, params, device_assigner, training=True,
-               tree_variables_class=TreeVariables,
-               tree_configs=None, tree_stats=None):
+               tree_variables_class=TreeTrainingVariables):
     self.variables = []
     # Set up some scalar variables to run through the device assigner, then
     # we can use those to colocate everything related to a tree.
@@ -348,13 +347,7 @@ class ForestVariables(object):
 
     for i in range(params.num_trees):
       with ops.device(self.device_dummies[i].device):
-        kwargs = {}
-        if tree_configs is not None:
-          kwargs.update(dict(tree_config=tree_configs[i]))
-        if tree_stats is not None:
-          kwargs.update(dict(tree_stat=tree_stats[i]))
-        self.variables.append(tree_variables_class(
-            params, i, training, **kwargs))
+        self.variables.append(tree_variables_class(params, i, training))
 
   def __setitem__(self, t, val):
     self.variables[t] = val
@@ -368,11 +361,9 @@ class RandomForestGraphs(object):
 
   def __init__(self,
                params,
-               tree_configs=None,
-               tree_stats=None,
                device_assigner=None,
                variables=None,
-               tree_variables_class=TreeVariables,
+               tree_variables_class=TreeTrainingVariables,
                tree_graphs=None,
                training=True):
     self.params = params
@@ -380,10 +371,9 @@ class RandomForestGraphs(object):
         device_assigner or framework_variables.VariableDeviceChooser())
     logging.info('Constructing forest with params = ')
     logging.info(self.params.__dict__)
-    self.variables = variables or ForestVariables(
+    self.variables = variables or ForestTrainingVariables(
         self.params, device_assigner=self.device_assigner, training=training,
-        tree_variables_class=tree_variables_class,
-        tree_configs=tree_configs, tree_stats=tree_stats)
+        tree_variables_class=tree_variables_class)
     tree_graph_class = tree_graphs or RandomTreeGraphs
     self.trees = [
         tree_graph_class(self.variables[i], self.params, i)
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py b/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
index 1c9c81827e..bbe627b157 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
@@ -18,14 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from google.protobuf.json_format import ParseDict
-from tensorflow.contrib.decision_trees.proto import generic_tree_model_pb2 as _tree_proto
 from tensorflow.contrib.tensor_forest.python import tensor_forest
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import resources
-from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
@@ -114,47 +110,6 @@ class TensorForestTest(test_util.TensorFlowTestCase):
     self.assertTrue(isinstance(paths, ops.Tensor))
     self.assertTrue(isinstance(var, ops.Tensor))
 
-  def testInfrenceFromRestoredModel(self):
-    input_data = [[-1., 0.], [-1., 2.],  # node 1
-                  [1., 0.], [1., -2.]]  # node 2
-    expected_prediction = [[0.0, 1.0], [0.0, 1.0],
-                           [0.0, 1.0], [0.0, 1.0]]
-    hparams = tensor_forest.ForestHParams(
-        num_classes=2,
-        num_features=2,
-        num_trees=1,
-        max_nodes=1000,
-        split_after_samples=25).fill()
-    tree_weight = {'decisionTree':
-                       {'nodes':
-                        [{'binaryNode':
-                          {'rightChildId': 2,
-                           'leftChildId': 1,
-                           'inequalityLeftChildTest':
-                           {'featureId': {'id': '0'},
-                            'threshold': {'floatValue': 0}}}},
-                         {'leaf': {'vector':
-                                   {'value': [{'floatValue': 0.0},
-                                              {'floatValue': 1.0}]}},
-                          'nodeId': 1},
-                         {'leaf': {'vector':
-                                   {'value': [{'floatValue': 0.0},
-                                              {'floatValue': 1.0}]}},
-                          'nodeId': 2}]}}
-    restored_tree_param = ParseDict(tree_weight,
-                                    _tree_proto.Model()).SerializeToString()
-    graph_builder = tensor_forest.RandomForestGraphs(hparams,
-                                                     [restored_tree_param])
-    probs, paths, var = graph_builder.inference_graph(input_data)
-    self.assertTrue(isinstance(probs, ops.Tensor))
-    self.assertTrue(isinstance(paths, ops.Tensor))
-    self.assertTrue(isinstance(var, ops.Tensor))
-    with self.test_session():
-      variables.global_variables_initializer().run()
-      resources.initialize_resources(resources.shared_resources()).run()
-      self.assertEquals(probs.eval().shape, (4, 2))
-      self.assertEquals(probs.eval().tolist(), expected_prediction)
-
   def testTrainingConstructionClassificationSparse(self):
     input_data = sparse_tensor.SparseTensor(
         indices=[[0, 0], [0, 3], [1, 0], [1, 7], [2, 1], [3, 9]],
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index da4dd5a14c..b7b26cfb1c 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -91,11 +91,8 @@ void GetSubGraphIncomingEdges(const tensorflow::Graph& graph,
       if (!subgraph_node_ids.count(edge->src()->id()) &&
           !edge->src()->IsSource() && !edge->IsControlEdge()) {
         incoming_edges->insert(edge);
-        VLOG(2) << "INCOMING " << edge->src()->name() << " -> " << node->name()
-                << " Y, ";
       } else {
-        VLOG(2) << "INCOMING " << edge->src()->name() << " -> " << node->name()
-                << " N, ";
+        VLOG(2) << node->name() << " -> " << edge->src()->name() << " N, ";
       }
     }
   }
@@ -109,12 +106,10 @@ void GetSubGraphOutgoingEdges(const tensorflow::Graph& graph,
     for (const tensorflow::Edge* edge : node->out_edges()) {
       if (!subgraph_node_ids.count(edge->dst()->id()) &&
           !edge->dst()->IsSink() && !edge->IsControlEdge()) {
-        VLOG(2) << "OUTGOING " << node->name() << " -> " << edge->dst()->name()
-                << " Y, ";
+        VLOG(2) << node->name() << " -> " << edge->dst()->name() << " Y, ";
         outgoing_edges->insert(edge);
       } else {
-        VLOG(2) << "OUTGOING " << node->name() << " -> " << edge->dst()->name()
-                << " N, ";
+        VLOG(2) << node->name() << " -> " << edge->dst()->name() << " N, ";
       }
     }
   }
@@ -186,27 +181,29 @@ struct ConvertGraphParams {
 static tensorflow::Status FillSubGraphEdgeSets(ConvertGraphParams* p) {
   GetSubGraphIncomingEdges(p->graph, p->subgraph_node_ids,
                            &p->subgraph_incoming_edges);
-
-  std::set<std::pair<int, int>> unique_tensors;
-  // Add only unique input source nodes. If output of an outside node is shared
-  // between multiple nodes inside the engine, only one edge should be created
   for (const tensorflow::Edge* edge : p->subgraph_incoming_edges) {
-    unique_tensors.insert({edge->src()->id(), edge->src_output()});
+    p->subgraph_inputs.push_back({edge->src()->id(), edge->src_output()});
+  }
+  auto output_name_to_index_map = BuildTensorNameMap(p->output_names);
+  std::set<std::pair<int, int>> subgraph_outputs_set;
+  // Collect outputs referenced from output_names
+  for (int node_id : p->subgraph_node_ids) {
+    tensorflow::Node* node = p->graph.FindNodeId(node_id);
+    if (output_name_to_index_map.count(node->name())) {
+      for (int index : output_name_to_index_map.at(node->name())) {
+        subgraph_outputs_set.insert({node_id, index});
+      }
+    }
   }
-  p->subgraph_inputs.insert(p->subgraph_inputs.begin(), unique_tensors.begin(),
-                            unique_tensors.end());
   GetSubGraphOutgoingEdges(p->graph, p->subgraph_node_ids,
                            &p->subgraph_outgoing_edges);
-  unique_tensors.clear();
-  // Similar to above, if multiple ouside nodes are sharing the output of an
-  // internal node only one output port should be created and shared between
-  // outputs
   for (const tensorflow::Edge* edge : p->subgraph_outgoing_edges) {
-    unique_tensors.insert({edge->src()->id(), edge->src_output()});
+    subgraph_outputs_set.insert({edge->src()->id(), edge->src_output()});
   }
-  p->subgraph_outputs.reserve(unique_tensors.size());
+  p->subgraph_outputs.reserve(subgraph_outputs_set.size());
   p->subgraph_outputs.insert(p->subgraph_outputs.begin(),
-                             unique_tensors.begin(), unique_tensors.end());
+                             subgraph_outputs_set.begin(),
+                             subgraph_outputs_set.end());
   return tensorflow::Status::OK();
 }
 
@@ -228,6 +225,7 @@ tensorflow::Status GetCalibNode(ConvertGraphParams* params) {
   for (auto in_edge :
        params->subgraph_incoming_edges) {  // loop over incoming edges and
                                            // attach them to calib node
+    // tensorflow::Node* src_node = in_edge->src();
     auto src_output = in_edge->src_output();
     auto dst_node = in_edge->dst();
     auto dst_input = in_edge->dst_input();
@@ -259,24 +257,19 @@ tensorflow::Status ConvertSubGraphToTensorRT(ConvertGraphParams* params) {
   for (size_t i = 0; i < params->subgraph_inputs.size(); ++i) {
     subgraph_edge_to_input_map.insert({params->subgraph_inputs.at(i), i});
   }
-  std::set<std::pair<int, int>> unique_tensors;
   for (const tensorflow::Edge* edge : params->subgraph_incoming_edges) {
     std::pair<int, int> old_src = {edge->src()->id(), edge->src_output()};
-    if (unique_tensors.count(old_src)) continue;
-    unique_tensors.insert(old_src);
     int new_src_output = subgraph_edge_to_input_map.at(old_src);
     params->graph.AddEdge(edge->src(), edge->src_output(), trt_node,
                           new_src_output);
-    VLOG(1) << "Wire " << edge->src()->name() << ":" << edge->src_output()
-            << " -> " << trt_node->name() << ":" << new_src_output;
     params->graph.RemoveEdge(edge);
   }
-  if (VLOG_IS_ON(2)) {
-    VLOG(2) << "new edge count: " << trt_node->in_edges().size();
-    for (const tensorflow::Edge* edge : trt_node->in_edges()) {
-      VLOG(2) << edge->src()->name() << " port: " << edge->src_output();
-    }
+
+  VLOG(2) << "new wiring edges: " << trt_node->in_edges().size();
+  for (const tensorflow::Edge* edge : trt_node->in_edges()) {
+    VLOG(2) << edge->src()->name() << " port: " << edge->src_output();
   }
+
   TF_RETURN_IF_ERROR(status);
 
   // Re-map outgoing edges to use the new TRT node instead of the orig subgraph
@@ -290,8 +283,6 @@ tensorflow::Status ConvertSubGraphToTensorRT(ConvertGraphParams* params) {
     int new_src_output = subgraph_edge_to_output_map.at(old_src);
     TF_RETURN_IF_ERROR(params->graph.UpdateEdge(
         trt_node, new_src_output, edge->dst(), edge->dst_input()));
-    VLOG(1) << "Wire " << trt_node->name() << ":" << new_src_output << " -> "
-            << edge->dst()->name() << ":" << edge->dst_input();
   }
   // Remove the original subgraph
   for (int node_id : params->subgraph_node_ids) {
@@ -326,12 +317,9 @@ tensorflow::Status ConvertCalibGraphToInferGraph(
       tensorflow::GraphConstructorOptions(), graph_def, &graph));
   //  get calib nodes
   std::vector<tensorflow::Node*> calib_nodes;
-  std::vector<tensorflow::Node*> topo_order;
-  tensorflow::GetPostOrder(graph, &topo_order);
-  for (auto rit = topo_order.rbegin(); rit != topo_order.rend(); ++rit) {
-    auto node = *rit;
+  for (auto node : graph.op_nodes()) {
     if (node->type_string() == "TRTCalibOp") {
-      VLOG(1) << "Found Calib Node " << node->name();
+      VLOG(1) << "Found Calib Node";
       calib_nodes.push_back(node);
     }
   }
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 4e4d295538..96e0700862 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -362,11 +362,10 @@ void ReorderCKtoKC(const TRT_ShapedWeights& iweights,
       break;
     }
     case tensorflow::DataType::DT_HALF: {
-      Reorder2(
-          {k, c}, static_cast<Eigen::half const*>(iweights.GetValues()),
-          istrides,
-          static_cast<Eigen::half*>(const_cast<void*>(oweights->GetValues())),
-          ostrides);
+      Reorder2({k, c}, static_cast<Eigen::half const*>(iweights.GetValues()),
+               istrides, static_cast<Eigen::half*>(
+                             const_cast<void*>(oweights->GetValues())),
+               ostrides);
       break;
     }
     default:
@@ -1180,9 +1179,9 @@ tensorflow::Status BinaryTensorOpTensor(
   CHECK_EQ_TYPE(tensor_r->getType(), dtype);
   auto op_pair = ops.find(node_def.op());
   if (op_pair == ops.end())
-    return tensorflow::errors::Unimplemented(
-        "binary op: " + node_def.op() +
-        " not supported at: " + node_def.name());
+    return tensorflow::errors::Unimplemented("binary op: " + node_def.op() +
+                                             " not supported at: " +
+                                             node_def.name());
 
   nvinfer1::IElementWiseLayer* layer = ctx.network()->addElementWise(
       *const_cast<nvinfer1::ITensor*>(tensor_l),
@@ -2139,7 +2138,9 @@ void Converter::register_op_converters() {
 }
 
 }  // namespace
-
+tensorflow::Status GetTensorRTGraph(tensorrt::convert::SubGraphParams& s) {
+  return tensorflow::errors::Unimplemented("Not implemented yet");
+}
 tensorflow::Status ConvertCalibrationNodeToEngineNode(
     tensorflow::Graph& graph, tensorflow::Node* c_node) {
   const auto ndef = c_node->def();
@@ -2163,23 +2164,9 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   for (auto n : graph.op_nodes()) {
     node_maps.insert({n->name(), n});
   }
-  std::set<int> subgraph_ids;
-  for (const auto internal_node : segment_nodes) {
-    subgraph_ids.insert(node_maps.at(internal_node)->id());
-  }
-  if (VLOG_IS_ON(2)) {
-    string node_names = StrCat(c_node->name(), " segment nodes= ");
-
-    for (const auto& node_name : segment_nodes) {
-      StrAppend(&node_names, node_name, ", ");
-    }
-    VLOG(2) << node_names;
-  }
-
   VLOG(1) << "Output Nodes:";
   std::vector<tensorflow::DataType> out_types;
   std::vector<const tensorflow::Edge*> out_edges;
-
   for (auto& i : output_nodes) {
     auto node_port = tensorflow::str_util::Split(i, ":");
     VLOG(1) << " " << i << " in graph " << node_maps.count(i);
@@ -2199,24 +2186,18 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
         out_types.push_back(out_node->output_type(0));
       }
       for (auto out_edge : out_node->out_edges()) {
-        if (subgraph_ids.count(out_edge->dst()->id()))
-          continue;  // skip internal edges;
         if (out_edge->src_output() == port) {
           out_edges.push_back(out_edge);
-          VLOG(1) << "OUTPUT EDGE " << out_edge->src()->name() << ":"
-                  << out_edge->src_output() << " -> " << out_edge->dst()->name()
-                  << ":" << out_edge->dst_input();
+          break;
         }
       }
     } else {
       LOG(WARNING) << " couldn't find output node " << out_node_name;
     }
   }
-  if (VLOG_IS_ON(1)) {
-    VLOG(1) << c_node->name() << " Input Nodes:";
-    for (auto& i : input_names) {
-      VLOG(1) << " Input " << i << " in graph " << node_maps.count(i);
-    }
+  VLOG(1) << "Input Nodes:";
+  for (auto& i : input_names) {
+    VLOG(1) << " " << i << " in graph " << node_maps.count(i);
   }
   auto trt_rm = tensorflow::tensorrt::TRTResourceManager::instance();
   auto resmgr = trt_rm->getManager("TRTCalibOps");
@@ -2250,24 +2231,14 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   calib_res->builder_ = nullptr;
   tensorflow::NodeDefBuilder op_builder(engine_name, "TRTEngineOp");
   std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
-  income_edges.resize(c_node->num_inputs());
   for (const auto in_edge : c_node->in_edges()) {
     auto src = in_edge->src();
     int dest_port = in_edge->dst_input();
-    VLOG(1) << "Incoming connection " << src->name() << ":"
-            << in_edge->src_output() << " -> " << c_node->name() << ":"
-            << dest_port;
-    income_edges.at(dest_port) = {src->name(), in_edge->src_output(),
-                                  c_node->input_type(dest_port)};
+    income_edges.emplace_back(src->name(), in_edge->src_output(),
+                              c_node->input_type(dest_port));
   }
   tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
       income_edges);
-  if (VLOG_IS_ON(2)) {
-    for (const auto& inp : input_list) {
-      VLOG(2) << " Input from inputlist " << inp.node << ":" << inp.index << " "
-              << tensorflow::DataTypeString(inp.data_type);
-    }
-  }
   op_builder.Input(input_list);
   tensorflow::NodeDef engine_node;
   const char* engine_plan_data = static_cast<const char*>(engine_plan->data());
@@ -2284,26 +2255,13 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   }
   auto trt_engine_node = graph.AddNode(engine_node, &status);
   TF_RETURN_IF_ERROR(status);
-  std::map<string, int> port_map;
-  for (size_t t = 0; t < output_nodes.size(); t++) {
-    port_map.insert({output_nodes.at(t), t});
-  }
-  for (auto& i : out_edges) {
-    string s(i->src()->name());
-    if (i->src_output()) StrAppend(&s, ":", i->src_output());
-    int out_port = port_map.at(s);
-    VLOG(1) << "Connecting " << trt_engine_node->name() << ":" << out_port
-            << " -> " << i->dst()->name() << ":" << i->dst_input();
-    TF_RETURN_IF_ERROR(
-        graph.UpdateEdge(trt_engine_node, out_port, i->dst(), i->dst_input()));
-  }
-  for (const auto ed : trt_engine_node->in_edges()) {
-    VLOG(1) << "In Edge  " << ed->src()->name() << ":" << ed->src_output()
-            << " -> " << ed->dst()->name() << ":" << ed->dst_input();
-  }
-  for (const auto ed : trt_engine_node->out_edges()) {
-    VLOG(1) << "Out Edge " << ed->src()->name() << ":" << ed->src_output()
-            << " -> " << ed->dst()->name() << ":" << ed->dst_input();
+  for (size_t i = 0; i < out_edges.size(); i++) {
+    VLOG(1) << "Connecting trt_engine_node output " << i << " with "
+            << out_edges.at(i)->dst()->name() << " port "
+            << out_edges.at(i)->dst_input();
+    TF_RETURN_IF_ERROR(graph.UpdateEdge(trt_engine_node, i,
+                                        out_edges.at(i)->dst(),
+                                        out_edges.at(i)->dst_input()));
   }
   VLOG(1) << "Segment nodes:";
   for (auto& i : segment_nodes) {
@@ -2374,7 +2332,6 @@ tensorflow::Status ConvertSubgraph(
     std::vector<string>* output_names,
     std::vector<tensorflow::DataType>* output_dtypes,
     const string& engine_name) {
-  std::set<string> added_tensors;
   for (const std::pair<int, int>& input : s.input_inds) {
     VLOG(2) << "parsing input. Node id= " << input.first;
     int node_id = input.first;
@@ -2417,6 +2374,7 @@ tensorflow::Status ConvertSubgraph(
 
     auto op_info = op_info_vec.at(shape_inference_output_idx);
     tensorflow::DataType tf_dtype = op_info.dtype();
+    input_dtypes->push_back(tf_dtype);
 
     nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
     auto type_status = ConvertDType(tf_dtype, &dtype);
@@ -2452,10 +2410,8 @@ tensorflow::Status ConvertSubgraph(
     if (output_idx != 0) {
       input_tensor_name = StrCat(node_name, ":", output_idx);
     }
-    if (added_tensors.count(input_tensor_name)) continue;
-    added_tensors.insert(input_tensor_name);
+
     input_names->push_back(input_tensor_name);
-    input_dtypes->push_back(tf_dtype);
     nvinfer1::ITensor* input_tensor = converter.network()->addInput(
         input_tensor_name.c_str(), dtype, input_dim_pseudo_chw);
 
@@ -2479,7 +2435,6 @@ tensorflow::Status ConvertSubgraph(
 
   // Gather output metadata
   int trt_engine_op_output_idx = 0;
-  added_tensors.clear();
   for (const std::pair<int, int>& output : s.output_inds) {
     int node_id = output.first;
     int output_idx = output.second;
@@ -2496,8 +2451,6 @@ tensorflow::Status ConvertSubgraph(
     if (output_idx != 0)
       tensorflow::strings::StrAppend(&tensor_name, ":", output_idx);
     VLOG(2) << "Output tensor name: " << tensor_name;
-    if (added_tensors.count(tensor_name)) continue;
-    added_tensors.insert(tensor_name);
     output_names->push_back(tensor_name);
     auto tensor_or_weights = converter.get_tensor(tensor_name);
     if (!tensor_or_weights.is_tensor()) {
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets.py b/tensorflow/contrib/tpu/python/tpu/datasets.py
index d879170b68..2e472a2805 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets.py
+++ b/tensorflow/contrib/tpu/python/tpu/datasets.py
@@ -166,21 +166,11 @@ def StreamingFilesDataset(files,
     return remote_iterator.get_next()
 
   def MapFn(unused_input):
-    if isinstance(source_dataset.output_types, dtypes.DType):
-      output_types = [source_dataset.output_types]
-    elif isinstance(source_dataset.output_types, (list, tuple)):
-      output_types = source_dataset.output_types
-    else:
-      raise ValueError('source dataset has invalid output types')
-    remote_calls = functional_ops.remote_call(
+    return functional_ops.remote_call(
         args=[source_handle],
-        Tout=output_types,
+        Tout=[dtypes.string],
         f=LoadingFunc,
-        target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)
-    if len(remote_calls) == 1:
-      return remote_calls[0]
-    else:
-      return remote_calls
+        target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)[0]
 
   with ops.device('/job:%s' % worker_job):
     output_dataset = dataset_ops.Dataset.range(2).repeat().map(
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets_test.py b/tensorflow/contrib/tpu/python/tpu/datasets_test.py
index b58d05eac5..918cf0ed8e 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/datasets_test.py
@@ -26,8 +26,6 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.lib.io import python_io
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
@@ -164,30 +162,6 @@ class DatasetsTest(test.TestCase):
 
     self.assertEqual(set(all_contents), set(retrieved_values))
 
-  def testArbitraryReaderFuncFromDatasetGenerator(self):
-
-    def my_generator():
-      yield (1, [1] * 10)
-
-    def gen_dataset(dummy):
-      return dataset_ops.Dataset.from_generator(
-          my_generator, (dtypes.int64, dtypes.int64),
-          (tensor_shape.TensorShape([]), tensor_shape.TensorShape([10])))
-
-    dataset = datasets.StreamingFilesDataset(
-        dataset_ops.Dataset.range(10), filetype=gen_dataset)
-
-    iterator = dataset.make_initializable_iterator()
-    self._sess.run(iterator.initializer)
-    get_next = iterator.get_next()
-
-    retrieved_values = self._sess.run(get_next)
-
-    self.assertIsInstance(retrieved_values, (list, tuple))
-    self.assertEqual(len(retrieved_values), 2)
-    self.assertEqual(retrieved_values[0], 1)
-    self.assertItemsEqual(retrieved_values[1], [1] * 10)
-
   def testUnexpectedFiletypeString(self):
     with self.assertRaises(ValueError):
       datasets.StreamingFilesDataset(
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index b1c224a345..d89633199d 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -699,9 +699,7 @@ cc_library(
     srcs = ["platform/stacktrace_handler.cc"],
     hdrs = ["platform/stacktrace_handler.h"],
     deps = [
-        ":abi",
         ":lib_platform",
-        ":stacktrace",
     ],
 )
 
@@ -3091,8 +3089,6 @@ cc_library(
         # we now need at least "str_util".
         ":lib",
         ":lib_platform",
-        ":stacktrace_handler",
-        ":test_lite",
         "//tensorflow/core/platform/default/build_config:test_lite_main",
     ],
     alwayslink = 1,
@@ -3573,10 +3569,7 @@ tf_cc_tests_gpu(
 tf_cc_test_mkl(
     name = "mkl_runtime_tests",
     size = "small",
-    srcs = [
-        "common_runtime/mkl_cpu_allocator_test.cc",
-        "common_runtime/mkl_threadpool_device_test.cc",
-    ],
+    srcs = ["common_runtime/mkl_cpu_allocator_test.cc"],
     linkstatic = 1,
     deps = [
         ":core",
diff --git a/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt b/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt
index 985f09312f..cbe76de415 100644
--- a/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt
@@ -4,10 +4,6 @@ op {
   description: <<END
 if < 0, `scale * features` otherwise.
 
-To be used together with
-`initializer = tf.variance_scaling_initializer(factor=1.0, mode='FAN_IN')`.
-For correct dropout, use `tf.contrib.nn.alpha_dropout`.
-
 See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt
deleted file mode 100644
index 6e13d0d049..0000000000
--- a/tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt
+++ /dev/null
@@ -1,48 +0,0 @@
-op {
-  graph_op_name: "StringSplitV2"
-  in_arg {
-    name: "input"
-    description: <<END
-`1-D` string `Tensor`, the strings to split.
-END
-  }
-  in_arg {
-    name: "sep"
-    description: <<END
-`0-D` string `Tensor`, the delimiter character.
-END
-  }
-  attr {
-    name: "maxsplit"
-    description: <<END
-An `int`. If `maxsplit > 0`, limit of the split of the result.
-END
-  }
-  summary: "Split elements of `source` based on `sep` into a `SparseTensor`."
-  description: <<END
-Let N be the size of source (typically N will be the batch size). Split each
-element of `source` based on `sep` and return a `SparseTensor`
-containing the split tokens. Empty tokens are ignored.
-
-For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
-then the output will be
-```
-st.indices = [0, 0;
-              0, 1;
-              1, 0;
-              1, 1;
-              1, 2]
-st.shape = [2, 3]
-st.values = ['hello', 'world', 'a', 'b', 'c']
-```
-
-If `sep` is given, consecutive delimiters are not grouped together and are
-deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
-sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
-string, consecutive whitespace are regarded as a single separator, and the
-result will contain no empty strings at the startor end if the string has
-leading or trailing whitespace.
-
-Note that the above mentioned behavior matches python's str.split.
-END
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt
deleted file mode 100644
index 0e8576fb01..0000000000
--- a/tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "StringSplitV2"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc
index 9cda17867b..8f2a419756 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@@ -86,7 +86,7 @@ BFCAllocator::Chunk* BFCAllocator::ChunkFromHandle(ChunkHandle h) {
   return &(chunks_[h]);
 }
 
-bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
+bool BFCAllocator::Extend(size_t rounded_bytes) {
   size_t available_bytes = memory_limit_ - total_region_allocated_bytes_;
   // Rounds available_bytes down to the nearest multiple of kMinAllocationSize.
   available_bytes = (available_bytes / kMinAllocationSize) * kMinAllocationSize;
@@ -108,7 +108,7 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
 
   // Try allocating.
   size_t bytes = std::min(curr_region_allocation_bytes_, available_bytes);
-  void* mem_addr = suballocator_->Alloc(alignment, bytes);
+  void* mem_addr = suballocator_->Alloc(32, bytes);
   if (mem_addr == nullptr && !started_backpedal_) {
     // Only backpedal once.
     started_backpedal_ = true;
@@ -119,7 +119,7 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
     while (mem_addr == nullptr) {
       bytes = RoundedBytes(bytes * kBackpedalFactor);
       if (bytes < rounded_bytes) break;
-      mem_addr = suballocator_->Alloc(alignment, bytes);
+      mem_addr = suballocator_->Alloc(32, bytes);
     }
   }
 
@@ -261,7 +261,7 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
   }
 
   // Try to extend
-  if (Extend(unused_alignment, rounded_bytes)) {
+  if (Extend(rounded_bytes)) {
     ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes);
     if (ptr != nullptr) {
       return ptr;
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index 52aedb1e9c..ba5a3eea3a 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -305,8 +305,7 @@ class BFCAllocator : public VisitableAllocator {
   // Try to add a new memory region that can satisfy an allocation of
   // 'rounded_bytes' bytes.  Returns true on success and false on
   // failure.
-  bool Extend(size_t alignment, size_t rounded_bytes)
-      EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  bool Extend(size_t rounded_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   // Returns a pointer to an underlying allocated chunk of size
   // 'rounded_bytes'.
diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
index 9028e6298c..c21a1ea9f2 100644
--- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
@@ -102,25 +102,9 @@ TEST(DirectSessionWithTrackingAllocTest, CostModelTest) {
         EXPECT_EQ(2, shape.dim(0).size());
         EXPECT_EQ(1, shape.dim(1).size());
         if (node->name() == y->name()) {
-#ifdef INTEL_MKL
-          // if MKL is used, it goes through various additional 
-          // graph rewrite pass. In TF, everytime a graph pass 
-          // happens, "constant" nodes are allocated
-          // and deallocated. Each allocation calls the
-          // (FindChunkPtr of BFCAllocator),
-          // which increments the value of AllocationId. 
-          // Thus AllocationId becomes more than 3 and 4 if 
-          // MKL is used. Now they are 9 and 10 for MKL. 
-          EXPECT_EQ(19, cm->AllocationId(node, 0));
-#else
           EXPECT_EQ(21, cm->AllocationId(node, 0));
-#endif 
         } else {
-#ifdef INTEL_MKL
-          EXPECT_EQ(20, cm->AllocationId(node, 0));
-#else
           EXPECT_EQ(22, cm->AllocationId(node, 0));
-#endif 
         }
       }
       EXPECT_LE(0, cm->MaxExecutionTime(node));
diff --git a/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc b/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
deleted file mode 100644
index 5d583a8360..0000000000
--- a/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifdef INTEL_MKL
-
-#include "tensorflow/core/common_runtime/threadpool_device.h"
-
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/cpu_info.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/public/session_options.h"
-
-namespace tensorflow {
-
-#ifdef _OPENMP
-TEST(MKLThreadPoolDeviceTest, TestOmpDefaults) {
-  SessionOptions options;
-  unsetenv("OMP_NUM_THREADS");
-
-  ThreadPoolDevice* tp = new ThreadPoolDevice(
-      options, "/device:CPU:0", Bytes(256), DeviceLocality(), cpu_allocator());
-
-  const int ht = port::NumHyperthreadsPerCore();
-  EXPECT_EQ(omp_get_max_threads(), (port::NumSchedulableCPUs() + ht - 1) / ht);
-}
-
-TEST(MKLThreadPoolDeviceTest, TestOmpPreSets) {
-  SessionOptions options;
-  setenv("OMP_NUM_THREADS", "314", 1);
-
-  ThreadPoolDevice* tp = new ThreadPoolDevice(
-      options, "/device:CPU:0", Bytes(256), DeviceLocality(), cpu_allocator());
-
-  EXPECT_EQ(omp_get_max_threads(), 314);
-}
-#endif  // _OPENMP
-
-}  // namespace tensorflow
-
-#endif  // INTEL_MKL
diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc
index a5d31b75c7..21912236d0 100644
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -16,10 +16,8 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/process_util.h"
 
 #ifdef INTEL_MKL
-#ifdef _OPENMP
 #include <omp.h>
-#endif  // _OPENMP
-#endif  // INTEL_MKL
+#endif
 #include <string.h>
 
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -59,10 +57,7 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
   // MKL library executes ops in parallel using OMP threads
   // Set inter_op conservatively to avoid thread oversubscription that could
   // lead to severe perf degradations and OMP resource exhaustion
-  int mkl_intra_op = 1;
-#ifdef _OPENMP
-  mkl_intra_op = omp_get_max_threads();
-#endif  // _OPENMP
+  const int mkl_intra_op = omp_get_max_threads();
   CHECK_GE(mkl_intra_op, 1);
   const int32 mkl_inter_op = std::max(
       (port::NumSchedulableCPUs() + mkl_intra_op - 1) / mkl_intra_op, 2);
@@ -73,7 +68,7 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
 #else
   // Default to using the number of cores available in the process.
   return port::NumSchedulableCPUs();
-#endif  // INTEL_MKL
+#endif
 }
 
 thread::ThreadPool* NewThreadPoolFromSessionOptions(
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index 74a87215e1..f7a07fe503 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -31,11 +31,7 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 
 #ifdef INTEL_MKL
-#ifdef _OPENMP
-#include <omp.h>
-#endif
 #include "tensorflow/core/common_runtime/mkl_cpu_allocator.h"
-#include "tensorflow/core/platform/cpu_info.h"
 #endif
 
 namespace tensorflow {
@@ -47,26 +43,7 @@ ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options,
     : LocalDevice(options, Device::BuildDeviceAttributes(
                                name, DEVICE_CPU, memory_limit, locality)),
       allocator_(allocator),
-      scoped_allocator_mgr_(new ScopedAllocatorMgr(name)) {
-#ifdef INTEL_MKL
-#ifdef _OPENMP
-  const char* user_omp_threads = getenv("OMP_NUM_THREADS");
-  if (user_omp_threads == nullptr) {
-    // OMP_NUM_THREADS controls MKL's intra-op parallelization
-    // Default to available physical cores
-    const int mkl_intra_op = port::NumSchedulableCPUs();
-    const int ht = port::NumHyperthreadsPerCore();
-    omp_set_num_threads((mkl_intra_op + ht - 1) / ht);
-  } else {
-    uint64 user_val = 0;
-    if (strings::safe_strtou64(user_omp_threads, &user_val)) {
-      // Superflous but triggers OpenMP loading
-      omp_set_num_threads(user_val);
-    }
-  }
-#endif  // _OPENMP
-#endif  // INTEL_MKL
-}
+      scoped_allocator_mgr_(new ScopedAllocatorMgr(name)) {}
 
 ThreadPoolDevice::~ThreadPoolDevice() {}
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
index 770a0fcf14..1cea1b1462 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
@@ -147,9 +147,7 @@ MasterService::Stub::Stub(
 }
 
 MasterService::AsyncService::AsyncService() {
-  int method_len = sizeof(grpcMasterService_method_names) / 
-                    sizeof(grpcMasterService_method_names[0]);
-  for (int i = 0; i < method_len; ++i) {
+  for (int i = 0; i < 10; ++i) {
     AddMethod(new ::grpc::internal::RpcServiceMethod(
         grpcMasterService_method_names[i],
         ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr));
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
index a8508d2d4f..89f83f9f24 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_session.h"
 #include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
@@ -51,14 +50,9 @@ Status TestCluster::MakeTestCluster(const SessionOptions& options, int n,
   }
 
   for (int i = 0; i < n; ++i) {
-    string server_file =
-        strings::StrCat(testing::TensorFlowSrcRoot(),
-                        "/core/distributed_runtime/rpc/grpc_testlib_server");
-    if (!options.env->FileExists(server_file).ok()) {
-      return errors::Internal("Could not find grpc_testlib_server");
-    }
     const std::vector<string> argv(
-        {server_file,
+        {strings::StrCat(testing::TensorFlowSrcRoot(),
+                         "/core/distributed_runtime/rpc/grpc_testlib_server"),
          /* see grpc_testlib_server.cc for flags */
          tf_jobs, "--tf_job=localhost", strings::StrCat("--tf_task=", i),
          strings::StrCat("--num_cpus=", num_cpus),
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 2bb4d32d57..2c87156dca 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -67,8 +67,13 @@ struct AllocatorStats {
 // device memory.
 class Allocator {
  public:
+#ifdef EIGEN_VECTORIZE_AVX512
   // Align to 64 byte boundary.
   static constexpr size_t kAllocatorAlignment = 64;
+#else
+  // Align to 32 byte boundary.
+  static constexpr size_t kAllocatorAlignment = 32;
+#endif
 
   virtual ~Allocator();
 
diff --git a/tensorflow/core/framework/op_gen_lib.cc b/tensorflow/core/framework/op_gen_lib.cc
index 4b56d807df..3d7920a6e2 100644
--- a/tensorflow/core/framework/op_gen_lib.cc
+++ b/tensorflow/core/framework/op_gen_lib.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/core/framework/op_gen_lib.h"
 
-#include <algorithm>
 #include <vector>
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
diff --git a/tensorflow/core/framework/remote_fused_graph_execute_info.proto b/tensorflow/core/framework/remote_fused_graph_execute_info.proto
index 10072724d2..eb689ec1e6 100644
--- a/tensorflow/core/framework/remote_fused_graph_execute_info.proto
+++ b/tensorflow/core/framework/remote_fused_graph_execute_info.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "RemoteFusedGraphExecuteInfoProto";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
+//add go_package externally
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index 80e168df97..b613effd18 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -1147,29 +1147,29 @@ TEST(Tensor, FailureToAllocate) {
 
 // On the alignment.
 //
-// As of 2018/5, tensorflow::Tensor allocates its buffer with 64-byte
+// As of 2015/8, tensorflow::Tensor allocates its buffer with 32-byte
 // alignment. Tensor::tensor/flat/vec/matrix methods requires the
 // buffer satisfies Eigen::Aligned (e.g., 16-bytes aligned usually,
-// 32-bytes for AVX, and 64-bytes for AVX512). Tensor::Slice requires
-// the caller to ensure its result is aligned if the caller intends
-// to use those methods. In this test case, we simply make sure each
-// slice is 64-byte aligned: sizeof(float) * 4 * 36 = 576.  576 % 64 = 0.
+// and 32-bytes for AVX). Tensor::Slice requires the caller to ensure
+// its result is aligned if the caller intends to use those methods.
+// In this test case, we simply make sure each slice is 32-byte
+// aligned: sizeof(float) * 4 * 2 = 32.
 TEST(Tensor, Slice_Basic) {
   Tensor saved;
   {  // General
-    Tensor x(DT_FLOAT, TensorShape({10, 4, 36}));
+    Tensor x(DT_FLOAT, TensorShape({10, 4, 34}));
     // Fills in known values.
     for (int i = 0; i < 10; ++i) {
       x.Slice(i, i + 1).flat<float>().setConstant(i * 1.f);
     }
     // A simple slice along dim0.
     Tensor y = x.Slice(4, 8);
-    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 4, 36})));
+    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 4, 34})));
     auto tx = x.tensor<float, 3>();
     auto ty = y.tensor<float, 3>();
     for (int i = 0; i < 4; ++i) {
       for (int j = 0; j < 4; ++j) {
-        for (int k = 0; k < 36; ++k) {
+        for (int k = 0; k < 34; ++k) {
           EXPECT_EQ(ty(i, j, k), 4.0 + i);
           EXPECT_EQ(&tx(4 + i, j, k), &ty(i, j, k));
         }
@@ -1186,7 +1186,7 @@ TEST(Tensor, Slice_Basic) {
     auto tz = z.tensor<float, 3>();
     EXPECT_EQ(1, z.dim_size(0));
     for (int j = 0; j < 4; ++j) {
-      for (int k = 0; k < 36; ++k) {
+      for (int k = 0; k < 34; ++k) {
         EXPECT_EQ(tz(0, j, k), 6.0);
       }
     }
@@ -1198,16 +1198,16 @@ TEST(Tensor, Slice_Basic) {
     EXPECT_EQ(1, saved.dim_size(0));
     auto tsaved = saved.tensor<float, 3>();
     for (int j = 0; j < 4; ++j) {
-      for (int k = 0; k < 36; ++k) {
+      for (int k = 0; k < 34; ++k) {
         EXPECT_EQ(tsaved(0, j, k), 6.0);
       }
     }
   }
   {  // Empty
-    Tensor x(DT_FLOAT, TensorShape({10, 0, 36}));
+    Tensor x(DT_FLOAT, TensorShape({10, 0, 34}));
     x.flat<float>().setRandom();
     Tensor y = x.Slice(4, 8);
-    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 0, 36})));
+    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 0, 34})));
   }
 
   {
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index b9667998d6..72a13d4da7 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -2691,14 +2691,14 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
     // If Op has been specifically assigned to a non-CPU device, then No.
     if (!n->assigned_device_name().empty() &&
-        !str_util::StrContains(n->assigned_device_name(), kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->assigned_device_name(),kCPUDeviceSubStr)) {
       result = false;
       reason = "Op has been assigned a runtime device that is not CPU.";
     }
 
     // If user has specifically assigned this op to a non-CPU device, then No.
     if (!n->def().device().empty() &&
-        !str_util::StrContains(n->def().device(), kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->def().device(),kCPUDeviceSubStr)) {
       result = false;
       reason = "User has assigned a device that is not CPU.";
     }
@@ -2865,9 +2865,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return false;
   }
 
-  // If the depth_radius of LRN is not 2, then MKL DNN takes unoptimized
-  // path. The unoptimized path is slow. Thus we dont rewrite the node
-  // and use default Eigen. But for depth_radius=2, MKL DNN optimized
+  // If the depth_radius of LRN is not 2, then MKL DNN takes unoptimized 
+  // path. The unoptimized path is slow. Thus we dont rewrite the node 
+  // and use default Eigen. But for depth_radius=2, MKL DNN optimized 
   // path is taken, i.e., eigen node is rewritten by MKl DNN node.
   static bool LrnRewrite(const Node* n) {
     CHECK_NOTNULL(n);
@@ -2876,13 +2876,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     CHECK_EQ(GetNodeAttr(n->def(), "depth_radius", &depth_radius).ok(), true);
 
     // if the depth_radius of LRN is not 2, don't rewrite the node by MKL DNN
-    // and use eigen node instead
+    // and use eigen node instead 
     if (depth_radius == 2) {
       return true;
     }
     VLOG(1) << "LrnRewrite: The model sets depth_radius as not 2 which"
             << "case is not optimized by Intel MKL, thus using Eigen op"
-            << "for LRN ";
+            << "for LRN " ; 
 
     return false;
   }
@@ -3015,35 +3015,6 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                                 std::vector<NodeBuilder::NodeOut>* ws_tensors,
                                 bool* are_ws_tensors_added);
 
-  // Helper function used by FixMklMetaDataEdges. Fixes the metadata edge
-  // pointed by 'e_metadata' corresponding to the data edge 'e_data' in graph
-  // 'g'. Returns true is fixup was done; otherwise, it returns false.
-  bool FixMklMetaDataEdgeIfNeeded(std::unique_ptr<Graph>* g,
-    const Edge* e_data, const Edge* e_metadata);
-
-  // Are the input Mkl metadata edges for node 'n' in graph 'g' correctly
-  // connected? If not, then fix them. This is needed because a graph may have
-  // some input Mkl metadata edges incorrectly setup after node merge and
-  // rewrite passes. This could happen because GetReversePostOrder function may
-  // not provide topologically sorted order if a graph contains cycles. The
-  // function returns true if at least one Mkl metadata edge for node 'n' was
-  // fixed. Otherwise, it returns false.
-  //
-  // Example:
-  //
-  // X = MklConv2D(_, _, _)
-  // Y = MklConv2DWithBias(_, _, _, _, _, _)
-  // Z = MklAdd(X, Y, DummyMklTensor, Y:1)
-  //
-  // For a graph such as shown above, note that 3rd argument of MklAdd contains
-  // DummyMklTensor. Actually, it should be getting the Mkl metadata from
-  // MklConv2D op (specifically, X:2). This incorrect plumbing could be possible
-  // (although rare) if the Mkl NodeMerge + NodeRewrite passes visit Z before X
-  // (possible if X, Y, Z are part of a loop.) This function fixes the Mkl
-  // metadata edges only - it does not rewrite nodes nor does it modify the Mkl
-  // data edges (1st and 2nd arguments of MklAdd).
-  bool FixMklMetaDataEdges(std::unique_ptr<Graph>* g, Node* n);
-
   // Functions specific to operators to copy attributes
   // We need operator-specific function to copy attributes because the framework
   // does not provide any generic function for it.
@@ -4270,92 +4241,6 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
   return nullptr;
 }
 
-///////////////////////////////////////////////////////////////////////////////
-//              Post-rewrite Mkl metadata fixup pass
-///////////////////////////////////////////////////////////////////////////////
-bool MklLayoutRewritePass::FixMklMetaDataEdgeIfNeeded(std::unique_ptr<Graph>* g,
-    const Edge* e_data, const Edge* e_metadata) {
-  if (g == nullptr || e_data == nullptr || e_metadata == nullptr) {
-    return false;
-  }
-
-  Node* n_data = e_data->src();
-  int n_data_op_slot = e_data->src_output();
-  int n_metadata_op_slot = GetTensorMetaDataIndex(n_data_op_slot,
-                                                  n_data->num_outputs());
-
-  // If the source of meta edge is a constant node (producing dummy Mkl metadata
-  // tensor), then we will need to fix.
-  if (IsConstant(e_metadata->src())) {
-    Node* e_metadata_dst = e_metadata->dst();
-    int e_metadata_in_slot = e_metadata->dst_input();
-    CHECK_NOTNULL((*g)->AddEdge(n_data, n_metadata_op_slot,
-                  e_metadata_dst, e_metadata_in_slot));
-
-    (*g)->RemoveEdge(e_metadata);
-    return true;
-  }
-
-  return false;
-}
-
-bool MklLayoutRewritePass::FixMklMetaDataEdges(std::unique_ptr<Graph>* g,
-    Node* n) {
-  bool result = false;
-
-  // If graph node is not Mkl node, then return.
-  DataType T = DT_INVALID;
-  if (!GetNodeAttr(n->def(), "T", &T).ok() ||
-      !mkl_op_registry::IsMklOp(n->type_string(), T)) {
-    return result;
-  }
-
-  // If it is Mkl node, then check if the input edges to this node that carry
-  // Mkl metadata are linked up correctly with the source node.
-
-  // For Mkl nodes, we generate twice the number of input tensors (n for Mkl
-  // data tensors + n for Mkl metadata tensors). We need to check for correct
-  // connection of n metadata tensors only.
-  int num_data_inputs = n->num_inputs() / 2;
-  for (int idx = 0; idx < num_data_inputs; idx++) {
-    // Get the edge connecting input slot with index (idx).
-    const Edge* e = nullptr;
-    TF_CHECK_OK(n->input_edge(idx, &e));
-
-    // If e is control edge, then skip.
-    if (e->IsControlEdge()) {
-      continue;
-    }
-
-    // Check that the source node for edge 'e' is Mkl node. If it is not an Mkl
-    // node, then we don't need to do anything.
-    Node* e_src = e->src();
-    if (GetNodeAttr(e_src->def(), "T", &T).ok() &&
-        mkl_op_registry::IsMklOp(e_src->type_string(), T)) {
-      // Source node for edge 'e' is Mkl node.
-      // Destination node and destination input slot of e is node 'n' and 'idx'
-      // resp.
-      CHECK_EQ(e->dst(), n);
-      CHECK_EQ(e->dst_input(), idx);
-
-      // Let's get edge that carries Mkl metadata corresponding to Mkl data edge
-      // 'e'. For that, let's first get the input slot of 'n' where the meta
-      // edge will feed the value.
-      int e_meta_in_slot = GetTensorMetaDataIndex(e->dst_input(),
-                                                  n->num_inputs());
-      const Edge* e_meta = nullptr;
-      TF_CHECK_OK(n->input_edge(e_meta_in_slot, &e_meta));
-
-      // Let's check if we need to fix this meta edge.
-      if (FixMklMetaDataEdgeIfNeeded(g, e, e_meta)) {
-        result = true;
-      }
-    }
-  }
-
-  return result;
-}
-
 ///////////////////////////////////////////////////////////////////////////////
 //              Run function for the pass
 ///////////////////////////////////////////////////////////////////////////////
@@ -4422,25 +4307,6 @@ bool MklLayoutRewritePass::RunPass(std::unique_ptr<Graph>* g) {
 
   DumpGraph("After running MklLayoutRewritePass(NodeMerge+Rewrite)", &**g);
 
-  order.clear();
-  GetReversePostOrder(**g, &order);  // This will give us topological sort.
-  for (Node* n : order) {
-    // If node is not an op or it cannot run on CPU device, then skip.
-    if (!n->IsOp() || !CanOpRunOnCPUDevice(n)) {
-      continue;
-    }
-    if (FixMklMetaDataEdges(g, n)) {
-      string node_name = n->name();
-      string op_name = n->type_string();
-
-      VLOG(1) << "MklLayoutRewritePass: fixed metadata edges for node "
-              << node_name << " with op " << op_name;
-      result = true;
-    }
-  }
-  DumpGraph("After running MklLayoutRewritePass(NodeMerge+Rewrite+Fixup)",
-            &**g);
-
   return result;
 }
 
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 7645b4a7f0..029cdcf94a 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -3518,37 +3518,6 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_DeviceTest) {
             "B->C:1;C->E;D->E:1;E->Z;M->C:2;N->C:3;Y->Z:1");
 }
 
-/////////////////////////////////////////////////////////////////////
-//         Post-rewrite fixup pass test
-
-TEST_F(MklLayoutPassTest, PostRewriteFixUpPass) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'C' op: '_MklConv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
-      " input: ['A', 'B', 'M', 'N']}"
-      "node { name: 'D' op: 'Const' "
-      " attr { key: 'dtype' value { type: DT_UINT8 } }"
-      " attr { key: 'value' value { "
-      "    tensor { dtype: DT_UINT8 tensor_shape { dim { size: 1 } } "
-      "    int_val: 0 } } } }"
-      "node { name: 'E' op: '_MklAdd'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['C', 'A', 'D', 'D']}");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(_MklConv2D);D(Const);E(_MklAdd);"
-            "M(_MklInput);N(_MklInput)|A->C;A->E:1;B->C:1;C->E;C:2->E:2;"
-            "D->E:3;M->C:2;N->C:3");
-}
-
 /////////////////////////////////////////////////////////////////////
 
 static void BM_MklLayoutRewritePass(int iters, int op_nodes) {
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 0c02876ac5..6749a7c571 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -610,6 +610,7 @@ class SymbolicShapeRefiner {
     }
   };
 
+  // Compute the shape of the tensors outputed by node 'node' at output port
   // 'port_index' as the union of shape1 and shape2.
   ShapeHandle OutputAsUnion(const NodeDef* node, int port_index,
                             ShapeHandle shape1, ShapeHandle shape2) {
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 8ca726df0b..1b18087cdf 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -679,7 +679,6 @@ cc_library(
     deps = [
         ":constant_folding",
         ":graph_optimizer",
-        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:graph_view",
         "//tensorflow/core/grappler:grappler_item",
@@ -781,6 +780,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:scoped_allocator_ops_op_lib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index 03e36a7b9c..4dde7ed1b4 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/utils.h"
-#include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -201,7 +200,8 @@ Status Remapper::Optimize(Cluster* /*cluster*/, const GrapplerItem& item,
         }
       }
       if (optimizable) {
-        VLOG(1) << "Optimizing fused batch norm node " << node.DebugString();
+        VLOG(2) << "Optimizing fused batch norm node " << node.DebugString()
+                << std::endl;
         AddBatchNormNodes(optimized_graph, node);
         continue;
       }
diff --git a/tensorflow/core/kernels/as_string_op.cc b/tensorflow/core/kernels/as_string_op.cc
index a7757d1361..66c4aff3e3 100644
--- a/tensorflow/core/kernels/as_string_op.cc
+++ b/tensorflow/core/kernels/as_string_op.cc
@@ -73,7 +73,6 @@ class AsStringOp : public OpKernel {
     }
     switch (dtype) {
       case DT_INT8:
-      case DT_INT16:
       case DT_INT32:
         strings::Appendf(&format_, "d");
         break;
@@ -130,7 +129,6 @@ class AsStringOp : public OpKernel {
       ENCODE_TYPE(DT_FLOAT, float, format_);
       ENCODE_TYPE(DT_DOUBLE, double, format_);
       ENCODE_TYPE(DT_INT8, int8, format_);
-      ENCODE_TYPE(DT_INT16, int16, format_);
       case (DT_BOOL): {
         const auto& input_flat = input_tensor->flat<bool>();
         for (int i = 0; i < input_flat.size(); ++i) {
diff --git a/tensorflow/core/kernels/cwise_op_clip.cc b/tensorflow/core/kernels/cwise_op_clip.cc
index 49b90e855b..14d889e8e3 100644
--- a/tensorflow/core/kernels/cwise_op_clip.cc
+++ b/tensorflow/core/kernels/cwise_op_clip.cc
@@ -33,41 +33,52 @@ class ClipOp : public OpKernel {
     const Tensor& in0 = ctx->input(0);
     const Tensor& in1 = ctx->input(1);
     const Tensor& in2 = ctx->input(2);
-    OP_REQUIRES(ctx, (in0.shape() == in1.shape() ||
-                      TensorShapeUtils::IsScalar(in1.shape())) &&
-                     (in0.shape() == in2.shape() ||
-                      TensorShapeUtils::IsScalar(in2.shape())),
-                errors::InvalidArgument(
-                    "clip_value_min and clip_value_max must be either of "
-                    "the same shape as input, or a scalar. ",
-                    "input shape: ", in0.shape().DebugString(),
-                    "clip_value_min shape: ", in1.shape().DebugString(),
-                    "clip_value_max shape: ", in2.shape().DebugString()));
-
-    Tensor* out = nullptr;
-    OP_REQUIRES_OK(
-        ctx, ctx->forward_input_or_allocate_output({0}, 0, in0.shape(), &out));
-    if (out->NumElements() == 0) return;  // Nothing to do for empty output
 
     auto in0_flat = in0.flat<T>();
     auto in1_flat = in1.flat<T>();
     auto in2_flat = in2.flat<T>();
-    auto out_flat = out->flat<T>();
     const Device& d = ctx->eigen_device<Device>();
 
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(
+        ctx, ctx->forward_input_or_allocate_output({0}, 0, in0.shape(), &out));
+    auto out_flat = out->flat<T>();
     if (in1.shape() == in2.shape()) {
       if (in0.shape() == in1.shape()) {
         functor::TernaryClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                             out_flat);
       } else {
+        OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(in1.shape()),
+                    errors::InvalidArgument(
+                        "clip_value_min and clip_value_max must be either of "
+                        "the same shape as input, or a scalar. ",
+                        "input shape: ", in0.shape().DebugString(),
+                        "clip_value_min shape: ", in1.shape().DebugString(),
+                        "clip_value_max shape: ", in2.shape().DebugString()));
         functor::UnaryClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                           out_flat);
       }
     } else {
       if (in0.shape() == in1.shape()) {
+        OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(in2.shape()),
+                    errors::InvalidArgument(
+                        "clip_value_min and clip_value_max must be either of "
+                        "the same shape as input, or a scalar. ",
+                        "input shape: ", in0.shape().DebugString(),
+                        "clip_value_min shape: ", in1.shape().DebugString(),
+                        "clip_value_max shape: ", in2.shape().DebugString()));
         functor::BinaryLeftClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                                out_flat);
       } else {
+        OP_REQUIRES(ctx,
+                    (in0.shape() == in2.shape() &&
+                     TensorShapeUtils::IsScalar(in1.shape())),
+                    errors::InvalidArgument(
+                        "clip_value_min and clip_value_max must be either of "
+                        "the same shape as input, or a scalar. ",
+                        "input shape: ", in0.shape().DebugString(),
+                        "clip_value_min shape: ", in1.shape().DebugString(),
+                        "clip_value_max shape: ", in2.shape().DebugString()));
         functor::BinaryRightClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                                 out_flat);
       }
diff --git a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
index 17a85d9773..9a3b2303a3 100644
--- a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
@@ -57,7 +57,6 @@ struct DenseUpdate<GPUDevice, T, SUB> {
   template struct functor::DenseUpdate<GPUDevice, T, ADD>; \
   template struct functor::DenseUpdate<GPUDevice, T, SUB>;
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
-TF_CALL_int32(DEFINE_GPU_KERNELS);
 TF_CALL_int64(DEFINE_GPU_KERNELS);
 #undef DEFINE_GPU_KERNELS
 
diff --git a/tensorflow/core/kernels/gather_functor.cc b/tensorflow/core/kernels/gather_functor.cc
index 5cd8e04927..e6fefe643b 100644
--- a/tensorflow/core/kernels/gather_functor.cc
+++ b/tensorflow/core/kernels/gather_functor.cc
@@ -37,7 +37,6 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
-TF_CALL_int64(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
 TF_CALL_complex128(DECLARE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/gather_functor_gpu.cu.cc b/tensorflow/core/kernels/gather_functor_gpu.cu.cc
index 4563fc6353..39b6924d74 100644
--- a/tensorflow/core/kernels/gather_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/gather_functor_gpu.cu.cc
@@ -31,7 +31,6 @@ typedef Eigen::GpuDevice GPUDevice;
   DEFINE_GPU_SPECS_INDEX(T, int32); \
   DEFINE_GPU_SPECS_INDEX(T, int64);
 
-TF_CALL_int64(DEFINE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
 TF_CALL_complex64(DEFINE_GPU_SPECS);
 TF_CALL_complex128(DEFINE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/gather_nd_op.cc b/tensorflow/core/kernels/gather_nd_op.cc
index 4e53291b7f..7e5a9e1ec5 100644
--- a/tensorflow/core/kernels/gather_nd_op.cc
+++ b/tensorflow/core/kernels/gather_nd_op.cc
@@ -228,8 +228,6 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
-TF_CALL_int32(DECLARE_GPU_SPECS);
-TF_CALL_int64(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
 TF_CALL_complex128(DECLARE_GPU_SPECS);
@@ -241,8 +239,6 @@ TF_CALL_complex128(DECLARE_GPU_SPECS);
 // Registration of the GPU implementations.
 #define REGISTER_GATHER_ND_GPU(type) REGISTER_GATHER_ND_ALL_INDICES(GPU, type)
 
-TF_CALL_int32(REGISTER_GATHER_ND_GPU);
-TF_CALL_int64(REGISTER_GATHER_ND_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GATHER_ND_GPU);
 TF_CALL_complex64(REGISTER_GATHER_ND_GPU);
 TF_CALL_complex128(REGISTER_GATHER_ND_GPU);
diff --git a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
index da8d2e9e3c..b03efc684f 100644
--- a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
@@ -119,8 +119,6 @@ struct GatherNdSlice<GPUDevice, T, Index, IXDIM> {
   DEFINE_GPU_SPECS_INDEX(T, int32); \
   DEFINE_GPU_SPECS_INDEX(T, int64);
 
-TF_CALL_int32(DEFINE_GPU_SPECS);
-TF_CALL_int64(DEFINE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
 TF_CALL_complex64(DEFINE_GPU_SPECS);
 TF_CALL_complex128(DEFINE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index 094504d6b9..ef332ebee3 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -153,7 +153,6 @@ TF_CALL_uint64(REGISTER_GATHER_CPU);
 // Registration of the GPU implementations.
 #define REGISTER_GATHER_GPU(type) REGISTER_GATHER_ALL_INDICES(GPU, type)
 
-TF_CALL_int64(REGISTER_GATHER_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GATHER_GPU);
 TF_CALL_complex64(REGISTER_GATHER_GPU);
 TF_CALL_complex128(REGISTER_GATHER_GPU);
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index 31d1b949ef..5eeb23d810 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -14,7 +14,6 @@ limitations under the License.
 
 #include <limits>
 #include <vector>
-#include <unordered_map>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -591,8 +590,8 @@ class MklConcatOp : public OpKernel {
       const int N = input_tensors.size();
 
       // Get Tensor shapes.
-      std::vector<MklDnnShape> mkl_input_shapes(N);
-      GetMklShapeList(context, "values", &mkl_input_shapes);
+      std::vector<MklDnnShape> input_shapes(N);
+      GetMklShapeList(context, "values", &input_shapes);
 
       const Tensor& concat_dim_tensor = (AxisArgName == NAME_IS_CONCAT_DIM)
                                             ? MklGetInput(context, 0)
@@ -611,14 +610,19 @@ class MklConcatOp : public OpKernel {
       int i = 0;
       bool invoke_eigen = false;
       bool are_all_mkl_inputs = true, are_all_tf_inputs = true;
-      const TensorShape expected_shape = mkl_input_shapes[0].IsMklTensor()
-                                       ? mkl_input_shapes[0].GetTfShape()
-                                       : input_tensors[0].shape();
+      const TensorShape expected_shape = input_shapes[0].IsMklTensor()
+                                             ? input_shapes[0].GetTfShape()
+                                             : input_tensors[0].shape();
       size_t expected_dims = expected_shape.dims();
 
       if (concat_dim < 0) concat_dim = expected_dims + concat_dim;
 
-      for (auto& s : mkl_input_shapes) {
+      for (auto& s : input_shapes) {
+        if (s == expected_shape) {
+          ++i;
+          continue;
+        }
+
         TensorShape s_shape =
             s.IsMklTensor() ? s.GetTfShape() : input_tensors[i].shape();
         size_t s_dims = s_shape.dims();
@@ -661,14 +665,21 @@ class MklConcatOp : public OpKernel {
 
       // Call Eigen library
       if (invoke_eigen) {
-        CallEigenVersion(context, input_tensors, mkl_input_shapes);
+        TensorShapeList tf_input_shapes;
+        i = 0;
+        for (auto& s : input_shapes) {
+          TensorShape s_shape =
+              s.IsMklTensor() ? s.GetTfShape() : input_tensors[i].shape();
+          tf_input_shapes.push_back(s_shape);
+          ++i;
+        }
+        CallEigenVersion(context, input_tensors, tf_input_shapes);
         return;
       }
 
       memory::dims dst_dims;
-
       if (are_all_mkl_inputs)
-        dst_dims = TFShapeToMklDnnDims(mkl_input_shapes[0].GetTfShape());
+        dst_dims = TFShapeToMklDnnDims(input_shapes[0].GetTfShape());
       else
         // When all the inputs are in Tensorflow format, we don't know
         // what is the input data format. In that case, we just use
@@ -678,61 +689,26 @@ class MklConcatOp : public OpKernel {
       std::vector<memory::primitive_desc> srcs_pd;
       std::vector<MklDnnData<T>> srcs(N, MklDnnData<T>(&cpu_engine));
       int64 dst_concat_dim_size = 0;
-
-      bool isMklReorderNeeded = false;
-      memory::format mkl_common_format = memory::format::any;
-      if (are_all_mkl_inputs) {
-        mkl_common_format =
-            FindMklCommonFormat(mkl_input_shapes, concat_dim,
-               &isMklReorderNeeded, &dst_concat_dim_size);
-
-        if (!isMklReorderNeeded) {
-          // All MKL tensors have a same format. Reorder is not needed.
-          for (int k = 0; k < N; k++) {
-            if (input_tensors[k].NumElements() == 0)
-              continue;
-
-            auto src_md = mkl_input_shapes[k].GetMklLayout();
-            srcs[k].SetUsrMem(src_md, &input_tensors[k]);
-            auto src_mpd = srcs[k].GetUsrMemPrimDesc();
-            srcs_pd.push_back(src_mpd);
-          }
-        } else {
-          // MKL tensors have different formats.
-          // Reorder them to most common format.
-          for (int k = 0; k < N; k++) {
-            if (input_tensors[k].NumElements() == 0)
-              continue;
-
-            auto src_dims = TFShapeToMklDnnDims(
-                mkl_input_shapes[k].GetTfShape());
-            auto src_md = mkl_input_shapes[k].GetMklLayout();
-            srcs[k].SetUsrMem(src_md, &input_tensors[k]);
-
-            if (src_md.data.format != mkl_common_format)
-              src_md = memory::desc(src_dims, MklDnnType<T>(),
-                           mkl_common_format);
-
-            srcs_pd.push_back(memory::primitive_desc(src_md, cpu_engine));
-          }
-        }
-      } else {  // All TF inputs
-        for (int k = 0; k < N; k++) {
-          if (input_tensors[k].NumElements() == 0)
-            continue;
-
-          memory::dims src_dims = TFShapeToMklDnnDims(input_tensors[k].shape());
-          dst_concat_dim_size += src_dims[concat_dim];
-
-          // It does not matter what data format to be used (NHWC versus NCHW).
-          // We just need to ensure that output uses same data format as inputs.
-          auto src_md =
-              memory::desc(src_dims, MklDnnType<T>(), memory::format::nchw);
-
-          srcs[k].SetUsrMem(src_md, &input_tensors[k]);
-          auto src_mpd = srcs[k].GetUsrMemPrimDesc();
-          srcs_pd.push_back(src_mpd);
-        }
+      for (int k = 0; k < N; k++) {
+        bool is_mkl_tensor = input_shapes[k].IsMklTensor();
+        memory::dims src_dims;
+
+        // Same comment as dst_dims for src_dims.
+        src_dims = (is_mkl_tensor)
+                       ? TFShapeToMklDnnDims(input_shapes[k].GetTfShape())
+                       : TFShapeToMklDnnDims(input_tensors[k].shape());
+
+        dst_concat_dim_size += src_dims[concat_dim];
+        auto src_md =
+            is_mkl_tensor ? input_shapes[k].GetMklLayout() :
+                          // It does not matter what data format we use here
+                          // (NHWC or NCHW). We just need to ensure that output
+                          // of Concat uses same data format as input.
+                memory::desc(src_dims, MklDnnType<T>(), memory::format::nchw);
+
+        srcs[k].SetUsrMem(src_md, &input_tensors[k]);
+        auto src_mpd = srcs[k].GetUsrMemPrimDesc();
+        srcs_pd.push_back(src_mpd);
       }
       dst_dims[concat_dim] = dst_concat_dim_size;
 
@@ -742,33 +718,25 @@ class MklConcatOp : public OpKernel {
       if (are_all_mkl_inputs) {
         // Since we are passing a specific format for destination,
         // we need to have dst_dims in MklDnn order (NCHW).
-        auto orig_tf_format = mkl_input_shapes[0].GetTfDataFormat();
+        auto orig_tf_format = input_shapes[0].GetTfDataFormat();
         dst_dims_in_nchw = MklDnnDimsInNCHW(
             dst_dims, MklDnnDataFormatToTFDataFormat(orig_tf_format));
-        // Set the output format same as the most common format of inputs
-        // to avoid layout conversions.
+        // We will set the output in the same format as input to avoid layout
+        // conversions.
+        // Currently we are setting dst format same as input format.
+        // See if we can make this choice in a better way.
         dst_md = memory::desc(
-            dst_dims_in_nchw, MklDnnType<T>(), mkl_common_format);
+            dst_dims_in_nchw, MklDnnType<T>(),
+            (memory::format)input_shapes[0].GetMklLayout().data.format);
       } else {
-        // All inputs are TF tensors.
-        // Set the output format same as input format (nchw).
+        // Again, format does not matter here. We just need to make it same as
+        // input format.
         dst_md = memory::desc(dst_dims, MklDnnType<T>(), memory::format::nchw);
       }
 
       std::vector<primitive::at> inputs;
-      std::vector<primitive> net;
-      if (isMklReorderNeeded) {
-        for (int k = 0; k < input_tensors.size(); k++) {
-          if (input_tensors[k].NumElements() > 0) {
-            srcs[k].CheckReorderToOpMem(srcs_pd[k], &net);
-          }
-        }
-      }
-      for (int k = 0; k < input_tensors.size(); k++) {
-        if (input_tensors[k].NumElements() > 0) {
-          inputs.push_back(srcs[k].GetOpMem());
-        }
-      }
+      for (int k = 0; k < input_tensors.size(); k++)
+        inputs.push_back(srcs[k].GetOpMem());
 
       // If all inputs are in MKL format, then meaning of concat_dim needs to
       // change. Value of concat_dim is tied to input Tensorflow data format
@@ -777,8 +745,7 @@ class MklConcatOp : public OpKernel {
       // But ifinput tensors are in NHWC order, then semantics need to change.
       // E.g., if we are concatinating over Channel (dimension 3 for NHWC),
       // then since MklDnn order is NCHW, concat_dim needs to be 1.
-      if (are_all_mkl_inputs)
-         concat_dim = mkl_input_shapes[0].TfDimIdx(concat_dim);
+      if (are_all_mkl_inputs) concat_dim = input_shapes[0].TfDimIdx(concat_dim);
 
       auto concat_pd = concat::primitive_desc(dst_md, concat_dim, srcs_pd);
 
@@ -791,7 +758,7 @@ class MklConcatOp : public OpKernel {
         dnn_shape_dst.SetMklLayout(&dst_pd);
         dnn_shape_dst.SetElemType(MklDnnType<T>());
         dnn_shape_dst.SetTfLayout(dst_dims.size(), dst_dims_in_nchw,
-                                  mkl_input_shapes[0].GetTfDataFormat());
+                                  input_shapes[0].GetTfDataFormat());
         tf_shape_dst.AddDim((dst_pd.get_size() / sizeof(T)));
       } else {
         dnn_shape_dst.SetMklTensor(false);
@@ -806,6 +773,7 @@ class MklConcatOp : public OpKernel {
       dst.SetUsrMem(dst_md, dst_tensor);
 
       auto concat_op = concat(concat_pd, inputs, dst.GetOpMem());
+      std::vector<primitive> net;
       net.push_back(concat_op);
       stream(stream::kind::eager).submit(net).wait();
     } catch (mkldnn::error& e) {
@@ -819,27 +787,15 @@ class MklConcatOp : public OpKernel {
   }
 
   void CallEigenVersion(OpKernelContext* context, const OpInputList& values,
-                        const MklDnnShapeList& mkl_input_shapes) {
-    CHECK_EQ(values.size(), mkl_input_shapes.size());
+                        const TensorShapeList& input_shapes) {
+    CHECK_EQ(values.size(), input_shapes.size());
 
     std::vector<Tensor> converted_values;
-    TensorShapeList tf_input_shapes;
-    for (int i = 0; i < mkl_input_shapes.size(); i++) {
-      if (mkl_input_shapes[i].IsMklTensor()) {
-        // do conversion from MKL to TF
-        Tensor tmp_tensor =
-            ConvertMklToTF<T>(context, values[i], mkl_input_shapes[i]);
-        converted_values.push_back(tmp_tensor);
-        tf_input_shapes.push_back(mkl_input_shapes[i].GetTfShape());
-      } else {
-        // no conversion since it is TF tensor already
-        converted_values.push_back(values[i]);
-        tf_input_shapes.push_back(values[i].shape());
-      }
-    }
+    for (int i = 0; i < input_shapes.size(); i++)
+      converted_values.push_back(values[i]);
 
     // Call Eigen concat.
-    eigen_concat_op_.Compute(context, converted_values, tf_input_shapes);
+    eigen_concat_op_.Compute(context, converted_values, input_shapes);
 
     // Set output Mkl tensor for this op.
     MklDnnShape dnn_shape_output;
@@ -856,55 +812,6 @@ class MklConcatOp : public OpKernel {
         output_tensor->flat<uint8>().data(),
         output_tensor->flat<uint8>().size() * sizeof(uint8));
   }
-
-  // This method finds the most commom format accross all MKL inputs
-  // Inputs:
-  //   1. input_shapes: shapes of input (MKL) tensors.
-  //   2. concat_dim: concat dimension.
-  // Outputs:
-  //   1. is_reorder_needed is set to true if inputs have difference formats
-  //      It is set to false otherwise.
-  //   2. concat_dim_size is the size of concat_dim.
-  // Return:
-  //   return the common MKL format.
-  memory::format FindMklCommonFormat(const MklDnnShapeList& input_shapes,
-      int concat_dim, bool* is_reorder_needed, int64* concat_dim_size) {
-    *is_reorder_needed = false;
-    *concat_dim_size = 0;
-    std::unordered_map<int, int> occurrence_map;
-    if (input_shapes.size() == 0)
-      return memory::format::any;
-
-    // Compute ocurrences of each format of all inputs.
-    for (int k=0; k <input_shapes.size(); k++) {
-      auto src_dims = TFShapeToMklDnnDims(input_shapes[k].GetTfShape());
-      *concat_dim_size += src_dims[concat_dim];
-      int fmt = static_cast<int>(
-          input_shapes[k].GetMklLayout().data.format);
-      occurrence_map[fmt] += 1;
-    }
-
-    if (occurrence_map.size() == 1) {
-       // this means that all inputs have a same format
-       // return it with is_reorder_needed set false.
-       return static_cast<memory::format>(
-           input_shapes[0].GetMklLayout().data.format);
-    }
-
-    // Input tensors have different formats. Thus, reorder is needed.
-    // We pick up the most common format to minimize the total
-    // number of input reorder.
-    memory::format commonest_format = memory::format::any;
-    int max_occurrence = 0;
-    *is_reorder_needed = true;
-    for (auto item : occurrence_map) {
-      if (item.second > max_occurrence) {
-        commonest_format = static_cast<memory::format>(item.first);
-        max_occurrence = item.second;
-      }
-    }
-    return commonest_format;
-  }
 };
 
 #endif
diff --git a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
index f857be6c32..c1da0ded1d 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
@@ -18,7 +18,6 @@ limitations under the License.
 // bias.
 
 #ifdef INTEL_MKL
-#ifdef INTEL_MKL_ML
 
 #define USE_EIGEN_TENSOR
 #define EIGEN_USE_THREADS
@@ -265,5 +264,4 @@ class MklConv2DCustomBackpropBiasOp : public OpKernel {
 TF_CALL_float(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 } /* namespace tensorflow */
-#endif /* INTEL_MKL_ML */
 #endif /* INTEL_MKL */
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
index c0dfed7d7d..279167aba2 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -199,15 +199,13 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase<T> {
     CHECK_NOTNULL(pool_params);
     CHECK_NOTNULL(dnn_data_input);
     TensorShape input_tensor_shape = input_tensor.shape();
-    if (input_tensor.NumElements() != 0) {
-      memory::desc input_md =
+    memory::desc input_md =
         input_mkl_shape.IsMklTensor()
             ? input_mkl_shape.GetMklLayout()
             : memory::desc(TFShapeToMklDnnDimsInNCHW(input_tensor_shape,
                                                      this->data_format_tf_),
                            MklDnnType<T>(), this->data_format_mkldnn_);
-      dnn_data_input->SetUsrMem(input_md, &input_tensor);
-    }
+    dnn_data_input->SetUsrMem(input_md, &input_tensor);
     this->InitMklPoolParameters(context, pool_params, input_mkl_shape,
                                 input_tensor_shape);
   }
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index e1fc2ea128..43c5b29509 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -292,7 +292,6 @@ TF_CALL_string(REGISTER_SCATTER_ND_CPU);
   REGISTER_SCATTER_ND_UPDATE_GPU(type);   \
   REGISTER_SCATTER_ND_GPU(type);
 
-TF_CALL_int32(REGISTER_SCATTER_ND_ALL_GPU);
 // TODO(b/66916790): Support half types in ScatterNd.
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_ALL_GPU);
 TF_CALL_complex64(REGISTER_SCATTER_ND_ALL_GPU);
@@ -307,8 +306,6 @@ TF_CALL_complex128(REGISTER_SCATTER_ND_ALL_GPU);
 #define REGISTER_SCATTER_ND_UPDATE_SYCL(type) \
   REGISTER_SCATTER_ND_UPDATE(type, SYCL);
 
-TF_CALL_int32(REGISTER_SCATTER_ND_ADD_SUB_SYCL);
-TF_CALL_int32(REGISTER_SCATTER_ND_UPDATE_SYCL);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_ADD_SUB_SYCL);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_UPDATE_SYCL);
 #undef REGISTER_SCATTER_ND_ADD_SUB_SYCL
@@ -579,7 +576,6 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
-TF_CALL_int32(DECLARE_GPU_SPECS);
 // TODO(b/66916790): Support half types in ScatterNd.
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
index 08b657f4c3..a3c21edc15 100644
--- a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
@@ -170,7 +170,6 @@ struct ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM> {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
-TF_CALL_int32(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
 TF_CALL_complex128(DECLARE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/scoped_allocator_ops_test.cc b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
index 634f9ba887..bb0129fa6f 100644
--- a/tensorflow/core/kernels/scoped_allocator_ops_test.cc
+++ b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
@@ -216,13 +216,8 @@ TEST_F(ScopedAllocatorConcatOpTest, Success3) {
 }
 
 TEST_F(ScopedAllocatorConcatOpTest, Reshape) {
-  MakeOp({2, 2, 4}, DT_DOUBLE, true, "test", 120, 2);
-
-  // The elements of the third parameter to ExecOp must be multiples of
-  // Allocator::kAllocatorAlignment in size.  If they are not, the backing
-  // tensor allocated by PrepOp will have too many elements and reshaping
-  // will fail.
-  ExecOp(DT_DOUBLE, 120, {{2, 4}, {2, 4}});
+  MakeOp({2, 2, 2}, DT_DOUBLE, true, "test", 120, 2);
+  ExecOp(DT_DOUBLE, 120, {{2, 2}, {2, 2}});
 }
 
 TEST_F(ScopedAllocatorConcatOpTest, NoReshapeAttr) {
diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h
index d65692a552..7796bf3587 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.h
+++ b/tensorflow/core/kernels/segment_reduction_ops.h
@@ -16,14 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 #define TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 
-
-// This file requires the following include because it uses CudaAtomicMax:
-// #include "tensorflow/core/util/cuda_kernel_helper.h"
-
-// Unfortunately we can't add the #include, since it breaks compilation for
-// non-GPU targets. This only breaks in clang, because it's more strict for
-// template code and CudaAtomicMax is used in template context.
-
 // This file requires the following include because it uses CudaAtomicMax:
 // #include "tensorflow/core/util/cuda_kernel_helper.h"
 
@@ -138,4 +130,4 @@ struct Highest {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index 866c5dcd52..a1f9667b78 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -1490,7 +1490,7 @@ inline void LibxsmmSparseMatMul<TL, TR>::Compute(
 
 #endif  // TENSORFLOW_USE_LIBXSMM
 
-// Here is an overview of the SparseMatMul code. Note that we assume that the
+// Here is a an overview of the SparseMatMul code. Note that we assume that the
 // left matrix is sparse.
 //
 // The matrix "left" is divided into a grid with blocksize of (M, KL). Each
diff --git a/tensorflow/core/kernels/string_split_op.cc b/tensorflow/core/kernels/string_split_op.cc
index 26ab72f12e..4c2b312c34 100644
--- a/tensorflow/core/kernels/string_split_op.cc
+++ b/tensorflow/core/kernels/string_split_op.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
@@ -44,63 +43,6 @@ std::vector<string> Split(const string& str, const string& delimiter,
   return char_vector;
 }
 
-std::vector<string> SplitV2(const string& str, StringPiece sep, int maxsplit) {
-  // This SplitV2 method matches the behavior of python's str.split:
-  //   If sep is given, consecutive delimiters are not grouped together
-  //   and are deemed to delimit empty strings (for example, '1,,2'.split(',')
-  //   returns ['1', '', '2']). The sep argument may consist of multiple
-  //   characters (for example, '1<>2<>3'.split('<>') returns ['1', '2', '3']).
-  //   Splitting an empty string with a specified separator returns [''].
-  //
-  //   If sep is not specified or is None, a different splitting algorithm is
-  //   applied: runs of consecutive whitespace are regarded as a single
-  //   separator, and the result will contain no empty strings at the start or
-  //   end if the string has leading or trailing whitespace. Consequently,
-  //   splitting an empty string or a string consisting of just whitespace
-  //   with a None separator returns [].
-
-  std::vector<string> result;
-
-  StringPiece text(str);
-  if (maxsplit == 0) {
-    result.emplace_back(std::string(text));
-    return result;
-  }
-
-  if (sep.empty()) {
-    StringPiece token;
-    // Remove leading whitespaces.
-    str_util::RemoveLeadingWhitespace(&text);
-    int split = 0;
-    while (str_util::ConsumeNonWhitespace(&text, &token)) {
-      result.emplace_back(std::string(token));
-      str_util::RemoveLeadingWhitespace(&text);
-      ++split;
-      if (maxsplit > 0 && split == maxsplit) {
-        result.emplace_back(std::string(text));
-        return result;
-      }
-    }
-    return result;
-  }
-  auto p = std::search(text.begin(), text.end(), sep.begin(), sep.end());
-  int split = 0;
-  while (p != text.end()) {
-    StringPiece token = text.substr(0, p - text.begin());
-    result.emplace_back(std::string(token));
-    text.remove_prefix(token.size());
-    text.remove_prefix(sep.size());
-    ++split;
-    if (maxsplit > 0 && split == maxsplit) {
-      result.emplace_back(std::string(text));
-      return result;
-    }
-    p = std::search(text.begin(), text.end(), sep.begin(), sep.end());
-  }
-  result.emplace_back(std::string(text));
-  return result;
-}
-
 }  // namespace
 
 class StringSplitOp : public OpKernel {
@@ -180,78 +122,6 @@ class StringSplitOp : public OpKernel {
   bool skip_empty_;
 };
 
-class StringSplitV2Op : public OpKernel {
- public:
-  explicit StringSplitV2Op(OpKernelConstruction* context)
-      : OpKernel(context), maxsplit_(-1) {
-    OP_REQUIRES_OK(context, context->GetAttr("maxsplit", &maxsplit_));
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    const Tensor* input_tensor;
-    OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(input_tensor->shape()),
-                errors::InvalidArgument("input must be a vector, got shape: ",
-                                        input_tensor->shape().DebugString()));
-
-    const auto input_vec = input_tensor->vec<string>();
-    const int64 batch_size = input_vec.dimension(0);
-
-    const Tensor* sep_tensor;
-    OP_REQUIRES_OK(ctx, ctx->input("sep", &sep_tensor));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(sep_tensor->shape()),
-                errors::InvalidArgument("sep must be a scalar, got shape: ",
-                                        sep_tensor->shape().DebugString()));
-    const auto sep_vec = sep_tensor->flat<string>();
-    StringPiece sep(sep_vec(0));
-    std::vector<string> tokens;
-    // Guess that we'll be unpacking a handful of tokens per example.
-    static constexpr int kReserveSize = 4;
-    tokens.reserve(batch_size * kReserveSize);
-
-    int64 output_size = 0;
-    int64 max_num_entries = 0;
-    std::vector<int64> num_indices(batch_size);
-    for (int64 i = 0; i < batch_size; ++i) {
-      std::vector<string> parts = SplitV2(input_vec(i), sep, maxsplit_);
-      int64 n_entries = parts.size();
-      num_indices[i] = n_entries;
-      output_size += n_entries;
-      max_num_entries = std::max(max_num_entries, n_entries);
-      tokens.insert(tokens.end(), parts.begin(), parts.end());
-    }
-
-    Tensor* sp_indices_t;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({output_size, 2}),
-                                             &sp_indices_t));
-    Tensor* sp_tokens_t;
-    OP_REQUIRES_OK(
-        ctx, ctx->allocate_output(1, TensorShape({output_size}), &sp_tokens_t));
-    Tensor* sp_shape_t;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(2, TensorShape({2}), &sp_shape_t));
-
-    auto sp_indices = sp_indices_t->matrix<int64>();
-    auto sp_tokens = sp_tokens_t->vec<string>();
-    auto sp_shape = sp_shape_t->vec<int64>();
-    sp_shape(0) = batch_size;
-    sp_shape(1) = max_num_entries;
-    size_t c = 0;
-    for (size_t i = 0; i < batch_size; ++i) {
-      for (size_t j = 0; j < num_indices[i]; ++j) {
-        sp_indices(c, 0) = i;
-        sp_indices(c, 1) = j;
-        sp_tokens(c) = tokens[c];
-        ++c;
-      }
-    }
-  }
-
- private:
-  int maxsplit_;
-};
-
 REGISTER_KERNEL_BUILDER(Name("StringSplit").Device(DEVICE_CPU), StringSplitOp);
-REGISTER_KERNEL_BUILDER(Name("StringSplitV2").Device(DEVICE_CPU),
-                        StringSplitV2Op);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/candidate_sampling_ops.cc b/tensorflow/core/ops/candidate_sampling_ops.cc
index 6e589c8d1c..6e4d100b04 100644
--- a/tensorflow/core/ops/candidate_sampling_ops.cc
+++ b/tensorflow/core/ops/candidate_sampling_ops.cc
@@ -145,15 +145,12 @@ REGISTER_OP("ComputeAccidentalHits")
       int64 num_true;
       TF_RETURN_IF_ERROR(c->GetAttr("num_true", &num_true));
 
-      // Validate true_classes, must be a matrix.
+      // Validate true_classes.
       ShapeHandle true_classes;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &true_classes));
       DimensionHandle unused;
       TF_RETURN_IF_ERROR(
           c->WithValue(c->Dim(true_classes, 1), num_true, &unused));
-      // Validate sampled_candidates, must be a vector.
-      ShapeHandle sampled_candidates;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &sampled_candidates));
 
       // All three outputs are the same shape.
       ShapeHandle v = c->Vector(InferenceContext::kUnknownDim);
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 9dca5f53ce..15e0ca8af9 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -218,17 +218,7 @@ REGISTER_OP("MapAndBatchDataset")
     .Attr("Targuments: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      // Use index from the end to retrieve the Input shapes,
-      // so that to avoid guessing the length of "other_arguments".
-      // batch_size, num_parallel_batches, and drop_remainder are 0-D scalars.
-      shape_inference::ShapeHandle unused;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 3), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 2), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 1), 0, &unused));
-
-      return shape_inference::ScalarShape(c);
-    });
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("MapAndBatchDatasetV2")
     .Input("input_dataset: variant")
@@ -241,17 +231,7 @@ REGISTER_OP("MapAndBatchDatasetV2")
     .Attr("Targuments: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      // Use index from the end to retrieve the Input shapes,
-      // so that to avoid guessing the length of "other_arguments".
-      // batch_size, num_parallel_calls, and drop_remainder are 0-D scalars.
-      shape_inference::ShapeHandle unused;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 3), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 2), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 1), 0, &unused));
-
-      return shape_inference::ScalarShape(c);
-    });
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("PrefetchDataset")
     .Input("input_dataset: variant")
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index 87f4991134..d949e70c66 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -454,9 +454,7 @@ REGISTER_OP("DrawBoundingBoxes")
       DimensionHandle unused;
       TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 2), 4, &unused));
 
-      // The rank of the input image (rank = 4) has already been restricted
-      // above, and the output is of the same shape as the input.
-      return shape_inference::UnchangedShape(c);
+      return shape_inference::UnchangedShapeWithRankAtLeast(c, 3);
     });
 
 // --------------------------------------------------------------------------
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index b3487122e2..1740fa152c 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1084,7 +1084,7 @@ REGISTER_OP("UnsortedSegmentProd")
     .Input("segment_ids: Tindices")
     .Input("num_segments: Tnumsegments")
     .Output("output: T")
-    .Attr("T: numbertype")
+    .Attr("T: realnumbertype")
     .Attr("Tindices: {int32,int64}")
     .Attr("Tnumsegments: {int32,int64} = DT_INT32")
     .SetShapeFn(UnsortedSegmentReductionShapeFn);
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 41efa49ce3..fc60e807b9 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -1453,7 +1453,6 @@ REGISTER_OP("QuantizedReluX")
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Scalar());
       return Status::OK();
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index 4423062362..1d5c743a56 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -78,7 +78,7 @@ REGISTER_OP("ReduceJoin")
 REGISTER_OP("AsString")
     .Input("input: T")
     .Output("output: string")
-    .Attr("T: {int8, int16, int32, int64, complex64, float, double, bool}")
+    .Attr("T: {int32, int64, complex64, float, double, bool, int8}")
     .Attr("precision: int = -1")
     .Attr("scientific: bool = false")
     .Attr("shortest: bool = false")
@@ -134,24 +134,6 @@ REGISTER_OP("StringSplit")
       return Status::OK();
     });
 
-REGISTER_OP("StringSplitV2")
-    .Input("input: string")
-    .Input("sep: string")
-    .Output("indices: int64")
-    .Output("values: string")
-    .Output("shape: int64")
-    .Attr("maxsplit: int = -1")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle unused;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
-
-      c->set_output(0, c->Matrix(InferenceContext::kUnknownDim, 2));
-      c->set_output(1, c->Vector(InferenceContext::kUnknownDim));
-      c->set_output(2, c->Vector(2));
-      return Status::OK();
-    });
-
 REGISTER_OP("StringStrip")
     .Input("input: string")
     .Output("output: string")
diff --git a/tensorflow/core/platform/cpu_info.cc b/tensorflow/core/platform/cpu_info.cc
index e9da3d8e32..99de364042 100644
--- a/tensorflow/core/platform/cpu_info.cc
+++ b/tensorflow/core/platform/cpu_info.cc
@@ -344,28 +344,5 @@ int CPUModelNum() {
 #endif
 }
 
-int CPUIDNumSMT() {
-#ifdef PLATFORM_IS_X86
-  // https://software.intel.com/en-us/articles/intel-64-architecture-processor-topology-enumeration
-  // https://software.intel.com/en-us/articles/intel-sdm (Vol 3A)
-  // Section: Detecting Hardware Multi-threads Support and Topology
-  // Uses CPUID Leaf 11 to enumerate system topology on Intel x86 architectures
-  // Other cases not supported
-  uint32 eax, ebx, ecx, edx;
-  // Check if system supports Leaf 11
-  GETCPUID(eax, ebx, ecx, edx, 0, 0);
-  if (eax >= 11) {
-    // 1) Leaf 11 available? CPUID.(EAX=11, ECX=0):EBX != 0
-    // 2) SMT_Mask_Width = CPUID.(EAX=11, ECX=0):EAX[4:0] if CPUID.(EAX=11,
-    // ECX=0):ECX[15:8] is 1
-    GETCPUID(eax, ebx, ecx, edx, 11, 0);
-    if (ebx != 0 && ((ecx & 0xff00) >> 8) == 1) {
-      return 1 << (eax & 0x1f);  // 2 ^ SMT_Mask_Width
-    }
-  }
-#endif  // PLATFORM_IS_X86
-  return 0;
-}
-
 }  // namespace port
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cpu_info.h b/tensorflow/core/platform/cpu_info.h
index 175c9ae8b1..b5be7e8b54 100644
--- a/tensorflow/core/platform/cpu_info.h
+++ b/tensorflow/core/platform/cpu_info.h
@@ -35,10 +35,6 @@ namespace port {
 // software can change it dynamically.
 int NumSchedulableCPUs();
 
-// Returns an estimate of the number of hyperthreads per physical core
-// on the CPU
-int NumHyperthreadsPerCore();
-
 // Mostly ISA related features that we care about
 enum CPUFeature {
   // Do not change numeric assignments.
@@ -111,9 +107,6 @@ int CPUModelNum();
 // Returns nominal core processor cycles per second of each processor.
 double NominalCPUFrequency();
 
-// Returns num of hyperthreads per physical core
-int CPUIDNumSMT();
-
 }  // namespace port
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index a319ccbdbe..ae81f9b5b3 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -71,8 +71,6 @@ def pyx_library(
         name = filename + "_cython_translation",
         srcs = [filename],
         outs = [filename.split(".")[0] + ".cpp"],
-        # Optionally use PYTHON_BIN_PATH on Linux platforms so that python 3
-        # works. Windows has issues with cython_binary so skip PYTHON_BIN_PATH.
         cmd = "PYTHONHASHSEED=0 $(location @cython//:cython_binary) --cplus $(SRCS) --output-file $(OUTS)",
         tools = ["@cython//:cython_binary"] + pxd_srcs,
     )
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
index ff4b4436bb..72c12318ca 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@@ -115,17 +115,18 @@ class LibHDFS {
     const char* kLibHdfsDso = "libhdfs.so";
 #endif
     char* hdfs_home = getenv("HADOOP_HDFS_HOME");
-    if (hdfs_home != nullptr) {
-      string path = io::JoinPath(hdfs_home, "lib", "native", kLibHdfsDso);
-      status_ = TryLoadAndBind(path.c_str(), &handle_);
-      if (status_.ok()) {
-        return;
-      }
+    if (hdfs_home == nullptr) {
+      status_ = errors::FailedPrecondition(
+          "Environment variable HADOOP_HDFS_HOME not set");
+      return;
+    }
+    string path = io::JoinPath(hdfs_home, "lib", "native", kLibHdfsDso);
+    status_ = TryLoadAndBind(path.c_str(), &handle_);
+    if (!status_.ok()) {
+      // try load libhdfs.so using dynamic loader's search path in case
+      // libhdfs.so is installed in non-standard location
+      status_ = TryLoadAndBind(kLibHdfsDso, &handle_);
     }
-
-    // Try to load the library dynamically in case it has been installed
-    // to a in non-standard location.
-    status_ = TryLoadAndBind(kLibHdfsDso, &handle_);
   }
 
   Status status_;
diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc
index 708f32ba80..8e316472fe 100644
--- a/tensorflow/core/platform/posix/port.cc
+++ b/tensorflow/core/platform/posix/port.cc
@@ -74,11 +74,6 @@ int NumSchedulableCPUs() {
   return kDefaultCores;
 }
 
-int NumHyperthreadsPerCore() {
-  static const int ht_per_core = tensorflow::port::CPUIDNumSMT();
-  return (ht_per_core > 0) ? ht_per_core : 1;
-}
-
 void* AlignedMalloc(size_t size, int minimum_alignment) {
 #if defined(__ANDROID__)
   return memalign(minimum_alignment, size);
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index cb1fd09dbb..522a9d84fd 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -19,12 +19,12 @@ limitations under the License.
 // TensorFlow uses semantic versioning, see http://semver.org/.
 
 #define TF_MAJOR_VERSION 1
-#define TF_MINOR_VERSION 9
+#define TF_MINOR_VERSION 8
 #define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX "-rc0"
+#define TF_VERSION_SUFFIX ""
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 90b6533690..dffc965b14 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -42,7 +42,6 @@ limitations under the License.
 
 #ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
-#include "tensorflow/core/lib/core/stringpiece.h"
 
 using mkldnn::engine;
 using mkldnn::memory;
@@ -713,48 +712,15 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
   return output_tensor;
 }
 #else
-using mkldnn::stream;
-template <typename T> class MklDnnData;
-
 template <typename T>
 inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
                              const MklDnnShape& mkl_shape) {
   Tensor output_tensor;
-  try {
-    if (!mkl_shape.IsMklTensor())
-      return mkl_tensor;  // return input since it is already TF tensor
-
-    TensorShape output_shape = mkl_shape.GetTfShape();;
-
-    // Allocate output tensor.
-    context->allocate_temp(DataTypeToEnum<T>::v(),
-        output_shape, &output_tensor);
-
-    auto cpu_engine = engine(engine::cpu, 0);
-    MklDnnData<T> input(&cpu_engine);
-
-    // Get Mkl layout of input tensor.
-    auto input_mkl_md = mkl_shape.GetMklLayout();
-    auto output_tf_md = mkl_shape.GetTfLayout();
-    auto output_tf_pd = memory::primitive_desc(output_tf_md, cpu_engine);
-    input.SetUsrMem(input_mkl_md, &mkl_tensor);
-
-    // reorder
-    if (input.IsReorderNeeded(output_tf_pd)) {
-      std::vector<primitive> net;
-      CHECK_EQ(input.CheckReorderToOpMem(output_tf_pd, &output_tensor, &net),
-             true);
-      stream(stream::kind::eager).submit(net).wait();
-    } else {
-      // If not, just forward input tensor to output tensor.
-      CHECK(output_tensor.CopyFrom(mkl_tensor, output_shape));
-    }
-  } catch (mkldnn::error& e) {
-    string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + string(e.message) + ", in file " +
-                       string(__FILE__) + ":" + std::to_string(__LINE__);
-    LOG(FATAL) << "Operation received an exception: " << error_msg;
-  }
+  TensorShape output_shape;
+
+  TF_CHECK_OK(
+      Status(error::Code::UNIMPLEMENTED, "Unimplemented conversion function"));
+
   return output_tensor;
 }
 #endif
@@ -1877,7 +1843,7 @@ class FactoryKeyCreator {
   template <typename T>
   void AddAsKey(const T data) {
     auto buffer = reinterpret_cast<const char *>(&data);
-    Append(StringPiece(buffer, sizeof(T)));
+    Append(absl::string_view(buffer, sizeof(T)));
   }
 
   std::string GetKey() {
@@ -1888,8 +1854,8 @@ class FactoryKeyCreator {
   string key_;
   const char delimiter = 'x';
   const int kMaxKeyLength = 256;
-  void Append(StringPiece s) {
-    key_.append(s.ToString());
+  void Append(absl::string_view s) {
+    key_.append(string(s));
     key_.append(1, delimiter);
   }
 };
diff --git a/tensorflow/docs_src/community/groups.md b/tensorflow/docs_src/community/groups.md
index 0b07d413da..d92f5775fa 100644
--- a/tensorflow/docs_src/community/groups.md
+++ b/tensorflow/docs_src/community/groups.md
@@ -1,38 +1,17 @@
 # User Groups
 
-TensorFlow has communities around the world. [Submit your community!](https://docs.google.com/forms/d/e/1FAIpQLSc_RQIUYtVgLLihzATaO_WUXkEyBDE_OoRoOXYDPmBEvHuEBA/viewform)
+TensorFlow has communities around the world.
 
 ## Asia
 
-* [TensorFlow China community](https://www.tensorflowers.cn)
-* [TensorFlow Korea (TF-KR) User Group](https://www.facebook.com/groups/TensorFlowKR/)
-* [TensorFlow User Group Tokyo](https://tfug-tokyo.connpass.com/)
-* [Soleil Data Dojo](https://soleildatadojo.connpass.com/)
+* [TensorFlow Korea (TF-KR) User Group](https://www.facebook.com/groups/TensorFlowKR/) _(Korean language)_
+* [TensorFlow User Group Tokyo](https://tfug-tokyo.connpass.com/) _(Japanese Language)_
+* [Soleil Data Dojo](https://soleildatadojo.connpass.com/) _(Japanese language)_
 * [TensorFlow User Group Utsunomiya](https://tfug-utsunomiya.connpass.com/)
-* [TensorFlow Philippines Community](https://www.facebook.com/groups/TensorFlowPH/)
-* [TensorFlow and Deep Learning Singapore](https://www.meetup.com/TensorFlow-and-Deep-Learning-Singapore/)
-* [TensorFlow India](https://www.facebook.com/tensorflowindia)
 
 
 ## Europe
 
 * [TensorFlow Barcelona](https://www.meetup.com/Barcelona-Machine-Learning-Meetup/)
 * [TensorFlow Madrid](https://www.meetup.com/TensorFlow-Madrid/)
-* [Tensorflow Belgium](https://www.meetup.com/TensorFlow-Belgium)
-* [TensorFlow x Rome Meetup](https://www.meetup.com/it-IT/TensorFlow-x-Rome-Meetup)
-* [TensorFlow London](https://www.meetup.com/TensorFlow-London/)
-* [TensorFlow Edinburgh](https://www.meetup.com/tensorflow-edinburgh/)
 
-
-## America
-
-* [TensorFlow Buenos Aires](https://www.meetup.com/TensorFlow-Buenos-Aires/)
-
-
-## Oceania
-* [Melbourne TensorFlow Meetup](https://www.meetup.com/Melbourne-TensorFlow-Meetup)
-
-
-## Africa
-
-* [TensorFlow Tunis Meetup](https://www.meetup.com/fr-FR/TensorFlow-Tunis-Meetup/)
diff --git a/tensorflow/docs_src/get_started/eager.md b/tensorflow/docs_src/get_started/eager.md
index bbb25e20c6..f08ac74425 100644
--- a/tensorflow/docs_src/get_started/eager.md
+++ b/tensorflow/docs_src/get_started/eager.md
@@ -1,3 +1,3 @@
 # Get Started with Eager Execution
 
-[Colab notebook](https://colab.research.google.com/github/tensorflow/models/blob/r1.9.0/samples/core/get_started/eager.ipynb)
+[Colab notebook](https://colab.research.google.com/github/tensorflow/models/blob/r1.8.0/samples/core/get_started/eager.ipynb)
diff --git a/tensorflow/docs_src/get_started/index.md b/tensorflow/docs_src/get_started/index.md
index 232d2f1547..55579d52fb 100644
--- a/tensorflow/docs_src/get_started/index.md
+++ b/tensorflow/docs_src/get_started/index.md
@@ -10,9 +10,9 @@ course prior to diving into TensorFlow documentation:
 TensorFlow is a tool for machine learning. While it contains a wide range of
 functionality, TensorFlow is mainly designed for deep neural network models.
 
-The easiest way to get started with TensorFlow is by using Eager Execution.
+The easiest way to get started with TensorFlow is using Eager Execution.
 
-  * @{$get_started/eager}, is for anyone new to machine learning or TensorFlow.
+  * @{$get_started/eager}, is for anyone new to  machine learning or TensorFlow.
 
 TensorFlow provides many APIs. The remainder of this section focuses on the
 Estimator API which provide scalable, high-performance models. See the
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index 2901848745..1abd840ab3 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -38,7 +38,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.9.0-rc0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.8.0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index 55bc0f64e7..52a2a3f8a6 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.9.0-rc0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.8.0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index 637231da12..1256fb99c4 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.9.0-rc0</version>
+  <version>1.8.0</version>
 </dependency>
 ```
 
@@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.9.0-rc0</version>
+                 <version>1.8.0</version>
                </dependency>
              </dependencies>
          </project>
@@ -124,12 +124,12 @@ instead:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-  <version>1.9.0-rc0</version>
+  <version>1.8.0</version>
 </dependency>
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow_jni_gpu</artifactId>
-  <version>1.9.0-rc0</version>
+  <version>1.8.0</version>
 </dependency>
 ```
 
@@ -148,7 +148,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.9.0-rc0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -167,7 +167,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.9.0-rc0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.8.0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -175,13 +175,13 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.9.0-rc0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.9.0-rc0.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.8.0.zip).
   3. Extract this .zip file.
 
-__Note__: The native library (`tensorflow_jni.dll`) requires `msvcp140.dll` at runtime, which is included in the [Visual C++ 2015 Redistributable](https://www.microsoft.com/en-us/download/details.aspx?id=48145) package. 
+
 
 ### Validate the installation
 
@@ -227,7 +227,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.9.0-rc0.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.8.0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -241,11 +241,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<pre><b>java -cp libtensorflow-1.9.0-rc0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.8.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.9.0-rc0.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.8.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index c8d706cf3c..0ed8160027 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -339,7 +339,9 @@ Docker will download the TensorFlow binary image the first time you launch it.
 
 #### GPU support
 
-To launch a Docker container with NVidia GPU support, enter a command of the following format (this [does not require any local CUDA installation](https://github.com/nvidia/nvidia-docker/wiki/CUDA#requirements)):
+Prior to installing TensorFlow with GPU support, ensure that your system meets all
+[NVIDIA software requirements](#NVIDIARequirements).  To launch a Docker container
+with NVidia GPU support, enter a command of the following format:
 
 <pre>
 $ <b>nvidia-docker run -it</b> <i>-p hostPort:containerPort TensorFlowGPUImage</i>
@@ -436,7 +438,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 <a name="ValidateYourInstallation"></a>
 ## Validate your installation
@@ -515,7 +517,7 @@ on your system:
   from source. To use the TensorFlow binaries, version 3.5 or higher is required.
   See the [NVIDIA documentation](https://developer.nvidia.com/cuda-gpus) for a
   list of supported GPU cards.
-* [GPU drivers](http://nvidia.com/drivers) that support your version of the CUDA
+* [GPU drivers](http://nvidia.com/driver) that support your version of the CUDA
   Toolkit.
 * The `libcupti-dev` library is the NVIDIA CUDA Profile Tools Interface. This
   library provides advanced profiling support. To install this library,
@@ -682,14 +684,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -701,14 +703,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -720,14 +722,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -739,14 +741,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index 9d01271c5a..29a867a9e3 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -119,7 +119,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py3-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -242,7 +242,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py3-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -350,7 +350,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -522,7 +522,7 @@ The value you specify depends on your Python version.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py2-none-any.whl
 </pre>
 
 
@@ -530,5 +530,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py2-none-a
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl
 </pre>
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index dc6c1e36fc..5ba522b436 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -81,7 +81,7 @@ or
 [macOS](#PrepareMac)
 
 
-<a name="PrepareLinux"></a>
+<a name="#PrepareLinux"></a>
 ## Prepare environment for Linux
 
 Before building TensorFlow on Linux, install the following build
@@ -328,10 +328,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.9.0rc0 on Linux:
+for TensorFlow 1.8.0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.9.0rc0-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.8.0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
@@ -373,9 +373,9 @@ The build and installation problems you encounter typically depend on the
 operating system.  See the "Common installation problems" section
 of one of the following guides:
 
-  * @{$install_linux#common_installation_problems$Installing TensorFlow on Linux}
-  * @{$install_mac#common_installation_problems$Installing TensorFlow on Mac OS}
-  * @{$install_windows#common_installation_problems$Installing TensorFlow on Windows}
+  * @{$install_linux#CommonInstallationProblems$Installing TensorFlow on Linux}
+  * @{$install_mac#CommonInstallationProblems$Installing TensorFlow on Mac OS}
+  * @{$install_windows#CommonInstallationProblems$Installing TensorFlow on Windows}
 
 Beyond the errors documented in those two guides, the following table
 notes additional errors specific to building TensorFlow.  Note that we
@@ -433,8 +433,6 @@ Stack Overflow and specify the `tensorflow` tag.
 **Linux**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.9.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.11.0</td><td>N/A</td><td>N/A</td></tr>
-<tr><td>tensorflow_gpu-1.9.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.11.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.8.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.10.0</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.8.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.7.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.10.0</td><td>N/A</td><td>N/A</td></tr>
@@ -458,7 +456,6 @@ Stack Overflow and specify the `tensorflow` tag.
 **Mac**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.9.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.11.0</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.8.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.10.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.7.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.10.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.6.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
@@ -475,8 +472,6 @@ Stack Overflow and specify the `tensorflow` tag.
 **Windows**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.9.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
-<tr><td>tensorflow_gpu-1.9.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.8.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.8.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.7.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
diff --git a/tensorflow/docs_src/mobile/linking_libs.md b/tensorflow/docs_src/mobile/linking_libs.md
index efef5dd0da..cf0db59021 100644
--- a/tensorflow/docs_src/mobile/linking_libs.md
+++ b/tensorflow/docs_src/mobile/linking_libs.md
@@ -27,7 +27,7 @@ called `libandroid_tensorflow_inference_java.jar`. There are three ways to
 include this functionality in your program:
 
 1. Include the jcenter AAR which contains it, as in this
- [example app](https://github.com/googlecodelabs/tensorflow-for-poets-2/blob/master/android/tfmobile/build.gradle#L59-L65)
+ [example app](https://github.com/googlecodelabs/tensorflow-for-poets-2/blob/master/android/build.gradle#L59-L65)
 
 2. Download the nightly precompiled version from
 [ci.tensorflow.org](http://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/).
diff --git a/tensorflow/docs_src/mobile/prepare_models.md b/tensorflow/docs_src/mobile/prepare_models.md
index 2b84dbb973..8b22c04d87 100644
--- a/tensorflow/docs_src/mobile/prepare_models.md
+++ b/tensorflow/docs_src/mobile/prepare_models.md
@@ -105,8 +105,8 @@ inline constants so everything’s in one file.  To handle the conversion, you
 need the `freeze_graph.py` script, that’s held in
 [`tensorflow/python/tools/freeze_graph.py`](https://www.tensorflow.org/code/tensorflow/python/tools/freeze_graph.py). You’ll run it like this:
 
-    bazel build tensorflow/python/tools:freeze_graph
-    bazel-bin/tensorflow/python/tools/freeze_graph \
+    bazel build tensorflow/tools:freeze_graph
+    bazel-bin/tensorflow/tools/freeze_graph \
     --input_graph=/tmp/model/my_graph.pb \
     --input_checkpoint=/tmp/model/model.ckpt-1000 \
     --output_graph=/tmp/frozen_graph.pb \
diff --git a/tensorflow/docs_src/performance/quantization.md b/tensorflow/docs_src/performance/quantization.md
index c97f74139c..2fea02d861 100644
--- a/tensorflow/docs_src/performance/quantization.md
+++ b/tensorflow/docs_src/performance/quantization.md
@@ -227,8 +227,8 @@ of 30.0f, and an 8-bit array, the quantized values represent the following:
   <table>
     <tr><th>Quantized</th><th>Float</th></tr>
     <tr><td>0</td><td>-10.0</td></tr>
-    <tr><td>128</td><td>10.0</td></tr>
     <tr><td>255</td><td>30.0</td></tr>
+    <tr><td>128</td><td>10.0</td></tr>
   </table>
   <figcaption>
     <b>Table 2</b>: Example quantized value range
diff --git a/tensorflow/docs_src/programmers_guide/estimators.md b/tensorflow/docs_src/programmers_guide/estimators.md
index b13b47184d..c4aae1d9d6 100644
--- a/tensorflow/docs_src/programmers_guide/estimators.md
+++ b/tensorflow/docs_src/programmers_guide/estimators.md
@@ -21,17 +21,18 @@ Note: TensorFlow also includes a deprecated `Estimator` class at
 
 Estimators provide the following benefits:
 
-*   You can run Estimator-based models on a local host or on a
+*   You can run Estimators-based models on a local host or on a
     distributed multi-server environment without changing your model.
-    Furthermore, you can run Estimator-based models on CPUs, GPUs,
+    Furthermore, you can run Estimators-based models on CPUs, GPUs,
     or TPUs without recoding your model.
 *   Estimators simplify sharing implementations between model developers.
-*   You can develop a state of the art model with high-level intuitive code.
+*   You can develop a state of the art model with high-level intuitive code,
     In short, it is generally much easier to create models with Estimators
     than with the low-level TensorFlow APIs.
-*   Estimators are themselves built on @{tf.layers}, which
+*   Estimators are themselves built on tf.layers, which
     simplifies customization.
-*   Estimators build the graph for you.
+*   Estimators build the graph for you.  In other words, you don't have to
+    build the graph.
 *   Estimators provide a safe distributed training loop that controls how and
     when to:
     *   build the graph
@@ -56,7 +57,7 @@ the "plumbing" for you.  That is, pre-made Estimators create and manage
 pre-made Estimators let you experiment with different model architectures by
 making only minimal code changes.  @{tf.estimator.DNNClassifier$`DNNClassifier`},
 for example, is a pre-made Estimator class that trains classification models
-based on dense, feed-forward neural networks.
+through dense, feed-forward neural networks.
 
 
 ### Structure of a pre-made Estimators program
@@ -78,7 +79,7 @@ of the following four steps:
     an input function:
 
         def input_fn(dataset):
-           ...  # manipulate dataset, extracting the feature dict and the label
+           ...  # manipulate dataset, extracting feature names and the label
            return feature_dict, label
 
     (See @{$programmers_guide/datasets} for full details.)
@@ -95,13 +96,13 @@ of the following four steps:
         population = tf.feature_column.numeric_column('population')
         crime_rate = tf.feature_column.numeric_column('crime_rate')
         median_education = tf.feature_column.numeric_column('median_education',
-                            normalizer_fn=lambda x: x - global_education_mean)
+                            normalizer_fn='lambda x: x - global_education_mean')
 
 3.  **Instantiate the relevant pre-made Estimator.**  For example, here's
     a sample instantiation of a pre-made Estimator named `LinearClassifier`:
 
         # Instantiate an estimator, passing the feature columns.
-        estimator = tf.estimator.LinearClassifier(
+        estimator = tf.estimator.Estimator.LinearClassifier(
             feature_columns=[population, crime_rate, median_education],
             )
 
diff --git a/tensorflow/docs_src/programmers_guide/feature_columns.md b/tensorflow/docs_src/programmers_guide/feature_columns.md
index 90f5c53a17..845194fe0e 100644
--- a/tensorflow/docs_src/programmers_guide/feature_columns.md
+++ b/tensorflow/docs_src/programmers_guide/feature_columns.md
@@ -528,10 +528,10 @@ suggested by the following snippet:
 categorical_column = ... # Create any categorical column
 
 # Represent the categorical column as an embedding column.
-# This means creating an embedding vector lookup table with one element for each category.
+# This means creating a one-hot vector with one element for each category.
 embedding_column = tf.feature_column.embedding_column(
     categorical_column=categorical_column,
-    dimension=embedding_dimensions)
+    dimension=dimension_of_embedding_vector)
 ```
 
 @{$programmers_guide/embedding$Embeddings} is a significant topic within machine
diff --git a/tensorflow/examples/learn/iris.py b/tensorflow/examples/learn/iris.py
index 86f5204ec3..03e60972aa 100644
--- a/tensorflow/examples/learn/iris.py
+++ b/tensorflow/examples/learn/iris.py
@@ -21,8 +21,7 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-
-from six.moves.urllib.request import urlretrieve
+import urllib
 
 import tensorflow as tf
 
@@ -39,7 +38,9 @@ FEATURE_KEYS = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
 def maybe_download_iris_data(file_name, download_url):
   """Downloads the file and returns the number of data."""
   if not os.path.exists(file_name):
-    urlretrieve(download_url, file_name)
+    raw = urllib.urlopen(download_url).read()
+    with open(file_name, 'w') as f:
+      f.write(raw)
 
   # The first line is a comma-separated string. The first one is the number of
   # total data in the file.
diff --git a/tensorflow/java/src/gen/cc/op_generator.cc b/tensorflow/java/src/gen/cc/op_generator.cc
index 9b171f66ec..debd95fc62 100644
--- a/tensorflow/java/src/gen/cc/op_generator.cc
+++ b/tensorflow/java/src/gen/cc/op_generator.cc
@@ -376,6 +376,9 @@ void GenerateOp(const OpSpec& op, const EndpointSpec& endpoint,
     }
   }
   // op annotations
+  op_class.add_annotation(
+      Annotation::Create("Generated", "javax.annotation")
+          .attributes("value = \"TensorFlow Java Op Generator\""));
   if (endpoint.deprecated()) {
     op_class.add_annotation(Annotation::Create("Deprecated"));
     string explanation;
@@ -412,12 +415,8 @@ void GenerateOp(const OpSpec& op, const EndpointSpec& endpoint,
   SourceFileWriter writer(op_file.get());
   std::list<Type> dependencies;
   CollectOpDependencies(op, mode, &dependencies);
-  writer.Write(kLicense)
-      .EndLine()
-      .Write("// This class has been generated, DO NOT EDIT!")
-      .EndLine()
-      .EndLine()
-      .BeginType(op_class, PUBLIC | FINAL, &dependencies, &op_javadoc);
+  writer.Write(kLicense).EndLine().BeginType(op_class, PUBLIC | FINAL,
+                                             &dependencies, &op_javadoc);
   if (!op.optional_attributes().empty()) {
     RenderOptionsClass(op, op_class, &writer);
   }
diff --git a/tensorflow/java/src/gen/cc/op_specs.cc b/tensorflow/java/src/gen/cc/op_specs.cc
index 941ab2699c..181fd4c5e3 100644
--- a/tensorflow/java/src/gen/cc/op_specs.cc
+++ b/tensorflow/java/src/gen/cc/op_specs.cc
@@ -96,7 +96,6 @@ Type TypeResolver::TypeOf(const OpDef_ArgDef& arg_def, bool* iterable_out) {
     *iterable_out = true;
     visited_attrs_.insert(std::make_pair(arg_def.number_attr(), Type::Int()));
   }
-
   Type type = Type::Wildcard();
   if (arg_def.type() != DataType::DT_INVALID) {
     // resolve type from DataType
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index bd97b181ff..b2e6c60021 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -196,11 +196,11 @@ def implicit_val_and_grad(f):
   # TODO(cais): Remove calls to tf.constant() once the gradients functions
   # accept lists and np.ndarrays.
 
-  def grad_fn(*args, **kwds):
+  def grad_fn(*args):
     """Computes the gradient of the wrapped function."""
     this_tape = tape.push_new_tape()
     try:
-      end_node = f(*args, **kwds)
+      end_node = f(*args)
       if end_node is None:
         raise ValueError("Cannot differentiate a function that returns None; "
                          "did you forget to return a value from {}?".format(
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index 20522098b0..9cd17e0407 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -978,10 +978,7 @@ py_test(
     size = "large",
     srcs = ["keras_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_windows",
-        "notsan",
-    ],
+    tags = ["notsan"],
     deps = [
         ":keras",
         "//tensorflow/core:protos_all_py",
diff --git a/tensorflow/python/estimator/exporter.py b/tensorflow/python/estimator/exporter.py
index b18212cfcd..7cdf840c97 100644
--- a/tensorflow/python/estimator/exporter.py
+++ b/tensorflow/python/estimator/exporter.py
@@ -156,7 +156,7 @@ def _loss_smaller(best_eval_result, current_eval_result):
   return best_eval_result[default_key] > current_eval_result[default_key]
 
 
-def _verify_compare_fn_args(compare_fn):
+def _verify_compre_fn_args(compare_fn):
   """Verifies compare_fn arguments."""
   args = set(util.fn_args(compare_fn))
   if 'best_eval_result' not in args:
@@ -265,7 +265,7 @@ class BestExporter(Exporter):
     self._compare_fn = compare_fn
     if self._compare_fn is None:
       raise ValueError('`compare_fn` must not be None.')
-    _verify_compare_fn_args(self._compare_fn)
+    _verify_compre_fn_args(self._compare_fn)
 
     self._saved_model_exporter = _SavedModelExporter(
         name, serving_input_receiver_fn, assets_extra, as_text)
diff --git a/tensorflow/python/estimator/inputs/numpy_io.py b/tensorflow/python/estimator/inputs/numpy_io.py
index a6cefdece2..035c7c148c 100644
--- a/tensorflow/python/estimator/inputs/numpy_io.py
+++ b/tensorflow/python/estimator/inputs/numpy_io.py
@@ -136,13 +136,11 @@ def numpy_input_fn(x,
       values in `x` have same shape).
     ValueError: if duplicate keys are in both `x` and `y` when `y` is a dict.
     ValueError: if x or y is an empty dict.
-    TypeError: `x` is not a dict or array.
-    ValueError: if 'shuffle' is not provided or a bool.
+    TypeError: `x` is not a dict or array, or if `shuffle` is not bool.
   """
   if not isinstance(shuffle, bool):
-    raise ValueError('shuffle must be provided and explicitly set as boolean '
-                     '(it is recommended to set it as True for training); '
-                     'got {}'.format(shuffle))
+    raise TypeError('shuffle must be explicitly set as boolean; '
+                    'got {}'.format(shuffle))
 
   def input_fn():
     """Numpy input function."""
diff --git a/tensorflow/python/estimator/inputs/numpy_io_test.py b/tensorflow/python/estimator/inputs/numpy_io_test.py
index 81b201cc5c..92d057e25d 100644
--- a/tensorflow/python/estimator/inputs/numpy_io_test.py
+++ b/tensorflow/python/estimator/inputs/numpy_io_test.py
@@ -286,9 +286,8 @@ class NumpyIoTest(test.TestCase):
     x = np.arange(32, 36)
     y = np.arange(4)
     with self.test_session():
-      with self.assertRaisesRegexp(ValueError,
-                                   'shuffle must be provided and explicitly '
-                                   'set as boolean'):
+      with self.assertRaisesRegexp(TypeError,
+                                   'shuffle must be explicitly set as boolean'):
         # Default shuffle is None.
         numpy_io.numpy_input_fn(x, y)
 
diff --git a/tensorflow/python/estimator/inputs/pandas_io.py b/tensorflow/python/estimator/inputs/pandas_io.py
index 57f8e5fd6a..938e244fb3 100644
--- a/tensorflow/python/estimator/inputs/pandas_io.py
+++ b/tensorflow/python/estimator/inputs/pandas_io.py
@@ -68,16 +68,15 @@ def pandas_input_fn(x,
   Raises:
     ValueError: if `x` already contains a column with the same name as `y`, or
       if the indexes of `x` and `y` don't match.
-    ValueError: if 'shuffle' is not provided or a bool.
+    TypeError: `shuffle` is not bool.
   """
   if not HAS_PANDAS:
     raise TypeError(
         'pandas_input_fn should not be called without pandas installed')
 
   if not isinstance(shuffle, bool):
-    raise ValueError('shuffle must be provided and explicitly set as boolean '
-                     '(it is recommended to set it as True for training); '
-                     'got {}'.format(shuffle))
+    raise TypeError('shuffle must be explicitly set as boolean; '
+                    'got {}'.format(shuffle))
 
   x = x.copy()
   if y is not None:
diff --git a/tensorflow/python/estimator/inputs/pandas_io_test.py b/tensorflow/python/estimator/inputs/pandas_io_test.py
index dcecf6dd61..e5912a3b28 100644
--- a/tensorflow/python/estimator/inputs/pandas_io_test.py
+++ b/tensorflow/python/estimator/inputs/pandas_io_test.py
@@ -70,9 +70,8 @@ class PandasIoTest(test.TestCase):
       return
     x, _ = self.makeTestDataFrame()
     y_noindex = pd.Series(np.arange(-32, -28))
-    with self.assertRaisesRegexp(ValueError,
-                                 'shuffle must be provided and explicitly '
-                                 'set as boolean'):
+    with self.assertRaisesRegexp(TypeError,
+                                 'shuffle must be explicitly set as boolean'):
       # Default shuffle is None
       pandas_io.pandas_input_fn(x, y_noindex)
 
diff --git a/tensorflow/python/estimator/inputs/queues/feeding_functions.py b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
index 51a61adb21..8e2ec83020 100644
--- a/tensorflow/python/estimator/inputs/queues/feeding_functions.py
+++ b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
@@ -250,7 +250,7 @@ class _PandasFeedFn(object):
                num_epochs=None):
     if len(placeholders) != len(dataframe.columns) + 1:
       raise ValueError("Expected {} placeholders; got {}.".format(
-          len(dataframe.columns) + 1, len(placeholders)))
+          len(dataframe.columns), len(placeholders)))
     self._index_placeholder = placeholders[0]
     self._col_placeholders = placeholders[1:]
     self._dataframe = dataframe
diff --git a/tensorflow/python/estimator/keras.py b/tensorflow/python/estimator/keras.py
index 2f439f765e..c80af08fba 100644
--- a/tensorflow/python/estimator/keras.py
+++ b/tensorflow/python/estimator/keras.py
@@ -70,7 +70,7 @@ def _convert_tensor(x):
   return x
 
 
-def _any_variable_initialized():
+def _any_variable_initalized():
   """Check if any variable has been initialized in the Keras model.
 
   Returns:
@@ -511,7 +511,7 @@ def model_to_estimator(keras_model=None,
       keras_model_fn, model_dir=model_dir, config=config)
 
   # Check if we need to call get_weights:
-  if _any_variable_initialized():
+  if _any_variable_initalized():
     keras_weights = keras_model.get_weights()
     # Warn if config passed to estimator tries to update GPUOptions. If a
     # session has already been created, the GPUOptions passed to the first
diff --git a/tensorflow/python/estimator/keras_test.py b/tensorflow/python/estimator/keras_test.py
index 5e094ae92b..6688a84130 100644
--- a/tensorflow/python/estimator/keras_test.py
+++ b/tensorflow/python/estimator/keras_test.py
@@ -31,10 +31,10 @@ from tensorflow.python.estimator import run_config as run_config_lib
 from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.applications import mobilenet
 from tensorflow.python.keras.optimizers import SGD
-from tensorflow.python.ops.parsing_ops import gen_parsing_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
@@ -146,13 +146,13 @@ def randomize_io_type(array, name):
 def multi_inputs_multi_outputs_model():
   a = keras.layers.Input(shape=(16,), name='input_a')
   b = keras.layers.Input(shape=(16,), name='input_b')
-  m = keras.layers.Input(shape=(8,), dtype='string', name='input_m')
+  m = keras.layers.Input(shape=(8,), dtype='bool', name='input_m')
   dense = keras.layers.Dense(8, name='dense_1')
 
   a_2 = dense(a)
-  # Read m
-  m_2 = keras.layers.Lambda(gen_parsing_ops.string_to_number)(m)
-  s_2 = keras.layers.Lambda(lambda k: k[0] * k[1])([m_2, a_2])
+  # Apply a mask
+  s_2 = keras.layers.Lambda(lambda k:
+                            K.switch(k[0], k[1], K.zeros_like(k[1])))([m, a_2])
   b_2 = dense(b)
   merged = keras.layers.concatenate([s_2, b_2], name='merge')
   c = keras.layers.Dense(3, activation='softmax', name='dense_2')(merged)
@@ -372,13 +372,13 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
 
     def train_input_fn():
       input_dict = {'input_a': a_train, 'input_b': b_train,
-                    'input_m': input_m_train.astype(np.str)}
+                    'input_m': input_m_train > 0}
       output_dict = {'dense_2': c_train, 'dense_3': d_train}
       return input_dict, output_dict
 
     def eval_input_fn():
       input_dict = {'input_a': a_test, 'input_b': b_test,
-                    'input_m': input_m_test.astype(np.str)}
+                    'input_m': input_m_test > 0}
       output_dict = {'dense_2': c_test, 'dense_3': d_test}
       return input_dict, output_dict
 
diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index af5d709f7e..2d6925d1a8 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -1389,7 +1389,7 @@ class LayoutOptimizerTest(test.TestCase):
       expected_num_transposes = 3
       self.assertEqual(expected_num_transposes, num_transposes)
       self._assert_trans_nhwc_to_nchw('map/while/Conv2D-0', nodes)
-      self._assert_trans_nchw_to_nhwc('map/while/Add_1-0-2', nodes)
+      self._assert_trans_nchw_to_nhwc('map/while/Add-0-2', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
   def testLoopWithVecAnd4D(self):
@@ -1413,7 +1413,7 @@ class LayoutOptimizerTest(test.TestCase):
       expected_num_transposes = 2
       self.assertEqual(expected_num_transposes, num_transposes)
       self._assert_trans_nhwc_to_nchw('map/while/Conv2D-0', nodes)
-      self._assert_trans_nchw_to_nhwc('map/while/Add_1-0-2', nodes)
+      self._assert_trans_nchw_to_nhwc('map/while/Add-0-2', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
   def testBinaryOpSecondPort(self):
diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py
index f608dea430..e487f583be 100644
--- a/tensorflow/python/keras/activations.py
+++ b/tensorflow/python/keras/activations.py
@@ -93,8 +93,6 @@ def selu(x):
       - To be used together with the initialization "lecun_normal".
       - To be used together with the dropout variant "AlphaDropout".
 
-  References:
-      - [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
   """
   alpha = 1.6732632423543772848170429916717
   scale = 1.0507009873554804934193349852946
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 9f91368e5b..70b6a8431a 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -724,6 +724,15 @@ class TensorBoard(Callback):
         for weight in layer.weights:
           mapped_weight_name = weight.name.replace(':', '_')
           tf_summary.histogram(mapped_weight_name, weight)
+          if self.write_grads:
+            grads = model.optimizer.get_gradients(model.total_loss, weight)
+
+            def is_indexed_slices(grad):
+              return type(grad).__name__ == 'IndexedSlices'
+
+            grads = [grad.values if is_indexed_slices(grad) else grad
+                     for grad in grads]
+            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)
           if self.write_images:
             w_img = array_ops.squeeze(weight)
             shape = K.int_shape(w_img)
@@ -750,18 +759,6 @@ class TensorBoard(Callback):
             assert len(shape) == 4 and shape[-1] in [1, 3, 4]
             tf_summary.image(mapped_weight_name, w_img)
 
-        if self.write_grads:
-          for weight in layer.trainable_weights:
-            mapped_weight_name = weight.name.replace(':', '_')
-            grads = model.optimizer.get_gradients(model.total_loss, weight)
-
-            def is_indexed_slices(grad):
-              return type(grad).__name__ == 'IndexedSlices'
-
-            grads = [grad.values if is_indexed_slices(grad) else grad
-                     for grad in grads]
-            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)
-
         if hasattr(layer, 'output'):
           tf_summary.histogram('{}_out'.format(layer.name), layer.output)
     self.merged = tf_summary.merge_all()
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index 5062a26580..b355f4a269 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -653,8 +653,6 @@ class KerasCallbacksTest(test.TestCase):
       model.add(
           keras.layers.Dense(
               NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
-      # non_trainable_weights: moving_variance, moving_mean
-      model.add(keras.layers.BatchNormalization())
       model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
       model.compile(
           loss='categorical_crossentropy',
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index 1c9135982e..a4cd017d60 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -123,7 +123,7 @@ class Network(base_layer.Layer):
     # Entries are unique. Includes input and output layers.
     self._layers = []
 
-    # Used in symbolic mode only, only in conjunction with graph-networks
+    # Used in symbolic mode only, only in conjonction with graph-networks
     self._outbound_nodes = []
     self._inbound_nodes = []
 
diff --git a/tensorflow/python/keras/engine/saving_test.py b/tensorflow/python/keras/engine/saving_test.py
index 7e82db028b..6a94986b9c 100644
--- a/tensorflow/python/keras/engine/saving_test.py
+++ b/tensorflow/python/keras/engine/saving_test.py
@@ -482,7 +482,7 @@ class TestWholeModelSaving(test.TestCase):
       with h5py.File(fname, 'r') as h5file:
         num_names_arrays = len([attr for attr in h5file['model_weights'].attrs
                                 if attr.startswith('layer_names')])
-      # The chunking of layer names array should have happened.
+      # The chunking of layer names array should have happend.
       self.assertGreater(num_names_arrays, 0)
       out2 = model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
@@ -527,7 +527,7 @@ class TestWholeModelSaving(test.TestCase):
         num_weight_arrays = len(
             [attr for attr in h5file['model_weights']['nested_model'].attrs
              if attr.startswith('weight_names')])
-      # The chunking of layer names array should have happened.
+      # The chunking of layer names array should have happend.
       self.assertGreater(num_weight_arrays, 0)
       out2 = model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index fce6cbdb7a..89c1f1a40f 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -24,7 +24,6 @@ import numpy as np
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -410,13 +409,11 @@ class Model(Network):
         else:
           if sample_weight_mode == 'temporal':
             sample_weights.append(array_ops.placeholder_with_default(
-                constant_op.constant([[1.]], dtype=K.floatx()),
-                shape=[None, None], name=name + '_sample_weights'))
+                [[1.]], shape=[None, None], name=name + '_sample_weights'))
             sample_weight_modes.append('temporal')
           else:
             sample_weights.append(array_ops.placeholder_with_default(
-                constant_op.constant([1.], dtype=K.floatx()),
-                shape=[None], name=name + '_sample_weights'))
+                [1.], shape=[None], name=name + '_sample_weights'))
             sample_weight_modes.append(None)
     self.sample_weight_modes = sample_weight_modes
     self._feed_sample_weight_modes = []
diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py
index e8838cd3bc..2ecbff3a1c 100644
--- a/tensorflow/python/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/engine/training_eager.py
@@ -732,7 +732,7 @@ def slice_arrays(arrays, indices, contiguous=True):
   """Slices batches out of provided arrays (workaround for eager tensors).
 
   Unfortunately eager tensors don't have the same slicing behavior as
-  Numpy arrays (they follow the same slicing behavior as symbolic TF tensors),
+  Numpy arrays (they folow  the same slicing behavior as symbolic TF tensors),
   hence we cannot use `generic_utils.slice_arrays` directly
   and we have to implement this workaround based on `concat`. This has a
   performance cost.
diff --git a/tensorflow/python/keras/initializers_test.py b/tensorflow/python/keras/initializers_test.py
index c519e194bd..a54d6da839 100644
--- a/tensorflow/python/keras/initializers_test.py
+++ b/tensorflow/python/keras/initializers_test.py
@@ -71,7 +71,7 @@ class KerasInitializersTest(test.TestCase):
                                                       stddev=1,
                                                       seed=126),
                    tensor_shape,
-                   target_mean=0., target_max=2, target_min=-2)
+                   target_mean=0., target_std=None, target_max=2)
 
   def test_constant(self):
     tensor_shape = (5, 6, 4)
@@ -83,49 +83,49 @@ class KerasInitializersTest(test.TestCase):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(1. / fan_in)
+      scale = np.sqrt(3. / fan_in)
       self._runner(keras.initializers.lecun_uniform(seed=123), tensor_shape,
-                   target_mean=0., target_std=std)
+                   target_mean=0., target_max=scale, target_min=-scale)
 
   def test_glorot_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, fan_out = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(2. / (fan_in + fan_out))
+      scale = np.sqrt(6. / (fan_in + fan_out))
       self._runner(keras.initializers.glorot_uniform(seed=123), tensor_shape,
-                   target_mean=0., target_std=std)
+                   target_mean=0., target_max=scale, target_min=-scale)
 
   def test_he_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(2. / fan_in)
+      scale = np.sqrt(6. / fan_in)
       self._runner(keras.initializers.he_uniform(seed=123), tensor_shape,
-                   target_mean=0., target_std=std)
+                   target_mean=0., target_max=scale, target_min=-scale)
 
   def test_lecun_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(1. / fan_in)
+      scale = np.sqrt(1. / fan_in)
       self._runner(keras.initializers.lecun_normal(seed=123), tensor_shape,
-                   target_mean=0., target_std=std)
+                   target_mean=0., target_std=None, target_max=2 * scale)
 
   def test_glorot_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, fan_out = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(2. / (fan_in + fan_out))
+      scale = np.sqrt(2. / (fan_in + fan_out))
       self._runner(keras.initializers.glorot_normal(seed=123), tensor_shape,
-                   target_mean=0., target_std=std)
+                   target_mean=0., target_std=None, target_max=2 * scale)
 
   def test_he_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(2. / fan_in)
+      scale = np.sqrt(2. / fan_in)
       self._runner(keras.initializers.he_normal(seed=123), tensor_shape,
-                   target_mean=0., target_std=std)
+                   target_mean=0., target_std=None, target_max=2 * scale)
 
   def test_orthogonal(self):
     tensor_shape = (20, 20)
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index f60064ed63..5061825d38 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -19,9 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
-import sys
 import types as python_types
-import warnings
 
 import numpy as np
 
@@ -716,7 +714,6 @@ class Lambda(Layer):
     return self.mask
 
   def get_config(self):
-    module = self.function.__module__
     if isinstance(self.function, python_types.LambdaType):
       function = generic_utils.func_dump(self.function)
       function_type = 'lambda'
@@ -724,26 +721,21 @@ class Lambda(Layer):
       function = self.function.__name__
       function_type = 'function'
 
-    output_shape_module = None
     if isinstance(self._output_shape, python_types.LambdaType):
       output_shape = generic_utils.func_dump(self._output_shape)
       output_shape_type = 'lambda'
-      output_shape_module = self._output_shape.__module__
     elif callable(self._output_shape):
       output_shape = self._output_shape.__name__
       output_shape_type = 'function'
-      output_shape_module = self._output_shape.__module__
     else:
       output_shape = self._output_shape
       output_shape_type = 'raw'
 
     config = {
         'function': function,
-        'module': module,
         'function_type': function_type,
         'output_shape': output_shape,
         'output_shape_type': output_shape_type,
-        'output_shape_module': output_shape_module,
         'arguments': self.arguments
     }
     base_config = super(Lambda, self).get_config()
@@ -753,16 +745,8 @@ class Lambda(Layer):
   def from_config(cls, config, custom_objects=None):
     config = config.copy()
     globs = globals()
-    module = config.pop('module', None)
-    if module in sys.modules:
-      globs.update(sys.modules[module].__dict__)
-    elif module is not None:
-      # Note: we don't know the name of the function if it's a lambda.
-      warnings.warn('{} is not loaded, but a Lambda layer uses it. '
-                    'It may cause errors.'.format(module)
-                    , UserWarning)
     if custom_objects:
-      globs.update(custom_objects)
+      globs = dict(list(globs.items()) + list(custom_objects.items()))
     function_type = config.pop('function_type')
     if function_type == 'function':
       # Simple lookup in custom objects
@@ -776,14 +760,6 @@ class Lambda(Layer):
     else:
       raise TypeError('Unknown function type:', function_type)
 
-    output_shape_module = config.pop('output_shape_module', None)
-    if output_shape_module in sys.modules:
-      globs.update(sys.modules[output_shape_module].__dict__)
-    elif output_shape_module is not None:
-      # Note: we don't know the name of the function if it's a lambda.
-      warnings.warn('{} is not loaded, but a Lambda layer uses it. '
-                    'It may cause errors.'.format(output_shape_module)
-                    , UserWarning)
     output_shape_type = config.pop('output_shape_type')
     if output_shape_type == 'function':
       # Simple lookup in custom objects
diff --git a/tensorflow/python/keras/models_test.py b/tensorflow/python/keras/models_test.py
index e6e45902a8..c616d8f24f 100644
--- a/tensorflow/python/keras/models_test.py
+++ b/tensorflow/python/keras/models_test.py
@@ -144,19 +144,5 @@ class CheckpointingTests(test.TestCase):
     model.load_weights(save_prefix)
     self.assertEqual(12., self.evaluate(beta1_power))
 
-class TestModelBackend(test.TestCase):
-
-  def test_model_backend_float64_use_cases(self):
-    # Test case for GitHub issue 19318
-    floatx = keras.backend.floatx()
-    keras.backend.set_floatx('float64')
-
-    x = keras.Input((5,))
-    y = keras.layers.Dense(1)(x)
-    model = keras.models.Model(x, y)
-    model.compile('rmsprop', 'mse')
-
-    keras.backend.set_floatx(floatx)
-
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/as_string_op_test.py b/tensorflow/python/kernel_tests/as_string_op_test.py
index 94ed8ebd31..9d54add264 100644
--- a/tensorflow/python/kernel_tests/as_string_op_test.py
+++ b/tensorflow/python/kernel_tests/as_string_op_test.py
@@ -130,16 +130,6 @@ class AsStringOpTest(test.TestCase):
       result = output.eval(feed_dict={input_: int_inputs_})
       self.assertAllEqual(s(result), ["%d" % x for x in int_inputs_])
 
-  def testHalfInt(self):
-    s = lambda strs: [x.decode("ascii") for x in strs]
-
-    with self.test_session():
-      input_ = array_ops.placeholder(dtypes.int16)
-      int_inputs_ = [np.iinfo(np.int16).min, np.iinfo(np.int16).max]
-      output = string_ops.as_string(input_)
-      result = output.eval(feed_dict={input_: int_inputs_})
-      self.assertAllEqual(s(result), ["%d" % x for x in int_inputs_])
-
   def testBool(self):
     bool_inputs_ = [False, True]
     s = lambda strs: [x.decode("ascii") for x in strs]
diff --git a/tensorflow/python/kernel_tests/betainc_op_test.py b/tensorflow/python/kernel_tests/betainc_op_test.py
index 16fdedac41..08b03f8518 100644
--- a/tensorflow/python/kernel_tests/betainc_op_test.py
+++ b/tensorflow/python/kernel_tests/betainc_op_test.py
@@ -172,7 +172,7 @@ class BetaincTest(test.TestCase):
       tf_gout_t = math_ops.betainc(tf_ga_s, tf_gb_s, tf_gx_s)
       err = gradient_checker.compute_gradient_error(
           [tf_gx_s], [gx_s.shape], tf_gout_t, gx_s.shape)
-      tf_logging.info("betainc gradient err = %g " % err)
+      print("betainc gradient err = %g " % err)
       self.assertLess(err, err_tolerance)
 
       # Test broadcast gradient
@@ -181,7 +181,7 @@ class BetaincTest(test.TestCase):
       tf_gout_t = math_ops.betainc(tf_ga_s, tf_gb_s, tf_gx_s)
       err = gradient_checker.compute_gradient_error(
           [tf_gx_s], [()], tf_gout_t, ga_s.shape)
-      tf_logging.info("betainc gradient err = %g " % err)
+      print("betainc gradient err = %g " % err)
       self.assertLess(err, err_tolerance)
 
 
diff --git a/tensorflow/python/kernel_tests/clip_ops_test.py b/tensorflow/python/kernel_tests/clip_ops_test.py
index fb52d10475..e08123b041 100644
--- a/tensorflow/python/kernel_tests/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/clip_ops_test.py
@@ -18,12 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.platform import test
@@ -417,16 +414,6 @@ class ClipTest(test.TestCase):
 
     self.assertAllClose(np_ans, tf_ans)
 
-  def testClipByValueEmptyTensor(self):
-    # Test case for GitHub issue 19337
-    zero = array_ops.placeholder(dtype=dtypes.float32, shape=None)
-    x = clip_ops.clip_by_value(zero, zero, zero)
-    y = clip_ops.clip_by_value(zero, 1.0, 1.0)
-    z = clip_ops.clip_by_value(zero, zero, 1.0)
-    w = clip_ops.clip_by_value(zero, 1.0, zero)
-    with self.test_session(use_gpu=True) as sess:
-      sess.run([x, y, z, w], feed_dict={zero: np.zeros((7, 0))})
-
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index 80ba7dafc9..8699fd5b25 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -312,8 +312,8 @@ class Conv2DTest(test.TestCase):
       expected_values = self.evaluate(expected_results)
       computed_values = self.evaluate(computed_results)
       for e_value, c_value in zip(expected_values, computed_values):
-        tf_logging.info("expected = ", e_value)
-        tf_logging.info("actual = ", c_value)
+        print("expected = ", e_value)
+        print("actual = ", c_value)
         self.assertAllClose(
             e_value.flatten(), c_value.flatten(), atol=tolerance, rtol=1e-4)
 
@@ -337,8 +337,8 @@ class Conv2DTest(test.TestCase):
       for i in range(len(tensors)):
         conv = tensors[i]
         value = values[i]
-        tf_logging.info("expected = ", expected)
-        tf_logging.info("actual = ", value)
+        print("expected = ", expected)
+        print("actual = ", value)
         tol = 1e-5
         if value.dtype == np.float16:
           tol = 1e-3
@@ -547,8 +547,8 @@ class Conv2DTest(test.TestCase):
       # "values" consists of two tensors for two backprops
       value = self.evaluate(conv)
       self.assertShapeEqual(value, conv)
-    tf_logging.info("expected = ", expected)
-    tf_logging.info("actual = ", value)
+    print("expected = ", expected)
+    print("actual = ", value)
     self.assertArrayNear(expected, value.flatten(), err)
 
   def _CompareBackpropInput(self, input_sizes, filter_sizes, output_sizes,
@@ -723,8 +723,8 @@ class Conv2DTest(test.TestCase):
             data_format=data_format)
         value = self.evaluate(conv)
         self.assertShapeEqual(value, conv)
-      tf_logging.info("expected = ", expected)
-      tf_logging.info("actual = ", value)
+      print("expected = ", expected)
+      print("actual = ", value)
       self.assertArrayNear(expected, value.flatten(), 1e-5)
 
   def _CompareBackFilter(self, input_sizes, filter_sizes, output_sizes,
@@ -912,8 +912,8 @@ class Conv2DTest(test.TestCase):
         value_2 = sess.run(conv_2)
         self.assertShapeEqual(value, conv)
         self.assertShapeEqual(value_2, conv_2)
-      tf_logging.info("expected = ", value_2)
-      tf_logging.info("actual = ", value)
+      print("expected = ", value_2)
+      print("actual = ", value)
       self.assertArrayNear(value_2.flatten(), value.flatten(), err)
 
   # Testing for backprops
@@ -965,8 +965,8 @@ class Conv2DTest(test.TestCase):
         value_2 = sess.run(conv_2)
         self.assertShapeEqual(value, conv)
         self.assertShapeEqual(value_2, conv_2)
-      tf_logging.info("expected = ", value_2)
-      tf_logging.info("actual = ", value)
+      print("expected = ", value_2)
+      print("actual = ", value)
       self.assertArrayNear(value_2.flatten(), value.flatten(), err)
 
   def testConv2D2x2Depth3ValidBackpropFilterStride1x1Dilation2x1(self):
@@ -1178,7 +1178,7 @@ class Conv2DTest(test.TestCase):
           # since fp16 numerical gradients are too imprecise.
           err = np.fabs(jacob_t - reference_jacob_t).max()
 
-        tf_logging.info("conv_2d gradient error = ", err)
+        print("conv_2d gradient error = ", err)
         self.assertLess(err, 0.002)
 
   def testInputGradientValidPaddingStrideOne(self):
@@ -1546,7 +1546,7 @@ class DepthwiseConv2DTest(test.TestCase):
       conv = nn_impl.depthwise_conv2d(
           t1, t2, strides=[1, stride, stride, 1], padding=padding)
       value = sess.run(conv)
-    tf_logging.info("value = ", value)
+    print("value = ", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
 
@@ -1668,7 +1668,7 @@ class SeparableConv2DTest(test.TestCase):
         conv = array_ops.transpose(conv, [0, 2, 3, 1])
 
       value = sess.run(conv)
-    tf_logging.info("value = ", value)
+    print("value = ", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
 
@@ -1826,7 +1826,7 @@ class Conv2DBenchmark(test.Benchmark):
         wall_time = time.time() - start
         self.report_benchmark(
             name="conv_stack_iter_%d" % iter_index, wall_time=wall_time)
-        tf_logging.info("conv_stack_iter_%d: %.4f" % (iter_index, wall_time))
+        print("conv_stack_iter_%d: %.4f" % (iter_index, wall_time))
 
 
 def GetInceptionFwdTest(input_size, filter_size, stride, padding,
diff --git a/tensorflow/python/kernel_tests/gather_nd_op_test.py b/tensorflow/python/kernel_tests/gather_nd_op_test.py
index 58e2a8ac2a..91ebe8de99 100644
--- a/tensorflow/python/kernel_tests/gather_nd_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_nd_op_test.py
@@ -197,21 +197,7 @@ class GatherNdTest(test.TestCase):
     self.assertEqual(None, shape.ndims)
     self.assertEqual(None, shape[0].value)
 
-  def testBadIndicesCPU(self):
-    with self.test_session(use_gpu=False):
-      params = [0, 1, 2]
-      indices = [[[0], [7]]]  # Make this one higher rank
-      gather_nd = array_ops.gather_nd(params, indices)
-      with self.assertRaisesOpError(
-          r"flat indices\[1, :\] = \[7\] does not index into param "
-          r"\(shape: \[3\]\)"):
-        gather_nd.eval()
-
-  def _disabledTestBadIndicesGPU(self):
-    # TODO disabled due to different behavior on GPU and CPU
-    # On GPU the bad indices do not raise error but fetch 0 values
-    if not test.is_gpu_available():
-      return
+  def testBadIndices(self):
     with self.test_session(use_gpu=True):
       params = [0, 1, 2]
       indices = [[[0], [7]]]  # Make this one higher rank
@@ -221,21 +207,7 @@ class GatherNdTest(test.TestCase):
           r"\(shape: \[3\]\)"):
         gather_nd.eval()
 
-  def testBadIndicesWithSlicesCPU(self):
-    with self.test_session(use_gpu=False):
-      params = [[0, 1, 2]]
-      indices = [[[0], [0], [1]]]  # Make this one higher rank
-      gather_nd = array_ops.gather_nd(params, indices)
-      with self.assertRaisesOpError(
-          r"flat indices\[2, :\] = \[1\] does not index into param "
-          r"\(shape: \[1,3\]\)"):
-        gather_nd.eval()
-
-  def _disabledTestBadIndicesWithSlicesGPU(self):
-    # TODO disabled due to different behavior on GPU and CPU
-    # On GPU the bad indices do not raise error but fetch 0 values
-    if not test.is_gpu_available():
-      return
+  def testBadIndicesWithSlices(self):
     with self.test_session(use_gpu=True):
       params = [[0, 1, 2]]
       indices = [[[0], [0], [1]]]  # Make this one higher rank
diff --git a/tensorflow/python/kernel_tests/gather_op_test.py b/tensorflow/python/kernel_tests/gather_op_test.py
index 033fa95935..a2fcd751df 100644
--- a/tensorflow/python/kernel_tests/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_op_test.py
@@ -27,8 +27,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.platform import test
 
-_TEST_TYPES = (dtypes.int64, dtypes.float32,
-               dtypes.complex64, dtypes.complex128)
+_TEST_TYPES = (dtypes.float32, dtypes.complex64, dtypes.complex128)
 
 
 class GatherTest(test.TestCase):
@@ -123,9 +122,6 @@ class GatherTest(test.TestCase):
                 gather, [tf_params, tf_indices, tf_axis], gather_grad)
             self.assertEqual(indices_grad, None)
             self.assertEqual(axis_grad, None)
-            if dtype.is_integer:
-              self.assertEqual(params_grad, None)
-              continue
             # For axis 0, we are able to create an efficient IndexedSlices for
             # the gradient.
             if axis == 0:
@@ -181,19 +177,7 @@ class GatherTest(test.TestCase):
     gather_t = array_ops.gather(params, indices, axis=axis)
     self.assertEqual(None, gather_t.shape)
 
-  def testBadIndicesCPU(self):
-    with self.test_session(use_gpu=False):
-      params = [[0, 1, 2], [3, 4, 5]]
-      with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 2\)"):
-        array_ops.gather(params, [[7]], axis=0).eval()
-      with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 3\)"):
-        array_ops.gather(params, [[7]], axis=1).eval()
-
-  def _disabledTestBadIndicesGPU(self):
-    # TODO disabled due to different behavior on GPU and CPU
-    # On GPU the bad indices do not raise error but fetch 0 values
-    if not test.is_gpu_available():
-      return
+  def testBadIndices(self):
     with self.test_session(use_gpu=True):
       params = [[0, 1, 2], [3, 4, 5]]
       with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 2\)"):
diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py
index 795aa67248..a9b55854f1 100644
--- a/tensorflow/python/kernel_tests/init_ops_test.py
+++ b/tensorflow/python/kernel_tests/init_ops_test.py
@@ -362,33 +362,6 @@ class UniformUnitScalingInitializationTest(test.TestCase):
         dtype=dtypes.string)
 
 
-class VarianceScalingInitializationTest(test.TestCase):
-
-  def testNormalDistribution(self):
-    shape = [100, 100]
-    expect_mean = 0.
-    expect_var = 1. / shape[0]
-    init = init_ops.variance_scaling_initializer(distribution='normal')
-
-    with self.test_session(use_gpu=True):
-      x = init(shape).eval()
-
-    self.assertNear(np.mean(x), expect_mean, err=1e-2)
-    self.assertNear(np.var(x), expect_var, err=1e-2)
-
-  def testUniformDistribution(self):
-    shape = [100, 100]
-    expect_mean = 0.
-    expect_var = 1. / shape[0]
-    init = init_ops.variance_scaling_initializer(distribution='uniform')
-
-    with self.test_session(use_gpu=True):
-      x = init(shape).eval()
-
-    self.assertNear(np.mean(x), expect_mean, err=1e-2)
-    self.assertNear(np.var(x), expect_var, err=1e-2)
-
-
 # TODO(vrv): move to sequence_ops_test?
 class RangeTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index e95c729715..a0c372db7d 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -947,7 +947,7 @@ class PoolingTest(test.TestCase):
           output_sizes,
           x_init_value=x_init_value,
           delta=1e-2)
-    tf_logging.info("%s gradient error = " % func_name, err)
+    print("%s gradient error = " % func_name, err)
     self.assertLess(err, err_tolerance)
 
   def _ConstructAndTestSecondGradient(self,
@@ -1024,7 +1024,7 @@ class PoolingTest(test.TestCase):
           input_sizes,
           x_init_value=x_init_value,
           delta=1e-2)
-    tf_logging.info("%s second-order gradient error = " % func_name, err)
+    print("%s second-order gradient error = " % func_name, err)
     self.assertLess(err, err_tolerance)
 
   def _testMaxPoolGradValidPadding1_1(self, data_format, use_gpu):
diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index 253e43920b..677253946e 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -19,7 +19,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import gc
 import re
 
 import numpy as np
@@ -435,29 +434,13 @@ class PyFuncTest(test.TestCase):
 
   # ----- Tests shared by py_func and eager_py_func -----
   def testCleanup(self):
-    # Delete everything created by previous tests to avoid side effects.
-    ops.reset_default_graph()
-    gc.collect()
-    initial_size = script_ops._py_funcs.size()
-    # Encapsulate the graph generation, so locals can be deleted.
-    def make_graphs():
-      for _ in xrange(1000):
-        g = ops.Graph()
-        with g.as_default():
-          c = constant_op.constant([1.], dtypes.float32)
-          _ = script_ops.py_func(lambda x: x + 1, [c], [dtypes.float32])
-          _ = script_ops.eager_py_func(lambda x: x + 1, [c], [dtypes.float32])
-          # These ops have a reference to 'c' which has a reference to the graph.
-          # Checks if the functions are being deleted though the graph is referenced from them.
-          # (see #18292)
-          _ = script_ops.py_func(lambda x: x + c.shape[0], [c], [dtypes.float32])
-          _ = script_ops.eager_py_func(lambda x: x + c.shape[0], [c], [dtypes.float32])
- 
-    # Call garbage collector to enforce deletion.
-    make_graphs()
-    ops.reset_default_graph()
-    gc.collect()
-    self.assertEqual(initial_size, script_ops._py_funcs.size())
+    for _ in xrange(1000):
+      g = ops.Graph()
+      with g.as_default():
+        c = constant_op.constant([1.], dtypes.float32)
+        _ = script_ops.py_func(lambda x: x + 1, [c], [dtypes.float32])
+        _ = script_ops.eager_py_func(lambda x: x + 1, [c], [dtypes.float32])
+    self.assertLess(script_ops._py_funcs.size(), 100)
 
   # ----- Tests for eager_py_func -----
   @test_util.run_in_graph_and_eager_modes()
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index faa4b49a8d..79fe927b8a 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -144,9 +144,7 @@ class StatefulScatterNdTest(test.TestCase):
         self.assertAllClose(new, ref_var.eval())
 
   def _VariableRankTests(self, np_scatter, tf_scatter):
-    for vtype in (np.int32,
-                  np.float32, np.float64,
-                  np.complex64, np.complex128):
+    for vtype in (np.float32, np.float64, np.complex64, np.complex128):
       for itype in (np.int32, np.int64):
         self._VariableRankTest(np_scatter, tf_scatter, vtype, itype)
 
@@ -223,7 +221,7 @@ class StatefulScatterNdTest(test.TestCase):
   #   self._VariableRankTests(_NumpyDiv, state_ops.scatter_nd_div)
 
   def _ScatterRepeatIndicesTest(self, np_scatter, tf_scatter):
-    for vtype in (np.int32, np.float32, np.float64):
+    for vtype in (np.float32, np.float64):
       for itype in (np.int32, np.int64):
         self._VariableRankTest(
             np_scatter, tf_scatter, vtype, itype, repeat_indices=True)
diff --git a/tensorflow/python/kernel_tests/scatter_ops_test.py b/tensorflow/python/kernel_tests/scatter_ops_test.py
index 1a0fa744ae..c70a4ffce7 100644
--- a/tensorflow/python/kernel_tests/scatter_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_ops_test.py
@@ -159,13 +159,7 @@ class ScatterTest(test.TestCase):
 
           # Clips small values to avoid division by zero.
           def clip_small_values(x):
-            threshold = 1e-4
-            sign = np.sign(x)
-
-            if isinstance(x, np.int32):
-              threshold = 1
-              sign = np.random.choice([-1, 1])
-            return threshold * sign if np.abs(x) < threshold else x
+            return 1e-4 * np.sign(x) if np.abs(x) < 1e-4 else x
 
           updates = np.vectorize(clip_small_values)(updates)
           old = _AsType(np.random.randn(*((first_dim,) + extra_shape)), vtype)
@@ -187,11 +181,7 @@ class ScatterTest(test.TestCase):
                          tf_scatter,
                          repeat_indices=False,
                          updates_are_scalar=False):
-    vtypes = [np.float32, np.float64]
-    if tf_scatter != state_ops.scatter_div:
-      vtypes.append(np.int32)
-
-    for vtype in vtypes:
+    for vtype in (np.float32, np.float64):
       for itype in (np.int32, np.int64):
         self._VariableRankTest(tf_scatter, vtype, itype, repeat_indices,
                                updates_are_scalar)
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index a82855dfeb..794be096b7 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -264,9 +264,7 @@ class UnsortedSegmentTest(SegmentReductionHelper):
 
     # A subset of ops has been enabled for complex numbers
     self.complex_ops_list = [(np.add, None,
-                              math_ops.unsorted_segment_sum, lambda t: 0),
-                             (np.ndarray.__mul__, None,
-                              math_ops.unsorted_segment_prod, lambda t: 1)]
+                              math_ops.unsorted_segment_sum, lambda t: 0)]
     self.differentiable_dtypes = [dtypes_lib.float16, dtypes_lib.float32,
                                   dtypes_lib.float64]
     self.all_dtypes = (self.differentiable_dtypes +
diff --git a/tensorflow/python/kernel_tests/string_split_op_test.py b/tensorflow/python/kernel_tests/string_split_op_test.py
index e20daccb28..a5bd1b6ee0 100644
--- a/tensorflow/python/kernel_tests/string_split_op_test.py
+++ b/tensorflow/python/kernel_tests/string_split_op_test.py
@@ -146,101 +146,5 @@ class StringSplitOpTest(test.TestCase):
       self.assertAllEqual(shape, [3, 1])
 
 
-class StringSplitV2OpTest(test.TestCase):
-
-  def testSplitV2(self):
-    strings = ["pigs on the wing", "animals"]
-
-    with self.test_session() as sess:
-      tokens = string_ops.string_split_v2(strings)
-      indices, values, shape = sess.run(tokens)
-      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0]])
-      self.assertAllEqual(values, [b"pigs", b"on", b"the", b"wing", b"animals"])
-      self.assertAllEqual(shape, [2, 4])
-
-  def testSplitV2MultiCharSeparator(self):
-    # Match Python behavior:
-    # >>> '1<>2<>3'.split('<>')
-    # ['1', '2', '3']
-    # >>> "<><>4<>5<><>6<>".split("<>")
-    # ['', '', '4', '5', '', '6', '']
-    strings = ["1<>2<>3", "<><>4<>5<><>6<>"]
-
-    with self.test_session() as sess:
-      tokens = string_ops.string_split_v2(strings, sep="<>")
-      indices, values, shape = sess.run(tokens)
-      self.assertAllEqual(
-          indices, [[0, 0], [0, 1], [0, 2],
-                    [1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5], [1, 6]])
-      self.assertAllEqual(values, [b"1", b"2", b"3",
-                                   b"", b"", b"4", b"5", b"", b"6", b""])
-      self.assertAllEqual(shape, [2, 7])
-
-  def testSplitV2SimpleSeparator(self):
-    # Match Python behavior:
-    # >>> '1,2,3'.split(',')
-    # ['1', '2', '3']
-    # >>> '1,2,,3,'.split(',')
-    # ['1', '2', '', '3', '']
-    strings = ["1,2,3", "4,5,,6,"]
-
-    with self.test_session() as sess:
-      tokens = string_ops.string_split_v2(strings, sep=',')
-      indices, values, shape = sess.run(tokens)
-      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2],
-                                    [1, 0], [1, 1], [1, 2], [1, 3], [1, 4]])
-      self.assertAllEqual(values, [b"1", b"2", b"3",
-                                   b"4", b"5", b"", b"6", b""])
-      self.assertAllEqual(shape, [2, 5])
-
-  def testSplitV2EmptySeparator(self):
-    # Match Python behavior:
-    # >>> '1 2 3'.split()
-    # ['1', '2', '3']
-    #>>> '   1   2   3   '.split()
-    #['1', '2', '3']
-    strings = ["1 2 3", "  4  5    6  "]
-
-    with self.test_session() as sess:
-      tokens = string_ops.string_split_v2(strings)
-      indices, values, shape = sess.run(tokens)
-      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2],
-                                    [1, 0], [1, 1], [1, 2]])
-      self.assertAllEqual(values, [b"1", b"2", b"3", b"4", b"5", b"6"])
-      self.assertAllEqual(shape, [2, 3])
-
-  def testSplitV2SimpleSeparatorMaxSplit(self):
-    # Match Python behavior:
-    # >>> '1,2,3'.split(',', maxsplit=1)
-    # ['1', '2,3']
-    # >>> '4,5,,6,'.split(',', maxsplit=1)
-    # ['4', '5,,6,']
-    strings = ["1,2,3", "4,5,,6,"]
-
-    with self.test_session() as sess:
-      tokens = string_ops.string_split_v2(strings, sep=',', maxsplit=1)
-      indices, values, shape = sess.run(tokens)
-      self.assertAllEqual(indices, [[0, 0], [0, 1],
-                                    [1, 0], [1, 1]])
-      self.assertAllEqual(values, [b"1", b"2,3", b"4", b"5,,6,"])
-      self.assertAllEqual(shape, [2, 2])
-
-  def testSplitV2EmptySeparatorMaxSplit(self):
-    # Match Python behavior:
-    # '1 2 3'.split(maxsplit=1)
-    # ['1', '2 3']
-    # >>> "  4  5    6  ".split(maxsplit=1)
-    # ['4', '5    6  ']
-    strings = ["1 2 3", "  4  5    6  "]
-
-    with self.test_session() as sess:
-      tokens = string_ops.string_split_v2(strings, maxsplit=1)
-      indices, values, shape = sess.run(tokens)
-      self.assertAllEqual(indices, [[0, 0], [0, 1],
-                                    [1, 0], [1, 1]])
-      self.assertAllEqual(values, [b"1", b"2 3", b"4", b"5    6  "])
-      self.assertAllEqual(shape, [2, 2])
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index fae63b1132..8129334703 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -2619,10 +2619,6 @@ reverse.__doc__ = gen_array_ops.reverse_v2.__doc__
 
 # pylint: disable=redefined-builtin
 @tf_export("reverse_sequence")
-@deprecation.deprecated_args(
-    None, "seq_dim is deprecated, use seq_axis instead", "seq_dim")
-@deprecation.deprecated_args(
-    None, "batch_dim is deprecated, use batch_axis instead", "batch_dim")
 def reverse_sequence(input,
                      seq_lengths,
                      seq_axis=None,
diff --git a/tensorflow/python/ops/gradient_checker.py b/tensorflow/python/ops/gradient_checker.py
index 94c8d79335..12afcd0b51 100644
--- a/tensorflow/python/ops/gradient_checker.py
+++ b/tensorflow/python/ops/gradient_checker.py
@@ -283,10 +283,10 @@ def compute_gradient(x,
   numbers.  For example, if `x` is complex with shape `[m]` and `y` is complex
   with shape `[n]`, each Jacobian `J` will have shape `[m * 2, n * 2]` with
 
-      J[::2, ::2] = d(Re y)/d(Re x)
-      J[::2, 1::2] = d(Im y)/d(Re x)
-      J[1::2, ::2] = d(Re y)/d(Im x)
-      J[1::2, 1::2] = d(Im y)/d(Im x)
+      J[:m, :n] = d(Re y)/d(Re x)
+      J[:m, n:] = d(Im y)/d(Re x)
+      J[m:, :n] = d(Re y)/d(Im x)
+      J[m:, n:] = d(Im y)/d(Im x)
 
   Args:
     x: a tensor or list of tensors
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index f27d9224c1..bdcf420980 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -28,7 +28,6 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_image_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
@@ -259,14 +258,14 @@ def random_flip_up_down(image, seed=None):
   dimension, which is `height`.  Otherwise output the image as-is.
 
   Args:
-    image: 4-D Tensor of shape `[batch, height, width, channels]` or
-           3-D Tensor of shape `[height, width, channels]`.
+    image: A 3-D tensor of shape `[height, width, channels].`
     seed: A Python integer. Used to create a random seed. See
       @{tf.set_random_seed}
       for behavior.
 
   Returns:
-    A tensor of the same type and shape as `image`.
+    A 3-D tensor of the same type and shape as `image`.
+
   Raises:
     ValueError: if the shape of `image` not supported.
   """
@@ -281,14 +280,13 @@ def random_flip_left_right(image, seed=None):
   second dimension, which is `width`.  Otherwise output the image as-is.
 
   Args:
-    image: 4-D Tensor of shape `[batch, height, width, channels]` or
-           3-D Tensor of shape `[height, width, channels]`.
+    image: A 3-D tensor of shape `[height, width, channels].`
     seed: A Python integer. Used to create a random seed. See
       @{tf.set_random_seed}
       for behavior.
 
   Returns:
-    A tensor of the same type and shape as `image`.
+    A 3-D tensor of the same type and shape as `image`.
 
   Raises:
     ValueError: if the shape of `image` not supported.
@@ -299,8 +297,7 @@ def random_flip_left_right(image, seed=None):
 def _random_flip(image, flip_index, seed, scope_name):
   """Randomly (50% chance) flip an image along axis `flip_index`.
     Args:
-      image: 4-D Tensor of shape `[batch, height, width, channels]` or
-             3-D Tensor of shape `[height, width, channels]`.
+      image: A 3-D tensor of shape `[height, width, channels].`
       flip_index: The dimension along which to flip the image.
                   Vertical: 0, Horizontal: 1
       seed: A Python integer. Used to create a random seed. See
@@ -309,37 +306,22 @@ def _random_flip(image, flip_index, seed, scope_name):
       scope_name: Name of the scope in which the ops are added.
 
     Returns:
-      A tensor of the same type and shape as `image`.
+      A 3-D tensor of the same type and shape as `image`.
 
     Raises:
       ValueError: if the shape of `image` not supported.
   """
   with ops.name_scope(None, scope_name, [image]) as scope:
     image = ops.convert_to_tensor(image, name='image')
-    image = _AssertAtLeast3DImage(image)
-    shape = image.get_shape()
-    if shape.ndims == 3 or shape.ndims is None:
-      uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
-      mirror_cond = math_ops.less(uniform_random, .5)
-      result = control_flow_ops.cond(
-          mirror_cond,
-          lambda: array_ops.reverse(image, [flip_index]),
-          lambda: image,
-          name=scope
-      )
-      return fix_image_flip_shape(image, result)
-    elif shape.ndims == 4:
-      uniform_random = random_ops.random_uniform(
-          [array_ops.shape(image)[0]], 0, 1.0, seed=seed
-      )
-      mirror_cond = math_ops.less(uniform_random, .5)
-      return array_ops.where(
-          mirror_cond,
-          image,
-          functional_ops.map_fn(lambda x: array_ops.reverse(x, [flip_index]), image, dtype=image.dtype)
-      )
-    else:
-      raise ValueError('\'image\' must have either 3 or 4 dimensions.')
+    image = _Assert3DImage(image)
+    uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
+    mirror_cond = math_ops.less(uniform_random, .5)
+    result = control_flow_ops.cond(
+        mirror_cond,
+        lambda: array_ops.reverse(image, [flip_index]),
+        lambda: image,
+        name=scope)
+    return fix_image_flip_shape(image, result)
 
 
 @tf_export('image.flip_left_right')
@@ -1652,13 +1634,13 @@ def is_jpeg(contents, name=None):
 
 
 @tf_export('image.decode_image')
-def decode_image(contents, channels=None, dtype=dtypes.uint8, name=None):
+def decode_image(contents, channels=None, name=None):
   """Convenience function for `decode_bmp`, `decode_gif`, `decode_jpeg`,
   and `decode_png`.
 
   Detects whether an image is a BMP, GIF, JPEG, or PNG, and performs the
-  appropriate operation to convert the input bytes `string` into a `Tensor`
-  of type `dtype`.
+  appropriate operation to convert the input bytes `string` into a `Tensor` of
+  type `uint8`.
 
   Note: `decode_gif` returns a 4-D array `[num_frames, height, width, 3]`, as
   opposed to `decode_bmp`, `decode_jpeg` and `decode_png`, which return 3-D
@@ -1670,11 +1652,10 @@ def decode_image(contents, channels=None, dtype=dtypes.uint8, name=None):
     contents: 0-D `string`. The encoded image bytes.
     channels: An optional `int`. Defaults to `0`. Number of color channels for
       the decoded image.
-    dtype: The desired DType of the returned `Tensor`.
     name: A name for the operation (optional)
 
   Returns:
-    `Tensor` with type `dtype` and shape `[height, width, num_channels]` for
+    `Tensor` with type `uint8` with shape `[height, width, num_channels]` for
       BMP, JPEG, and PNG images and shape `[num_frames, height, width, 3]` for
       GIF images.
 
@@ -1698,7 +1679,7 @@ def decode_image(contents, channels=None, dtype=dtypes.uint8, name=None):
       channels_msg = 'Channels must be in (None, 0, 3) when decoding BMP images'
       assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
       with ops.control_dependencies([assert_decode, assert_channels]):
-        return convert_image_dtype(gen_image_ops.decode_bmp(contents), dtype)
+        return gen_image_ops.decode_bmp(contents)
 
     def _gif():
       # Create assert to make sure that channels is not set to 1
@@ -1711,7 +1692,7 @@ def decode_image(contents, channels=None, dtype=dtypes.uint8, name=None):
       channels_msg = 'Channels must be in (None, 0, 3) when decoding GIF images'
       assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
       with ops.control_dependencies([assert_channels]):
-        return convert_image_dtype(gen_image_ops.decode_gif(contents), dtype)
+        return gen_image_ops.decode_gif(contents)
 
     def check_gif():
       # Create assert op to check that bytes are GIF decodable
@@ -1720,11 +1701,7 @@ def decode_image(contents, channels=None, dtype=dtypes.uint8, name=None):
 
     def _png():
       """Decodes a PNG image."""
-      return convert_image_dtype(
-          gen_image_ops.decode_png(contents, channels,
-                                   dtype=dtypes.uint8
-                                   if dtype == dtypes.uint8
-                                   else dtypes.uint16), dtype)
+      return gen_image_ops.decode_png(contents, channels)
 
     def check_png():
       """Checks if an image is PNG."""
@@ -1740,8 +1717,7 @@ def decode_image(contents, channels=None, dtype=dtypes.uint8, name=None):
                       'images')
       assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
       with ops.control_dependencies([assert_channels]):
-        return convert_image_dtype(
-            gen_image_ops.decode_jpeg(contents, channels), dtype)
+        return gen_image_ops.decode_jpeg(contents, channels)
 
     # Decode normal JPEG images (start with \xff\xd8\xff\xe0)
     # as well as JPEG images with EXIF data (start with \xff\xd8\xff\xe1).
@@ -1902,7 +1878,7 @@ def sample_distorted_bounding_box(image_size,
       width / height within this range.
     area_range: An optional list of `floats`. Defaults to `[0.05, 1]`.
       The cropped area of the image must contain a fraction of the
-      supplied image within this range.
+      supplied image within in this range.
     max_attempts: An optional `int`. Defaults to `100`.
       Number of attempts at generating a cropped region of the image
       of the specified constraints. After `max_attempts` failures, return the
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 2a6ab26e96..45499dcce0 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -533,37 +533,6 @@ class FlipImageBenchmark(test.Benchmark):
         iters=benchmark_rounds,
         wall_time=step_time)
 
-  def _benchmarkBatchedRandomFlipLeftRight(self, device, cpu_count):
-    image_shape = [16, 299, 299, 3]
-    warmup_rounds = 100
-    benchmark_rounds = 1000
-    config = config_pb2.ConfigProto()
-    if cpu_count is not None:
-      config.inter_op_parallelism_threads = 1
-      config.intra_op_parallelism_threads = cpu_count
-    with session.Session("", graph=ops.Graph(), config=config) as sess:
-      with ops.device(device):
-        inputs = variables.Variable(
-            random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255,
-            trainable=False,
-            dtype=dtypes.float32)
-        run_op = image_ops.random_flip_left_right(inputs)
-        sess.run(variables.global_variables_initializer())
-        for i in xrange(warmup_rounds + benchmark_rounds):
-          if i == warmup_rounds:
-            start = time.time()
-          sess.run(run_op)
-    end = time.time()
-    step_time = (end - start) / benchmark_rounds
-    tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
-    print("benchmarkBatchedRandomFlipLeftRight_16_299_299_3_%s step_time: "
-          "%.2f us" %
-          (tag, step_time * 1e6))
-    self.report_benchmark(
-        name="benchmarkBatchedRandomFlipLeftRight_16_299_299_3_%s" % (tag),
-        iters=benchmark_rounds,
-        wall_time=step_time)
-
   def benchmarkFlipLeftRightCpu1(self):
     self._benchmarkFlipLeftRight("/cpu:0", 1)
 
@@ -582,15 +551,6 @@ class FlipImageBenchmark(test.Benchmark):
   def benchmarkRandomFlipLeftRightGpu(self):
     self._benchmarkRandomFlipLeftRight(test.gpu_device_name(), None)
 
-  def benchmarkBatchedRandomFlipLeftRightCpu1(self):
-    self._benchmarkBatchedRandomFlipLeftRight("/cpu:0", 1)
-
-  def benchmarkBatchedRandomFlipLeftRightCpuAll(self):
-    self._benchmarkBatchedRandomFlipLeftRight("/cpu:0", None)
-
-  def benchmarkBatchedRandomFlipLeftRightGpu(self):
-    self._benchmarkBatchedRandomFlipLeftRight(test.gpu_device_name(), None)
-
 
 class AdjustHueBenchmark(test.Benchmark):
 
@@ -1027,7 +987,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
 
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_left_right(x_tf, seed=seed)
+      y = image_ops.random_flip_left_right(x_tf)
       self.assertTrue(y.op.name.startswith("random_flip_left_right"))
 
       count_flipped = 0
@@ -1048,50 +1008,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       self.assertGreaterEqual(count_flipped, 20)
       self.assertGreaterEqual(count_unflipped, 20)
 
-  def testRandomFlipLeftRightWithBatch(self):
-    batch_size = 16
-    seed = 42
-
-    # create single item of test data
-    x_np_raw = np.array(
-        [[1, 2, 3], [1, 2, 3]], dtype=np.uint8
-    ).reshape([1, 2, 3, 1])
-    y_np_raw = np.array(
-        [[3, 2, 1], [3, 2, 1]], dtype=np.uint8
-    ).reshape([1, 2, 3, 1])
-
-    # create batched test data
-    x_np = np.vstack([x_np_raw for _ in range(batch_size)])
-    y_np = np.vstack([y_np_raw for _ in range(batch_size)])
-
-    with self.test_session(use_gpu=True):
-      x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_left_right(x_tf, seed=seed)
-      self.assertTrue(y.op.name.startswith("random_flip_left_right"))
-
-      count_flipped = 0
-      count_unflipped = 0
-      for _ in range(100):
-        y_tf = y.eval()
-
-        # check every element of the batch
-        for i in range(batch_size):
-          if y_tf[i][0][0] == 1:
-            self.assertAllEqual(y_tf[i], x_np[i])
-            count_unflipped += 1
-          else:
-            self.assertAllEqual(y_tf[i], y_np[i])
-            count_flipped += 1
-
-      # 100 trials, each containing batch_size elements
-      # Mean: 50 * batch_size
-      # Std Dev: ~5 * sqrt(batch_size)
-      # Six Sigma: 50 * batch_size - (5 * 6 * sqrt(batch_size))
-      #          = 50 * batch_size - 30 * sqrt(batch_size) = 800 - 30 * 4 = 680
-      six_sigma = 50 * batch_size - 30 * np.sqrt(batch_size)
-      self.assertGreaterEqual(count_flipped, six_sigma)
-      self.assertGreaterEqual(count_unflipped, six_sigma)
-
   def testInvolutionUpDown(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
 
@@ -1141,11 +1057,9 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[4, 5, 6], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
 
-    seed = 42
-
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_up_down(x_tf, seed=seed)
+      y = image_ops.random_flip_up_down(x_tf, seed=42)
       self.assertTrue(y.op.name.startswith("random_flip_up_down"))
       count_flipped = 0
       count_unflipped = 0
@@ -1165,50 +1079,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       self.assertGreaterEqual(count_flipped, 20)
       self.assertGreaterEqual(count_unflipped, 20)
 
-  def testRandomFlipUpDownWithBatch(self):
-    batch_size = 16
-    seed = 42
-
-    # create single item of test data
-    x_np_raw = np.array(
-        [[1, 2, 3], [4, 5, 6]], dtype=np.uint8
-    ).reshape([1, 2, 3, 1])
-    y_np_raw = np.array(
-        [[4, 5, 6], [1, 2, 3]], dtype=np.uint8
-    ).reshape([1, 2, 3, 1])
-
-    # create batched test data
-    x_np = np.vstack([x_np_raw for _ in range(batch_size)])
-    y_np = np.vstack([y_np_raw for _ in range(batch_size)])
-
-    with self.test_session(use_gpu=True):
-      x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_up_down(x_tf, seed=seed)
-      self.assertTrue(y.op.name.startswith("random_flip_up_down"))
-
-      count_flipped = 0
-      count_unflipped = 0
-      for _ in range(100):
-        y_tf = y.eval()
-
-        # check every element of the batch
-        for i in range(batch_size):
-          if y_tf[i][0][0] == 1:
-            self.assertAllEqual(y_tf[i], x_np[i])
-            count_unflipped += 1
-          else:
-            self.assertAllEqual(y_tf[i], y_np[i])
-            count_flipped += 1
-
-      # 100 trials, each containing batch_size elements
-      # Mean: 50 * batch_size
-      # Std Dev: ~5 * sqrt(batch_size)
-      # Six Sigma: 50 * batch_size - (5 * 6 * sqrt(batch_size))
-      #          = 50 * batch_size - 30 * sqrt(batch_size) = 800 - 30 * 4 = 680
-      six_sigma = 50 * batch_size - 30 * np.sqrt(batch_size)
-      self.assertGreaterEqual(count_flipped, six_sigma)
-      self.assertGreaterEqual(count_unflipped, six_sigma)
-
   def testInvolutionTranspose(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
 
@@ -1286,7 +1156,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     #Ops that support 4D input
     for op in [
         image_ops.flip_left_right, image_ops.flip_up_down,
-        image_ops.random_flip_left_right, image_ops.random_flip_up_down,
         image_ops.transpose_image, image_ops.rot90
     ]:
       transformed_unknown_dims_4 = op(p_unknown_dims_4)
@@ -1297,6 +1166,14 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
                                    "must be at least three-dimensional"):
         op(p_wrong_rank)
 
+    for op in [
+        image_ops.random_flip_left_right,
+        image_ops.random_flip_up_down,
+    ]:
+      with self.assertRaisesRegexp(ValueError, "must be three-dimensional"):
+        op(p_wrong_rank)
+
+
   def testRot90GroupOrder(self):
     image = np.arange(24, dtype=np.uint8).reshape([2, 4, 3])
     with self.test_session(use_gpu=True):
@@ -1331,6 +1208,41 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         y_np = np.rot90(image, k=k, axes=(1, 2))
         self.assertAllEqual(y_np, y_tf.eval({k_placeholder: k}))
 
+class RandomFlipTest(test_util.TensorFlowTestCase):
+
+  def testRandomLeftRight(self):
+    x_np = np.array([0, 1], dtype=np.uint8).reshape([1, 2, 1])
+    num_iterations = 500
+
+    hist = [0, 0]
+    with self.test_session(use_gpu=True):
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.random_flip_left_right(x_tf)
+      for _ in xrange(num_iterations):
+        y_np = y.eval().flatten()[0]
+        hist[y_np] += 1
+
+    # Ensure that each entry is observed within 4 standard deviations.
+    four_stddev = 4.0 * np.sqrt(num_iterations / 2.0)
+    self.assertAllClose(hist, [num_iterations / 2.0] * 2, atol=four_stddev)
+
+  def testRandomUpDown(self):
+    x_np = np.array([0, 1], dtype=np.uint8).reshape([2, 1, 1])
+    num_iterations = 500
+
+    hist = [0, 0]
+    with self.test_session(use_gpu=True):
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.random_flip_up_down(x_tf)
+      for _ in xrange(num_iterations):
+        y_np = y.eval().flatten()[0]
+        hist[y_np] += 1
+
+    # Ensure that each entry is observed within 4 standard deviations.
+    four_stddev = 4.0 * np.sqrt(num_iterations / 2.0)
+    self.assertAllClose(hist, [num_iterations / 2.0] * 2, atol=four_stddev)
+
+
 class AdjustContrastTest(test_util.TensorFlowTestCase):
 
   def _testContrast(self, x_np, y_np, contrast_factor):
@@ -3968,88 +3880,5 @@ class SobelEdgesTest(test_util.TensorFlowTestCase):
       self.assertAllClose(expected_batch, actual_sobel)
 
 
-class DecodeImageTest(test_util.TensorFlowTestCase):
-
-  def testJpegUint16(self):
-    with self.test_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/jpeg/testdata"
-      jpeg0 = io_ops.read_file(os.path.join(base, "jpeg_merge_test1.jpg"))
-      image0 = image_ops.decode_image(jpeg0, dtype=dtypes.uint16)
-      image1 = image_ops.convert_image_dtype(image_ops.decode_jpeg(jpeg0),
-                                             dtypes.uint16)
-      image0, image1 = sess.run([image0, image1])
-      self.assertAllEqual(image0, image1)
-
-  def testPngUint16(self):
-    with self.test_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/png/testdata"
-      png0 = io_ops.read_file(os.path.join(base, "lena_rgba.png"))
-      image0 = image_ops.decode_image(png0, dtype=dtypes.uint16)
-      image1 = image_ops.convert_image_dtype(
-          image_ops.decode_png(png0, dtype=dtypes.uint16), dtypes.uint16)
-      image0, image1 = sess.run([image0, image1])
-      self.assertAllEqual(image0, image1)
-
-  def testGifUint16(self):
-    with self.test_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/gif/testdata"
-      gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
-      image0 = image_ops.decode_image(gif0, dtype=dtypes.uint16)
-      image1 = image_ops.convert_image_dtype(image_ops.decode_gif(gif0),
-                                             dtypes.uint16)
-      image0, image1 = sess.run([image0, image1])
-      self.assertAllEqual(image0, image1)
-
-  def testBmpUint16(self):
-    with self.test_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/bmp/testdata"
-      bmp0 = io_ops.read_file(os.path.join(base, "lena.bmp"))
-      image0 = image_ops.decode_image(bmp0, dtype=dtypes.uint16)
-      image1 = image_ops.convert_image_dtype(image_ops.decode_bmp(bmp0),
-                                             dtypes.uint16)
-      image0, image1 = sess.run([image0, image1])
-      self.assertAllEqual(image0, image1)
-
-  def testJpegFloat32(self):
-    with self.test_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/jpeg/testdata"
-      jpeg0 = io_ops.read_file(os.path.join(base, "jpeg_merge_test1.jpg"))
-      image0 = image_ops.decode_image(jpeg0, dtype=dtypes.float32)
-      image1 = image_ops.convert_image_dtype(image_ops.decode_jpeg(jpeg0),
-                                             dtypes.float32)
-      image0, image1 = sess.run([image0, image1])
-      self.assertAllEqual(image0, image1)
-
-  def testPngFloat32(self):
-    with self.test_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/png/testdata"
-      png0 = io_ops.read_file(os.path.join(base, "lena_rgba.png"))
-      image0 = image_ops.decode_image(png0, dtype=dtypes.float32)
-      image1 = image_ops.convert_image_dtype(
-          image_ops.decode_png(png0, dtype=dtypes.uint16), dtypes.float32)
-      image0, image1 = sess.run([image0, image1])
-      self.assertAllEqual(image0, image1)
-
-  def testGifFloat32(self):
-    with self.test_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/gif/testdata"
-      gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
-      image0 = image_ops.decode_image(gif0, dtype=dtypes.float32)
-      image1 = image_ops.convert_image_dtype(image_ops.decode_gif(gif0),
-                                             dtypes.float32)
-      image0, image1 = sess.run([image0, image1])
-      self.assertAllEqual(image0, image1)
-
-  def testBmpFloat32(self):
-    with self.test_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/bmp/testdata"
-      bmp0 = io_ops.read_file(os.path.join(base, "lena.bmp"))
-      image0 = image_ops.decode_image(bmp0, dtype=dtypes.float32)
-      image1 = image_ops.convert_image_dtype(image_ops.decode_bmp(bmp0),
-                                             dtypes.float32)
-      image0, image1 = sess.run([image0, image1])
-      self.assertAllEqual(image0, image1)
-
-
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 724fcc39cd..2df230d470 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -467,8 +467,7 @@ class VarianceScaling(Initializer):
     else:
       scale /= max(1., (fan_in + fan_out) / 2.)
     if self.distribution == "normal":
-      # constant taken from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
-      stddev = math.sqrt(scale) / .87962566103423978
+      stddev = math.sqrt(scale)
       return random_ops.truncated_normal(
           shape, 0.0, stddev, dtype, seed=self.seed)
     else:
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 8276047cb6..222b8ebc9d 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -35,9 +35,8 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 # Assert and Print are special symbols in python, so we must
-# have an upper-case version of them.  For users with Python 3 or Python 2.7
-# with `from __future__ import print_function`, we also allow lowercase.
-@tf_export("Print", "print")
+# use an upper-case version of them.
+@tf_export("Print")
 def Print(input_, data, message=None, first_n=None, summarize=None,
           name=None):
   """Prints a list of tensors.
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 466d0dadc8..e40481f3a7 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -125,8 +125,8 @@ def abs(x, name=None):  # pylint: disable=redefined-builtin
   ```
 
   Args:
-    x: A `Tensor` or `SparseTensor` of type `float16`, `float32`, `float64`,
-      `int32`, `int64`, `complex64` or `complex128`.
+    x: A `Tensor` or `SparseTensor` of type `float32`, `float64`, `int32`,
+      `int64`, `complex64` or `complex128`.
     name: A name for the operation (optional).
 
   Returns:
@@ -430,10 +430,10 @@ def pow(x, y, name=None):  # pylint: disable=redefined-builtin
   ```
 
   Args:
-    x: A `Tensor` of type `float16`, `float32`, `float64`, `int32`, `int64`,
-     `complex64`, or `complex128`.
-    y: A `Tensor` of type `float16`, `float32`, `float64`, `int32`, `int64`,
-     `complex64`, or `complex128`.
+    x: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
+     or `complex128`.
+    y: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
+     or `complex128`.
     name: A name for the operation (optional).
 
   Returns:
@@ -600,7 +600,7 @@ def round(x, name=None):  # pylint: disable=redefined-builtin
   ```
 
   Args:
-    x: A `Tensor` of type `float16`, `float32`, `float64`, `int32`, or `int64`.
+    x: A `Tensor` of type `float32` or `float64`.
     name: A name for the operation (optional).
 
   Returns:
@@ -1257,7 +1257,7 @@ def reduce_sum(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` is None, all dimensions are reduced, and a
+  If `axis` has no entries, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
@@ -1397,7 +1397,7 @@ def reduce_mean(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` is None, all dimensions are reduced, and a
+  If `axis` has no entries, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
@@ -1469,7 +1469,7 @@ def reduce_prod(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` is None, all dimensions are reduced, and a
+  If `axis` has no entries, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   Args:
@@ -1519,7 +1519,7 @@ def reduce_min(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` is None, all dimensions are reduced, and a
+  If `axis` has no entries, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   Args:
@@ -1568,7 +1568,7 @@ def reduce_max(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` is None, all dimensions are reduced, and a
+  If `axis` has no entries, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   Args:
@@ -1617,7 +1617,7 @@ def reduce_all(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` is None, all dimensions are reduced, and a
+  If `axis` has no entries, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
@@ -1675,7 +1675,7 @@ def reduce_any(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` is None, all dimensions are reduced, and a
+  If `axis` has no entries, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index f47f38e29e..783d485892 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -621,7 +621,7 @@ def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):
   """Calculate the mean and variance of based on the sufficient statistics.
 
   Args:
-    counts: A `Tensor` containing the total count of the data (one value).
+    counts: A `Tensor` containing a the total count of the data (one value).
     mean_ss: A `Tensor` containing the mean sufficient statistics: the (possibly
       shifted) sum of the elements to average over.
     variance_ss: A `Tensor` containing the variance sufficient statistics: the
@@ -689,9 +689,6 @@ def moments(
     # Compute true mean while keeping the dims for proper broadcasting.
     mean = math_ops.reduce_mean(y, axes, keepdims=True, name="mean")
     # sample variance, not unbiased variance
-    # Note: stop_gradient does not change the gradient that gets 
-    #       backpropagated to the mean from the variance calculation,
-    #       because that gradient is zero
     variance = math_ops.reduce_mean(
         math_ops.squared_difference(y, array_ops.stop_gradient(mean)),
         axes,
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 0c2f5b06c4..a0b55eb077 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1596,12 +1596,12 @@ def leaky_relu(features, alpha=0.2, name=None):
   Returns:
     The activation value.
   """
-  with ops.name_scope(name, "LeakyRelu", [features, alpha]) as name:
+  with ops.name_scope(name, "LeakyRelu", [features, alpha]):
     features = ops.convert_to_tensor(features, name="features")
     if features.dtype.is_integer:
       features = math_ops.to_float(features)
     alpha = ops.convert_to_tensor(alpha, dtype=features.dtype, name="alpha")
-    return math_ops.maximum(alpha * features, features, name=name)
+    return math_ops.maximum(alpha * features, features)
 
 
 def _flatten_outer_dims(logits):
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 035b4735af..46a5f4fae6 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -962,16 +962,6 @@ class LeakyReluTest(test_lib.TestCase):
       self.assertAllClose(
           outputs, [-0.4, -0.2, 0.0, 1.0, 2.0], rtol=tol, atol=tol)
 
-  def testName(self):
-    np_values = np.array([-2, -1, 0, 1, 2], dtype=np.float64)
-    outputs_with_name_set = nn_ops.leaky_relu(
-        constant_op.constant(np_values),
-        name='test_relu_op')
-    self.assertEqual(outputs_with_name_set.name, 'test_relu_op:0')
-    outputs_without_name_set = nn_ops.leaky_relu(
-        constant_op.constant(np_values))
-    self.assertEqual(outputs_without_name_set.name, 'LeakyRelu:0')
-
 
 class SwishTest(test_lib.TestCase):
 
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index 219562de5d..f8676ccb5f 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -23,7 +23,6 @@ import threading
 
 # Used by py_util.cc to get tracebacks.
 import traceback  # pylint: disable=unused-import
-import weakref
 
 import numpy as np
 import six
@@ -130,14 +129,11 @@ class FuncRegistry(object):
   def __init__(self):
     self._lock = threading.Lock()
     self._unique_id = 0  # GUARDED_BY(self._lock)
-    # Only store weakrefs to the funtions. The strong reference is stored in
-    # the graph.
-    self._funcs = weakref.WeakValueDictionary()
+    self._funcs = {}
 
   def insert(self, func):
     """Registers `func` and returns a unique token for this entry."""
     token = self._next_unique_token()
-    # Store a weakref to the function
     self._funcs[token] = func
     return token
 
@@ -190,7 +186,7 @@ class FuncRegistry(object):
     Raises:
       ValueError: if no function is registered for `token`.
     """
-    func = self._funcs.get(token, None)
+    func = self._funcs[token]
     if func is None:
       raise ValueError("callback %s is not found" % token)
     if isinstance(func, EagerFunc):
@@ -232,6 +228,19 @@ _py_funcs = FuncRegistry()
 pywrap_tensorflow.InitializePyTrampoline(_py_funcs)
 
 
+class CleanupFunc(object):
+  """A helper class to remove a registered function from _py_funcs."""
+
+  def __init__(self, token):
+    self._token = token
+
+  def __del__(self):
+    if _py_funcs is not None:
+      # If _py_funcs is None, the program is most likely in shutdown, and the
+      # _py_funcs object has been destroyed already.
+      _py_funcs.remove(self._token)
+
+
 def _internal_py_func(func,
                       inp,
                       Tout,
@@ -261,15 +270,17 @@ def _internal_py_func(func,
     # bound to that of the outer graph instead.
     graph = graph._outer_graph
 
+  cleanup = CleanupFunc(token)
+
   # TODO(zhifengc): Consider adding a Graph method to collect
   # `cleanup` objects in one of its member.
-  if not hasattr(graph, "_py_funcs_used_in_graph"):
-    graph._py_funcs_used_in_graph = []
+  if not hasattr(graph, "_cleanup_py_funcs_used_in_graph"):
+    graph._cleanup_py_funcs_used_in_graph = []
 
-  # Store a reference to the function in the graph to ensure it stays alive
-  # as long as the graph lives. When the graph is destroyed, the function
-  # is left to the garbage collector for destruction as well.
-  graph._py_funcs_used_in_graph.append(func)
+  # When `graph` is destroyed, elements in _cleanup_py_funcs_used_in_graph
+  # will be destroyed and their __del__ will remove the 'token' from
+  # the funcs registry.
+  graph._cleanup_py_funcs_used_in_graph.append(cleanup)
   # pylint: enable=protected-access
 
   if eager:
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index c3b16a7bd5..0130233746 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -84,8 +84,6 @@ def _convert_to_sparse_tensors(sp_inputs):
 
 # pylint: disable=protected-access
 @tf_export("sparse_concat")
-@deprecation.deprecated_args(
-    None, "concat_dim is deprecated, use axis instead", "concat_dim")
 def sparse_concat(axis,
                   sp_inputs,
                   name=None,
@@ -599,8 +597,6 @@ class KeywordRequired(object):
 
 
 @tf_export("sparse_split")
-@deprecation.deprecated_args(
-    None, "split_dim is deprecated, use axis instead", "split_dim")
 def sparse_split(keyword_required=KeywordRequired(),
                  sp_input=None,
                  num_split=None,
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index 0280c89c10..ae79c01949 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -91,59 +91,6 @@ def string_split(source, delimiter=" ", skip_empty=True):  # pylint: disable=inv
   shape.set_shape([2])
   return sparse_tensor.SparseTensor(indices, values, shape)
 
-@tf_export("strings.split")
-def string_split_v2(source, sep=None, maxsplit=-1):
-  """Split elements of `source` based on `sep` into a `SparseTensor`.
-
-  Let N be the size of source (typically N will be the batch size). Split each
-  element of `source` based on `sep` and return a `SparseTensor`
-  containing the split tokens. Empty tokens are ignored.
-
-  For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
-  then the output will be
-
-  st.indices = [0, 0;
-                0, 1;
-                1, 0;
-                1, 1;
-                1, 2]
-  st.shape = [2, 3]
-  st.values = ['hello', 'world', 'a', 'b', 'c']
-
-  If `sep` is given, consecutive delimiters are not grouped together and are
-  deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
-  sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
-  string, consecutive whitespace are regarded as a single separator, and the
-  result will contain no empty strings at the startor end if the string has
-  leading or trailing whitespace.
-
-  Note that the above mentioned behavior matches python's str.split.
-
-  Args:
-    source: `1-D` string `Tensor`, the strings to split.
-    sep: `0-D` string `Tensor`, the delimiter character.
-    maxsplit: An `int`. If `maxsplit > 0`, limit of the split of the result.
-
-  Raises:
-    ValueError: If sep is not a string.
-
-  Returns:
-    A `SparseTensor` of rank `2`, the strings split according to the delimiter.
-    The first column of the indices corresponds to the row in `source` and the
-    second column corresponds to the index of the split component in this row.
-  """
-  if sep is None:
-    sep = ''
-  sep = ops.convert_to_tensor(sep, dtype=dtypes.string)
-  source = ops.convert_to_tensor(source, dtype=dtypes.string)
-
-  indices, values, shape = gen_string_ops.string_split_v2(
-      source, sep=sep, maxsplit=maxsplit)
-  indices.set_shape([None, 2])
-  values.set_shape([None])
-  shape.set_shape([2])
-  return sparse_tensor.SparseTensor(indices, values, shape)
-
 
 def _reduce_join_reduction_dims(x, axis, reduction_indices):
   """Returns range(rank(x) - 1, 0, -1) if reduction_indices is None."""
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 47414c28af..f49e2d314d 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -1786,23 +1786,6 @@ class variable_scope(object):
           assert v.name == "foo/bar/v:0"
   ```
 
-  Simple example of how to reenter a premade variable scope safely:
-
-  ```python
-  with tf.variable_scope("foo") as vs:
-    pass
-
-  # Re-enter the variable scope.
-  with tf.variable_scope(vs,
-                         auxiliary_name_scope=False) as vs1:
-    # Restore the original name_scope.
-    with tf.name_scope(vs1.original_name_scope):
-        v = tf.get_variable("v", [1])
-        assert v.name == "foo/v:0"
-        c = tf.constant([1], name="c")
-        assert c.name == "foo/c:0"
-  ```
-
   Basic example of sharing a variable AUTO_REUSE:
 
   ```python
@@ -1941,9 +1924,7 @@ class variable_scope(object):
         (which must have the same shape). Constraints are not safe to
         use when doing asynchronous distributed training.
       auxiliary_name_scope: If `True`, we create an auxiliary name scope with
-        the scope. If `False`, we don't create it. Note that the argument is
-        not inherited, and it only takes effect for once when creating. You
-        should only use it for re-entering a premade variable scope.
+        the scope. If `False`, we don't touch name scope.
 
     Returns:
       A scope that can be captured and reused.
diff --git a/tensorflow/python/tools/import_pb_to_tensorboard.py b/tensorflow/python/tools/import_pb_to_tensorboard.py
old mode 100644
new mode 100755
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index b59f8e1f98..522965990b 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1719,7 +1719,7 @@ def tf_py_build_info_genrule():
       name="py_build_info_gen",
       outs=["platform/build_info.py"],
       cmd=
-     "$(location //tensorflow/tools/build_info:gen_build_info.py) --raw_generate \"$@\" --build_config " + if_cuda("cuda", "cpu"),
+      "$(location //tensorflow/tools/build_info:gen_build_info.py) --raw_generate \"$@\" --build_config " + if_cuda("cuda", "cpu"),
       local=1,
       tools=[clean_dep("//tensorflow/tools/build_info:gen_build_info.py")],)
 
diff --git a/tensorflow/tools/api/generator/create_python_api.py b/tensorflow/tools/api/generator/create_python_api.py
index 671b7e387e..bca9fa49eb 100644
--- a/tensorflow/tools/api/generator/create_python_api.py
+++ b/tensorflow/tools/api/generator/create_python_api.py
@@ -41,11 +41,7 @@ _GENERATED_FILE_HEADER = """# This file is MACHINE GENERATED! Do not edit.
 # Generated by: tensorflow/tools/api/generator/create_python_api.py script.
 \"\"\"%s
 \"\"\"
-
-from __future__ import print_function
-
 """
-_GENERATED_FILE_FOOTER = "\n\ndel print_function\n"
 
 
 class SymbolExposedTwiceError(Exception):
@@ -153,7 +149,6 @@ class _ModuleInitCodeBuilder(object):
 _names_with_underscore = [%s]
 __all__ = [_s for _s in dir() if not _s.startswith('_')]
 __all__.extend([_s for _s in _names_with_underscore])
-__all__.remove('print_function')
 ''' % underscore_names_str
 
     return module_text_map
@@ -338,8 +333,7 @@ def create_api_files(
     if module or not root_init_template:
       contents = (
           _GENERATED_FILE_HEADER %
-          get_module_docstring(module, package, api_name) +
-          text + _GENERATED_FILE_FOOTER)
+          get_module_docstring(module, package, api_name) + text)
     else:
       # Read base init file
       with open(root_init_template, 'r') as root_init_template_file:
diff --git a/tensorflow/tools/api/golden/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
index 10171b3d60..5bb3b3c444 100644
--- a/tensorflow/tools/api/golden/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
@@ -58,7 +58,7 @@ tf_module {
   }
   member_method {
     name: "decode_image"
-    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'uint8\'>\", \'None\'], "
+    argspec: "args=[\'contents\', \'channels\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "decode_jpeg"
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index 3051c4437e..dc2bd40096 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -1532,10 +1532,6 @@ tf_module {
     name: "pow"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "print"
-    argspec: "args=[\'input_\', \'data\', \'message\', \'first_n\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "py_func"
     argspec: "args=[\'func\', \'inp\', \'Tout\', \'stateful\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/tensorflow.strings.pbtxt
index b641c39feb..a3fbe95bba 100644
--- a/tensorflow/tools/api/golden/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.strings.pbtxt
@@ -4,8 +4,4 @@ tf_module {
     name: "regex_full_match"
     argspec: "args=[\'input\', \'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "split"
-    argspec: "args=[\'source\', \'sep\', \'maxsplit\'], varargs=None, keywords=None, defaults=[\'None\', \'-1\'], "
-  }
 }
diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index 883bb93647..5fa75e1d61 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@@ -322,10 +322,6 @@ create_activate_virtualenv_and_install_tensorflow() {
   pip install -v ${PIP_FLAGS} ${WHL_PATH} || \
     die "pip install (forcing to reinstall tensorflow) FAILED"
   echo "Successfully installed pip package ${TF_WHEEL_PATH}"
-
-  # Force downgrade setuptools.
-  pip install --upgrade setuptools==39.1.0
-
 }
 
 ################################################################################
diff --git a/tensorflow/tools/ci_build/builds/with_the_same_user b/tensorflow/tools/ci_build/builds/with_the_same_user
index b216e3549f..d4bf546d40 100755
--- a/tensorflow/tools/ci_build/builds/with_the_same_user
+++ b/tensorflow/tools/ci_build/builds/with_the_same_user
@@ -40,7 +40,7 @@ if [ -n "${CI_BUILD_USER_FORCE_BADNAME}" ]; then
   ADDUSER_OPTS="--force-badname"
 fi
 
-getent group "${CI_BUILD_GID}" || addgroup ${ADDUSER_OPTS} --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}"
+getent group "${CI_BUILD_GID}" || addgroup --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}"
 getent passwd "${CI_BUILD_UID}" || adduser ${ADDUSER_OPTS} \
     --gid "${CI_BUILD_GID}" --uid "${CI_BUILD_UID}" \
     --gecos "${CI_BUILD_USER} (generated by with_the_same_user script)" \
diff --git a/tensorflow/tools/ci_build/ci_build.sh b/tensorflow/tools/ci_build/ci_build.sh
index 1f0fd0387a..072dd6ab99 100755
--- a/tensorflow/tools/ci_build/ci_build.sh
+++ b/tensorflow/tools/ci_build/ci_build.sh
@@ -134,12 +134,6 @@ if [[ $? != "0" ]]; then
   die "ERROR: docker build failed. Dockerfile is at ${DOCKERFILE_PATH}"
 fi
 
-# If caller wants the with_the_same_user script to allow bad usernames, 
-# pass the var to the docker environment
-if [ -n "${CI_BUILD_USER_FORCE_BADNAME}" ]; then
-        CI_BUILD_USER_FORCE_BADNAME_ENV="-e CI_BUILD_USER_FORCE_BADNAME=yes"
-fi
-
 # Run the command inside the container.
 echo "Running '${COMMAND[*]}' inside ${DOCKER_IMG_NAME}..."
 mkdir -p ${WORKSPACE}/bazel-ci_build-cache
@@ -154,7 +148,6 @@ ${DOCKER_BINARY} run --rm --pid=host \
     -e "CI_BUILD_GROUP=$(id -g -n)" \
     -e "CI_BUILD_GID=$(id -g)" \
     -e "CI_TENSORFLOW_SUBMODULE_PATH=${CI_TENSORFLOW_SUBMODULE_PATH}" \
-    ${CI_BUILD_USER_FORCE_BADNAME_ENV} \
     -v ${WORKSPACE}:/workspace \
     -w /workspace \
     ${GPU_EXTRA_PARAMS} \
diff --git a/tensorflow/tools/ci_build/copy_binary.py b/tensorflow/tools/ci_build/copy_binary.py
index 148526492d..420d390d2b 100755
--- a/tensorflow/tools/ci_build/copy_binary.py
+++ b/tensorflow/tools/ci_build/copy_binary.py
@@ -32,8 +32,7 @@ import shutil
 import tempfile
 import zipfile
 
-TF_NIGHTLY_REGEX = (r"(.+)tf_nightly(|_gpu)-(\d\.[\d]{1,2}"
-                    "\.\d.dev[\d]{0,8})-(.+)\.whl")
+TF_NIGHTLY_REGEX = r"(.+)tf_nightly(|_gpu)-(\d\.\d\.\d.dev[\d]{0,8})-(.+)\.whl"
 BINARY_STRING_TEMPLATE = "%s-%s-%s.whl"
 
 
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index 88f1d04193..60290df833 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -115,7 +115,3 @@ pip2 install keras_applications==1.0.2
 pip3 install keras_applications==1.0.2
 pip2 install keras_preprocessing==1.0.1
 pip3 install keras_preprocessing==1.0.1
-
-# Install last working version of setuptools.
-pip2 install --upgrade setuptools==39.1.0
-pip3 install --upgrade setuptools==39.1.0
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index acd69ef346..edb9d4b929 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -39,6 +39,7 @@ if [[ -z $pip35_version ]]; then
 fi
 
 set -e
+pip3.5 install --upgrade setuptools
 pip3.5 install --upgrade pip
 
 pip3.5 install --upgrade virtualenv
@@ -85,7 +86,4 @@ pip3.5 install --upgrade termcolor
 pip3.5 install keras_applications==1.0.2
 pip3.5 install keras_preprocessing==1.0.1
 
-# Install last working version of setuptools.
-pip3.5 install --upgrade setuptools==39.1.0
-
 # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh)
diff --git a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
index 323b30f48e..5635977731 100755
--- a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
@@ -49,6 +49,7 @@ cd Python-3.6.1
 make altinstall
 ln -s /usr/local/bin/pip3.6 /usr/local/bin/pip3
 
+pip3 install --upgrade setuptools
 pip3 install --upgrade pip
 
 pip3 install --upgrade virtualenv
@@ -100,8 +101,4 @@ pip3 install --upgrade termcolor
 # Keras
 pip3.5 install keras_applications==1.0.2
 pip3.5 install keras_preprocessing==1.0.1
-
-# Install last working version of setuptools.
-pip3 install --upgrade setuptools==39.1.0
-
 # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh)
diff --git a/tensorflow/tools/ci_build/linux/mkl/basic-mkl-test.sh b/tensorflow/tools/ci_build/linux/mkl/basic-mkl-test.sh
deleted file mode 100755
index 10a09a415a..0000000000
--- a/tensorflow/tools/ci_build/linux/mkl/basic-mkl-test.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# Usage: basic_mkl_test.sh
-
-# Helper function to traverse directories up until given file is found.
-function upsearch () {
-  test / == "$PWD" && return || \
-      test -e "$1" && echo "$PWD" && return || \
-      cd .. && upsearch "$1"
-}
-
-# Set up WORKSPACE.
-WORKSPACE="${WORKSPACE:-$(upsearch WORKSPACE)}"
-
-BUILD_TAG=mkl-ci-test CI_BUILD_USER_FORCE_BADNAME=yes ${WORKSPACE}/tensorflow/tools/ci_build/ci_build.sh cpu tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
diff --git a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
index b8bce57c87..1bd1852ffc 100755
--- a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
+++ b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
@@ -79,7 +79,6 @@ if [[ $1 == "PI_ONE" ]]; then
   --linkopt=-L${OPENBLAS_INSTALL_PATH}/lib/
   --linkopt=-l:libopenblas.a"
   echo "Building for the Pi One/Zero, with no NEON support"
-  WHEEL_ARCH=linux_armv6l
 else
   PI_COPTS='--copt=-march=armv7-a --copt=-mfpu=neon-vfpv4
   --copt=-std=gnu11 --copt=-DS_IREAD=S_IRUSR --copt=-DS_IWRITE=S_IWUSR
@@ -87,7 +86,6 @@ else
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8'
-  WHEEL_ARCH=linux_armv7l
   echo "Building for the Pi Two/Three, with NEON acceleration"
 fi
 
@@ -102,8 +100,6 @@ bazel build -c opt ${PI_COPTS} \
   --copt=-fomit-frame-pointer --cpu=armeabi \
   --crosstool_top=@local_config_arm_compiler//:toolchain \
   --verbose_failures \
-  //tensorflow:libtensorflow.so \
-  //tensorflow:libtensorflow_framework.so \
   //tensorflow/tools/benchmark:benchmark_model \
   //tensorflow/tools/pip_package:build_pip_package
 
@@ -116,12 +112,10 @@ BDIST_OPTS="--universal" \
   bazel-bin/tensorflow/tools/pip_package/build_pip_package "${OUTDIR}"
 
 OLD_FN=$(ls "${OUTDIR}" | grep -m 1 \.whl)
-SUB='s/tensorflow-([^-]+)-([^-]+)-.*/tensorflow-\1-\2-none-'${WHEEL_ARCH}'.whl/; print'
+SUB='s/tensorflow-([^-]+)-([^-]+)-.*/tensorflow-\1-\2-none-any.whl/; print'
 NEW_FN=$(echo "${OLD_FN}" | perl -ne "${SUB}")
 mv "${OUTDIR}/${OLD_FN}" "${OUTDIR}/${NEW_FN}"
 cp bazel-bin/tensorflow/tools/benchmark/benchmark_model "${OUTDIR}"
-cp bazel-bin/tensorflow/libtensorflow.so "${OUTDIR}"
-cp bazel-bin/tensorflow/libtensorflow_framework.so "${OUTDIR}"
 
 echo "Output can be found here:"
 find "${OUTDIR}"
diff --git a/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl b/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
index f8f63e276c..47539b2423 100644
--- a/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
+++ b/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
@@ -31,11 +31,7 @@ def _def_file_filter_configure_impl(repository_ctx):
   vc_path = find_vc_path(repository_ctx)
   if vc_path == "visual-studio-not-found":
     auto_configure_fail("Visual C++ build tools not found on your machine")
-
-  undname = find_msvc_tool(repository_ctx, vc_path, "undname.exe")
-  if undname == None:
-    auto_configure_fail("Couldn't find undname.exe under %s, please check your VC installation and set BAZEL_VC environment variable correctly." % vc_path)
-  undname_bin_path = undname.replace("\\", "\\\\")
+  undname_bin_path = find_msvc_tool(repository_ctx, vc_path, "undname.exe").replace("\\", "\\\\")
 
   repository_ctx.template(
     "def_file_filter.py",
diff --git a/tensorflow/tools/dist_test/local_test.sh b/tensorflow/tools/dist_test/local_test.sh
index b0114721bd..06c2b997cb 100755
--- a/tensorflow/tools/dist_test/local_test.sh
+++ b/tensorflow/tools/dist_test/local_test.sh
@@ -64,6 +64,9 @@ die() {
 # Configurations
 DOCKER_IMG_NAME="tensorflow/tf-dist-test-local-cluster"
 
+# Use TensorFlow v1.5.0 for Python 2.7 and CPU only as we set num_gpus to 0 in the below
+DEFAULT_WHL_FILE_LOCATION="https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp27-none-linux_x86_64.whl"
+
 # Parse input arguments
 LEAVE_CONTAINER_RUNNING=0
 MODEL_NAME=""
@@ -74,7 +77,8 @@ SYNC_REPLICAS_FLAG=""
 
 WHL_FILE_LOCATION=${1}
 if [[ -z "${WHL_FILE_LOCATION}" ]]; then
-  echo "WARNING: No wheel url passed. Will use latest tf-nightly cpu p2 wheel."
+  WHL_FILE_LOCATION=${DEFAULT_WHL_FILE_LOCATION}
+  echo "use default whl file location"
 fi
 
 while true; do
@@ -127,11 +131,7 @@ echo "Building in temporary directory: ${BUILD_DIR}"
 cp -r ${DIR}/* "${BUILD_DIR}"/ || \
   die "Failed to copy files to ${BUILD_DIR}"
 
-# Download whl file into the build context directory.
-if [[ -z "${WHL_FILE_LOCATION}" ]]; then
-  pip2 download --no-deps tf-nightly
-  cp tf-nightly-*.whl "${BUILD_DIR}"/tensorflow-none-any.whl
-elif [[ $WHL_FILE_LOCATION =~ 'http://' || $WHL_FILE_LOCATION =~ 'https://' ]]; then
+if [[ $WHL_FILE_LOCATION =~ 'http://' || $WHL_FILE_LOCATION =~ 'https://' ]]; then
     # Download whl file into the build context directory.
     wget -P "${BUILD_DIR}" "${WHL_FILE_LOCATION}" || \
         die "Failed to download tensorflow whl file from URL: ${WHL_FILE_LOCATION}"
diff --git a/tensorflow/tools/dist_test/remote_test.sh b/tensorflow/tools/dist_test/remote_test.sh
index e188c88c8f..935535312d 100755
--- a/tensorflow/tools/dist_test/remote_test.sh
+++ b/tensorflow/tools/dist_test/remote_test.sh
@@ -108,7 +108,7 @@ fi
 # Parse command-line arguments.
 WHL_URL=${1}
 if [[ -z "${WHL_URL}" ]]; then
-  echo "WARNING: No wheel url passed. Will use latest tf-nightly cpu p2 wheel."
+  die "whl URL is not specified"
 fi
 
 # Create docker build context directory.
@@ -121,13 +121,8 @@ cp -r ${DIR}/* ${BUILD_DIR}/ || \
   die "Failed to copy files to ${BUILD_DIR}"
 
 # Download whl file into the build context directory.
-if [[ -z "${WHL_URL}" ]]; then
-  pip2 download --no-deps tf-nightly
-  cp tf-nightly-*.whl "${BUILD_DIR}"/tensorflow-none-any.whl
-else
-  wget -P "${BUILD_DIR}" ${WHL_URL} || \
-    die "Failed to download tensorflow whl file from URL: ${WHL_URL}"
-fi
+wget -P "${BUILD_DIR}" ${WHL_URL} || \
+  die "Failed to download tensorflow whl file from URL: ${WHL_URL}"
 
 # Build docker image for test.
 docker build ${NO_CACHE_FLAG} \
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 57a491255e..406d134699 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -76,7 +76,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.9 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.8 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # TODO(craigcitro): Don't install the pip package, since it makes it
 # more difficult to experiment with local changes. Instead, just add
diff --git a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
index 6796ad70e5..a6cd44ced1 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
+++ b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
@@ -3,7 +3,7 @@ FROM tensorflow/tensorflow:latest-devel
 LABEL maintainer="Clayne Robison<clayne.b.robison@intel.com>"
 
 # These arguments are parameterized. Use --build-args to override.
-ARG TF_BRANCH=r1.9
+ARG TF_BRANCH=r1.8
 ARG WHL_DIR=/whl
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 204b5b4dba..2fe47f3356 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -13,8 +13,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         cuda-cusparse-dev-9-0 \
         curl \
         git \
-        libcudnn7=7.1.4.18-1+cuda9.0 \
-        libcudnn7-dev=7.1.4.18-1+cuda9.0 \
+        libcudnn7=7.0.5.15-1+cuda9.0 \
+        libcudnn7-dev=7.0.5.15-1+cuda9.0 \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
@@ -85,7 +85,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.9 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.8 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
index 9197651ff4..bff4a20392 100644
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@@ -12,7 +12,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         cuda-cusolver-9-0 \
         cuda-cusparse-9-0 \
         curl \
-        libcudnn7=7.1.4.18-1+cuda9.0 \
+        libcudnn7=7.0.5.15-1+cuda9.0 \
         libfreetype6-dev \
         libhdf5-serial-dev \
         libpng12-dev \
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 620fef9363..5910f0625e 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -61,7 +61,6 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/autograph/core:core",
     "//tensorflow/contrib/autograph/impl:impl",
     "//tensorflow/contrib/autograph/lang:lang",
-    "//tensorflow/contrib/autograph/operators:operators",
     "//tensorflow/contrib/autograph/pyct:pyct",
     "//tensorflow/contrib/autograph/pyct/static_analysis:static_analysis",
     "//tensorflow/contrib/boosted_trees:boosted_trees_pip",
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index f7e42ce536..0c4065bc77 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -41,15 +41,51 @@ function is_windows() {
   fi
 }
 
-function prepare_src() {
+function main() {
   if [ $# -lt 1 ] ; then
     echo "No destination dir provided"
     exit 1
   fi
 
-  TMPDIR="$1"
-  mkdir -p "$TMPDIR"
-  echo $(date) : "=== Preparing sources in dir: ${TMPDIR}"
+  DEST=$(real_path $1)
+  TMPDIR=$(mktemp -d -t tmp.XXXXXXXXXX)
+
+  PKG_NAME_FLAG=""
+  GPU_BUILD=0
+  NIGHTLY_BUILD=0
+  PROJECT_NAME=""
+  while true; do
+    if [[ "$1" == "--nightly_flag" ]]; then
+      NIGHTLY_BUILD=1
+    elif [[ "$1" == "--gpu" ]]; then
+      GPU_BUILD=1
+    elif [[ "$1" == "--gpudirect" ]]; then
+      PKG_NAME_FLAG="--project_name tensorflow_gpudirect"
+    elif [[ "$1" == "--project_name" ]]; then
+      shift
+      if [[ -z "$1" ]]; then
+        break
+      fi
+      PROJECT_NAME="$1"
+    fi
+    shift
+
+    if [[ -z "$1" ]]; then
+      break
+    fi
+  done
+
+  if [[ -n ${PROJECT_NAME} ]]; then
+    PKG_NAME_FLAG="--project_name ${PROJECT_NAME}"
+  elif [[ ${NIGHTLY_BUILD} == "1" && ${GPU_BUILD} == "1" ]]; then
+    PKG_NAME_FLAG="--project_name tf_nightly_gpu"
+  elif [[ ${NIGHTLY_BUILD} == "1" ]]; then
+    PKG_NAME_FLAG="--project_name tf_nightly"
+  elif [[ ${GPU_BUILD} == "1" ]]; then
+    PKG_NAME_FLAG="--project_name tensorflow_gpu"
+  fi
+
+  echo $(date) : "=== Using tmpdir: ${TMPDIR}"
 
   if [ ! -d bazel-bin/tensorflow ]; then
     echo "Could not find bazel-bin.  Did you run from the root of the build tree?"
@@ -119,28 +155,17 @@ function prepare_src() {
   # over so user defined ops can be compiled.
   mkdir -p ${TMPDIR}/google
   mkdir -p ${TMPDIR}/third_party
-  pushd ${RUNFILES%org_tensorflow} > /dev/null
+  pushd ${RUNFILES%org_tensorflow}
   for header in $(find protobuf_archive -name \*.h); do
     mkdir -p "${TMPDIR}/google/$(dirname ${header})"
     cp "$header" "${TMPDIR}/google/$(dirname ${header})/"
   done
-  popd > /dev/null
+  popd
   cp -R $RUNFILES/third_party/eigen3 ${TMPDIR}/third_party
 
   cp tensorflow/tools/pip_package/MANIFEST.in ${TMPDIR}
   cp tensorflow/tools/pip_package/README ${TMPDIR}
   cp tensorflow/tools/pip_package/setup.py ${TMPDIR}
-}
-
-function build_wheel() {
-  if [ $# -lt 2 ] ; then
-    echo "No src and dest dir provided"
-    exit 1
-  fi
-
-  TMPDIR="$1"
-  DEST="$2"
-  PKG_NAME_FLAG="$3"
 
   # Before we leave the top-level directory, make sure we know how to
   # call python.
@@ -148,110 +173,15 @@ function build_wheel() {
     source tools/python_bin_path.sh
   fi
 
-  pushd ${TMPDIR} > /dev/null
+  pushd ${TMPDIR}
   rm -f MANIFEST
   echo $(date) : "=== Building wheel"
   "${PYTHON_BIN_PATH:-python}" setup.py bdist_wheel ${PKG_NAME_FLAG} >/dev/null
   mkdir -p ${DEST}
   cp dist/* ${DEST}
-  popd > /dev/null
+  popd
+  rm -rf ${TMPDIR}
   echo $(date) : "=== Output wheel file is in: ${DEST}"
 }
 
-function usage() {
-  echo "Usage:"
-  echo "$0 [--src srcdir] [--dst dstdir] [options]"
-  echo "$0 dstdir [options]"
-  echo ""
-  echo "    --src                 prepare sources in srcdir"
-  echo "                              will use temporary dir if not specified"
-  echo ""
-  echo "    --dst                 build wheel in dstdir"
-  echo "                              if dstdir is not set do not build, only prepare sources"
-  echo ""
-  echo "  Options:"
-  echo "    --project_name <name> set project name to name"
-  echo "    --gpu                 build tensorflow_gpu"
-  echo "    --gpudirect           build tensorflow_gpudirect"
-  echo "    --nightly_flag        build tensorflow nightly"
-  echo ""
-  exit 1
-}
-
-function main() {
-  PKG_NAME_FLAG=""
-  PROJECT_NAME=""
-  GPU_BUILD=0
-  NIGHTLY_BUILD=0
-  SRCDIR=""
-  DSTDIR=""
-  CLEANSRC=1
-  while true; do
-    if [[ "$1" == "--help" ]]; then
-      usage
-      exit 1
-    elif [[ "$1" == "--nightly_flag" ]]; then
-      NIGHTLY_BUILD=1
-    elif [[ "$1" == "--gpu" ]]; then
-      GPU_BUILD=1
-    elif [[ "$1" == "--gpudirect" ]]; then
-      PKG_NAME_FLAG="--project_name tensorflow_gpudirect"
-    elif [[ "$1" == "--project_name" ]]; then
-      shift
-      if [[ -z "$1" ]]; then
-        break
-      fi
-      PROJECT_NAME="$1"
-    elif [[ "$1" == "--src" ]]; then
-      shift
-      SRCDIR="$(real_path $1)"
-      CLEANSRC=0
-    elif [[ "$1" == "--dst" ]]; then
-      shift
-      DSTDIR="$(real_path $1)"
-    else
-      DSTDIR="$(real_path $1)"
-    fi
-    shift
-
-    if [[ -z "$1" ]]; then
-      break
-    fi
-  done
-
-  if [[ -z "$DSTDIR" ]] && [[ -z "$SRCDIR" ]]; then
-    echo "No destination dir provided"
-    usage
-    exit 1
-  fi
-
-  if [[ -z "$SRCDIR" ]]; then
-    # make temp srcdir if none set
-    SRCDIR="$(mktemp -d -t tmp.XXXXXXXXXX)"
-  fi
-
-  prepare_src "$SRCDIR"
-
-  if [[ -z "$DSTDIR" ]]; then
-      # only want to prepare sources
-      exit
-  fi
-
-  if [[ -n ${PROJECT_NAME} ]]; then
-    PKG_NAME_FLAG="--project_name ${PROJECT_NAME}"
-  elif [[ ${NIGHTLY_BUILD} == "1" && ${GPU_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tf_nightly_gpu"
-  elif [[ ${NIGHTLY_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tf_nightly"
-  elif [[ ${GPU_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tensorflow_gpu"
-  fi
-
-  build_wheel "$SRCDIR" "$DSTDIR" "$PKG_NAME_FLAG"
-
-  if [[ $CLEANSRC -ne 0 ]]; then
-    rm -rf "${TMPDIR}"
-  fi
-}
-
 main "$@"
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 97f625e7e9..d25a9e77b1 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -45,7 +45,7 @@ DOCLINES = __doc__.split('\n')
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.9.0-rc0'
+_VERSION = '1.8.0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
@@ -54,7 +54,6 @@ REQUIRED_PACKAGES = [
     'numpy >= 1.13.3',
     'six >= 1.10.0',
     'protobuf >= 3.4.0',
-    'setuptools <= 39.1.0',
     'tensorboard >= 1.8.0, < 1.9.0',
     'termcolor >= 1.1.0',
 ]
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
index 15d7c70281..29add6d5ea 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
@@ -814,9 +814,6 @@ void Generator::Generate(const FileDescriptor& fd) {
   // Add header to cc file.
   SetOutput(&cc_);
   Print("// GENERATED FILE - DO NOT MODIFY");
-  Print();
-  Print("#include <algorithm>");  // for `std::stable_sort()`
-  Print();
   headers = {GetProtoTextHeaderName(fd, true /* impl */)};
   AddHeadersToCurrentSection(headers);
   Print();
diff --git a/tensorflow/tools/quantization/quantize_graph_test.py b/tensorflow/tools/quantization/quantize_graph_test.py
index 92bb5127da..df71840b64 100644
--- a/tensorflow/tools/quantization/quantize_graph_test.py
+++ b/tensorflow/tools/quantization/quantize_graph_test.py
@@ -119,8 +119,8 @@ def are_tensors_near(a, b, tolerance):
   flat_a = a.flatten()
   flat_b = b.flatten()
   if len(flat_a) != len(flat_b):
-    tf_logging.info("Tensors are different sizes: " + str(len(flat_a)) + " vs "
-                    + str(len(flat_b)))
+    print("Tensors are different sizes: " + str(len(flat_a)) + " vs " + str(
+        len(flat_b)))
     return False
   value_count = len(flat_a)
   how_many_different = 0
@@ -140,10 +140,10 @@ def are_tensors_near(a, b, tolerance):
   if how_many_different == 0:
     return True
   else:
-    tf_logging.info("Tensors have {0} different values ({1}%), with mean"
-                    " difference {2} and mean absolute difference {3}".format(
-                        how_many_different, proportion_different * 100,
-                        mean_difference, mean_abs_difference))
+    print("Tensors have {0} different values ({1}%), with mean difference"
+          " {2} and mean absolute difference {3}".format(
+              how_many_different, proportion_different * 100, mean_difference,
+              mean_abs_difference))
     return False
 
 
diff --git a/tensorflow/tools/test/upload_test_benchmarks.py b/tensorflow/tools/test/upload_test_benchmarks.py
index c030575109..9c45359ee1 100644
--- a/tensorflow/tools/test/upload_test_benchmarks.py
+++ b/tensorflow/tools/test/upload_test_benchmarks.py
@@ -89,6 +89,7 @@ import shutil
 
 from six import text_type
 from google.cloud import datastore
+from six import text_type
 
 
 def is_real_file(dirpath, fname):
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 4f3df570a5..dbec66216a 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -50,31 +50,31 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   mkl_repository(
       name = "mkl_linux",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.14/mklml_lnx_2018.0.3.20180406.tgz",
-          "https://github.com/intel/mkl-dnn/releases/download/v0.14/mklml_lnx_2018.0.3.20180406.tgz"
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_lnx_2018.0.2.20180127.tgz",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_lnx_2018.0.2.20180127.tgz",
       ],
-      sha256 = "d2305244fdc9b87db7426ed4496e87a4b3977ad3374d73b8000e8b7a5b7aa725",
-      strip_prefix = "mklml_lnx_2018.0.3.20180406",
+      sha256 = "74844bd77294742bf2396ff040369d1aa4cdd9e826fcd38cf8398ae83564d146",
+      strip_prefix = "mklml_lnx_2018.0.2.20180127",
       build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
   mkl_repository(
       name = "mkl_windows",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.14/mklml_win_2018.0.3.20180406.zip",
-          "https://github.com/intel/mkl-dnn/releases/download/v0.14/mklml_win_2018.0.3.20180406.zip"
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_win_2018.0.2.20180127.zip",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_win_2018.0.2.20180127.zip"
       ],
-      sha256 = "a584a5bf1c8d2ad70b90d12b52652030e9a338217719064fdb84b7ad0d693694",
-      strip_prefix = "mklml_win_2018.0.3.20180406",
+      sha256 = "d8fbf0faa0684bffa3548005d05fe5cfe56ff9dbc0e15e7612d7ac01055a6ded",
+      strip_prefix = "mklml_win_2018.0.2.20180127",
       build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
   mkl_repository(
       name = "mkl_darwin",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.14/mklml_mac_2018.0.3.20180406.tgz",
-          "https://github.com/intel/mkl-dnn/releases/download/v0.14/mklml_mac_2018.0.3.20180406.tgz"
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_mac_2018.0.2.20180127.tgz",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_mac_2018.0.2.20180127.tgz"
       ],
-      sha256 = "094e3dfd61c816136dc8d12a45cc611ce26c5f4828176a3644cd0b0efa15a25b",
-      strip_prefix = "mklml_mac_2018.0.3.20180406",
+      sha256 = "aa740d71e14562bfea56e6829e6dc186e7487cbcf6748a88dec73826b7ec1943",
+      strip_prefix = "mklml_mac_2018.0.2.20180127",
       build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
 
@@ -85,11 +85,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "mkl_dnn",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/v0.14.tar.gz",
-          "https://github.com/intel/mkl-dnn/archive/v0.14.tar.gz",
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/v0.13.tar.gz",
+          "https://github.com/intel/mkl-dnn/archive/v0.13.tar.gz",
       ],
-      sha256 = "efebc53882856afec86457a2da644693f5d59c68772d41d640d6b60a8efc4eb0",
-      strip_prefix = "mkl-dnn-0.14",
+      sha256 = "d2cfd93a70cfe86ebe054477c530c9b5c1218b70f75856eb6d1956c68ee89e8f",
+      strip_prefix = "mkl-dnn-0.13",
       build_file = clean_dep("//third_party/mkl_dnn:mkldnn.BUILD"),
   )
 
@@ -187,11 +187,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "highwayhash",
       urls = [
-          "http://mirror.bazel.build/github.com/google/highwayhash/archive/fd3d9af80465e4383162e4a7c5e2f406e82dd968.tar.gz",
-          "https://github.com/google/highwayhash/archive/fd3d9af80465e4383162e4a7c5e2f406e82dd968.tar.gz",
+          "https://mirror.bazel.build/github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
+          "https://github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
       ],
-      sha256 = "9c3e0e87d581feeb0c18d814d98f170ff23e62967a2bd6855847f0b2fe598a37",
-      strip_prefix = "highwayhash-fd3d9af80465e4383162e4a7c5e2f406e82dd968",
+      sha256 = "0f30a15b1566d93f146c8d149878a06e91d9bb7ec2cfd76906df62a82be4aac9",
+      strip_prefix = "highwayhash-dfcb97ca4fe9277bf9dc1802dd979b071896453b",
       build_file = clean_dep("//third_party:highwayhash.BUILD"),
   )
 
diff --git a/third_party/eigen.BUILD b/third_party/eigen.BUILD
index e54c1a4501..07bb6645eb 100644
--- a/third_party/eigen.BUILD
+++ b/third_party/eigen.BUILD
@@ -64,7 +64,6 @@ cc_library(
         # This define (mostly) guarantees we don't link any problematic
         # code. We use it, but we do not rely on it, as evidenced above.
         "EIGEN_MPL2_ONLY",
-        "EIGEN_MAX_ALIGN_BYTES=64",
     ],
     includes = ["."],
     visibility = ["//visibility:public"],
diff --git a/third_party/highwayhash.BUILD b/third_party/highwayhash.BUILD
index 08cb84ea2c..1b8e40765e 100644
--- a/third_party/highwayhash.BUILD
+++ b/third_party/highwayhash.BUILD
@@ -10,7 +10,6 @@ cc_library(
     srcs = ["highwayhash/sip_hash.cc"],
     hdrs = [
         "highwayhash/sip_hash.h",
-        "highwayhash/endianess.h",
         "highwayhash/state_helpers.h",
     ],
     visibility = ["//visibility:public"],
diff --git a/third_party/jpeg/jpeg.BUILD b/third_party/jpeg/jpeg.BUILD
index 663a218733..4418ac32fc 100644
--- a/third_party/jpeg/jpeg.BUILD
+++ b/third_party/jpeg/jpeg.BUILD
@@ -291,10 +291,8 @@ cc_library(
         "jchuff.h",
         "jconfig.h",
         "jdct.h",
-        "jerror.h",
         "jinclude.h",
         "jmorecfg.h",
-        "jpegint.h",
         "jpeglib.h",
         "jsimd.h",
         "jsimddct.h",
diff --git a/third_party/png.BUILD b/third_party/png.BUILD
index 17c5449cc0..76ab32d69c 100644
--- a/third_party/png.BUILD
+++ b/third_party/png.BUILD
@@ -28,14 +28,7 @@ cc_library(
         "pngwrite.c",
         "pngwtran.c",
         "pngwutil.c",
-    ] + select({
-        "@org_tensorflow//tensorflow:linux_ppc64le": [
-            "powerpc/powerpc_init.c",
-            "powerpc/filter_vsx_intrinsics.c",
-        ],
-        "//conditions:default": [
-        ],
-    }),
+    ],
     hdrs = [
         "png.h",
         "pngconf.h",
diff --git a/third_party/py/python_configure.bzl b/third_party/py/python_configure.bzl
index 3c7e5c8469..954f21f5f8 100644
--- a/third_party/py/python_configure.bzl
+++ b/third_party/py/python_configure.bzl
@@ -6,7 +6,6 @@
   * `PYTHON_LIB_PATH`: Location of python libraries.
 """
 
-_BAZEL_SH = "BAZEL_SH"
 _PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
 _PYTHON_LIB_PATH = "PYTHON_LIB_PATH"
 _TF_PYTHON_CONFIG_REPO = "TF_PYTHON_CONFIG_REPO"
@@ -153,22 +152,6 @@ def _get_python_bin(repository_ctx):
             _PYTHON_BIN_PATH, repository_ctx.os.environ.get("PATH", "")))
 
 
-def _get_bash_bin(repository_ctx):
-  """Gets the bash bin path."""
-  bash_bin = repository_ctx.os.environ.get(_BAZEL_SH)
-  if bash_bin != None:
-    return bash_bin
-  else:
-    bash_bin_path = repository_ctx.which("bash")
-    if bash_bin_path != None:
-      return str(bash_bin_path)
-    else:
-      _fail("Cannot find bash in PATH, please make sure " +
-            "bash is installed and add its directory in PATH, or --define " +
-            "%s='/path/to/bash'.\nPATH=%s" % (
-                _BAZEL_SH, repository_ctx.os.environ.get("PATH", "")))
-
-
 def _get_python_lib(repository_ctx, python_bin):
   """Gets the python lib path."""
   python_lib = repository_ctx.os.environ.get(_PYTHON_LIB_PATH)
@@ -201,14 +184,14 @@ def _get_python_lib(repository_ctx, python_bin):
       "  print(paths[0])\n" +
       "END")
   cmd = '%s - %s' % (python_bin, print_lib)
-  result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
+  result = repository_ctx.execute(["bash", "-c", cmd])
   return result.stdout.strip('\n')
 
 
 def _check_python_lib(repository_ctx, python_lib):
   """Checks the python lib path."""
   cmd = 'test -d "%s" -a -x "%s"' % (python_lib, python_lib)
-  result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
+  result = repository_ctx.execute(["bash", "-c", cmd])
   if result.return_code == 1:
     _fail("Invalid python library path: %s" % python_lib)
 
@@ -216,7 +199,7 @@ def _check_python_lib(repository_ctx, python_lib):
 def _check_python_bin(repository_ctx, python_bin):
   """Checks the python bin path."""
   cmd =  '[[ -x "%s" ]] && [[ ! -d "%s" ]]' % (python_bin, python_bin)
-  result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
+  result = repository_ctx.execute(["bash", "-c", cmd])
   if result.return_code == 1:
     _fail("--define %s='%s' is not executable. Is it the python binary?" % (
         _PYTHON_BIN_PATH, python_bin))
@@ -311,7 +294,6 @@ def _python_autoconf_impl(repository_ctx):
 python_configure = repository_rule(
     implementation = _python_autoconf_impl,
     environ = [
-        _BAZEL_SH,
         _PYTHON_BIN_PATH,
         _PYTHON_LIB_PATH,
         _TF_PYTHON_CONFIG_REPO,
diff --git a/third_party/repo.bzl b/third_party/repo.bzl
index cb67d3e961..36f5aa5bde 100644
--- a/third_party/repo.bzl
+++ b/third_party/repo.bzl
@@ -17,6 +17,7 @@
 _SINGLE_URL_WHITELIST = depset([
     "arm_compiler",
     "ortools_archive",
+    "gemmlowp",
 ])
 
 def _is_windows(ctx):
@@ -87,9 +88,7 @@ def _tf_http_archive(ctx):
   if ctx.attr.patch_file != None:
     _apply_patch(ctx, ctx.attr.patch_file)
   if ctx.attr.build_file != None:
-    # Use BUILD.bazel to avoid conflict with third party projects with
-    # BUILD or build (directory) underneath.
-    ctx.template("BUILD.bazel", ctx.attr.build_file, {
+    ctx.template("BUILD", ctx.attr.build_file, {
         "%prefix%": ".." if _repos_are_siblings() else "external",
     }, False)
 
-- 
cgit v1.2.3


From 6070ae0e148f50dbc8f36e1654f0a3f53b8b067e Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Mon, 18 Jun 2018 21:00:34 -0700
Subject: Merge changes from github.

PiperOrigin-RevId: 201110240
---
 CONTRIBUTING.md                                    |   2 +-
 README.md                                          |   1 +
 RELEASE.md                                         |  67 +++-
 configure.py                                       |   5 +
 tensorflow/BUILD                                   |   4 +-
 tensorflow/c/generate-pc.sh                        |  11 +-
 tensorflow/cc/gradients/math_grad.cc               |   1 +
 tensorflow/cc/gradients/nn_grad.cc                 |  47 +++
 tensorflow/cc/gradients/nn_grad_test.cc            |  84 ++++-
 tensorflow/compiler/aot/codegen_test_h.golden      |   4 +-
 .../compiler/aot/embedded_protocol_buffers.h       |   2 +-
 tensorflow/compiler/aot/runtime.h                  |   4 +-
 tensorflow/compiler/aot/runtime_test.cc            |  16 +-
 tensorflow/compiler/xla/service/cpu/BUILD          |  18 +-
 tensorflow/compiler/xla/service/cpu/cpu_runtime.cc |   2 +
 tensorflow/compiler/xla/service/cpu/cpu_runtime.h  |   1 +
 tensorflow/compiler/xla/service/cpu/ir_emitter.cc  |   8 +-
 .../compiler/xla/service/cpu/runtime_fft_impl.h    |  20 +-
 .../xla/service/cpu/runtime_single_threaded_fft.cc |  32 ++
 .../xla/service/cpu/runtime_single_threaded_fft.h  |  31 ++
 .../compiler/xla/service/cpu/simple_orc_jit.cc     |   2 +
 tensorflow/compiler/xla/service/pattern_matcher.h  |   2 +-
 .../compiler/xla/service/tuple_simplifier.cc       |   7 +
 tensorflow/compiler/xla/service/tuple_simplifier.h |   9 +-
 .../compiler/xla/service/tuple_simplifier_test.cc  |  77 ++++
 tensorflow/contrib/autograph/__init__.py           |   3 +
 tensorflow/contrib/cmake/tf_c.cmake                |  22 +-
 tensorflow/contrib/cmake/tf_cc_ops.cmake           |   2 +-
 tensorflow/contrib/cmake/tf_python.cmake           |   3 +-
 tensorflow/contrib/cmake/tools/create_def_file.py  |   9 +-
 .../bijectors/sinh_arcsinh_bijector_test.py        |  28 +-
 tensorflow/contrib/eager/python/datasets.py        |   3 +-
 .../python/examples/notebooks/4_high_level.ipynb   |   4 +-
 .../feature_column/sequence_feature_column.py      |  22 +-
 .../feature_column/sequence_feature_column_test.py |  41 ++
 tensorflow/contrib/ffmpeg/__init__.py              |   1 -
 tensorflow/contrib/ffmpeg/ffmpeg_ops.py            |   1 -
 tensorflow/contrib/framework/__init__.py           |   3 +-
 .../ops/fused_conv2d_bias_activation_op_test.py    |  11 +-
 .../src_impl/hexagon_controller.c                  |   2 +-
 tensorflow/contrib/lite/download_dependencies.sh   |   4 +-
 .../contrib/lite/examples/minimal/minimal.cc       |   2 +-
 .../contrib/lite/g3doc/tf_ops_compatibility.md     |  14 +-
 tensorflow/contrib/lite/java/ovic/README.md        |   4 +-
 .../kernels/internal/reference/reference_ops.h     |   4 +-
 tensorflow/contrib/lite/python/interpreter.py      |   2 +-
 .../interpreter_wrapper/interpreter_wrapper.cc     |   9 +-
 .../interpreter_wrapper/interpreter_wrapper.h      |   3 +-
 tensorflow/contrib/lite/python/lite.py             |  11 +
 tensorflow/contrib/lite/toco/import_tensorflow.cc  |   2 +-
 tensorflow/contrib/lite/toco/toco_port.cc          |   6 +
 tensorflow/contrib/lite/toco/toco_port.h           |  18 +
 tensorflow/contrib/makefile/compile_nsync.sh       |   2 +-
 .../contrib/makefile/download_dependencies.sh      |   4 +-
 .../contrib/metrics/python/ops/metric_ops.py       |   2 +-
 tensorflow/contrib/mpi_collectives/kernels/ring.h  |   2 +-
 .../contrib/opt/python/training/adamax_test.py     |   6 +-
 .../opt/python/training/model_average_optimizer.py |   2 +-
 tensorflow/contrib/periodic_resample/BUILD         |  19 +-
 .../kernels/periodic_resample_op.cc                |   5 +
 .../kernels/periodic_resample_op.h                 | 415 +++++++++++++++------
 .../contrib/periodic_resample/ops/array_ops.cc     |  53 ++-
 .../periodic_resample/ops/array_ops_test.cc        |  41 ++
 .../kernel_tests/periodic_resample_op_test.py      |  27 +-
 .../python/ops/periodic_resample_op.py             |   8 +-
 .../predictor/contrib_estimator_predictor.py       |   5 +-
 .../contrib/predictor/core_estimator_predictor.py  |   5 +-
 .../contrib/predictor/predictor_factories.py       |  24 +-
 .../contrib/predictor/predictor_factories_test.py  |  19 +
 .../contrib/predictor/saved_model_predictor.py     |   6 +-
 tensorflow/contrib/quantize/README.md              |   2 +-
 .../contrib/slim/python/slim/evaluation_test.py    |  25 +-
 tensorflow/contrib/summary/summary.py              |   5 +-
 .../contrib/tensor_forest/client/eval_metrics.py   |  45 +--
 .../contrib/tensor_forest/python/tensor_forest.py  |  34 +-
 .../tensor_forest/python/tensor_forest_test.py     |  45 +++
 .../contrib/tensorrt/convert/convert_graph.cc      |  66 ++--
 .../contrib/tensorrt/convert/convert_nodes.cc      |  97 +++--
 tensorflow/contrib/tpu/python/tpu/datasets.py      |  16 +-
 tensorflow/contrib/tpu/python/tpu/datasets_test.py |  26 ++
 tensorflow/core/BUILD                              |   9 +-
 .../core/api_def/base_api/api_def_Selu.pbtxt       |   4 +
 .../api_def/base_api/api_def_StringSplitV2.pbtxt   |  48 +++
 .../api_def/python_api/api_def_StringSplitV2.pbtxt |   4 +
 tensorflow/core/common_runtime/bfc_allocator.cc    |   8 +-
 tensorflow/core/common_runtime/bfc_allocator.h     |   3 +-
 .../direct_session_with_tracking_alloc_test.cc     |  16 +
 .../common_runtime/mkl_threadpool_device_test.cc   |  53 +++
 tensorflow/core/common_runtime/process_util.cc     |  11 +-
 .../core/common_runtime/threadpool_device.cc       |  25 +-
 .../rpc/grpc_master_service_impl.cc                |   4 +-
 .../core/distributed_runtime/rpc/grpc_testlib.cc   |  10 +-
 tensorflow/core/framework/allocator.h              |   5 -
 tensorflow/core/framework/op_gen_lib.cc            |   1 +
 .../remote_fused_graph_execute_info.proto          |   2 +-
 tensorflow/core/framework/tensor_test.cc           |  24 +-
 tensorflow/core/graph/mkl_layout_pass.cc           | 148 +++++++-
 tensorflow/core/graph/mkl_layout_pass_test.cc      |  31 ++
 .../core/grappler/clusters/single_machine_test.cc  |   8 +-
 tensorflow/core/grappler/costs/graph_properties.cc |   1 -
 tensorflow/core/grappler/optimizers/BUILD          |   2 +-
 tensorflow/core/grappler/optimizers/remapper.cc    |   4 +-
 tensorflow/core/kernels/as_string_op.cc            |   2 +
 tensorflow/core/kernels/cwise_op_clip.cc           |  43 +--
 .../core/kernels/dense_update_functor_gpu.cu.cc    |   1 +
 tensorflow/core/kernels/gather_functor.cc          |   1 +
 tensorflow/core/kernels/gather_functor_gpu.cu.cc   |   1 +
 tensorflow/core/kernels/gather_nd_op.cc            |   4 +
 tensorflow/core/kernels/gather_nd_op_gpu.cu.cc     |   2 +
 tensorflow/core/kernels/gather_op.cc               |   1 +
 tensorflow/core/kernels/mkl_concat_op.cc           | 213 ++++++++---
 tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc  |   2 +
 tensorflow/core/kernels/mkl_pooling_ops_common.h   |   6 +-
 tensorflow/core/kernels/scatter_nd_op.cc           |   4 +
 tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc    |   1 +
 .../core/kernels/scoped_allocator_ops_test.cc      |   9 +-
 tensorflow/core/kernels/segment_reduction_ops.h    |  10 +-
 tensorflow/core/kernels/sparse_matmul_op.cc        |   2 +-
 tensorflow/core/kernels/string_split_op.cc         | 130 +++++++
 tensorflow/core/ops/candidate_sampling_ops.cc      |   5 +-
 tensorflow/core/ops/dataset_ops.cc                 |  24 +-
 tensorflow/core/ops/image_ops.cc                   |   4 +-
 tensorflow/core/ops/math_ops.cc                    |   2 +-
 tensorflow/core/ops/nn_ops.cc                      |   1 +
 tensorflow/core/ops/string_ops.cc                  |  20 +-
 tensorflow/core/platform/cpu_info.cc               |  23 ++
 tensorflow/core/platform/cpu_info.h                |   7 +
 tensorflow/core/platform/default/build_config.bzl  |   2 +
 .../core/platform/hadoop/hadoop_file_system.cc     |  21 +-
 tensorflow/core/platform/posix/port.cc             |   5 +
 tensorflow/core/public/version.h                   |   4 +-
 tensorflow/core/util/mkl_util.h                    |  50 ++-
 tensorflow/docs_src/community/groups.md            |  29 +-
 tensorflow/docs_src/get_started/eager.md           |   2 +-
 tensorflow/docs_src/get_started/index.md           |   4 +-
 tensorflow/docs_src/install/install_c.md           |   2 +-
 tensorflow/docs_src/install/install_go.md          |   2 +-
 tensorflow/docs_src/install/install_java.md        |  24 +-
 tensorflow/docs_src/install/install_linux.md       |  24 +-
 tensorflow/docs_src/install/install_mac.md         |  10 +-
 tensorflow/docs_src/install/install_sources.md     |  17 +-
 tensorflow/docs_src/mobile/linking_libs.md         |   2 +-
 tensorflow/docs_src/mobile/prepare_models.md       |   4 +-
 tensorflow/docs_src/performance/quantization.md    |   2 +-
 .../docs_src/programmers_guide/estimators.md       |  19 +-
 .../docs_src/programmers_guide/feature_columns.md  |   4 +-
 tensorflow/examples/learn/iris.py                  |   7 +-
 tensorflow/java/src/gen/cc/op_generator.cc         |  11 +-
 tensorflow/java/src/gen/cc/op_specs.cc             |   1 +
 tensorflow/python/eager/backprop.py                |   4 +-
 tensorflow/python/estimator/BUILD                  |   5 +-
 tensorflow/python/estimator/exporter.py            |   4 +-
 tensorflow/python/estimator/inputs/numpy_io.py     |   8 +-
 .../python/estimator/inputs/numpy_io_test.py       |   5 +-
 tensorflow/python/estimator/inputs/pandas_io.py    |   7 +-
 .../python/estimator/inputs/pandas_io_test.py      |   5 +-
 .../estimator/inputs/queues/feeding_functions.py   |   2 +-
 tensorflow/python/estimator/keras.py               |   4 +-
 tensorflow/python/estimator/keras_test.py          |  14 +-
 tensorflow/python/keras/activations.py             |   2 +
 tensorflow/python/keras/callbacks.py               |  21 +-
 tensorflow/python/keras/callbacks_test.py          |   2 +
 tensorflow/python/keras/engine/network.py          |   2 +-
 tensorflow/python/keras/engine/saving_test.py      |   4 +-
 tensorflow/python/keras/engine/training.py         |   7 +-
 tensorflow/python/keras/engine/training_eager.py   |   2 +-
 tensorflow/python/keras/initializers_test.py       |  26 +-
 tensorflow/python/keras/layers/core.py             |  26 +-
 tensorflow/python/keras/models_test.py             |  14 +
 .../python/kernel_tests/as_string_op_test.py       |  10 +
 tensorflow/python/kernel_tests/betainc_op_test.py  |   4 +-
 tensorflow/python/kernel_tests/clip_ops_test.py    |  13 +
 tensorflow/python/kernel_tests/conv_ops_test.py    |  32 +-
 .../python/kernel_tests/gather_nd_op_test.py       |  32 +-
 tensorflow/python/kernel_tests/gather_op_test.py   |  20 +-
 tensorflow/python/kernel_tests/init_ops_test.py    |  27 ++
 tensorflow/python/kernel_tests/pooling_ops_test.py |   4 +-
 tensorflow/python/kernel_tests/py_func_test.py     |  31 +-
 .../python/kernel_tests/scatter_nd_ops_test.py     |   6 +-
 tensorflow/python/kernel_tests/scatter_ops_test.py |  14 +-
 .../kernel_tests/segment_reduction_ops_test.py     |   4 +-
 .../python/kernel_tests/string_split_op_test.py    |  96 +++++
 tensorflow/python/ops/array_ops.py                 |   4 +
 tensorflow/python/ops/gradient_checker.py          |   8 +-
 tensorflow/python/ops/image_ops_impl.py            |  74 ++--
 tensorflow/python/ops/image_ops_test.py            | 261 ++++++++++---
 tensorflow/python/ops/init_ops.py                  |   3 +-
 tensorflow/python/ops/logging_ops.py               |   5 +-
 tensorflow/python/ops/math_ops.py                  |  28 +-
 tensorflow/python/ops/nn_impl.py                   |   5 +-
 tensorflow/python/ops/nn_ops.py                    |   4 +-
 tensorflow/python/ops/nn_test.py                   |  10 +
 tensorflow/python/ops/script_ops.py                |  35 +-
 tensorflow/python/ops/sparse_ops.py                |   4 +
 tensorflow/python/ops/string_ops.py                |  53 +++
 tensorflow/python/ops/variable_scope.py            |  21 +-
 .../python/tools/import_pb_to_tensorboard.py       |   0
 tensorflow/tensorflow.bzl                          |   2 +-
 .../tools/api/generator/create_python_api.py       |   8 +-
 tensorflow/tools/api/golden/tensorflow.image.pbtxt |   2 +-
 tensorflow/tools/api/golden/tensorflow.pbtxt       |   4 +
 .../tools/api/golden/tensorflow.strings.pbtxt      |   4 +
 tensorflow/tools/ci_build/builds/pip.sh            |   4 +
 .../tools/ci_build/builds/with_the_same_user       |   2 +-
 tensorflow/tools/ci_build/ci_build.sh              |   7 +
 tensorflow/tools/ci_build/copy_binary.py           |   3 +-
 .../tools/ci_build/install/install_pip_packages.sh |   4 +
 .../install/install_python3.5_pip_packages.sh      |   4 +-
 .../install/install_python3.6_pip_packages.sh      |   5 +-
 .../tools/ci_build/linux/mkl/basic-mkl-test.sh     |  29 ++
 tensorflow/tools/ci_build/pi/build_raspberry_pi.sh |   8 +-
 .../def_file_filter/def_file_filter_configure.bzl  |   6 +-
 tensorflow/tools/dist_test/local_test.sh           |  12 +-
 tensorflow/tools/dist_test/remote_test.sh          |  11 +-
 tensorflow/tools/docker/Dockerfile.devel           |   2 +-
 tensorflow/tools/docker/Dockerfile.devel-cpu-mkl   |   2 +-
 tensorflow/tools/docker/Dockerfile.devel-gpu       |   6 +-
 tensorflow/tools/docker/Dockerfile.gpu             |   2 +-
 tensorflow/tools/pip_package/BUILD                 |   1 +
 tensorflow/tools/pip_package/build_pip_package.sh  | 160 +++++---
 tensorflow/tools/pip_package/setup.py              |   3 +-
 .../proto_text/gen_proto_text_functions_lib.cc     |   3 +
 .../tools/quantization/quantize_graph_test.py      |  12 +-
 tensorflow/tools/test/upload_test_benchmarks.py    |   1 -
 tensorflow/workspace.bzl                           |  40 +-
 third_party/eigen.BUILD                            |   1 +
 third_party/highwayhash.BUILD                      |   1 +
 third_party/jpeg/jpeg.BUILD                        |   2 +
 third_party/png.BUILD                              |   9 +-
 third_party/py/python_configure.bzl                |  24 +-
 third_party/repo.bzl                               |   5 +-
 231 files changed, 3338 insertions(+), 905 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc
 create mode 100644 tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h
 create mode 100644 tensorflow/contrib/periodic_resample/ops/array_ops_test.cc
 create mode 100644 tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt
 create mode 100644 tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
 mode change 100755 => 100644 tensorflow/python/tools/import_pb_to_tensorboard.py
 create mode 100755 tensorflow/tools/ci_build/linux/mkl/basic-mkl-test.sh

(limited to 'configure.py')

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 8669c25c45..db4b1581ae 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -90,7 +90,7 @@ Bazel BUILD files also need to include a license section, e.g.,
 Changes to TensorFlow C++ code should conform to
 [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
 
-Use `clang-tidy` to check your C/C++ changes. To install clang-tidy on ubuntu:16.04, do:
+Use `clang-tidy` to check your C/C++ changes. To install `clang-tidy` on ubuntu:16.04, do:
 
 ```bash
 apt-get install -y clang-tidy
diff --git a/README.md b/README.md
index 6fb4486d0d..63853137cf 100644
--- a/README.md
+++ b/README.md
@@ -56,6 +56,7 @@ $ python
 42
 >>> sess.close()
 ```
+Learn more examples about how to do specific tasks in TensorFlow at the [tutorials page of tensorflow.org](https://www.tensorflow.org/tutorials/).
 
 ## Contribution guidelines
 
diff --git a/RELEASE.md b/RELEASE.md
index 84d9d52868..e09e9c6190 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,62 @@
+# Release 1.9.0
+
+## Major Features And Improvements
+* Update tf.keras to the Keras 2.1.6 API.
+* `tfe.Network` is deprecated. Please inherit from `tf.keras.Model`.
+* Adding support of core feature columns and losses to gradient boosted trees estimators.
+* The distributions.Bijector API supports broadcasting for Bijectors with new API changes. See [here](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/distributions/bijectors/Bijector) for more details.
+* Layered variable names have changed in the following conditions:
+  * Using `tf.keras.layers` with custom variable scopes.
+  * Using `tf.layers` in  a subclassed `tf.keras.Model` class. See [here](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/layers) for more details
+
+## Breaking Chances
+  * If you're opening empty variable scopes; replace `variable_scope`('', ...) by `variable_scope`(`tf.get_variable_scope()`, ...).
+
+## Bug Fixes and Other Changes
+* `tf.data`:
+  * The `DatasetBase::DebugString()` method is now `const`.
+  * Added the `tf.contrib.data.sample_from_datasets()` API for randomly sampling from multiple datasets.
+* Eager Execution:
+* `tf.keras`:
+  * Move Keras code out of _impl folder and remove API files.
+  * `tf.keras.Model.save_weights` now saves in TensorFlow format by default.
+  * Enable dataset iterators to be passed to `tf.keras.Model` training/eval methods.
+* Accelerated Linear Algebra (XLA):
+* TensorFlow Debugger (tfdbg): fix an issue in which the TensorBoard Debugger Plugin could not handle total source file size exceeding gRPC message size limit (4 MB).
+* `tf.contrib`:
+  * Add `tf.contrib.data.choose_from_datasets()`.
+  * `tf.contrib.data.make_csv_dataset()` now supports line breaks in quoted strings. Two arguments were removed from `make_csv_dataset`.
+  * `tf.contrib.framework.zero_initializer` supports ResourceVariable.
+  * Adding "constrained_optimization" to tensorflow/contrib.
+* Other:
+  * Add GCS Configuration Ops.
+  * Changing signature of `MakeIterator` to enable propagating error status.
+  * KL divergence for two Dirichlet distributions.
+  * More consistent GcsFileSystem behavior for certain reads past EOF.
+  * Update benchmark for tf.scan to match ranges across eager and graph modes.
+  * Fixed bug in `tf.reduce_prod gradient` for complex dtypes.
+  * Add optional `args` argument to `Dataset.from_generator()`.
+  * Allow the use of '.' in variables (e.g. "hparams.parse('a.b=1.0')"), which would previously raise an error. This will correspond to an attribute name with an embedded '.' symbol (e.g. 'a.b'), which can only be accessed indirectly (e.g. through getattr and setattr).  To set this up the user will first need to explicitly add the variable to the hparam object (e.g. "hparams.add_hparam(name='a.b', value=0.0)").
+  * Benchmark for tf.scan in graph and eager modes.
+  * Added complex128 support to FFT, FFT2D, FFT3D, IFFT, IFFT2D, and IFFT3D.
+  * Making ids unique in `nn.embedding_lookup_sparse`. This helps to reduce RPC calls for looking up the embeddings when there are repeated ids in the batch.
+  * Support indicator column in boosted trees.
+  * Prevent `tf.gradients()` from backpropagating through integer tensors.
+  * LinearOperator[1D,2D,3D]Circulant added to `tensorflow.linalg`.
+  * Conv3D, Conv3DBackpropInput, Conv3DBackpropFilter now supports arbitrary.
+  * Added `tf.train.Checkpoint` for reading/writing object-based checkpoints.
+  * `Dataset.list_files()` now produces determinstic results when `shuffle=False` or a `seed` is passed.
+  * Added LinearOperatorKronecker, a dense-free implementation of the Kronecker Product.
+  * Allow LinearOperator to broadcast.
+  * SavedModelBuilder will now deduplicate asset names that point to files with the same basename and the same contents. Note that this may result in new asset files included in SavedModels in cases where assets with the same name but different contents were previously overwriting each other.
+
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+Abdullah Alrasheed, Achal Shah, Ad-530, ADiegoCAlonso, Aditya Yogi, Ag Ramesh, akindyakov, Andy Kernahan, Anya Petrova, Aurelien Geron, Ben, Ben Barsdell, Bhavani-Subramanian, braincodercn, Brett Koonce, Brian Nemsick, Brian Zier, Bryan Heden, candy.dc, cclauss, Clayne Robison, ctiijima, Dalmo Cirne, David Norman, David T.H. Kao, DosLin, ekelsen, Elson Rodriguez, Erik Smistad, Felix Abecassis, Fergal Cotter, fo40225, foo0x29a, Freedom" Koan-Sin Tan, FréDéRic Branchaud-Charron, gdh1995, Geoffrey Irving, Giuseppe, gracehoney, Guido Zuidhof, Guillaume Klein, Guozhong Zhuang, Haggai, Harald Husum, imsheridan, Ivan Zhang, Jan Zikes, Jayaram Bobba, Jesse Benson, Jesse Gumz, Jiajia Li, Jie, jinghuangintel, Jingwen, jjsjann123, Joe Yearsley, Joel Hestness, Joel Shor, josephyearsley, Junpeng Lao, Karol M. Langner, Kb Sriram, krantideep95, Krish Ravindranath, Letian Feng, Loo Rong Jie, Lukas Geiger, Maciej, Mahmoud Abuzaina, ManHyuk, Mark Ryan, mbhuiyan, Michal Turek, Mostafa Alaa, Myungsung Kwak, Nand Dalal, Nehal J Wani, Neil Tenenholtz, ngc92, Nicholas Nadeau, P.Eng., Avs, Niranjan Hasabnis, P-Hidringer, Paul Van Eck, Peng Yu, Qing Zhao, Qingying Chen, Quanlong, Rajendra Arora, Rholais Lii, rmanyari, Robin Richtsfeld, Russell Klopfer, Sagi, Sam Sendelbach, Sandeep N Gupta, Sandip Giri, Sarah Edkins, Scott Tseng, Sdalbsoo, Sergii Khomenko, Seungwoo Choi (Biggie), Seyed Majid Azimi, Shaoning Zeng, shengfuintel, Siu Kei, Muk, Smit Shilu, soonson, Stefan Schweter, Sukhwan Kim, Sunitha Kambhampati, Taehoon Lee, tamimaddari82, Tang, Wenyi, Ted Chang, u2takey, Utkarsh Upadhyay, Vadim Markovtsev, voegtlel, Wai Hon Law, wangsiyu, Wenhao Hu, wenhao.hu, William D. Irons, Yan Facai (颜发才), Yanbo Liang, Yihong Wang, Yilei (Dolee) Yang, Yong Tang, Yuan (Terry) Tang
+
 # Release 1.8.0
 
 ## Major Features And Improvements
@@ -404,14 +463,6 @@ answered questions, and were part of inspiring discussions.
 
 # Release 1.4.0
 
-## Major Features And Improvements
-* `tf.keras` is now part of the core TensorFlow API.
-* [`tf.data`](http://tensorflow.org/programmers_guide/datasets) is now part of
-  the core TensorFlow API.
-  * The API is now subject to backwards compatibility guarantees.
-
-# Release 1.4.0
-
 ## Major Features And Improvements
 * `tf.keras` is now part of the core TensorFlow API.
 * [`tf.data`](http://tensorflow.org/programmers_guide/datasets) is now part of
diff --git a/configure.py b/configure.py
index bde7af8c0e..ada342a50a 100644
--- a/configure.py
+++ b/configure.py
@@ -1397,6 +1397,10 @@ def set_grpc_build_flags():
   write_to_bazelrc('build --define grpc_no_ares=true')
 
 
+def set_build_strip_flag():
+  write_to_bazelrc('build --strip=always')
+
+
 def set_windows_build_flags():
   if is_windows():
     # The non-monolithic build is not supported yet
@@ -1519,6 +1523,7 @@ def main():
 
   set_grpc_build_flags()
   set_cc_opt_flags(environ_cp)
+  set_build_strip_flag()
   set_windows_build_flags()
 
   if get_var(
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index a73c4ca3aa..6d134dbb80 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -475,7 +475,7 @@ tf_cc_shared_object(
 # excludes all but a subset of function names.
 # On MacOS, the linker does not support version_script, but has an
 # an "-exported_symbols_list" command.  -z defs disallows undefined
-# symbols in object files and -s strips the output.
+# symbols in object files.
 
 tf_cc_shared_object(
     name = "libtensorflow.so",
@@ -489,7 +489,6 @@ tf_cc_shared_object(
         "//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "-z defs",
-            "-s",
             "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
             "$(location //tensorflow/c:version_script.lds)",
         ],
@@ -515,7 +514,6 @@ tf_cc_shared_object(
         "//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "-z defs",
-            "-s",
             "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
             "$(location //tensorflow:tf_version_script.lds)",
         ],
diff --git a/tensorflow/c/generate-pc.sh b/tensorflow/c/generate-pc.sh
index 02a6a58b61..7184ad68fb 100755
--- a/tensorflow/c/generate-pc.sh
+++ b/tensorflow/c/generate-pc.sh
@@ -15,10 +15,12 @@
 # ==============================================================================
 
 TF_PREFIX='/usr/local'
+LIBDIR='lib'
 
 usage() {
     echo "Usage: $0 OPTIONS"
     echo -e "-p, --prefix\tset installation prefix (default: /usr/local)"
+    echo -e "-l, --libdir\tset lib directory (default: lib)"
     echo -e "-v, --version\tset TensorFlow version"
     echo -e "-h, --help\tdisplay this message"
 }
@@ -26,7 +28,7 @@ usage() {
 [ $# == 0 ] && usage && exit 0
 
 # read the options
-ARGS=$(getopt -o p:v:h --long prefix:,version:,help -n $0 -- "$@")
+ARGS=$(getopt -o p:l:v:h --long prefix:,libdir:,version:,help -n $0 -- "$@")
 eval set -- "$ARGS"
 
 # extract options and their arguments into variables.
@@ -38,6 +40,11 @@ while true ; do
                 "") shift 2 ;;
                 *) TF_PREFIX=$2 ; shift 2 ;;
             esac ;;
+        -l|--libdir)
+            case "$2" in
+                "") shift 2 ;;
+                *) LIBDIR=$2 ; shift 2 ;;
+            esac ;;
         -v|--version)
             case "$2" in
                 "") shift 2 ;;
@@ -55,7 +62,7 @@ echo "Generating pkgconfig file for TensorFlow $TF_VERSION in $TF_PREFIX"
 cat << EOF > tensorflow.pc
 prefix=${TF_PREFIX}
 exec_prefix=\${prefix}
-libdir=\${exec_prefix}/lib
+libdir=\${exec_prefix}/${LIBDIR}
 includedir=\${prefix}/include
 
 Name: TensorFlow
diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc
index 52c177212a..35a01e0341 100644
--- a/tensorflow/cc/gradients/math_grad.cc
+++ b/tensorflow/cc/gradients/math_grad.cc
@@ -38,6 +38,7 @@ REGISTER_NO_GRADIENT_OP("NotEqual");
 REGISTER_NO_GRADIENT_OP("LogicalAnd");
 REGISTER_NO_GRADIENT_OP("LogicalOr");
 REGISTER_NO_GRADIENT_OP("LogicalNot");
+REGISTER_NO_GRADIENT_OP("Floor");
 
 // Conjugate helper function returns the conjugate of an Output if it
 // is complex valued.
diff --git a/tensorflow/cc/gradients/nn_grad.cc b/tensorflow/cc/gradients/nn_grad.cc
index 0cb3132e94..c73482d5f4 100644
--- a/tensorflow/cc/gradients/nn_grad.cc
+++ b/tensorflow/cc/gradients/nn_grad.cc
@@ -255,6 +255,53 @@ Status LRNGradHelper(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("LRN", LRNGradHelper);
 
+Status SoftplusGradHelper(const Scope& scope, const Operation& op,
+                          const std::vector<Output>& grad_inputs,
+                          std::vector<Output>* grad_outputs) {
+  auto dx = internal::SoftplusGrad(scope, grad_inputs[0], op.input(0));
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("Softplus", SoftplusGradHelper);
+
+Status SoftsignGradHelper(const Scope& scope, const Operation& op,
+                          const std::vector<Output>& grad_inputs,
+                          std::vector<Output>* grad_outputs) {
+  auto dx = internal::SoftsignGrad(scope, grad_inputs[0], op.input(0));
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("Softsign", SoftsignGradHelper);
+
+Status FractionalAvgPoolGradHelper(const Scope& scope, const Operation& op,
+                                   const std::vector<Output>& grad_inputs,
+                                   std::vector<Output>* grad_outputs) {
+  bool overlapping;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.output(0).node()->attrs(), "overlapping", &overlapping));
+  auto dx = internal::FractionalAvgPoolGrad(
+      scope, Shape(scope, op.input(0), Shape::OutType(DT_INT64)),
+      grad_inputs[0], op.output(1), op.output(2),
+      internal::FractionalAvgPoolGrad::Overlapping(overlapping));
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("FractionalAvgPool", FractionalAvgPoolGradHelper);
+
+Status FractionalMaxPoolGradHelper(const Scope& scope, const Operation& op,
+                                   const std::vector<Output>& grad_inputs,
+                                   std::vector<Output>* grad_outputs) {
+  bool overlapping;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.output(0).node()->attrs(), "overlapping", &overlapping));
+  auto dx = internal::FractionalMaxPoolGrad(
+      scope, op.input(0), op.output(0), grad_inputs[0], op.output(1),
+      op.output(2), internal::FractionalMaxPoolGrad::Overlapping(overlapping));
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("FractionalMaxPool", FractionalMaxPoolGradHelper);
+
 }  // anonymous namespace
 }  // namespace ops
 }  // namespace tensorflow
diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc
index c4eba7ecb0..b4d457a9d1 100644
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@@ -28,6 +28,8 @@ namespace {
 using ops::BiasAdd;
 using ops::Conv2D;
 using ops::Elu;
+using ops::FractionalAvgPool;
+using ops::FractionalMaxPool;
 using ops::L2Loss;
 using ops::LogSoftmax;
 using ops::LRN;
@@ -41,6 +43,8 @@ using ops::Relu;
 using ops::Relu6;
 using ops::Selu;
 using ops::Softmax;
+using ops::Softplus;
+using ops::Softsign;
 
 class NNGradTest : public ::testing::Test {
  protected:
@@ -71,22 +75,30 @@ class NNGradTest : public ::testing::Test {
     EXPECT_LT(max_error, 1e-3);
   }
 
-  // Sets tensor with random values, ensuring that the max value is largest by
-  // a reasonable amount.
-  // This is an issue for MaxPool, MaxPoolV2 and MaxPool3D, in which
-  // perturbations by the numeric gradient computation in the gradient checker
-  // can change the max value if values are too close together.
+  // Sets tensor with random values, ensuring that every pair of elements are at
+  // least a reasonable amount apart.
+  // This is an issue for max pooling operations, in which perturbations by the
+  // numeric gradient computation in the gradient checker can change the max
+  // value if a pool has values that are too close together.
   template <typename T>
-  void SetRandomValuesWithBumpedMax(Tensor* tensor) {
+  void SetRandomValuesForMaxPooling(Tensor* tensor) {
     auto tensor_flat = tensor->flat<T>();
-    tensor_flat.setRandom();
-    int32 max_index = 0;
-    for (size_t i = 1; i < tensor->NumElements(); i++) {
-      if (tensor_flat(i) > tensor_flat(max_index)) {
-        max_index = i;
-      }
+    // First set the array to an increasing sequence of values spaced
+    // a reasonable amount apart
+    T cur = 0;
+    for (size_t i = 0; i < tensor->NumElements(); i++) {
+      tensor_flat(i) = cur;
+      cur += 5e-2;
+    }
+    // Fischer-Yates shuffle the array
+    for (size_t i = tensor->NumElements() - 1; i >= 1; i--) {
+      // j <- random integer 0 <= j <= i
+      size_t j = random::New64() % (i + 1);
+      // swap values at i, j
+      T tmp = tensor_flat(i);
+      tensor_flat(i) = tensor_flat(j);
+      tensor_flat(j) = tmp;
     }
-    tensor_flat(max_index) += 1e-2;
   }
 
   Scope scope_;
@@ -189,7 +201,7 @@ TEST_F(NNGradTest, MaxPoolGradHelper) {
   const std::vector<int> strides{1, 2, 2, 1};
   auto y = MaxPool(scope_, x, ksize, strides, "VALID");
   Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
-  SetRandomValuesWithBumpedMax<float>(&x_init_value);
+  SetRandomValuesForMaxPooling<float>(&x_init_value);
   RunTest(x, x_init_value, y, y_shape);
 }
 
@@ -202,7 +214,7 @@ TEST_F(NNGradTest, MaxPoolGradV2Helper) {
   Tensor strides = test::AsTensor<int>({1, 2, 2, 1}, {4});
   auto y = MaxPoolV2(scope_, x, ksize, strides, "VALID");
   Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
-  SetRandomValuesWithBumpedMax<float>(&x_init_value);
+  SetRandomValuesForMaxPooling<float>(&x_init_value);
   RunTest(x, x_init_value, y, y_shape);
 }
 
@@ -215,7 +227,7 @@ TEST_F(NNGradTest, MaxPool3DGradHelper) {
   const std::vector<int> strides{1, 3, 3, 3, 1};
   auto y = MaxPool3D(scope_, x, ksize, strides, "VALID");
   Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
-  SetRandomValuesWithBumpedMax<float>(&x_init_value);
+  SetRandomValuesForMaxPooling<float>(&x_init_value);
   RunTest(x, x_init_value, y, y_shape);
 }
 
@@ -248,5 +260,45 @@ TEST_F(NNGradTest, LRN){
   RunTest(x, x_shape, y, x_shape);
 }
 
+TEST_F(NNGradTest, SoftplusGrad) {
+  TensorShape shape({3, 7});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+  auto y = Softplus(scope_, x);
+  RunTest(x, shape, y, shape);
+}
+
+TEST_F(NNGradTest, SoftsignGrad) {
+  TensorShape shape({3, 7});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+  auto y = Softsign(scope_, x);
+  RunTest(x, shape, y, shape);
+}
+
+TEST_F(NNGradTest, FractionalAvgPoolGradHelper) {
+  TensorShape x_shape({1, 3, 7, 1});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+  // Force consistent pooling regions for unit testing.
+  auto y = FractionalAvgPool(
+      scope_, x, {1, 1.2, 1.9, 1},
+      FractionalAvgPool::Deterministic(true).Overlapping(true).Seed(1).Seed2(
+          2));
+  TensorShape y_shape({1, 2, 3, 1});
+  RunTest(x, x_shape, y.output, y_shape);
+}
+
+TEST_F(NNGradTest, FractionalMaxPoolGradHelper) {
+  TensorShape x_shape({1, 3, 7, 1});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+  // Force consistent pooling regions for unit testing.
+  auto y = FractionalMaxPool(
+      scope_, x, {1, 1.2, 1.9, 1},
+      FractionalMaxPool::Deterministic(true).Overlapping(true).Seed(1).Seed2(
+          2));
+  Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
+  SetRandomValuesForMaxPooling<float>(&x_init_value);
+  TensorShape y_shape({1, 2, 3, 1});
+  RunTest(x, x_init_value, y.output, y_shape);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index 6e050cf564..6641d45e83 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -56,9 +56,9 @@ namespace bar {
 //
 // Memory stats:
 //   arg bytes total:    104
-//   arg bytes aligned:  128
+//   arg bytes aligned:  192
 //   temp bytes total:   126
-//   temp bytes aligned: 224
+//   temp bytes aligned: 320
 class MyClass : public tensorflow::XlaCompiledCpuFunction {
  public:
   // Number of input arguments for the compiled computation.
diff --git a/tensorflow/compiler/aot/embedded_protocol_buffers.h b/tensorflow/compiler/aot/embedded_protocol_buffers.h
index ebfe4806c2..4e194a6aba 100644
--- a/tensorflow/compiler/aot/embedded_protocol_buffers.h
+++ b/tensorflow/compiler/aot/embedded_protocol_buffers.h
@@ -71,7 +71,7 @@ struct ProtobufToEmbed {
   const ::tensorflow::protobuf::MessageLite* message;
 };
 
-// Embeds a a sequence of protocol buffers into an object file.
+// Embeds a sequence of protocol buffers into an object file.
 //
 // `target_triple` is the target triple for the target architecture for the
 // generated object file.
diff --git a/tensorflow/compiler/aot/runtime.h b/tensorflow/compiler/aot/runtime.h
index d085864f00..d1a669ceb1 100644
--- a/tensorflow/compiler/aot/runtime.h
+++ b/tensorflow/compiler/aot/runtime.h
@@ -25,8 +25,8 @@ namespace tensorflow {
 namespace tfcompile {
 namespace runtime {
 
-// Align to 32-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment.
-static constexpr size_t kAlign = 32;
+// Align to 64-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment.
+static constexpr size_t kAlign = 64;
 
 // aligned_buffer_bytes returns the sum of each size in `sizes`, skipping -1
 // values.  There are `n` entries in `sizes`.  Each buffer is aligned to kAlign
diff --git a/tensorflow/compiler/aot/runtime_test.cc b/tensorflow/compiler/aot/runtime_test.cc
index 6d603a02eb..06ec623eb2 100644
--- a/tensorflow/compiler/aot/runtime_test.cc
+++ b/tensorflow/compiler/aot/runtime_test.cc
@@ -24,7 +24,7 @@ namespace runtime {
 namespace {
 
 TEST(Runtime, AlignmentValue) {
-  // We've chosen 32 byte alignment for the tfcompile runtime to mimic the
+  // We've chosen 64 byte alignment for the tfcompile runtime to mimic the
   // regular tensorflow allocator, which was chosen to play nicely with Eigen.
   // The tfcompile runtime also has a requirement that comes from the xla
   // generated code, on the relation: buffer_size >= 16 ? 2 * sizeof(void*) : 8
@@ -39,13 +39,13 @@ TEST(Runtime, AlignedBufferBytes) {
   EXPECT_EQ(aligned_buffer_bytes(sizesA, 1), 0);
 
   static constexpr intptr_t sizesB[1] = {3};
-  EXPECT_EQ(aligned_buffer_bytes(sizesB, 1), 32);
+  EXPECT_EQ(aligned_buffer_bytes(sizesB, 1), 64);
 
   static constexpr intptr_t sizesC[1] = {32};
-  EXPECT_EQ(aligned_buffer_bytes(sizesC, 1), 32);
+  EXPECT_EQ(aligned_buffer_bytes(sizesC, 1), 64);
 
   static constexpr intptr_t sizesD[7] = {1, -1, 32, -1, 64, 2, 3};
-  EXPECT_EQ(aligned_buffer_bytes(sizesD, 7), 192);
+  EXPECT_EQ(aligned_buffer_bytes(sizesD, 7), 320);
 }
 
 void* add_ptr(void* base, uintptr_t delta) {
@@ -101,11 +101,11 @@ TEST(Runtime, MallocFreeContiguousBuffers) {
   EXPECT_NE(base, nullptr);
   EXPECT_EQ(bufD[0], add_ptr(base, 0));
   EXPECT_EQ(bufD[1], nullptr);
-  EXPECT_EQ(bufD[2], add_ptr(base, 32));
+  EXPECT_EQ(bufD[2], add_ptr(base, 64));
   EXPECT_EQ(bufD[3], nullptr);
-  EXPECT_EQ(bufD[4], add_ptr(base, 64));
-  EXPECT_EQ(bufD[5], add_ptr(base, 128));
-  EXPECT_EQ(bufD[6], add_ptr(base, 160));
+  EXPECT_EQ(bufD[4], add_ptr(base, 128));
+  EXPECT_EQ(bufD[5], add_ptr(base, 192));
+  EXPECT_EQ(bufD[6], add_ptr(base, 256));
   for (int i = 0; i < 7; ++i) {
     const intptr_t size = sizesD[i];
     if (size != -1) {
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index d82922a359..1067b38f93 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -178,6 +178,7 @@ cc_library(
         ":runtime_matmul",
         ":runtime_matmul_mkl",
         ":runtime_single_threaded_conv2d",
+        ":runtime_single_threaded_fft",
         ":runtime_single_threaded_matmul",
         "@llvm//:execution_engine",
         "@llvm//:core",
@@ -516,7 +517,6 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/core:framework",
         "//tensorflow/core:framework_lite",
         "//third_party/eigen3",
     ],
@@ -578,6 +578,22 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "runtime_single_threaded_fft",
+    srcs = [
+        "runtime_fft_impl.h",
+        "runtime_single_threaded_fft.cc",
+    ],
+    hdrs = ["runtime_single_threaded_fft.h"],
+    copts = runtime_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:framework_lite",
+        "//third_party/eigen3",
+    ],
+)
+
 cc_library(
     name = "runtime_single_threaded_matmul",
     srcs = ["runtime_single_threaded_matmul.cc"],
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
index 215405f680..54c52bc08f 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
@@ -51,6 +51,8 @@ extern const char* const kEigenConvF16SymbolName =
 extern const char* const kEigenConvF32SymbolName =
     "__xla_cpu_runtime_EigenConvF32";
 extern const char* const kEigenFftSymbolName = "__xla_cpu_runtime_EigenFft";
+extern const char* const kEigenSingleThreadedFftSymbolName =
+    "__xla_cpu_runtime_EigenSingleThreadedFft";
 extern const char* const kEigenSingleThreadedMatMulF16SymbolName =
     "__xla_cpu_runtime_EigenSingleThreadedMatMulF16";
 extern const char* const kEigenSingleThreadedMatMulF32SymbolName =
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
index 1dce6efa5c..aa0e967123 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
@@ -52,6 +52,7 @@ extern const char* const kMKLSingleThreadedMatMulF64SymbolName;
 extern const char* const kEigenConvF16SymbolName;
 extern const char* const kEigenConvF32SymbolName;
 extern const char* const kEigenFftSymbolName;
+extern const char* const kEigenSingleThreadedFftSymbolName;
 extern const char* const kEigenSingleThreadedMatMulF16SymbolName;
 extern const char* const kEigenSingleThreadedMatMulF32SymbolName;
 extern const char* const kEigenSingleThreadedMatMulF64SymbolName;
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 2c20be155f..758b8c62b4 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -1172,7 +1172,13 @@ Status IrEmitter::HandleFft(HloInstruction* fft) {
       {int8_ptr_type, int8_ptr_type, int8_ptr_type, int32_type, int32_type,
        int64_type, int64_type, int64_type, int64_type},
       /*isVarArg=*/false);
-  const char* fn_name = runtime::kEigenFftSymbolName;
+
+  bool multi_threaded_eigen =
+      hlo_module_config_.debug_options().xla_cpu_multi_thread_eigen();
+  const char* fn_name = multi_threaded_eigen
+                            ? runtime::kEigenFftSymbolName
+                            : runtime::kEigenSingleThreadedFftSymbolName;
+
   llvm::Function* fft_func = llvm::cast<llvm::Function>(
       module_->getOrInsertFunction(fn_name, fft_type));
   fft_func->setCallingConv(llvm::CallingConv::C);
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h b/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
index 984cb0616e..0bf693edd0 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
@@ -21,8 +21,6 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/numeric_types.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/platform/types.h"
 
 // 'tensorflow' namespace is used so that int64 and other types don't require
@@ -71,11 +69,9 @@ void EigenFftR2C(const EigenDevice& device, complex64* out, float* operand,
   in_dims[0] = input_batch;
   Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> out_dims;
   out_dims[0] = input_batch;
-  TensorShape temp_shape{input_batch};
   for (int i = 0; i < FFTRank; i++) {
     in_dims[i + 1] = fft_shape[i];
     out_dims[i + 1] = i == FFTRank - 1 ? fft_shape[i] / 2 + 1 : fft_shape[i];
-    temp_shape.AddDim(fft_shape[i]);
   }
   const Eigen::TensorMap<Eigen::Tensor<float, FFTRank + 1, Eigen::RowMajor>,
                          Eigen::Aligned>
@@ -88,8 +84,8 @@ void EigenFftR2C(const EigenDevice& device, complex64* out, float* operand,
   const auto axes = Eigen::ArrayXi::LinSpaced(FFTRank, 1, FFTRank);
 
   // Compute the full FFT using a temporary tensor.
-  Tensor temp(DataTypeToEnum<complex64>::v(), temp_shape);
-  auto full_fft = temp.flat_inner_dims<complex64, FFTRank + 1>();
+  Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor> full_fft(in_dims);
+
   const Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> zero_start_indices;
   full_fft.device(device) =
       input.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(axes);
@@ -112,11 +108,9 @@ void EigenFftC2R(const EigenDevice& device, float* out, complex64* operand,
   in_dims[0] = input_batch;
   Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> out_dims;
   out_dims[0] = input_batch;
-  TensorShape temp_shape{input_batch};
   for (int i = 0; i < FFTRank; i++) {
     in_dims[i + 1] = i == FFTRank - 1 ? fft_shape[i] / 2 + 1 : fft_shape[i];
     out_dims[i + 1] = fft_shape[i];
-    temp_shape.AddDim(fft_shape[i]);
   }
   const Eigen::TensorMap<Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor>,
                          Eigen::Aligned>
@@ -129,8 +123,7 @@ void EigenFftC2R(const EigenDevice& device, float* out, complex64* operand,
   // region we will slice from input given fft_shape. We slice input to
   // fft_shape on its inner-most dimensions, except the last (which we
   // slice to fft_shape[-1] / 2 + 1).
-  Tensor temp(DataTypeToEnum<complex64>::v(), temp_shape);
-  auto full_fft = temp.flat_inner_dims<complex64, FFTRank + 1>();
+  Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor> full_fft(out_dims);
 
   // Calculate the starting point and range of the source of
   // negative frequency part.
@@ -179,7 +172,6 @@ template <int FFTRank, typename EigenDevice>
 void EigenFftWithRank(const EigenDevice& device, void* out, void* operand,
                       int32 fft_type, int64 input_batch, int64 fft_length0,
                       int64 fft_length1, int64 fft_length2) {
-  CHECK(::xla::FftType_IsValid(fft_type)) << fft_type;
   switch (fft_type) {
     case ::xla::FftType::FFT:
       EigenFftC2C<true, FFTRank, EigenDevice>(
@@ -204,7 +196,8 @@ void EigenFftWithRank(const EigenDevice& device, void* out, void* operand,
           input_batch, fft_length0, fft_length1, fft_length2);
       break;
     default:
-      LOG(FATAL) << "Unsupported FFT type: " << fft_type;
+      // Unsupported FFT type
+      abort();
   }
 }
 
@@ -230,7 +223,8 @@ void EigenFftImpl(const EigenDevice& device, void* out, void* operand,
                                                  fft_length1, fft_length2);
       break;
     default:
-      LOG(FATAL) << "Unsupported FFT rank " << fft_rank;
+      // Unsupported FFT rank
+      abort();
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc
new file mode 100644
index 0000000000..2613ddb127
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc
@@ -0,0 +1,32 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h"
+
+#include "tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h"
+#include "tensorflow/core/platform/dynamic_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+using tensorflow::int32;
+using tensorflow::int64;
+
+TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenSingleThreadedFft(
+    const void* run_options_ptr, void* out, void* operand, int32 fft_type,
+    int32 fft_rank, int64 input_batch, int64 fft_length0, int64 fft_length1,
+    int64 fft_length2) {
+  tensorflow::xla::EigenFftImpl(Eigen::DefaultDevice(), out, operand, fft_type,
+                                fft_rank, input_batch, fft_length0, fft_length1,
+                                fft_length2);
+}
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h
new file mode 100644
index 0000000000..dcd133d012
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_FFT_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_FFT_H_
+
+#include "tensorflow/core/platform/types.h"
+
+extern "C" {
+
+extern void __xla_cpu_runtime_EigenSingleThreadedFft(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, void* out,
+    void* operand, tensorflow::int32 fft_type, tensorflow::int32 fft_rank,
+    tensorflow::int64 input_batch, tensorflow::int64 fft_length0,
+    tensorflow::int64 fft_length1, tensorflow::int64 fft_length2);
+
+}  // extern "C"
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_FFT_H_
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index 8d8c5e4c44..c4c90515ac 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "tensorflow/compiler/xla/service/cpu/windows_compatibility.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -202,6 +203,7 @@ bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(MKLSingleThreadedMatMulF64);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF32);
+  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedFft);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF64);
diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
index d3bc47e61e..2515222cf2 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher.h
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -204,7 +204,7 @@ class LayoutPattern {
   // Modifies the pattern to match only if the layout equals the given proto.
   // The layout must outlive the returned pattern.
   constexpr LayoutPattern<LayoutType, LayoutPatternEqualImpl<Impl>> EqualTo(
-      const Layout* layout) const {
+      const ::xla::Layout* layout) const {
     return LayoutPattern<LayoutType, LayoutPatternEqualImpl<Impl>>(
         LayoutPatternEqualImpl<Impl>(impl_, layout), matched_layout_);
   }
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier.cc b/tensorflow/compiler/xla/service/tuple_simplifier.cc
index e536c8afbf..77bdcc9de0 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier.cc
+++ b/tensorflow/compiler/xla/service/tuple_simplifier.cc
@@ -30,10 +30,17 @@ limitations under the License.
 
 namespace xla {
 
+TupleSimplifier::TupleSimplifier(bool exclude_entry_computation) :
+    exclude_entry_computation_(exclude_entry_computation) {}
+
 StatusOr<bool> TupleSimplifier::Run(HloModule* module) {
   // Initially add all GTE and Tuple instructions to the worklist.
   std::queue<HloInstruction*> worklist;
   for (auto* computation : module->computations()) {
+    if (exclude_entry_computation_ &&
+        computation == module->entry_computation()) {
+      continue;
+    }
     for (auto* instruction : computation->instructions()) {
       if (instruction->opcode() == HloOpcode::kTuple ||
           instruction->opcode() == HloOpcode::kGetTupleElement) {
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier.h b/tensorflow/compiler/xla/service/tuple_simplifier.h
index e5e9b10b5b..7509501883 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier.h
+++ b/tensorflow/compiler/xla/service/tuple_simplifier.h
@@ -27,13 +27,20 @@ namespace xla {
 // the module.
 class TupleSimplifier : public HloPassInterface {
  public:
-  TupleSimplifier() {}
+  TupleSimplifier() : TupleSimplifier(/*exclude_entry_computation=*/false) {}
+  explicit TupleSimplifier(bool exclude_entry_computation);
   ~TupleSimplifier() override {}
   tensorflow::StringPiece name() const override { return "tuple-simplifier"; }
 
   // Run tuple simplification on the given computation. Returns whether the
   // computation was changed.
   StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  // When set, this pipeline stage will perform optimization of all computations
+  // apart from the module's entry computation. This is used by Graphcore's
+  // backend.
+  bool exclude_entry_computation_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier_test.cc b/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
index ca9ae91281..d3635eae81 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
@@ -42,6 +42,12 @@ class TupleSimplifierTest : public HloTestBase {
     TF_ASSERT_OK(changed_status.status());
     EXPECT_EQ(change_expected, changed_status.ValueOrDie());
   }
+  void Run(HloModule* module, bool change_expected, bool exclude_entry) {
+    TupleSimplifier simplifier(exclude_entry);
+    auto changed_status = simplifier.Run(module);
+    TF_ASSERT_OK(changed_status.status());
+    EXPECT_EQ(change_expected, changed_status.ValueOrDie());
+  }
 
   const Shape scalar_shape_ = ShapeUtil::MakeShape(F32, {});
   const Shape tuple_shape_ = ShapeUtil::MakeTupleShape(
@@ -211,5 +217,76 @@ TEST_F(TupleSimplifierTest, IncompatibleTuples) {
   EXPECT_THAT(computation->root_instruction(), tuple);
 }
 
+TEST_F(TupleSimplifierTest, CanExcludeEntryComputation) {
+  //  Verify that the root computation can be excluded
+  auto module = CreateNewModule();
+
+  HloInstruction* p0;
+  HloInstruction* p1;
+  HloComputation* c0;
+  HloComputation* c1;
+  HloComputation* entry;
+
+  {
+    HloComputation::Builder builder(TestName() + "_1");
+    p0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, tuple_shape_, "param"));
+    HloInstruction* gte0 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, p0, 0));
+    HloInstruction* gte1 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, p0, 1));
+    HloInstruction* gte2 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, p0, 2));
+
+    builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1, gte2}));
+
+    c0 = module->AddEmbeddedComputation(builder.Build());
+  }
+  {
+    HloComputation::Builder builder(TestName() + "_2");
+    p1 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, tuple_shape_, "param"));
+    HloInstruction* gte0 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, p1, 0));
+    HloInstruction* gte1 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, p1, 1));
+    HloInstruction* gte2 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, p1, 2));
+
+    builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1, gte2}));
+
+    c1 = module->AddEmbeddedComputation(builder.Build());
+  }
+  {
+    HloComputation::Builder builder(TestName() + "_Entry");
+    HloInstruction* tuple_param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, tuple_shape_, "param"));
+    HloInstruction* call0 = builder.AddInstruction(
+        HloInstruction::CreateCall(tuple_shape_, {tuple_param}, c0));
+    HloInstruction* call1 = builder.AddInstruction(
+        HloInstruction::CreateCall(tuple_shape_, {tuple_param}, c1));
+    HloInstruction* gte0 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, call0, 0));
+    HloInstruction* gte1 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, call1, 1));
+    HloInstruction* tuple0 =
+        builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
+    HloInstruction* gte2 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, tuple0, 0));
+    HloInstruction* gte3 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, tuple0, 1));
+
+    builder.AddInstruction(HloInstruction::CreateTuple({gte2, gte3}));
+
+    entry = module->AddEntryComputation(builder.Build());
+  }
+
+  Run(module.get(), /*change_expected=*/true, /*exclude_entry=*/ true);
+
+  EXPECT_THAT(c0->root_instruction(), p0);
+  EXPECT_THAT(c1->root_instruction(), p1);
+  EXPECT_THAT(entry->instruction_count(), 9);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/contrib/autograph/__init__.py b/tensorflow/contrib/autograph/__init__.py
index 8fd83ef376..361cf2d77c 100644
--- a/tensorflow/contrib/autograph/__init__.py
+++ b/tensorflow/contrib/autograph/__init__.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 
 # TODO(mdan): Bring only the relevant symbols to the top level.
 from tensorflow.contrib.autograph import utils
+from tensorflow.contrib.autograph import operators
 from tensorflow.contrib.autograph.impl.api import convert
 from tensorflow.contrib.autograph.impl.api import converted_call
 from tensorflow.contrib.autograph.impl.api import do_not_convert
@@ -43,6 +44,8 @@ _allowed_symbols = [
     'do_not_convert',
     'to_code',
     'to_graph',
+    # Overloaded operators
+    'operators',
     # Python language "extensions"
     'set_element_type',
     'set_loop_options',
diff --git a/tensorflow/contrib/cmake/tf_c.cmake b/tensorflow/contrib/cmake/tf_c.cmake
index bda5e26f43..2e0a2fcef4 100644
--- a/tensorflow/contrib/cmake/tf_c.cmake
+++ b/tensorflow/contrib/cmake/tf_c.cmake
@@ -37,13 +37,15 @@ add_dependencies(
   tf_core_lib
   tf_protos_cc)
 
-add_library(tf_c_python_api OBJECT
-  "${tensorflow_source_dir}/tensorflow/c/python_api.cc"
-  "${tensorflow_source_dir}/tensorflow/c/python_api.h"
-)
-add_dependencies(
-  tf_c_python_api
-  tf_c
-  tf_core_lib
-  tf_core_framework
-  tf_protos_cc)
+if(tensorflow_BUILD_PYTHON_BINDINGS)
+  add_library(tf_c_python_api OBJECT
+    "${tensorflow_source_dir}/tensorflow/c/python_api.cc"
+    "${tensorflow_source_dir}/tensorflow/c/python_api.h"
+  )
+  add_dependencies(
+    tf_c_python_api
+    tf_c
+    tf_core_lib
+    tf_core_framework
+    tf_protos_cc)
+endif()
diff --git a/tensorflow/contrib/cmake/tf_cc_ops.cmake b/tensorflow/contrib/cmake/tf_cc_ops.cmake
index f73da0b8ab..6c90cf398c 100644
--- a/tensorflow/contrib/cmake/tf_cc_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_cc_ops.cmake
@@ -155,7 +155,7 @@ if (WIN32)
     set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.lib")
   endif()
 else (WIN32)
-  set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so")
+  set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal${CMAKE_SHARED_LIBRARY_SUFFIX}")
 endif (WIN32)
 add_custom_target(tf_extension_ops)
 
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index a0c3ddd28b..9244604489 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -715,7 +715,7 @@ if(WIN32)
   endif()
 else()
   add_custom_command(TARGET pywrap_tensorflow_internal POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal${CMAKE_SHARED_LIBRARY_SUFFIX}
                                      ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.so)
 endif()
 
@@ -832,7 +832,6 @@ add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
 add_custom_command(TARGET tf_python_copy_scripts_to_destination PRE_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/contrib/testing/python/framework/util_test.py
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/testing/python/framework/)
-
 add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tools/pip_package/README
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/)
diff --git a/tensorflow/contrib/cmake/tools/create_def_file.py b/tensorflow/contrib/cmake/tools/create_def_file.py
index cffe069aa3..4f957f1e0b 100644
--- a/tensorflow/contrib/cmake/tools/create_def_file.py
+++ b/tensorflow/contrib/cmake/tools/create_def_file.py
@@ -44,7 +44,8 @@ UNDNAME = "undname.exe"
 DUMPBIN = "dumpbin.exe"
 
 # Exclude if matched
-EXCLUDE_RE = re.compile(r"RTTI|deleting destructor|::internal::")
+EXCLUDE_RE = re.compile(r"RTTI|deleting destructor|::internal::|Internal|"
+                        r"python_op_gen_internal|grappler")
 
 # Include if matched before exclude
 INCLUDEPRE_RE = re.compile(r"google::protobuf::internal::ExplicitlyConstructed|"
@@ -56,6 +57,10 @@ INCLUDEPRE_RE = re.compile(r"google::protobuf::internal::ExplicitlyConstructed|"
                            r"tensorflow::ops::internal::Enter|"
                            r"tensorflow::strings::internal::AppendPieces|"
                            r"tensorflow::strings::internal::CatPieces|"
+                           r"tensorflow::errors::Internal|"
+                           r"tensorflow::Tensor::CopyFromInternal|"
+                           r"tensorflow::kernel_factory::"
+                           r"OpKernelRegistrar::InitInternal|"
                            r"tensorflow::io::internal::JoinPathImpl")
 
 # Include if matched after exclude
@@ -64,7 +69,7 @@ INCLUDE_RE = re.compile(r"^(TF_\w*)$|"
                         r"tensorflow::|"
                         r"functor::|"
                         r"\?nsync_|"
-                        r"perftools::gputools")
+                        r"stream_executor::")
 
 # We want to identify data members explicitly in the DEF file, so that no one
 # can implicitly link against the DLL if they use one of the variables exported
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
index 45760a29ee..795f1993ba 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
@@ -151,16 +151,24 @@ class SinhArcsinhBijectorTest(test.TestCase):
         self.assertAllClose(y, bijector.forward(x).eval(), rtol=1e-4, atol=0.)
         self.assertAllClose(x, bijector.inverse(y).eval(), rtol=1e-4, atol=0.)
 
-        # Do the numpy calculation in float128 to avoid inf/nan.
-        y_float128 = np.float128(y)
-        self.assertAllClose(
-            np.log(np.cosh(
-                np.arcsinh(y_float128) / tailweight - skewness) / np.sqrt(
-                    y_float128**2 + 1)) -
-            np.log(tailweight),
-            bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
-            rtol=1e-4,
-            atol=0.)
+        # On IBM PPC systems, longdouble (np.float128) is same as double except that it can have more precision.
+        # Type double being of 8 bytes, can't hold square of max of float64 (which is also 8 bytes) and
+        # below test fails due to overflow error giving inf. So this check avoids that error by skipping square
+        # calculation and corresponding assert.
+
+        if np.amax(y) <= np.sqrt(np.finfo(np.float128).max) and \
+           np.fabs(np.amin(y)) <= np.sqrt(np.fabs(np.finfo(np.float128).min)):
+
+          # Do the numpy calculation in float128 to avoid inf/nan.
+          y_float128 = np.float128(y)
+          self.assertAllClose(
+              np.log(np.cosh(
+                  np.arcsinh(y_float128) / tailweight - skewness) / np.sqrt(
+                      y_float128**2 + 1)) -
+              np.log(tailweight),
+              bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
+              rtol=1e-4,
+              atol=0.)
         self.assertAllClose(
             -bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
             bijector.forward_log_det_jacobian(x, event_ndims=0).eval(),
diff --git a/tensorflow/contrib/eager/python/datasets.py b/tensorflow/contrib/eager/python/datasets.py
index d7909dd5a2..adf92c27ea 100644
--- a/tensorflow/contrib/eager/python/datasets.py
+++ b/tensorflow/contrib/eager/python/datasets.py
@@ -106,7 +106,8 @@ class Iterator(iterator_ops.EagerIterator, checkpointable.CheckpointableBase):
             target_device=target,
             buffer_size=10,
             container="",
-            shared_name=_generate_shared_name("function_buffer_resource"))
+            shared_name=_generate_shared_name(
+                "contrib_eager_iterator_function_buffer_resource"))
         self._buffer_resource_deleter = resource_variable_ops.EagerResourceDeleter(  # pylint: disable=line-too-long
             handle=self._buffer_resource_handle,
             handle_device=self._device)
diff --git a/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb b/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb
index 4fe3a0e3f3..5749f22ac5 100644
--- a/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb
+++ b/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb
@@ -68,7 +68,7 @@
         "# simply construct the object. Most layers take as a first argument the number\n",
         "# of output dimensions / channels.\n",
         "layer = tf.keras.layers.Dense(100)\n",
-        "# The number of input dimensionss is often unnecessary, as it can be inferred\n",
+        "# The number of input dimensions is often unnecessary, as it can be inferred\n",
         "# the first time the layer is used, but it can be provided if you want to \n",
         "# specify it manually, which is useful in some complex models.\n",
         "layer = tf.keras.layers.Dense(10, input_shape=(None, 5))"
@@ -267,7 +267,7 @@
         "  * `build`, where you know the shapes of the input tensors and can do the rest of the initialization\n",
         "  * `call`, where you do the forward computation\n",
         "\n",
-        "Note that you don't have to wait until `build` is called to create your variables, you can also create them in `__init__`. However, the advantage of creating them in `build` is that it enables late variable creation based on the shape of the inputs the layer will operate on. On the other hand, creating variables in `__init__` would mean that shapes requires to create the variables will need to be explicitly specified."
+        "Note that you don't have to wait until `build` is called to create your variables, you can also create them in `__init__`. However, the advantage of creating them in `build` is that it enables late variable creation based on the shape of the inputs the layer will operate on. On the other hand, creating variables in `__init__` would mean that shapes required to create the variables will need to be explicitly specified."
       ]
     },
     {
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
index 84a413c791..05bcdac2ca 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
@@ -346,7 +346,8 @@ def sequence_numeric_column(
     key,
     shape=(1,),
     default_value=0.,
-    dtype=dtypes.float32):
+    dtype=dtypes.float32,
+    normalizer_fn=None):
   """Returns a feature column that represents sequences of numeric data.
 
   Example:
@@ -370,6 +371,12 @@ def sequence_numeric_column(
     default_value: A single value compatible with `dtype` that is used for
       padding the sparse data into a dense `Tensor`.
     dtype: The type of values.
+    normalizer_fn: If not `None`, a function that can be used to normalize the
+      value of the tensor after `default_value` is applied for parsing.
+      Normalizer function takes the input `Tensor` as its argument, and returns
+      the output `Tensor`. (e.g. lambda x: (x - 3.0) / 4.2). Please note that
+      even though the most common use case of this function is normalization, it
+      can be used for any kind of Tensorflow transformations.
 
   Returns:
     A `_SequenceNumericColumn`.
@@ -383,12 +390,16 @@ def sequence_numeric_column(
   if not (dtype.is_integer or dtype.is_floating):
     raise ValueError('dtype must be convertible to float. '
                      'dtype: {}, key: {}'.format(dtype, key))
+  if normalizer_fn is not None and not callable(normalizer_fn):
+    raise TypeError(
+        'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
 
   return _SequenceNumericColumn(
       key,
       shape=shape,
       default_value=default_value,
-      dtype=dtype)
+      dtype=dtype,
+      normalizer_fn=normalizer_fn)
 
 
 def _assert_all_equal_and_return(tensors, name=None):
@@ -407,7 +418,7 @@ class _SequenceNumericColumn(
     fc._SequenceDenseColumn,
     collections.namedtuple(
         '_SequenceNumericColumn',
-        ['key', 'shape', 'default_value', 'dtype'])):
+        ['key', 'shape', 'default_value', 'dtype', 'normalizer_fn'])):
   """Represents sequences of numeric data."""
 
   @property
@@ -419,7 +430,10 @@ class _SequenceNumericColumn(
     return {self.key: parsing_ops.VarLenFeature(self.dtype)}
 
   def _transform_feature(self, inputs):
-    return inputs.get(self.key)
+    input_tensor = inputs.get(self.key)
+    if self.normalizer_fn is not None:
+      input_tensor = self.normalizer_fn(input_tensor)
+    return input_tensor
 
   @property
   def _variable_shape(self):
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
index ee74cf56dc..45d7b74046 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import monitored_session
 
@@ -947,6 +948,7 @@ class SequenceNumericColumnTest(test.TestCase):
     self.assertEqual((1,), a.shape)
     self.assertEqual(0., a.default_value)
     self.assertEqual(dtypes.float32, a.dtype)
+    self.assertIsNone(a.normalizer_fn)
 
   def test_shape_saved_as_tuple(self):
     a = sfc.sequence_numeric_column('aaa', shape=[1, 2])
@@ -965,6 +967,10 @@ class SequenceNumericColumnTest(test.TestCase):
         ValueError, 'dtype must be convertible to float'):
       sfc.sequence_numeric_column('aaa', dtype=dtypes.string)
 
+  def test_normalizer_fn_must_be_callable(self):
+    with self.assertRaisesRegexp(TypeError, 'must be a callable'):
+      sfc.sequence_numeric_column('aaa', normalizer_fn='NotACallable')
+
   def test_get_sequence_dense_tensor(self):
     sparse_input = sparse_tensor.SparseTensorValue(
         # example 0, values [[0.], [1]]
@@ -985,6 +991,41 @@ class SequenceNumericColumnTest(test.TestCase):
       self.assertAllEqual(
           expected_dense_tensor, dense_tensor.eval(session=sess))
 
+  def test_get_sequence_dense_tensor_with_normalizer_fn(self):
+
+    def _increment_two(input_sparse_tensor):
+      return sparse_ops.sparse_add(
+          input_sparse_tensor,
+          sparse_tensor.SparseTensor(((0, 0), (1, 1)), (2.0, 2.0), (2, 2))
+      )
+
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, values [[0.], [1]]
+        # example 1, [[10.]]
+        indices=((0, 0), (0, 1), (1, 0)),
+        values=(0., 1., 10.),
+        dense_shape=(2, 2))
+
+    # Before _increment_two:
+    #   [[0.], [1.]],
+    #   [[10.], [0.]],
+    # After _increment_two:
+    #   [[2.], [1.]],
+    #   [[10.], [2.]],
+    expected_dense_tensor = [
+        [[2.], [1.]],
+        [[10.], [2.]],
+    ]
+    numeric_column = sfc.sequence_numeric_column(
+        'aaa', normalizer_fn=_increment_two)
+
+    dense_tensor, _ = numeric_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': sparse_input}))
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_dense_tensor, dense_tensor.eval(session=sess))
+
   def test_get_sequence_dense_tensor_with_shape(self):
     """Tests get_sequence_dense_tensor with shape !=(1,)."""
     sparse_input = sparse_tensor.SparseTensorValue(
diff --git a/tensorflow/contrib/ffmpeg/__init__.py b/tensorflow/contrib/ffmpeg/__init__.py
index daba965a98..484ffee3e7 100644
--- a/tensorflow/contrib/ffmpeg/__init__.py
+++ b/tensorflow/contrib/ffmpeg/__init__.py
@@ -28,7 +28,6 @@ from __future__ import print_function
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_audio
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_video
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import encode_audio
-from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_video
 
 from tensorflow.python.util.all_util import remove_undocumented
 
diff --git a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
index 020b5c99c6..b1b5126d9e 100644
--- a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
+++ b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 from tensorflow.contrib.ffmpeg.ops import gen_decode_audio_op_py
 from tensorflow.contrib.ffmpeg.ops import gen_decode_video_op_py
 from tensorflow.contrib.ffmpeg.ops import gen_encode_audio_op_py
-from tensorflow.contrib.ffmpeg.ops import gen_decode_video_op_py
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import resource_loader
diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index 10d1ecc738..dc49383c5c 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -119,14 +119,13 @@ from tensorflow.python.framework.smart_cond import smart_cond
 from tensorflow.python.framework.smart_cond import smart_constant_value
 from tensorflow.python.framework.tensor_spec import BoundedTensorSpec
 from tensorflow.python.framework.tensor_spec import TensorSpec
-from tensorflow.python.ops.array_ops import broadcast_to
 from tensorflow.python.ops.init_ops import convolutional_delta_orthogonal
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_1d
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_2d
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_3d
 from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = ['nest', 'broadcast_to']
+_allowed_symbols = ['nest']
 _nest_allowed_symbols = [
     'assert_same_structure',
     'is_sequence',
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
index 65cb94b5a4..a955e21b72 100644
--- a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
@@ -301,8 +301,8 @@ class FusedConv2DBiasActivationTest(test.TestCase):
           conv = tensors[i]
           value = values[i]
           ref_value = ref_values[i]
-          print("expected = ", ref_value)
-          print("actual = ", value)
+          tf_logging.info("expected = ", ref_value)
+          tf_logging.info("actual = ", value)
           tol = 1e-5
           if value.dtype == np.float16:
             tol = 1e-3
@@ -843,7 +843,8 @@ class FusedConvInt8Tests(test.TestCase):
                                                 vertical_stride, padding_type)
     output_width = CalculateConvolvedOutputDim(input_width, filter_width,
                                                horizontal_stride, padding_type)
-    print("output_height=", output_height, ", output_width=", output_width)
+    tf_logging.info("output_height=", output_height, ", output_width=", 
+			                 output_width)
 
     side_input, _, _ = gen_array_ops.quantize_v2(
         random_ops.random_uniform(
@@ -880,8 +881,8 @@ class FusedConvInt8Tests(test.TestCase):
     with self.test_session(
         use_gpu=True, config=NoMemoryOptimizationConfig()) as sess:
       actual_y, expected_y = sess.run([actual, expected])
-      print("actual_y = ", actual_y)
-      print("expected_y = ", expected_y)
+      tf_logging.info("actual_y = ", actual_y)
+      tf_logging.info("expected_y = ", expected_y)
       self.assertTrue(np.array_equal(actual_y, expected_y))
 
   def testFusedConvInt8(self):
diff --git a/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c b/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
index 6a5d982dc8..2e5c84704f 100644
--- a/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
+++ b/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "hexagon_controller.h"
 
-#include <malloc.h>
+#include <stdlib.h>
 #include <stdio.h>
 
 #include "adspmsgd.h"
diff --git a/tensorflow/contrib/lite/download_dependencies.sh b/tensorflow/contrib/lite/download_dependencies.sh
index 436c3e1d4c..840015a7fa 100755
--- a/tensorflow/contrib/lite/download_dependencies.sh
+++ b/tensorflow/contrib/lite/download_dependencies.sh
@@ -30,9 +30,7 @@ if [ ! -f $BZL_FILE_PATH ]; then
 fi
 
 EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
-# TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' once
-# the archive has been propagated in mirror.bazel.build.
-GEMMLOWP_URL="$(grep -o 'https://github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
+GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)"
 NEON_2_SSE_URL="https://github.com/intel/ARM_NEON_2_x86_SSE/archive/master.zip"
diff --git a/tensorflow/contrib/lite/examples/minimal/minimal.cc b/tensorflow/contrib/lite/examples/minimal/minimal.cc
index 106e3b0270..8b0ace96cc 100644
--- a/tensorflow/contrib/lite/examples/minimal/minimal.cc
+++ b/tensorflow/contrib/lite/examples/minimal/minimal.cc
@@ -38,7 +38,7 @@ using namespace tflite;
 
 int main(int argc, char *argv[]) {
   if(argc != 2) {
-    fprintf(stderr, "Usage: %s <model>\n");
+    fprintf(stderr, "minimal <tflite model>\n");
     return 1;
   }
   const char* filename = argv[1];
diff --git a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
index bb2e615eac..965273f0f0 100644
--- a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
+++ b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
@@ -128,7 +128,6 @@ TensorFlow operation not listed above are likely unsupported. Notably, the
 following common ops are not supported at the moment:
 
 *   [tf.depth_to_space](https://www.tensorflow.org/api_docs/python/tf/depth_to_space)
-*   [tf.gather](https://www.tensorflow.org/api_docs/python/tf/gather)
 *   [tf.image.resize_bilinear](https://www.tensorflow.org/api_docs/python/tf/image/resize_bilinear)
 *   [tf.tanh](https://www.tensorflow.org/api_docs/python/tf/tanh)
 
@@ -306,6 +305,19 @@ Options {
 }
 ```
 
+**GATHER**
+
+```
+Inputs {
+  0: params tensor
+  1: indices tensor
+  2: axis tensor (optional)
+}
+Outputs {
+  0: a tensor with same type as the params tensor.
+}
+```
+
 **GREATER**
 
 ```
diff --git a/tensorflow/contrib/lite/java/ovic/README.md b/tensorflow/contrib/lite/java/ovic/README.md
index 5efa70987e..26349347fa 100644
--- a/tensorflow/contrib/lite/java/ovic/README.md
+++ b/tensorflow/contrib/lite/java/ovic/README.md
@@ -2,7 +2,7 @@
 
 This folder contains building code for track one of the [Low Power ImageNet Recognition Challenge workshop at CVPR 2018.](https://rebootingcomputing.ieee.org/home/sitemap/14-lpirc/80-low-power-image-recognition-challenge-lpirc-2018)
 
-## Pre-requesits
+## Pre-requisite
 
 Follow the steps [here](https://www.tensorflow.org/mobile/tflite/demo_android) to install Tensorflow, Bazel, and the Android NDK and SDK.
 
@@ -49,7 +49,7 @@ Once you have a submission that follows the instructions from the [competition s
 You can call the validator binary below to verify that your model fits the format requirements. This often helps you to catch size mismatches (e.g. output should be [1, 1001] instead of [1,1,1,1001]). Let say the submission file is located at `/path/to/my_model.lite`, then call:
 
 ```sh
-bazel build --cxxopt--std=c++11 //tensorflow/contrib/lite/java/ovic:ovic_validator --cxxopt=-Wno-all
+bazel build --cxxopt=--std=c++11 //tensorflow/contrib/lite/java/ovic:ovic_validator --cxxopt=-Wno-all
 bazel-bin/tensorflow/contrib/lite/java/ovic/ovic_validator /path/to/my_model.lite
 ```
 
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index a2f192bbc2..1908f7fa6c 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -1934,7 +1934,7 @@ inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
 // The quantization of the input, output arrays is as follows:
 //  - The input activations are quantized as uint8 on the interval
 //    [-1, 127/128].
-//    The rationale for that is that that is the natural interval for output
+//    The rationale for that is that is the natural interval for output
 //    activations (see next point) and these need to be concatenated together.
 //    We could accommodate different ranges by re-scaling, but we empirically
 //    found that setting the input activations range to be [-1, 127/128] in the
@@ -1999,7 +1999,7 @@ inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
 // However, for a fixed-point implementation in 16-bit integers, using 5
 // integer bits to represent the [-16, 16] range would leave only 11
 // fractional bits, giving an increment of 2^-11 = 4.9e-4 between consecutive
-// representable values. Notice that that is higher than the
+// representable values. Notice that is higher than the
 // worst-case clamping error with clamping to [-8, 8]: 3.4e-4 for Logistic.
 // Using [-8, 8] thus seems like the better compromise overall, enjoying
 // an increment of 2.4e-4 between representable values and a worst-case
diff --git a/tensorflow/contrib/lite/python/interpreter.py b/tensorflow/contrib/lite/python/interpreter.py
index 9400e757b9..fd90823425 100644
--- a/tensorflow/contrib/lite/python/interpreter.py
+++ b/tensorflow/contrib/lite/python/interpreter.py
@@ -55,7 +55,7 @@ class Interpreter(object):
     elif model_content and not model_path:
       self._interpreter = (
           _interpreter_wrapper.InterpreterWrapper_CreateWrapperCPPFromBuffer(
-              model_content, len(model_content)))
+              model_content))
       if not self._interpreter:
         raise ValueError(
             'Failed to create model from {} bytes'.format(len(model_content)))
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index f705551fcb..b283551c45 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -397,9 +397,14 @@ InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromFile(
 }
 
 InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
-    const char* data, size_t len) {
+    PyObject* data) {
+  char * buf = nullptr;
+  Py_ssize_t length;
+  if (PY_TO_CPPSTRING(data, &buf, &length) == -1) {
+    return nullptr;
+  }
   std::unique_ptr<tflite::FlatBufferModel> model =
-      tflite::FlatBufferModel::BuildFromBuffer(data, len);
+      tflite::FlatBufferModel::BuildFromBuffer(buf, length);
   return model ? new InterpreterWrapper(std::move(model)) : nullptr;
 }
 
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
index b0ed7c4559..cbeb53bee7 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
@@ -40,8 +40,7 @@ class InterpreterWrapper {
   static InterpreterWrapper* CreateWrapperCPPFromFile(const char* model_path);
 
   // SWIG caller takes ownership of pointer.
-  static InterpreterWrapper* CreateWrapperCPPFromBuffer(const char* data,
-                                                        size_t len);
+  static InterpreterWrapper* CreateWrapperCPPFromBuffer(PyObject* data);
 
   ~InterpreterWrapper();
   bool AllocateTensors();
diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index 0913cd2c5c..88dda7290b 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -34,6 +34,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from six import PY3
+
 from google.protobuf import text_format as _text_format
 from google.protobuf.message import DecodeError
 from tensorflow.contrib.lite.python import lite_constants as constants
@@ -54,6 +56,7 @@ from tensorflow.python.framework.importer import import_graph_def
 from tensorflow.python.ops.variables import global_variables_initializer
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import tag_constants
+# from tensorflow.python.util.all_util import remove_undocumented
 
 
 class TocoConverter(object):
@@ -203,6 +206,12 @@ class TocoConverter(object):
       except (_text_format.ParseError, DecodeError):
         try:
           print("Ignore 'tcmalloc: large alloc' warnings.")
+
+          if not isinstance(file_content, str):
+            if PY3:
+              file_content = file_content.decode('utf-8')
+            else:
+              file_content = file_content.encode('utf-8')
           _text_format.Merge(file_content, graph_def)
         except (_text_format.ParseError, DecodeError):
           raise ValueError(
@@ -382,3 +391,5 @@ def _freeze_graph(sess, output_tensors):
                                                         output_arrays)
   else:
     return sess.graph_def
+
+# remove_undocumented(__name__)
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index 4465f953ba..caca199d2e 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -178,7 +178,7 @@ ArrayDataType ConvertDataType(tensorflow::DataType dtype) {
   else if (dtype == DT_STRING)
     return ArrayDataType::kString;
   else
-    LOG(INFO) << "Unsupported data type in placehoder op: " << dtype;
+    LOG(INFO) << "Unsupported data type in placeholder op: " << dtype;
   return ArrayDataType::kNone;
 }
 
diff --git a/tensorflow/contrib/lite/toco/toco_port.cc b/tensorflow/contrib/lite/toco/toco_port.cc
index 1b21c8bc60..de76fd4032 100644
--- a/tensorflow/contrib/lite/toco/toco_port.cc
+++ b/tensorflow/contrib/lite/toco/toco_port.cc
@@ -20,6 +20,12 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 
+#if defined(__ANDROID__) && defined(__ARM_ARCH_7A__)
+namespace std {
+double round(double x) { return ::round(x); }
+}  // namespace std
+#endif
+
 namespace toco {
 namespace port {
 void CopyToBuffer(const string& src, char* dest) {
diff --git a/tensorflow/contrib/lite/toco/toco_port.h b/tensorflow/contrib/lite/toco/toco_port.h
index 5c019cb2bf..17f82b9dd7 100644
--- a/tensorflow/contrib/lite/toco/toco_port.h
+++ b/tensorflow/contrib/lite/toco/toco_port.h
@@ -34,6 +34,24 @@ limitations under the License.
 #define TFLITE_PROTO_NS google::protobuf
 #endif
 
+#ifdef __ANDROID__
+#include <sstream>
+namespace std {
+
+template <typename T>
+std::string to_string(T value)
+{
+    std::ostringstream os ;
+    os << value ;
+    return os.str() ;
+}
+
+#ifdef __ARM_ARCH_7A__
+double round(double x);
+#endif
+}
+#endif
+
 namespace toco {
 namespace port {
 
diff --git a/tensorflow/contrib/makefile/compile_nsync.sh b/tensorflow/contrib/makefile/compile_nsync.sh
index e8c6edd7ba..a28fc3a87f 100755
--- a/tensorflow/contrib/makefile/compile_nsync.sh
+++ b/tensorflow/contrib/makefile/compile_nsync.sh
@@ -270,7 +270,7 @@ for arch in $archs; do
                         PLATFORM_LDFLAGS=-pthread
                         MKDEP=${CC} -M -std=c++11
                         PLATFORM_C=../../platform/c++11/src/nsync_semaphore_mutex.cc \
-                                   ../../platform/c++11/src/per_thread_waiter.cc \
+                                   ../../platform/posix/src/per_thread_waiter.c \
                                    ../../platform/c++11/src/yield.cc \
                                    ../../platform/c++11/src/time_rep_timespec.cc \
                                    ../../platform/c++11/src/nsync_panic.cc
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index eff9081e35..48953e2e38 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -27,9 +27,7 @@ if [ ! -f $BZL_FILE_PATH ]; then
 fi
 
 EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
-# TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' once
-# the archive has been propagated in mirror.bazel.build.
-GEMMLOWP_URL="$(grep -o 'https://github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
+GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 NSYNC_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
 PROTOBUF_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/protobuf/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 2ed99d50a4..a6be2084aa 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -2503,7 +2503,7 @@ def _compute_recall_at_precision(tp, fp, fn, precision, name):
     name: An optional variable_scope name.
 
   Returns:
-    The recall at a the given `precision`.
+    The recall at a given `precision`.
   """
   precisions = math_ops.div(tp, tp + fp + _EPSILON)
   tf_index = math_ops.argmin(
diff --git a/tensorflow/contrib/mpi_collectives/kernels/ring.h b/tensorflow/contrib/mpi_collectives/kernels/ring.h
index 1d56d588bc..c001615d3f 100644
--- a/tensorflow/contrib/mpi_collectives/kernels/ring.h
+++ b/tensorflow/contrib/mpi_collectives/kernels/ring.h
@@ -129,7 +129,7 @@ cudaStream_t CudaStreamForMPI();
  *  has the fully accumulated Segment 1; and so on. The scatter-reduce is
  * complete.
  *
- *  Next, the allgather distributes these fully accumululated chunks across all
+ *  Next, the allgather distributes these fully accumulated chunks across all
  * nodes. Communication proceeds in the same ring, once again in N-1 steps. At
  * the ith step, node j will send chunk (j - i + 1) and receive chunk (j - i).
  * For example, at the first iteration, the following transfers will occur:
diff --git a/tensorflow/contrib/opt/python/training/adamax_test.py b/tensorflow/contrib/opt/python/training/adamax_test.py
index 21bf3f5313..915e6504e1 100644
--- a/tensorflow/contrib/opt/python/training/adamax_test.py
+++ b/tensorflow/contrib/opt/python/training/adamax_test.py
@@ -224,8 +224,10 @@ class AdaMaxOptimizerTest(test.TestCase):
           var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
-          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0),
+                                             rtol=1e-2)
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1),
+                                             rtol=1e-2)
           if use_resource:
             self.assertEqual("var0_%d/AdaMax:0" % (i,),
                              opt.get_slot(var=var0, name="m").name)
diff --git a/tensorflow/contrib/opt/python/training/model_average_optimizer.py b/tensorflow/contrib/opt/python/training/model_average_optimizer.py
index a7c97a1da2..b6b10e500b 100644
--- a/tensorflow/contrib/opt/python/training/model_average_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/model_average_optimizer.py
@@ -62,7 +62,7 @@ class ModelAverageCustomGetter(object):
   """
 
   def __init__(self, worker_device):
-    """Create a new `ElasticAverageCustomGetter`.
+    """Create a new `ModelAverageCustomGetter`.
 
     Args:
       worker_device: String.  Name of the `worker` job.
diff --git a/tensorflow/contrib/periodic_resample/BUILD b/tensorflow/contrib/periodic_resample/BUILD
index 6ca7fe8b6e..f2171efc95 100644
--- a/tensorflow/contrib/periodic_resample/BUILD
+++ b/tensorflow/contrib/periodic_resample/BUILD
@@ -6,12 +6,13 @@ exports_files(["LICENSE"])
 
 load(
     "//tensorflow:tensorflow.bzl",
-    "py_test",
+    "tf_cc_test",
     "tf_gen_op_libs",
     "tf_custom_op_library",
     "tf_custom_op_py_library",
     "tf_gen_op_wrapper_py",
 )
+load("//tensorflow:tensorflow.bzl", "py_test")
 
 cc_library(
     name = "all_ops",
@@ -84,6 +85,22 @@ py_test(
         ":init_py",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradient_checker",
+    ],
+)
+
+tf_cc_test(
+    name = "periodic_resample_op_cc_test",
+    size = "small",
+    srcs = [
+        "ops/array_ops_test.cc",
+    ],
+    deps = [
+        ":all_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
     ],
 )
 
diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
index e18923c8aa..514689cf45 100644
--- a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
+++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
@@ -22,4 +22,9 @@ namespace tensorflow {
 REGISTER_KERNEL_BUILDER(Name("PeriodicResample").Device(DEVICE_CPU),
                         PeriodicResampleOp);
 
+
+REGISTER_KERNEL_BUILDER(Name("PeriodicResampleOpGrad")
+                            .Device(DEVICE_CPU),
+                        PeriodicResampleOpGrad);
+
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
index 3ab588c458..42fba81a5c 100644
--- a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
+++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
@@ -25,92 +25,202 @@
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/work_sharder.h"
 
 namespace {
 
-template <class IndexVecT, class IndexT>
-IndexT compute_input_index(
-    IndexVecT* target_dimensions, const IndexT& output_index,
-    const IndexVecT& original_dimensions, const int& adjustable_dimension,
-    const std::vector<tensorflow::int64>& dimension_ceiling,
-    const std::vector<tensorflow::int64>& cumulative_dimensions, IndexT* result,
-    std::vector<IndexT>* output_indices, const int& rank) {
-  *result = 0;
-  output_indices->clear();
+// Computes input tensor index for given output index during forward
+// propagation through periodic_resample operation.
+class InputIndexer {
+ public:
+  InputIndexer(const std::vector<tensorflow::int64>& output_dimensions,
+               const tensorflow::TensorShape& input_shape,
+               int adjustable_dimension)
+      : output_dimensions_(output_dimensions),
+        adjustable_dimension_(adjustable_dimension),
+        rank_(input_shape.dims()),
+        linear_output_index_(0),
+        linear_input_index_(0),
+        adjustable_dimension_carriage_sum_(0) {
+    auto input_dimensions = TensorShapeToVector(input_shape);
+    // factors by which input_dimensions increases/decreases w.r.t.
+    // output_dimensions
+    dimension_ceiling_ =
+        ComputeDimensionCeiling(output_dimensions, input_dimensions);
+    cumulative_dimensions_ = ComputeCumulativeDimensions();
+
+    output_indices_.resize(output_dimensions_.size());
+    input_indices_.resize(output_dimensions_.size());
+
+    // Compute index_factors
+    index_factors_.resize(rank_);
+    tensorflow::int64 last_index_factor = 1;
+    for (auto r = rank_ - 1; r >= 0; --r) {
+      index_factors_[r] = last_index_factor;
+      last_index_factor *= input_dimensions[r];
+    }
+  }
+
+  tensorflow::int64 linear_input_index() const { return linear_input_index_; }
+
+  void MoveToOutputIndex(tensorflow::int64 output_index);
+  void IncrementOutputIndex();
+
+ private:
+  void RecomputeInputAdjustableDimensionIndex() {
+    tensorflow::int64 index = adjustable_dimension_carriage_sum_;
+    index *= output_dimensions_[adjustable_dimension_];
+    index += output_indices_[adjustable_dimension_];
+    input_indices_[adjustable_dimension_] = index;
+  }
+
+  std::vector<tensorflow::int64> TensorShapeToVector(
+      const tensorflow::TensorShape& tensor_shape);
+
+  std::vector<tensorflow::int64> ComputeDimensionCeiling(
+      const std::vector<tensorflow::int64>& output_dimensions,
+      const std::vector<tensorflow::int64>& input_dimensions);
+
+  std::vector<tensorflow::int64> ComputeCumulativeDimensions();
+
+  const std::vector<tensorflow::int64> output_dimensions_;
+  std::vector<tensorflow::int64> dimension_ceiling_;
+  std::vector<tensorflow::int64> index_factors_;
+  std::vector<tensorflow::int64> cumulative_dimensions_;
+  std::vector<tensorflow::int64> output_indices_;
+  std::vector<tensorflow::int64> input_indices_;
+
+  const int adjustable_dimension_;
+  const int rank_;
+  tensorflow::int64 linear_output_index_;
+  tensorflow::int64 linear_input_index_;
+  tensorflow::int64 adjustable_dimension_carriage_sum_;
+};
+
+void InputIndexer::MoveToOutputIndex(tensorflow::int64 output_index) {
+  linear_output_index_ = output_index;
+  linear_input_index_ = 0;
 
   // un-rasterize the output index
   auto last_reduced_i = output_index;
-  for (auto r = rank - 1; r >= 0; --r) {
-    (*output_indices)[r] = last_reduced_i % (*target_dimensions)[r];
+  for (auto r = rank_ - 1; r >= 0; --r) {
+    output_indices_[r] = last_reduced_i % output_dimensions_[r];
     last_reduced_i =
-        (last_reduced_i - (*output_indices)[r]) / (*target_dimensions)[r];
+        (last_reduced_i - output_indices_[r]) / output_dimensions_[r];
   }
 
+  tensorflow::int64 carriage_sum = 0;
+  for (int qi = 0; qi < rank_; ++qi) {
+    if (qi == adjustable_dimension_) continue;
+    carriage_sum += cumulative_dimensions_[qi] *
+                    (output_indices_[qi] % dimension_ceiling_[qi]);
+  }
+  adjustable_dimension_carriage_sum_ = carriage_sum;
+
   // rasterize the input index
-  IndexT last_index_factor = 1;
-  for (auto r = rank - 1; r >= 0; --r) {
-    IndexT index = 0;
-    if (r != adjustable_dimension)
-      index = (*output_indices)[r] / dimension_ceiling[r];
-    else {
-      for (int qi = 0; qi < rank; ++qi) {
-        if (qi == adjustable_dimension) continue;
-        index += cumulative_dimensions[qi] *
-                 ((*output_indices)[qi] % dimension_ceiling[qi]);
-      }
-      index *= (*target_dimensions)[adjustable_dimension];
-      index += (*output_indices)[r];
+  for (auto r = rank_ - 1; r >= 0; --r) {
+    if (r != adjustable_dimension_) {
+      input_indices_[r] = output_indices_[r] / dimension_ceiling_[r];
+    } else {
+      RecomputeInputAdjustableDimensionIndex();
     }
-    *result += last_index_factor * index;
-    last_index_factor *= original_dimensions[r];
   }
+  for (auto r = rank_ - 1; r >= 0; --r) {
+    linear_input_index_ += index_factors_[r] * input_indices_[r];
+  }
+}
+
+void InputIndexer::IncrementOutputIndex() {
+  linear_output_index_++;
+  for (auto r = rank_ - 1; r >= 0; --r) {
+    auto old_carriage_sum_increment =
+        cumulative_dimensions_[r] *
+        (output_indices_[r] % dimension_ceiling_[r]);
+    output_indices_[r] = (output_indices_[r] + 1) % output_dimensions_[r];
+    if (r != adjustable_dimension_) {
+      auto new_input_index = output_indices_[r] / dimension_ceiling_[r];
+      linear_input_index_ +=
+          (new_input_index - input_indices_[r]) * index_factors_[r];
+
+      input_indices_[r] = new_input_index;
+
+      auto new_carriage_sum_increment =
+          cumulative_dimensions_[r] *
+          (output_indices_[r] % dimension_ceiling_[r]);
 
-  return *result;
+      adjustable_dimension_carriage_sum_ = adjustable_dimension_carriage_sum_ -
+                                           old_carriage_sum_increment +
+                                           new_carriage_sum_increment;
+    }
+
+    if (output_indices_[r] != 0) {
+      // No more carries to higher indices.
+      break;
+    }
+  }
+  auto old_adjustable_dimension_input_index =
+      input_indices_[adjustable_dimension_];
+  RecomputeInputAdjustableDimensionIndex();
+  linear_input_index_ += (input_indices_[adjustable_dimension_] -
+                           old_adjustable_dimension_input_index) *
+                          index_factors_[adjustable_dimension_];
 }
 
-template <class InputDataT,
-          class IndexVecT>  // both types are needed here b/c IndexVecT and
-                            // InputDataT are not related
-                            void
-                            fill_periodic_tensor(
-                                tensorflow::OpKernelContext* context,
-                                const IndexVecT& desired_shape,
-                                const tensorflow::Tensor& input_tensor) {
-  // input is a strided array (last index is fastest, C-ordered)
-  auto input = input_tensor.flat<InputDataT>();
-  const int rank = input_tensor.dims();
-  // original and target dimensions
-  std::vector<tensorflow::int64> original_dimensions(rank),
-      target_dimensions(rank);
-  tensorflow::int64 total_size(input_tensor.NumElements()), new_sliced_size(1);
-  // factors by which original_dimensions increases/decreases w.r.t.
-  // target_dimensions
-  std::vector<tensorflow::int64> dimension_ceiling(rank),
-      cumulative_dimensions(rank);
-  // index of adjustable dimension
-  int adjustable_dimension;
-  tensorflow::TensorShape output_shape;
+std::vector<tensorflow::int64> InputIndexer::TensorShapeToVector(
+    const tensorflow::TensorShape& tensor_shape) {
+  std::vector<tensorflow::int64> result(tensor_shape.dims());
+  int count = 0;
+  for (const auto dim_info : tensor_shape) {
+    result[count] = dim_info.size;
+    ++count;
+  }
+  return result;
+}
 
-  // requires that the rank of the input tensor and length of the desired shape
-  // are equal
-  OP_REQUIRES(context, rank == desired_shape.size(),
-              tensorflow::errors::InvalidArgument(
-                  "periodic_resample expects the rank of the input tensor, ",
-                  rank, ", to be the same as the length of the desired shape, ",
-                  desired_shape.size(), "."));
+std::vector<tensorflow::int64> InputIndexer::ComputeDimensionCeiling(
+    const std::vector<tensorflow::int64>& output_dimensions,
+    const std::vector<tensorflow::int64>& input_dimensions) {
+  std::vector<tensorflow::int64> dimension_ceiling(input_dimensions.size());
+  for (size_t i = 0; i < input_dimensions.size(); ++i) {
+    dimension_ceiling[i] = (output_dimensions[i] + input_dimensions[i] - 1) /
+        input_dimensions[i];
+  }
+  return dimension_ceiling;
+}
 
-  bool found = false;
-  const auto& input_tensor_shape = input_tensor.shape();
+std::vector<tensorflow::int64> InputIndexer::ComputeCumulativeDimensions() {
+  std::vector<tensorflow::int64> cumulative_dimensions(rank_);
+  int count = 0;
+  for (int i = 0; i < rank_; ++i) {
+    if (count == 0) {
+      cumulative_dimensions[count] = 1;
+    } else {
+      cumulative_dimensions[count] =
+          cumulative_dimensions[count - 1] * dimension_ceiling_[count - 1];
+    }
+    ++count;
+  }
+  return cumulative_dimensions;
+}
 
+template <typename IndexVecT>
+void process_desired_shape(tensorflow::OpKernelContext* context,
+                           const tensorflow::TensorShape& input_tensor_shape,
+                           const IndexVecT& desired_shape,
+                           int* adjustable_dimension,
+                           std::vector<tensorflow::int64>* target_dimensions,
+                           tensorflow::int64* output_size) {
+  tensorflow::int64 new_sliced_size = 1;
+  bool found = false;
+  const int rank = input_tensor_shape.dims();
   for (int i = 0; i < rank; ++i) {
-    // if (desired_shape(i) < 1) {
     if (desired_shape[i] < 1) {
       // only one index can be adjustable
       OP_REQUIRES(context, !found,
                   tensorflow::errors::InvalidArgument(
                       "periodic_resample expects only "
                       "one index to be marked as adjustable."));
-      adjustable_dimension = i;
+      *adjustable_dimension = i;
       found = true;
     } else {
       OP_REQUIRES(
@@ -122,9 +232,8 @@ template <class InputDataT,
               i, " input tensor has size ", input_tensor_shape.dim_size(i),
               ", desired shape has size ", desired_shape[i], "."));
 
-      // target_dimensions[i] = desired_shape(i);
-      target_dimensions[i] = desired_shape[i];
-      new_sliced_size *= target_dimensions[i];
+      (*target_dimensions)[i] = desired_shape[i];
+      new_sliced_size *= (*target_dimensions)[i];
     }
   }
   // at least one index needs to be adjustable
@@ -132,26 +241,50 @@ template <class InputDataT,
               tensorflow::errors::InvalidArgument(
                   "periodic_resample expects at least "
                   "one index to be marked as adjustable."));
+  (*target_dimensions)[*adjustable_dimension] =
+      input_tensor_shape.num_elements() / new_sliced_size;
 
-  int count = 0;
-  for (const auto dim_info : input_tensor.shape()) {
-    original_dimensions[count] = dim_info.size;
-    ++count;
-  }
+  *output_size = new_sliced_size * (*target_dimensions)[*adjustable_dimension];
+}
 
-  target_dimensions[adjustable_dimension] = total_size / new_sliced_size;
+// Heuristic number based on measurements on
+// Intel(R) Core(TM) i7-4930K CPU @ 3.40GHz
+const tensorflow::int64 costPerFillIndex = 35;
 
-  count = 0;
-  for (int i = 0; i < input_tensor.shape().dims(); ++i) {
-    dimension_ceiling[count] = tensorflow::int64(std::ceil(
-        float(target_dimensions[count]) / float(original_dimensions[count])));
-    if (count == 0)
-      cumulative_dimensions[count] = 1;
-    else
-      cumulative_dimensions[count] =
-          cumulative_dimensions[count - 1] * dimension_ceiling[count - 1];
-    ++count;
-  }
+enum class Mode {
+  kForward,
+  kGradient
+};
+
+// Computes either periodic_resample operation output or gradients for it,
+// depending on |mode|.
+// |original_shape| is always shape of input to periodic_resample operation.
+// |source_tensor| is either source for periodic_resample (for forward mode)
+//     or gradients tensor.
+// |desired_shape| is always shape, provided by user, to which forward
+//     propagation attempts resample input tensor.
+template <class InputDataT, Mode mode>
+void
+do_periodic_resample_op(tensorflow::OpKernelContext* context,
+                        const tensorflow::TensorShape& original_shape,
+                        const tensorflow::PartialTensorShape& desired_shape,
+                        const tensorflow::Tensor& source_tensor) {
+  const int rank = source_tensor.dims();
+
+  // requires that the rank of the input tensor and length of the desired shape
+  // are equal
+  OP_REQUIRES(context, rank == desired_shape.dims(),
+              tensorflow::errors::InvalidArgument(
+                  "periodic_resample expects the rank of the input tensor, ",
+                  rank, ", to be the same as the length of the desired shape, ",
+                  desired_shape.dims(), "."));
+
+  std::vector<tensorflow::int64> target_dimensions(rank);
+  tensorflow::int64 new_size = 0;
+  // index of adjustable dimension
+  int adjustable_dimension = 0;
+  process_desired_shape(context, original_shape, desired_shape.dim_sizes(),
+                        &adjustable_dimension, &target_dimensions, &new_size);
 
   // ensure that the new dimension is greater than zero
   OP_REQUIRES(context, target_dimensions[adjustable_dimension] > 0,
@@ -160,11 +293,14 @@ template <class InputDataT,
                   "adjustable dimension, ",
                   adjustable_dimension, ", isn't greater than zero, ",
                   target_dimensions[adjustable_dimension], "."));
-  for (int i = 0; i < rank; ++i) {
-    output_shape.AddDim(target_dimensions[i]);
+  tensorflow::TensorShape output_shape;
+  if (mode == Mode::kForward) {
+    for (int i = 0; i < rank; ++i) {
+      output_shape.AddDim(target_dimensions[i]);
+    }
+  } else {
+    output_shape = original_shape;
   }
-  const auto new_size =
-      new_sliced_size * target_dimensions[adjustable_dimension];
 
   // Create an output tensor and attach it to the current context
   tensorflow::Tensor* output_tensor = nullptr;
@@ -172,47 +308,73 @@ template <class InputDataT,
                  context->allocate_output(0, output_shape, &output_tensor));
   auto output = output_tensor->flat<InputDataT>();
 
-  // memory is allocated for these variables outside the inner loop for
-  // efficiency (although, I could create a separate class scope for
-  // this purpose instead)
-  tensorflow::int64 result = 0;
-  std::vector<tensorflow::int64> output_indices(target_dimensions.size());
+  // input is a strided array (last index is fastest, C-ordered)
+  auto input = source_tensor.flat<InputDataT>();
 
   // Fill output tensor with periodically resampled input tensor values
-  for (tensorflow::int64 output_index = 0; output_index < new_size;
-       ++output_index) {
-    output(output_index) = input(compute_input_index(
-        &target_dimensions, output_index, original_dimensions,
-        adjustable_dimension, dimension_ceiling, cumulative_dimensions, &result,
-        &output_indices, rank));
-  }
+  InputIndexer input_indexer(target_dimensions, original_shape,
+                             adjustable_dimension);
+
+  auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+  auto fill_output_tensor = [&input_indexer, &output, &input](
+      tensorflow::int64 start, tensorflow::int64 limit) {
+    InputIndexer local_indexer(input_indexer);
+    local_indexer.MoveToOutputIndex(start);
+    for (tensorflow::int64 output_index = start; output_index < limit;
+         ++output_index) {
+      if (mode == Mode::kForward) {
+        output(output_index) = input(local_indexer.linear_input_index());
+      } else {
+        output(local_indexer.linear_input_index()) = input(output_index);
+      }
+      local_indexer.IncrementOutputIndex();
+    }
+  };
+  ::tensorflow::Shard(worker_threads.num_threads, worker_threads.workers,
+                      new_size, costPerFillIndex, fill_output_tensor);
 }
 
+#define DATA_TYPE_SWITCH(data_type, context, CASE)                            \
+  switch (data_type) {                                                        \
+    CASE(float)                                                               \
+    CASE(double)                                                              \
+    CASE(tensorflow::int32)                                                   \
+    CASE(tensorflow::int64)                                                   \
+    default:                                                                  \
+      context->CtxFailure(__FILE__, __LINE__,                                 \
+          tensorflow::errors::InvalidArgument(                                \
+              "Unsuppored tensor elements type"));                            \
+      break;                                                                  \
+  }
+
 void create_output_tensor(
     tensorflow::OpKernelContext* context,
     const tensorflow::Tensor& input_tensor,
     const tensorflow::DataType& input_tensor_type,
-    const tensorflow::PartialTensorShape& desired_shape_tensor) {
-  auto desired_shape = desired_shape_tensor.dim_sizes();
-
-  // obligatory type switch
-  switch (input_tensor_type) {
-    case tensorflow::DataTypeToEnum<float>::value:
-      fill_periodic_tensor<float>(context, desired_shape, input_tensor);
+    const tensorflow::PartialTensorShape& desired_shape) {
+#define CASE(type)                                                            \
+    case tensorflow::DataTypeToEnum<type>::value:                             \
+      do_periodic_resample_op<type, Mode::kForward>(                          \
+          context, input_tensor.shape(), desired_shape, input_tensor);        \
       break;
-    case tensorflow::DataTypeToEnum<double>::value:
-      fill_periodic_tensor<double>(context, desired_shape, input_tensor);
-      break;
-    case tensorflow::DataTypeToEnum<tensorflow::int32>::value:
-      fill_periodic_tensor<tensorflow::int32>(context, desired_shape,
-                                              input_tensor);
-      break;
-    case tensorflow::DataTypeToEnum<tensorflow::int64>::value:
-      fill_periodic_tensor<tensorflow::int64>(context, desired_shape,
-                                              input_tensor);
+
+  DATA_TYPE_SWITCH(input_tensor_type, context, CASE);
+#undef CASE
+}
+
+void create_grad_tensor(tensorflow::OpKernelContext* context,
+                        const tensorflow::Tensor& grad_tensor,
+                        const tensorflow::DataType& grad_tensor_type,
+                        const tensorflow::TensorShape& original_shape,
+                        const tensorflow::PartialTensorShape& desired_shape) {
+#define CASE(type)                                                            \
+    case tensorflow::DataTypeToEnum<type>::value:                             \
+      do_periodic_resample_op<type, Mode::kGradient>(                         \
+          context, original_shape, desired_shape, grad_tensor);               \
       break;
-    default:;
-  }
+
+  DATA_TYPE_SWITCH(grad_tensor_type, context, CASE);
+#undef CASE
 }
 
 }  // namespace
@@ -238,4 +400,25 @@ class PeriodicResampleOp : public tensorflow::OpKernel {
   tensorflow::PartialTensorShape desired_shape;
 };
 
+class PeriodicResampleOpGrad : public tensorflow::OpKernel {
+ public:
+  explicit PeriodicResampleOpGrad(tensorflow::OpKernelConstruction* context)
+      : tensorflow::OpKernel(context) {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("original_shape", &original_shape));
+    OP_REQUIRES_OK(context, context->GetAttr("desired_shape", &desired_shape));
+  }
+
+  void Compute(tensorflow::OpKernelContext* context) override {
+    const tensorflow::Tensor& grad_tensor = context->input(0);
+    const tensorflow::DataType grad_tensor_type = context->input_dtype(0);
+    create_grad_tensor(context, grad_tensor, grad_tensor_type, original_shape,
+                       desired_shape);
+  }
+
+ private:
+  tensorflow::TensorShape original_shape;
+  tensorflow::PartialTensorShape desired_shape;
+};
+
 #endif  // TENSORFLOW_KERNELS_PERIODICRESAMPLE_OP_H_
diff --git a/tensorflow/contrib/periodic_resample/ops/array_ops.cc b/tensorflow/contrib/periodic_resample/ops/array_ops.cc
index 82bd796956..fd38cd09b4 100644
--- a/tensorflow/contrib/periodic_resample/ops/array_ops.cc
+++ b/tensorflow/contrib/periodic_resample/ops/array_ops.cc
@@ -26,7 +26,42 @@ REGISTER_OP("PeriodicResample")
     .Input("values: T")
     .Attr("shape: shape")
     .Output("output: T")
-    .SetShapeFn(shape_inference::ExplicitShape)
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      tensorflow::PartialTensorShape desired_shape;
+      TF_RETURN_IF_ERROR(c->GetAttr("shape", &desired_shape));
+      shape_inference::ShapeHandle input_tensor_shape = c->input(0);
+      shape_inference::DimensionHandle num_input_elements =
+          c->NumElements(input_tensor_shape);
+      shape_inference::ShapeHandle result_shape_handle;
+      if (!shape_inference::InferenceContext::ValueKnown(num_input_elements)) {
+        TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
+            desired_shape, &result_shape_handle));
+      } else {
+        const int rank = c->Rank(input_tensor_shape);
+        std::vector<tensorflow::int64> target_dimensions(rank);
+        tensorflow::int64 new_sliced_size = 1;
+        int adjustable_dimension = 0;
+        for (int i = 0; i < rank; ++i) {
+          if (desired_shape.dim_size(i) < 1) {
+            adjustable_dimension = i;
+          } else {
+            target_dimensions[i] = desired_shape.dim_size(i);
+            new_sliced_size *= target_dimensions[i];
+          }
+        }
+        target_dimensions[adjustable_dimension] =
+            shape_inference::InferenceContext::Value(
+                num_input_elements) / new_sliced_size;
+        tensorflow::TensorShape result_shape;
+        for (int i = 0; i < rank; ++i) {
+          result_shape.AddDim(target_dimensions[i]);
+        }
+        TF_RETURN_IF_ERROR(c->MakeShapeFromTensorShape(
+            result_shape, &result_shape_handle));
+      }
+      c->set_output(0, result_shape_handle);
+      return Status::OK();
+    })
     .Doc(R"doc(
 Periodically resample elements of a tensor to conform to `shape`.
 
@@ -101,4 +136,20 @@ output: Periodically resampled tensor that has dimensions specified as in
 
 )doc");
 
+
+REGISTER_OP("PeriodicResampleOpGrad")
+    .Attr("T: numbertype")
+    .Input("grad: T")
+    .Attr("original_shape: shape")
+    .Attr("desired_shape: shape")
+    .Output("grad_values: T")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      tensorflow::TensorShape original_shape;
+      TF_RETURN_IF_ERROR(c->GetAttr("original_shape", &original_shape));
+      shape_inference::ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromTensorShape(original_shape, &s));
+      c->set_output(0, s);
+      return Status::OK();
+});
+
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/ops/array_ops_test.cc b/tensorflow/contrib/periodic_resample/ops/array_ops_test.cc
new file mode 100644
index 0000000000..43b7c1799f
--- /dev/null
+++ b/tensorflow/contrib/periodic_resample/ops/array_ops_test.cc
@@ -0,0 +1,41 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+TEST(ArrayOpsTest, PeriodicResample_ShapeFn) {
+  ShapeInferenceTestOp op("PeriodicResample");
+  // Case 1: output shape can be fully inferreed.
+  PartialTensorShape shape({4, 4, -1});
+  TensorShapeProto shape_proto;
+  shape.AsProto(&shape_proto);
+
+  TF_ASSERT_OK(NodeDefBuilder("test", "PeriodicResample")
+                   .Input({"values", 0, DT_INT32})
+                   .Attr("shape", shape_proto)
+                   .Finalize(&op.node_def));
+  INFER_OK(op, "[2,2,4]", "[4,4,1]");
+  // Case 2: output shape can not be inferred - report desired shape.
+  INFER_OK(op, "[2,2,?]", "[4,4,?]");
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py b/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
index a25de55e18..31a6fe1d94 100644
--- a/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
+++ b/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
@@ -21,8 +21,11 @@ from __future__ import print_function
 import numpy
 
 from tensorflow.contrib.periodic_resample import periodic_resample
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
@@ -93,7 +96,6 @@ class PeriodicResampleTest(test_util.TensorFlowTestCase):
   def testPeriodicResampleErrors(self):
     input_tensor = numpy.zeros(shape=[1, 2, 2, 4])
     with self.test_session():
-      variables.global_variables_initializer().run()
       with self.assertRaisesWithPredicateMatch(
           errors_impl.InvalidArgumentError,
           'Dimension 3 input tensor has size 4, desired shape has size 1'):
@@ -103,6 +105,29 @@ class PeriodicResampleTest(test_util.TensorFlowTestCase):
           '4, to be the same as the length of the desired shape, 3'):
         periodic_resample(input_tensor, [None, 4, 4]).eval()
 
+  def testPeriodicResampleGradient(self):
+    desired_shape = numpy.array([4, 4, None])
+    result_shape = (4, 4, 1)
+    input_shape = (2, 2, 4)
+    with self.test_session() as sess:
+      x = array_ops.placeholder(dtypes.float32, shape=input_shape)
+      output = periodic_resample(x, desired_shape)
+      error = gradient_checker.compute_gradient_error(
+          x, input_shape, output, result_shape)
+      self.assertLess(error, 1e-4)
+
+  def testPeriodicResampleShapeInference(self):
+    with self.test_session() as sess:
+      # Case 1: output shape can be fully inferreed.
+      x = array_ops.placeholder(dtypes.float32, shape=(2, 2, 4))
+      output = periodic_resample(x, [4, 4, None])
+      self.assertEqual(output.shape, [4, 4, 1])
+      # Case 2: output shape can not be inferred - report desired shape.
+      x = array_ops.placeholder(dtypes.float32, shape=(2, 2, None))
+      output = periodic_resample(x, [4, 4, None])
+      self.assertTrue(output.shape.is_compatible_with([4, 4, None]))
+      self.assertEqual(output.shape[2].value, None)
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py b/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
index 348623d8f8..470e300ccb 100644
--- a/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
+++ b/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
@@ -21,11 +21,17 @@ from __future__ import print_function
 # pylint: disable=unused-import
 from tensorflow.contrib.periodic_resample.python.ops import gen_periodic_resample_op
 
-from tensorflow.contrib.periodic_resample.python.ops.gen_periodic_resample_op import periodic_resample
+from tensorflow.contrib.periodic_resample.python.ops.gen_periodic_resample_op import periodic_resample, periodic_resample_op_grad
 
 from tensorflow.contrib.util import loader
+from tensorflow.python.framework import ops
 from tensorflow.python.platform import resource_loader
 # pylint: enable=unused-import
 
 _periodic_resample_op = loader.load_op_library(
     resource_loader.get_path_to_datafile('_periodic_resample_op.so'))
+
+@ops.RegisterGradient("PeriodicResample")
+def _periodic_resample_grad_cc(op, grad):
+  return periodic_resample_op_grad(
+      grad, op.inputs[0].shape, op.get_attr('shape'))
diff --git a/tensorflow/contrib/predictor/contrib_estimator_predictor.py b/tensorflow/contrib/predictor/contrib_estimator_predictor.py
index b7a98c68e2..af3b2ad1b5 100644
--- a/tensorflow/contrib/predictor/contrib_estimator_predictor.py
+++ b/tensorflow/contrib/predictor/contrib_estimator_predictor.py
@@ -34,7 +34,8 @@ class ContribEstimatorPredictor(predictor.Predictor):
                prediction_input_fn,
                input_alternative_key=None,
                output_alternative_key=None,
-               graph=None):
+               graph=None,
+               config=None):
     """Initialize a `ContribEstimatorPredictor`.
 
     Args:
@@ -48,6 +49,7 @@ class ContribEstimatorPredictor(predictor.Predictor):
         multi-headed models.
       graph: Optional. The Tensorflow `graph` in which prediction should be
         done.
+      config: `ConfigProto` proto used to configure the session.
     """
     self._graph = graph or ops.Graph()
     with self._graph.as_default():
@@ -58,6 +60,7 @@ class ContribEstimatorPredictor(predictor.Predictor):
       checkpoint_path = saver.latest_checkpoint(estimator.model_dir)
       self._session = monitored_session.MonitoredSession(
           session_creator=monitored_session.ChiefSessionCreator(
+              config=config,
               checkpoint_filename_with_path=checkpoint_path))
 
     input_alternative_key = (
diff --git a/tensorflow/contrib/predictor/core_estimator_predictor.py b/tensorflow/contrib/predictor/core_estimator_predictor.py
index d78d94c269..a725072e72 100644
--- a/tensorflow/contrib/predictor/core_estimator_predictor.py
+++ b/tensorflow/contrib/predictor/core_estimator_predictor.py
@@ -51,7 +51,8 @@ class CoreEstimatorPredictor(predictor.Predictor):
                estimator,
                serving_input_receiver_fn,
                output_key=None,
-               graph=None):
+               graph=None,
+               config=None):
     """Initialize a `CoreEstimatorPredictor`.
 
     Args:
@@ -62,6 +63,7 @@ class CoreEstimatorPredictor(predictor.Predictor):
         `None`, then `DEFAULT_SERVING_SIGNATURE_DEF_KEY` is used.
       graph: Optional. The Tensorflow `graph` in which prediction should be
         done.
+      config: `ConfigProto` proto used to configure the session.
     """
     self._graph = graph or ops.Graph()
     with self._graph.as_default():
@@ -71,6 +73,7 @@ class CoreEstimatorPredictor(predictor.Predictor):
       checkpoint_dir = estimator.model_dir
       self._session = monitored_session.MonitoredSession(
           session_creator=monitored_session.ChiefSessionCreator(
+              config=config,
               checkpoint_dir=checkpoint_dir))
 
     feed_tensor_info = signature_def.inputs
diff --git a/tensorflow/contrib/predictor/predictor_factories.py b/tensorflow/contrib/predictor/predictor_factories.py
index 6e77e934fe..f275bc15ad 100644
--- a/tensorflow/contrib/predictor/predictor_factories.py
+++ b/tensorflow/contrib/predictor/predictor_factories.py
@@ -30,7 +30,8 @@ def from_contrib_estimator(estimator,
                            prediction_input_fn,
                            input_alternative_key=None,
                            output_alternative_key=None,
-                           graph=None):
+                           graph=None,
+                           config=None):
   """Constructs a `Predictor` from a `tf.contrib.learn.Estimator`.
 
   Args:
@@ -44,6 +45,7 @@ def from_contrib_estimator(estimator,
       multi-headed models.
     graph: Optional. The Tensorflow `graph` in which prediction should be
       done.
+    config: `ConfigProto` proto used to configure the session.
 
   Returns:
     An initialized `Predictor`.
@@ -62,13 +64,15 @@ def from_contrib_estimator(estimator,
       prediction_input_fn,
       input_alternative_key=input_alternative_key,
       output_alternative_key=output_alternative_key,
-      graph=graph)
+      graph=graph,
+      config=config)
 
 
 def from_estimator(estimator,
                    serving_input_receiver_fn,
                    output_key=None,
-                   graph=None):
+                   graph=None,
+                   config=None):
   """Constructs a `Predictor` from a `tf.python.estimator.Estimator`.
 
   Args:
@@ -79,6 +83,7 @@ def from_estimator(estimator,
       `None`, then `DEFAULT_SERVING_SIGNATURE_DEF_KEY` is used.
     graph: Optional. The Tensorflow `graph` in which prediction should be
       done.
+    config: `ConfigProto` proto used to configure the session.
 
   Returns:
     An initialized `Predictor`.
@@ -93,14 +98,19 @@ def from_estimator(estimator,
                     'tf.contrib.learn.Estimator. You likely want to call '
                     'from_contrib_estimator.')
   return core_estimator_predictor.CoreEstimatorPredictor(
-      estimator, serving_input_receiver_fn, output_key=output_key, graph=graph)
+      estimator,
+      serving_input_receiver_fn,
+      output_key=output_key,
+      graph=graph,
+      config=config)
 
 
 def from_saved_model(export_dir,
                      signature_def_key=None,
                      signature_def=None,
                      tags=None,
-                     graph=None):
+                     graph=None,
+                     config=None):
   """Constructs a `Predictor` from a `SavedModel` on disk.
 
   Args:
@@ -115,6 +125,7 @@ def from_saved_model(export_dir,
       `SignatureDef`. Defaults to `DEFAULT_TAGS`.
     graph: Optional. The Tensorflow `graph` in which prediction should be
       done.
+    config: `ConfigProto` proto used to configure the session.
 
   Returns:
     An initialized `Predictor`.
@@ -128,4 +139,5 @@ def from_saved_model(export_dir,
       signature_def_key=signature_def_key,
       signature_def=signature_def,
       tags=tags,
-      graph=graph)
+      graph=graph,
+      config=config)
diff --git a/tensorflow/contrib/predictor/predictor_factories_test.py b/tensorflow/contrib/predictor/predictor_factories_test.py
index 578d9424b2..a2ef1dc3af 100644
--- a/tensorflow/contrib/predictor/predictor_factories_test.py
+++ b/tensorflow/contrib/predictor/predictor_factories_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.contrib.predictor import predictor_factories
 from tensorflow.contrib.predictor import testing_common
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.platform import test
 
 MODEL_DIR_NAME = 'contrib/predictor/test_export_dir'
@@ -41,6 +42,11 @@ class PredictorFactoriesTest(test.TestCase):
     """Test loading from_saved_model with tags."""
     predictor_factories.from_saved_model(self._export_dir, tags='serve')
 
+  def testFromSavedModelWithSessionConfig(self):
+    """Test loading from_saved_model with session config."""
+    predictor_factories.from_saved_model(
+        self._export_dir, config=config_pb2.ConfigProto())
+
   def testFromSavedModelWithBadTags(self):
     """Test that loading fails for bad tags."""
     bad_tags_regex = ('.*? could not be found in SavedModel')
@@ -53,6 +59,13 @@ class PredictorFactoriesTest(test.TestCase):
     predictor_factories.from_contrib_estimator(
         estimator, input_fn, output_alternative_key='sum')
 
+  def testFromContribEstimatorWithSessionConfig(self):
+    estimator = testing_common.get_arithmetic_estimator(core=False)
+    input_fn = testing_common.get_arithmetic_input_fn(core=False)
+    predictor_factories.from_contrib_estimator(
+        estimator, input_fn, output_alternative_key='sum',
+        config=config_pb2.ConfigProto())
+
   def testFromContribEstimatorWithCoreEstimatorRaises(self):
     estimator = testing_common.get_arithmetic_estimator(core=True)
     input_fn = testing_common.get_arithmetic_input_fn(core=True)
@@ -64,6 +77,12 @@ class PredictorFactoriesTest(test.TestCase):
     input_fn = testing_common.get_arithmetic_input_fn(core=True)
     predictor_factories.from_estimator(estimator, input_fn)
 
+  def testFromCoreEstimatorWithSessionConfig(self):
+    estimator = testing_common.get_arithmetic_estimator(core=True)
+    input_fn = testing_common.get_arithmetic_input_fn(core=True)
+    predictor_factories.from_estimator(
+        estimator, input_fn, config=config_pb2.ConfigProto())
+
   def testFromCoreEstimatorWithContribEstimatorRaises(self):
     estimator = testing_common.get_arithmetic_estimator(core=False)
     input_fn = testing_common.get_arithmetic_input_fn(core=False)
diff --git a/tensorflow/contrib/predictor/saved_model_predictor.py b/tensorflow/contrib/predictor/saved_model_predictor.py
index 0dbca0f813..95da6d04ed 100644
--- a/tensorflow/contrib/predictor/saved_model_predictor.py
+++ b/tensorflow/contrib/predictor/saved_model_predictor.py
@@ -121,7 +121,8 @@ class SavedModelPredictor(predictor.Predictor):
                input_names=None,
                output_names=None,
                tags=None,
-               graph=None):
+               graph=None,
+               config=None):
     """Initialize a `CoreEstimatorPredictor`.
 
     Args:
@@ -142,6 +143,7 @@ class SavedModelPredictor(predictor.Predictor):
         the correct `SignatureDef`. Defaults to `DEFAULT_TAGS`.
       graph: Optional. The Tensorflow `graph` in which prediction should be
         done.
+      config: `ConfigProto` proto used to configure the session.
     Raises:
       ValueError: If more than one of signature_def_key OR signature_def OR
         (input_names AND output_names) is specified.
@@ -152,7 +154,7 @@ class SavedModelPredictor(predictor.Predictor):
     self._graph = graph or ops.Graph()
 
     with self._graph.as_default():
-      self._session = session.Session()
+      self._session = session.Session(config=config)
       loader.load(self._session, tags.split(','), export_dir)
 
     if input_names is None:
diff --git a/tensorflow/contrib/quantize/README.md b/tensorflow/contrib/quantize/README.md
index c83623ec94..27a933c0f9 100644
--- a/tensorflow/contrib/quantize/README.md
+++ b/tensorflow/contrib/quantize/README.md
@@ -6,7 +6,7 @@ inference. The details of the transformation implemented in this package is
 described here [1].
 
 This is done using the
-[fake quantization op](https://www.tensorflow.org/versions/r0.12/api_docs/python/array_ops/fake_quantization).
+[fake quantization op](https://www.tensorflow.org/api_guides/python/array_ops#Fake_quantization).
 
 Literature has shown that fixed point networks provide comparable performance to
 floating point networks [2]. This is achieved by modeling the quantization
diff --git a/tensorflow/contrib/slim/python/slim/evaluation_test.py b/tensorflow/contrib/slim/python/slim/evaluation_test.py
index 94fc12ca81..3d0308aaf3 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation_test.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation_test.py
@@ -26,7 +26,6 @@ import time
 import numpy as np
 
 from tensorflow.contrib.framework.python.ops import variables as variables_lib
-from tensorflow.contrib.metrics.python.ops import metric_ops
 from tensorflow.contrib.slim.python.slim import evaluation
 from tensorflow.contrib.training.python.training import evaluation as evaluation_lib
 from tensorflow.core.protobuf import saver_pb2
@@ -37,6 +36,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import flags
 from tensorflow.python.platform import gfile
@@ -89,8 +89,8 @@ class EvaluationTest(test.TestCase):
     self._predictions, self._scale = TestModel(self._inputs)
 
   def testFinalOpsOnEvaluationLoop(self):
-    value_op, update_op = metric_ops.streaming_accuracy(self._predictions,
-                                                        self._labels)
+    value_op, update_op = metrics.accuracy(
+        labels=self._labels, predictions=self._predictions)
     init_op = control_flow_ops.group(variables.global_variables_initializer(),
                                      variables.local_variables_initializer())
     # Create checkpoint and log directories:
@@ -136,9 +136,10 @@ class EvaluationTest(test.TestCase):
     self.assertTrue(obj.hook_was_run)
 
   def _create_names_to_metrics(self, predictions, labels):
-    accuracy0, update_op0 = metric_ops.streaming_accuracy(predictions, labels)
-    accuracy1, update_op1 = metric_ops.streaming_accuracy(predictions + 1,
-                                                          labels)
+    accuracy0, update_op0 = metrics.accuracy(
+        labels=labels, predictions=predictions)
+    accuracy1, update_op1 = metrics.accuracy(
+        labels=labels, predictions=predictions + 1)
 
     names_to_values = {'Accuracy': accuracy0, 'Another_accuracy': accuracy1}
     names_to_updates = {'Accuracy': update_op0, 'Another_accuracy': update_op1}
@@ -198,8 +199,8 @@ class EvaluationTest(test.TestCase):
     predictions_limited = input.limit_epochs(self._predictions, num_epochs=1)
     labels_limited = input.limit_epochs(self._labels, num_epochs=1)
 
-    value_op, update_op = metric_ops.streaming_accuracy(
-        predictions_limited, labels_limited)
+    value_op, update_op = metrics.accuracy(
+        labels=labels_limited, predictions=predictions_limited)
 
     init_op = control_flow_ops.group(variables.global_variables_initializer(),
                                      variables.local_variables_initializer())
@@ -260,8 +261,8 @@ class SingleEvaluationTest(test.TestCase):
     self._prepareCheckpoint(checkpoint_path)
 
     # Next, determine the metric to evaluate:
-    value_op, update_op = metric_ops.streaming_accuracy(self._predictions,
-                                                        self._labels)
+    value_op, update_op = metrics.accuracy(
+        labels=self._labels, predictions=self._predictions)
 
     # Run the evaluation and verify the results:
     accuracy_value = evaluation.evaluate_once(
@@ -276,8 +277,8 @@ class SingleEvaluationTest(test.TestCase):
     self._prepareCheckpoint(checkpoint_path)
 
     # Next, determine the metric to evaluate:
-    value_op, update_op = metric_ops.streaming_accuracy(self._predictions,
-                                                        self._labels)
+    value_op, update_op = metrics.accuracy(
+        labels=self._labels, predictions=self._predictions)
 
     dumping_root = os.path.join(self.get_temp_dir(), 'tfdbg_dump_dir')
     dumping_hook = hooks.DumpingDebugHook(dumping_root, log_usage=False)
diff --git a/tensorflow/contrib/summary/summary.py b/tensorflow/contrib/summary/summary.py
index 99ced53e11..d22b80ac88 100644
--- a/tensorflow/contrib/summary/summary.py
+++ b/tensorflow/contrib/summary/summary.py
@@ -21,6 +21,7 @@ from @{tf.summary.merge_all} to @{tf.summary.FileWriter}.
 
 To use with eager execution enabled, write your code as follows:
 
+```python
 global_step = tf.train.get_or_create_global_step()
 summary_writer = tf.contrib.summary.create_file_writer(
     train_dir, flush_millis=10000)
@@ -30,9 +31,11 @@ with summary_writer.as_default(), tf.contrib.summary.always_record_summaries():
   tf.contrib.summary.scalar("loss", my_loss)
   # In this case every call to tf.contrib.summary.scalar will generate a record
   # ...
+```
 
 To use it with graph execution, write your code as follows:
 
+```python
 global_step = tf.train.get_or_create_global_step()
 summary_writer = tf.contrib.summary.create_file_writer(
     train_dir, flush_millis=10000)
@@ -53,7 +56,7 @@ with tf.Session(...) as sess:
   while not_done_training:
     sess.run([train_op, tf.contrib.summary.all_summary_ops()])
     # ...
-
+```
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/contrib/tensor_forest/client/eval_metrics.py b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
index e893e1d1c8..d8236a0a6f 100644
--- a/tensorflow/contrib/tensor_forest/client/eval_metrics.py
+++ b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
@@ -21,10 +21,10 @@ import numpy as np
 
 from tensorflow.contrib import losses
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
-from tensorflow.contrib.metrics.python.ops import metric_ops
 
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics
 from tensorflow.python.ops import nn
 
 INFERENCE_PROB_NAME = prediction_key.PredictionKey.PROBABILITIES
@@ -38,12 +38,13 @@ def _top_k_generator(k):
     targets = math_ops.to_int32(targets)
     if targets.get_shape().ndims > 1:
       targets = array_ops.squeeze(targets, axis=[1])
-    return metric_ops.streaming_mean(nn.in_top_k(probabilities, targets, k))
+    return metrics.mean(nn.in_top_k(probabilities, targets, k))
   return _top_k
 
 
 def _accuracy(predictions, targets, weights=None):
-  return metric_ops.streaming_accuracy(predictions, targets, weights=weights)
+  return metrics.accuracy(
+      labels=targets, predictions=predictions, weights=weights)
 
 
 def _r2(probabilities, targets, weights=None):
@@ -53,7 +54,7 @@ def _r2(probabilities, targets, weights=None):
   squares_residuals = math_ops.reduce_sum(
       math_ops.square(targets - probabilities), 0)
   score = 1 - math_ops.reduce_sum(squares_residuals / squares_total)
-  return metric_ops.streaming_mean(score, weights=weights)
+  return metrics.mean(score, weights=weights)
 
 
 def _squeeze_and_onehot(targets, depth):
@@ -62,7 +63,7 @@ def _squeeze_and_onehot(targets, depth):
 
 
 def _sigmoid_entropy(probabilities, targets, weights=None):
-  return metric_ops.streaming_mean(
+  return metrics.mean(
       losses.sigmoid_cross_entropy(probabilities,
                                    _squeeze_and_onehot(
                                        targets,
@@ -71,7 +72,7 @@ def _sigmoid_entropy(probabilities, targets, weights=None):
 
 
 def _softmax_entropy(probabilities, targets, weights=None):
-  return metric_ops.streaming_mean(
+  return metrics.mean(
       losses.sparse_softmax_cross_entropy(probabilities,
                                           math_ops.to_int32(targets)),
       weights=weights)
@@ -82,7 +83,7 @@ def _predictions(predictions, unused_targets, **unused_kwargs):
 
 
 def _class_log_loss(probabilities, targets, weights=None):
-  return metric_ops.streaming_mean(
+  return metrics.mean(
       losses.log_loss(probabilities,
                       _squeeze_and_onehot(targets,
                                           array_ops.shape(probabilities)[1])),
@@ -90,34 +91,36 @@ def _class_log_loss(probabilities, targets, weights=None):
 
 
 def _precision(predictions, targets, weights=None):
-  return metric_ops.streaming_precision(predictions, targets, weights=weights)
+  return metrics.precision(
+      labels=targets, predictions=predictions, weights=weights)
 
 
 def _precision_at_thresholds(predictions, targets, weights=None):
-  return metric_ops.streaming_precision_at_thresholds(
-      array_ops.slice(predictions, [0, 1], [-1, 1]),
-      targets,
-      np.arange(
-          0, 1, 0.01, dtype=np.float32),
+  return metrics.precision_at_thresholds(
+      labels=targets,
+      predictions=array_ops.slice(predictions, [0, 1], [-1, 1]),
+      thresholds=np.arange(0, 1, 0.01, dtype=np.float32),
       weights=weights)
 
 
 def _recall(predictions, targets, weights=None):
-  return metric_ops.streaming_recall(predictions, targets, weights=weights)
+  return metrics.recall(
+      labels=targets, predictions=predictions, weights=weights)
 
 
 def _recall_at_thresholds(predictions, targets, weights=None):
-  return metric_ops.streaming_recall_at_thresholds(
-      array_ops.slice(predictions, [0, 1], [-1, 1]),
-      targets,
-      np.arange(
-          0, 1, 0.01, dtype=np.float32),
+  return metrics.recall_at_thresholds(
+      labels=targets,
+      predictions=array_ops.slice(predictions, [0, 1], [-1, 1]),
+      thresholds=np.arange(0, 1, 0.01, dtype=np.float32),
       weights=weights)
 
 
 def _auc(probs, targets, weights=None):
-  return metric_ops.streaming_auc(array_ops.slice(probs, [0, 1], [-1, 1]),
-                                  targets, weights=weights)
+  return metrics.auc(
+      labels=targets,
+      predictions=array_ops.slice(probs, [0, 1], [-1, 1]),
+      weights=weights)
 
 
 _EVAL_METRICS = {
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest.py b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
index 7a35a70bbe..6f62cd11a9 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
@@ -295,7 +295,7 @@ def get_epoch_variable():
 
 
 # A simple container to hold the training variables for a single tree.
-class TreeTrainingVariables(object):
+class TreeVariables(object):
   """Stores tf.Variables for training a single random tree.
 
   Uses tf.get_variable to get tree-specific names so that this can be used
@@ -303,7 +303,7 @@ class TreeTrainingVariables(object):
   then relies on restoring that model to evaluate).
   """
 
-  def __init__(self, params, tree_num, training):
+  def __init__(self, params, tree_num, training, tree_config='', tree_stat=''):
     if (not hasattr(params, 'params_proto') or
         not isinstance(params.params_proto,
                        _params_proto.TensorForestParams)):
@@ -315,27 +315,28 @@ class TreeTrainingVariables(object):
       # TODO(gilberth): Manually shard this to be able to fit it on
       # multiple machines.
       self.stats = stats_ops.fertile_stats_variable(
-          params, '', self.get_tree_name('stats', tree_num))
+          params, tree_stat, self.get_tree_name('stats', tree_num))
     self.tree = model_ops.tree_variable(
-        params, '', self.stats, self.get_tree_name('tree', tree_num))
+        params, tree_config, self.stats, self.get_tree_name('tree', tree_num))
 
   def get_tree_name(self, name, num):
     return '{0}-{1}'.format(name, num)
 
 
-class ForestTrainingVariables(object):
+class ForestVariables(object):
   """A container for a forests training data, consisting of multiple trees.
 
-  Instantiates a TreeTrainingVariables object for each tree. We override the
+  Instantiates a TreeVariables object for each tree. We override the
   __getitem__ and __setitem__ function so that usage looks like this:
 
-    forest_variables = ForestTrainingVariables(params)
+    forest_variables = ForestVariables(params)
 
     ... forest_variables.tree ...
   """
 
   def __init__(self, params, device_assigner, training=True,
-               tree_variables_class=TreeTrainingVariables):
+               tree_variables_class=TreeVariables,
+               tree_configs=None, tree_stats=None):
     self.variables = []
     # Set up some scalar variables to run through the device assigner, then
     # we can use those to colocate everything related to a tree.
@@ -347,7 +348,13 @@ class ForestTrainingVariables(object):
 
     for i in range(params.num_trees):
       with ops.device(self.device_dummies[i].device):
-        self.variables.append(tree_variables_class(params, i, training))
+        kwargs = {}
+        if tree_configs is not None:
+          kwargs.update(dict(tree_config=tree_configs[i]))
+        if tree_stats is not None:
+          kwargs.update(dict(tree_stat=tree_stats[i]))
+        self.variables.append(tree_variables_class(
+            params, i, training, **kwargs))
 
   def __setitem__(self, t, val):
     self.variables[t] = val
@@ -361,9 +368,11 @@ class RandomForestGraphs(object):
 
   def __init__(self,
                params,
+               tree_configs=None,
+               tree_stats=None,
                device_assigner=None,
                variables=None,
-               tree_variables_class=TreeTrainingVariables,
+               tree_variables_class=TreeVariables,
                tree_graphs=None,
                training=True):
     self.params = params
@@ -371,9 +380,10 @@ class RandomForestGraphs(object):
         device_assigner or framework_variables.VariableDeviceChooser())
     logging.info('Constructing forest with params = ')
     logging.info(self.params.__dict__)
-    self.variables = variables or ForestTrainingVariables(
+    self.variables = variables or ForestVariables(
         self.params, device_assigner=self.device_assigner, training=training,
-        tree_variables_class=tree_variables_class)
+        tree_variables_class=tree_variables_class,
+        tree_configs=tree_configs, tree_stats=tree_stats)
     tree_graph_class = tree_graphs or RandomTreeGraphs
     self.trees = [
         tree_graph_class(self.variables[i], self.params, i)
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py b/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
index bbe627b157..1c9c81827e 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
@@ -18,10 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from google.protobuf.json_format import ParseDict
+from tensorflow.contrib.decision_trees.proto import generic_tree_model_pb2 as _tree_proto
 from tensorflow.contrib.tensor_forest.python import tensor_forest
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import resources
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
@@ -110,6 +114,47 @@ class TensorForestTest(test_util.TensorFlowTestCase):
     self.assertTrue(isinstance(paths, ops.Tensor))
     self.assertTrue(isinstance(var, ops.Tensor))
 
+  def testInfrenceFromRestoredModel(self):
+    input_data = [[-1., 0.], [-1., 2.],  # node 1
+                  [1., 0.], [1., -2.]]  # node 2
+    expected_prediction = [[0.0, 1.0], [0.0, 1.0],
+                           [0.0, 1.0], [0.0, 1.0]]
+    hparams = tensor_forest.ForestHParams(
+        num_classes=2,
+        num_features=2,
+        num_trees=1,
+        max_nodes=1000,
+        split_after_samples=25).fill()
+    tree_weight = {'decisionTree':
+                       {'nodes':
+                        [{'binaryNode':
+                          {'rightChildId': 2,
+                           'leftChildId': 1,
+                           'inequalityLeftChildTest':
+                           {'featureId': {'id': '0'},
+                            'threshold': {'floatValue': 0}}}},
+                         {'leaf': {'vector':
+                                   {'value': [{'floatValue': 0.0},
+                                              {'floatValue': 1.0}]}},
+                          'nodeId': 1},
+                         {'leaf': {'vector':
+                                   {'value': [{'floatValue': 0.0},
+                                              {'floatValue': 1.0}]}},
+                          'nodeId': 2}]}}
+    restored_tree_param = ParseDict(tree_weight,
+                                    _tree_proto.Model()).SerializeToString()
+    graph_builder = tensor_forest.RandomForestGraphs(hparams,
+                                                     [restored_tree_param])
+    probs, paths, var = graph_builder.inference_graph(input_data)
+    self.assertTrue(isinstance(probs, ops.Tensor))
+    self.assertTrue(isinstance(paths, ops.Tensor))
+    self.assertTrue(isinstance(var, ops.Tensor))
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      resources.initialize_resources(resources.shared_resources()).run()
+      self.assertEquals(probs.eval().shape, (4, 2))
+      self.assertEquals(probs.eval().tolist(), expected_prediction)
+
   def testTrainingConstructionClassificationSparse(self):
     input_data = sparse_tensor.SparseTensor(
         indices=[[0, 0], [0, 3], [1, 0], [1, 7], [2, 1], [3, 9]],
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index b7b26cfb1c..da4dd5a14c 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -91,8 +91,11 @@ void GetSubGraphIncomingEdges(const tensorflow::Graph& graph,
       if (!subgraph_node_ids.count(edge->src()->id()) &&
           !edge->src()->IsSource() && !edge->IsControlEdge()) {
         incoming_edges->insert(edge);
+        VLOG(2) << "INCOMING " << edge->src()->name() << " -> " << node->name()
+                << " Y, ";
       } else {
-        VLOG(2) << node->name() << " -> " << edge->src()->name() << " N, ";
+        VLOG(2) << "INCOMING " << edge->src()->name() << " -> " << node->name()
+                << " N, ";
       }
     }
   }
@@ -106,10 +109,12 @@ void GetSubGraphOutgoingEdges(const tensorflow::Graph& graph,
     for (const tensorflow::Edge* edge : node->out_edges()) {
       if (!subgraph_node_ids.count(edge->dst()->id()) &&
           !edge->dst()->IsSink() && !edge->IsControlEdge()) {
-        VLOG(2) << node->name() << " -> " << edge->dst()->name() << " Y, ";
+        VLOG(2) << "OUTGOING " << node->name() << " -> " << edge->dst()->name()
+                << " Y, ";
         outgoing_edges->insert(edge);
       } else {
-        VLOG(2) << node->name() << " -> " << edge->dst()->name() << " N, ";
+        VLOG(2) << "OUTGOING " << node->name() << " -> " << edge->dst()->name()
+                << " N, ";
       }
     }
   }
@@ -181,29 +186,27 @@ struct ConvertGraphParams {
 static tensorflow::Status FillSubGraphEdgeSets(ConvertGraphParams* p) {
   GetSubGraphIncomingEdges(p->graph, p->subgraph_node_ids,
                            &p->subgraph_incoming_edges);
+
+  std::set<std::pair<int, int>> unique_tensors;
+  // Add only unique input source nodes. If output of an outside node is shared
+  // between multiple nodes inside the engine, only one edge should be created
   for (const tensorflow::Edge* edge : p->subgraph_incoming_edges) {
-    p->subgraph_inputs.push_back({edge->src()->id(), edge->src_output()});
-  }
-  auto output_name_to_index_map = BuildTensorNameMap(p->output_names);
-  std::set<std::pair<int, int>> subgraph_outputs_set;
-  // Collect outputs referenced from output_names
-  for (int node_id : p->subgraph_node_ids) {
-    tensorflow::Node* node = p->graph.FindNodeId(node_id);
-    if (output_name_to_index_map.count(node->name())) {
-      for (int index : output_name_to_index_map.at(node->name())) {
-        subgraph_outputs_set.insert({node_id, index});
-      }
-    }
+    unique_tensors.insert({edge->src()->id(), edge->src_output()});
   }
+  p->subgraph_inputs.insert(p->subgraph_inputs.begin(), unique_tensors.begin(),
+                            unique_tensors.end());
   GetSubGraphOutgoingEdges(p->graph, p->subgraph_node_ids,
                            &p->subgraph_outgoing_edges);
+  unique_tensors.clear();
+  // Similar to above, if multiple ouside nodes are sharing the output of an
+  // internal node only one output port should be created and shared between
+  // outputs
   for (const tensorflow::Edge* edge : p->subgraph_outgoing_edges) {
-    subgraph_outputs_set.insert({edge->src()->id(), edge->src_output()});
+    unique_tensors.insert({edge->src()->id(), edge->src_output()});
   }
-  p->subgraph_outputs.reserve(subgraph_outputs_set.size());
+  p->subgraph_outputs.reserve(unique_tensors.size());
   p->subgraph_outputs.insert(p->subgraph_outputs.begin(),
-                             subgraph_outputs_set.begin(),
-                             subgraph_outputs_set.end());
+                             unique_tensors.begin(), unique_tensors.end());
   return tensorflow::Status::OK();
 }
 
@@ -225,7 +228,6 @@ tensorflow::Status GetCalibNode(ConvertGraphParams* params) {
   for (auto in_edge :
        params->subgraph_incoming_edges) {  // loop over incoming edges and
                                            // attach them to calib node
-    // tensorflow::Node* src_node = in_edge->src();
     auto src_output = in_edge->src_output();
     auto dst_node = in_edge->dst();
     auto dst_input = in_edge->dst_input();
@@ -257,19 +259,24 @@ tensorflow::Status ConvertSubGraphToTensorRT(ConvertGraphParams* params) {
   for (size_t i = 0; i < params->subgraph_inputs.size(); ++i) {
     subgraph_edge_to_input_map.insert({params->subgraph_inputs.at(i), i});
   }
+  std::set<std::pair<int, int>> unique_tensors;
   for (const tensorflow::Edge* edge : params->subgraph_incoming_edges) {
     std::pair<int, int> old_src = {edge->src()->id(), edge->src_output()};
+    if (unique_tensors.count(old_src)) continue;
+    unique_tensors.insert(old_src);
     int new_src_output = subgraph_edge_to_input_map.at(old_src);
     params->graph.AddEdge(edge->src(), edge->src_output(), trt_node,
                           new_src_output);
+    VLOG(1) << "Wire " << edge->src()->name() << ":" << edge->src_output()
+            << " -> " << trt_node->name() << ":" << new_src_output;
     params->graph.RemoveEdge(edge);
   }
-
-  VLOG(2) << "new wiring edges: " << trt_node->in_edges().size();
-  for (const tensorflow::Edge* edge : trt_node->in_edges()) {
-    VLOG(2) << edge->src()->name() << " port: " << edge->src_output();
+  if (VLOG_IS_ON(2)) {
+    VLOG(2) << "new edge count: " << trt_node->in_edges().size();
+    for (const tensorflow::Edge* edge : trt_node->in_edges()) {
+      VLOG(2) << edge->src()->name() << " port: " << edge->src_output();
+    }
   }
-
   TF_RETURN_IF_ERROR(status);
 
   // Re-map outgoing edges to use the new TRT node instead of the orig subgraph
@@ -283,6 +290,8 @@ tensorflow::Status ConvertSubGraphToTensorRT(ConvertGraphParams* params) {
     int new_src_output = subgraph_edge_to_output_map.at(old_src);
     TF_RETURN_IF_ERROR(params->graph.UpdateEdge(
         trt_node, new_src_output, edge->dst(), edge->dst_input()));
+    VLOG(1) << "Wire " << trt_node->name() << ":" << new_src_output << " -> "
+            << edge->dst()->name() << ":" << edge->dst_input();
   }
   // Remove the original subgraph
   for (int node_id : params->subgraph_node_ids) {
@@ -317,9 +326,12 @@ tensorflow::Status ConvertCalibGraphToInferGraph(
       tensorflow::GraphConstructorOptions(), graph_def, &graph));
   //  get calib nodes
   std::vector<tensorflow::Node*> calib_nodes;
-  for (auto node : graph.op_nodes()) {
+  std::vector<tensorflow::Node*> topo_order;
+  tensorflow::GetPostOrder(graph, &topo_order);
+  for (auto rit = topo_order.rbegin(); rit != topo_order.rend(); ++rit) {
+    auto node = *rit;
     if (node->type_string() == "TRTCalibOp") {
-      VLOG(1) << "Found Calib Node";
+      VLOG(1) << "Found Calib Node " << node->name();
       calib_nodes.push_back(node);
     }
   }
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 96e0700862..4e4d295538 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -362,10 +362,11 @@ void ReorderCKtoKC(const TRT_ShapedWeights& iweights,
       break;
     }
     case tensorflow::DataType::DT_HALF: {
-      Reorder2({k, c}, static_cast<Eigen::half const*>(iweights.GetValues()),
-               istrides, static_cast<Eigen::half*>(
-                             const_cast<void*>(oweights->GetValues())),
-               ostrides);
+      Reorder2(
+          {k, c}, static_cast<Eigen::half const*>(iweights.GetValues()),
+          istrides,
+          static_cast<Eigen::half*>(const_cast<void*>(oweights->GetValues())),
+          ostrides);
       break;
     }
     default:
@@ -1179,9 +1180,9 @@ tensorflow::Status BinaryTensorOpTensor(
   CHECK_EQ_TYPE(tensor_r->getType(), dtype);
   auto op_pair = ops.find(node_def.op());
   if (op_pair == ops.end())
-    return tensorflow::errors::Unimplemented("binary op: " + node_def.op() +
-                                             " not supported at: " +
-                                             node_def.name());
+    return tensorflow::errors::Unimplemented(
+        "binary op: " + node_def.op() +
+        " not supported at: " + node_def.name());
 
   nvinfer1::IElementWiseLayer* layer = ctx.network()->addElementWise(
       *const_cast<nvinfer1::ITensor*>(tensor_l),
@@ -2138,9 +2139,7 @@ void Converter::register_op_converters() {
 }
 
 }  // namespace
-tensorflow::Status GetTensorRTGraph(tensorrt::convert::SubGraphParams& s) {
-  return tensorflow::errors::Unimplemented("Not implemented yet");
-}
+
 tensorflow::Status ConvertCalibrationNodeToEngineNode(
     tensorflow::Graph& graph, tensorflow::Node* c_node) {
   const auto ndef = c_node->def();
@@ -2164,9 +2163,23 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   for (auto n : graph.op_nodes()) {
     node_maps.insert({n->name(), n});
   }
+  std::set<int> subgraph_ids;
+  for (const auto internal_node : segment_nodes) {
+    subgraph_ids.insert(node_maps.at(internal_node)->id());
+  }
+  if (VLOG_IS_ON(2)) {
+    string node_names = StrCat(c_node->name(), " segment nodes= ");
+
+    for (const auto& node_name : segment_nodes) {
+      StrAppend(&node_names, node_name, ", ");
+    }
+    VLOG(2) << node_names;
+  }
+
   VLOG(1) << "Output Nodes:";
   std::vector<tensorflow::DataType> out_types;
   std::vector<const tensorflow::Edge*> out_edges;
+
   for (auto& i : output_nodes) {
     auto node_port = tensorflow::str_util::Split(i, ":");
     VLOG(1) << " " << i << " in graph " << node_maps.count(i);
@@ -2186,18 +2199,24 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
         out_types.push_back(out_node->output_type(0));
       }
       for (auto out_edge : out_node->out_edges()) {
+        if (subgraph_ids.count(out_edge->dst()->id()))
+          continue;  // skip internal edges;
         if (out_edge->src_output() == port) {
           out_edges.push_back(out_edge);
-          break;
+          VLOG(1) << "OUTPUT EDGE " << out_edge->src()->name() << ":"
+                  << out_edge->src_output() << " -> " << out_edge->dst()->name()
+                  << ":" << out_edge->dst_input();
         }
       }
     } else {
       LOG(WARNING) << " couldn't find output node " << out_node_name;
     }
   }
-  VLOG(1) << "Input Nodes:";
-  for (auto& i : input_names) {
-    VLOG(1) << " " << i << " in graph " << node_maps.count(i);
+  if (VLOG_IS_ON(1)) {
+    VLOG(1) << c_node->name() << " Input Nodes:";
+    for (auto& i : input_names) {
+      VLOG(1) << " Input " << i << " in graph " << node_maps.count(i);
+    }
   }
   auto trt_rm = tensorflow::tensorrt::TRTResourceManager::instance();
   auto resmgr = trt_rm->getManager("TRTCalibOps");
@@ -2231,14 +2250,24 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   calib_res->builder_ = nullptr;
   tensorflow::NodeDefBuilder op_builder(engine_name, "TRTEngineOp");
   std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
+  income_edges.resize(c_node->num_inputs());
   for (const auto in_edge : c_node->in_edges()) {
     auto src = in_edge->src();
     int dest_port = in_edge->dst_input();
-    income_edges.emplace_back(src->name(), in_edge->src_output(),
-                              c_node->input_type(dest_port));
+    VLOG(1) << "Incoming connection " << src->name() << ":"
+            << in_edge->src_output() << " -> " << c_node->name() << ":"
+            << dest_port;
+    income_edges.at(dest_port) = {src->name(), in_edge->src_output(),
+                                  c_node->input_type(dest_port)};
   }
   tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
       income_edges);
+  if (VLOG_IS_ON(2)) {
+    for (const auto& inp : input_list) {
+      VLOG(2) << " Input from inputlist " << inp.node << ":" << inp.index << " "
+              << tensorflow::DataTypeString(inp.data_type);
+    }
+  }
   op_builder.Input(input_list);
   tensorflow::NodeDef engine_node;
   const char* engine_plan_data = static_cast<const char*>(engine_plan->data());
@@ -2255,13 +2284,26 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   }
   auto trt_engine_node = graph.AddNode(engine_node, &status);
   TF_RETURN_IF_ERROR(status);
-  for (size_t i = 0; i < out_edges.size(); i++) {
-    VLOG(1) << "Connecting trt_engine_node output " << i << " with "
-            << out_edges.at(i)->dst()->name() << " port "
-            << out_edges.at(i)->dst_input();
-    TF_RETURN_IF_ERROR(graph.UpdateEdge(trt_engine_node, i,
-                                        out_edges.at(i)->dst(),
-                                        out_edges.at(i)->dst_input()));
+  std::map<string, int> port_map;
+  for (size_t t = 0; t < output_nodes.size(); t++) {
+    port_map.insert({output_nodes.at(t), t});
+  }
+  for (auto& i : out_edges) {
+    string s(i->src()->name());
+    if (i->src_output()) StrAppend(&s, ":", i->src_output());
+    int out_port = port_map.at(s);
+    VLOG(1) << "Connecting " << trt_engine_node->name() << ":" << out_port
+            << " -> " << i->dst()->name() << ":" << i->dst_input();
+    TF_RETURN_IF_ERROR(
+        graph.UpdateEdge(trt_engine_node, out_port, i->dst(), i->dst_input()));
+  }
+  for (const auto ed : trt_engine_node->in_edges()) {
+    VLOG(1) << "In Edge  " << ed->src()->name() << ":" << ed->src_output()
+            << " -> " << ed->dst()->name() << ":" << ed->dst_input();
+  }
+  for (const auto ed : trt_engine_node->out_edges()) {
+    VLOG(1) << "Out Edge " << ed->src()->name() << ":" << ed->src_output()
+            << " -> " << ed->dst()->name() << ":" << ed->dst_input();
   }
   VLOG(1) << "Segment nodes:";
   for (auto& i : segment_nodes) {
@@ -2332,6 +2374,7 @@ tensorflow::Status ConvertSubgraph(
     std::vector<string>* output_names,
     std::vector<tensorflow::DataType>* output_dtypes,
     const string& engine_name) {
+  std::set<string> added_tensors;
   for (const std::pair<int, int>& input : s.input_inds) {
     VLOG(2) << "parsing input. Node id= " << input.first;
     int node_id = input.first;
@@ -2374,7 +2417,6 @@ tensorflow::Status ConvertSubgraph(
 
     auto op_info = op_info_vec.at(shape_inference_output_idx);
     tensorflow::DataType tf_dtype = op_info.dtype();
-    input_dtypes->push_back(tf_dtype);
 
     nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
     auto type_status = ConvertDType(tf_dtype, &dtype);
@@ -2410,8 +2452,10 @@ tensorflow::Status ConvertSubgraph(
     if (output_idx != 0) {
       input_tensor_name = StrCat(node_name, ":", output_idx);
     }
-
+    if (added_tensors.count(input_tensor_name)) continue;
+    added_tensors.insert(input_tensor_name);
     input_names->push_back(input_tensor_name);
+    input_dtypes->push_back(tf_dtype);
     nvinfer1::ITensor* input_tensor = converter.network()->addInput(
         input_tensor_name.c_str(), dtype, input_dim_pseudo_chw);
 
@@ -2435,6 +2479,7 @@ tensorflow::Status ConvertSubgraph(
 
   // Gather output metadata
   int trt_engine_op_output_idx = 0;
+  added_tensors.clear();
   for (const std::pair<int, int>& output : s.output_inds) {
     int node_id = output.first;
     int output_idx = output.second;
@@ -2451,6 +2496,8 @@ tensorflow::Status ConvertSubgraph(
     if (output_idx != 0)
       tensorflow::strings::StrAppend(&tensor_name, ":", output_idx);
     VLOG(2) << "Output tensor name: " << tensor_name;
+    if (added_tensors.count(tensor_name)) continue;
+    added_tensors.insert(tensor_name);
     output_names->push_back(tensor_name);
     auto tensor_or_weights = converter.get_tensor(tensor_name);
     if (!tensor_or_weights.is_tensor()) {
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets.py b/tensorflow/contrib/tpu/python/tpu/datasets.py
index 2e472a2805..d879170b68 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets.py
+++ b/tensorflow/contrib/tpu/python/tpu/datasets.py
@@ -166,11 +166,21 @@ def StreamingFilesDataset(files,
     return remote_iterator.get_next()
 
   def MapFn(unused_input):
-    return functional_ops.remote_call(
+    if isinstance(source_dataset.output_types, dtypes.DType):
+      output_types = [source_dataset.output_types]
+    elif isinstance(source_dataset.output_types, (list, tuple)):
+      output_types = source_dataset.output_types
+    else:
+      raise ValueError('source dataset has invalid output types')
+    remote_calls = functional_ops.remote_call(
         args=[source_handle],
-        Tout=[dtypes.string],
+        Tout=output_types,
         f=LoadingFunc,
-        target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)[0]
+        target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)
+    if len(remote_calls) == 1:
+      return remote_calls[0]
+    else:
+      return remote_calls
 
   with ops.device('/job:%s' % worker_job):
     output_dataset = dataset_ops.Dataset.range(2).repeat().map(
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets_test.py b/tensorflow/contrib/tpu/python/tpu/datasets_test.py
index 918cf0ed8e..b58d05eac5 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/datasets_test.py
@@ -26,6 +26,8 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.lib.io import python_io
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
@@ -162,6 +164,30 @@ class DatasetsTest(test.TestCase):
 
     self.assertEqual(set(all_contents), set(retrieved_values))
 
+  def testArbitraryReaderFuncFromDatasetGenerator(self):
+
+    def my_generator():
+      yield (1, [1] * 10)
+
+    def gen_dataset(dummy):
+      return dataset_ops.Dataset.from_generator(
+          my_generator, (dtypes.int64, dtypes.int64),
+          (tensor_shape.TensorShape([]), tensor_shape.TensorShape([10])))
+
+    dataset = datasets.StreamingFilesDataset(
+        dataset_ops.Dataset.range(10), filetype=gen_dataset)
+
+    iterator = dataset.make_initializable_iterator()
+    self._sess.run(iterator.initializer)
+    get_next = iterator.get_next()
+
+    retrieved_values = self._sess.run(get_next)
+
+    self.assertIsInstance(retrieved_values, (list, tuple))
+    self.assertEqual(len(retrieved_values), 2)
+    self.assertEqual(retrieved_values[0], 1)
+    self.assertItemsEqual(retrieved_values[1], [1] * 10)
+
   def testUnexpectedFiletypeString(self):
     with self.assertRaises(ValueError):
       datasets.StreamingFilesDataset(
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index c72ba2daff..a0cf59852b 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -700,7 +700,9 @@ cc_library(
     srcs = ["platform/stacktrace_handler.cc"],
     hdrs = ["platform/stacktrace_handler.h"],
     deps = [
+        ":abi",
         ":lib_platform",
+        ":stacktrace",
     ],
 )
 
@@ -3090,6 +3092,8 @@ cc_library(
         # we now need at least "str_util".
         ":lib",
         ":lib_platform",
+        ":stacktrace_handler",
+        ":test_lite",
         "//tensorflow/core/platform/default/build_config:test_lite_main",
     ],
     alwayslink = 1,
@@ -3570,7 +3574,10 @@ tf_cc_tests_gpu(
 tf_cc_test_mkl(
     name = "mkl_runtime_tests",
     size = "small",
-    srcs = ["common_runtime/mkl_cpu_allocator_test.cc"],
+    srcs = [
+        "common_runtime/mkl_cpu_allocator_test.cc",
+        "common_runtime/mkl_threadpool_device_test.cc",
+    ],
     linkstatic = 1,
     deps = [
         ":core",
diff --git a/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt b/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt
index cbe76de415..985f09312f 100644
--- a/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt
@@ -4,6 +4,10 @@ op {
   description: <<END
 if < 0, `scale * features` otherwise.
 
+To be used together with
+`initializer = tf.variance_scaling_initializer(factor=1.0, mode='FAN_IN')`.
+For correct dropout, use `tf.contrib.nn.alpha_dropout`.
+
 See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt
new file mode 100644
index 0000000000..6e13d0d049
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt
@@ -0,0 +1,48 @@
+op {
+  graph_op_name: "StringSplitV2"
+  in_arg {
+    name: "input"
+    description: <<END
+`1-D` string `Tensor`, the strings to split.
+END
+  }
+  in_arg {
+    name: "sep"
+    description: <<END
+`0-D` string `Tensor`, the delimiter character.
+END
+  }
+  attr {
+    name: "maxsplit"
+    description: <<END
+An `int`. If `maxsplit > 0`, limit of the split of the result.
+END
+  }
+  summary: "Split elements of `source` based on `sep` into a `SparseTensor`."
+  description: <<END
+Let N be the size of source (typically N will be the batch size). Split each
+element of `source` based on `sep` and return a `SparseTensor`
+containing the split tokens. Empty tokens are ignored.
+
+For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
+then the output will be
+```
+st.indices = [0, 0;
+              0, 1;
+              1, 0;
+              1, 1;
+              1, 2]
+st.shape = [2, 3]
+st.values = ['hello', 'world', 'a', 'b', 'c']
+```
+
+If `sep` is given, consecutive delimiters are not grouped together and are
+deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
+sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
+string, consecutive whitespace are regarded as a single separator, and the
+result will contain no empty strings at the startor end if the string has
+leading or trailing whitespace.
+
+Note that the above mentioned behavior matches python's str.split.
+END
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt
new file mode 100644
index 0000000000..0e8576fb01
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StringSplitV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc
index 8f2a419756..9cda17867b 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@@ -86,7 +86,7 @@ BFCAllocator::Chunk* BFCAllocator::ChunkFromHandle(ChunkHandle h) {
   return &(chunks_[h]);
 }
 
-bool BFCAllocator::Extend(size_t rounded_bytes) {
+bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
   size_t available_bytes = memory_limit_ - total_region_allocated_bytes_;
   // Rounds available_bytes down to the nearest multiple of kMinAllocationSize.
   available_bytes = (available_bytes / kMinAllocationSize) * kMinAllocationSize;
@@ -108,7 +108,7 @@ bool BFCAllocator::Extend(size_t rounded_bytes) {
 
   // Try allocating.
   size_t bytes = std::min(curr_region_allocation_bytes_, available_bytes);
-  void* mem_addr = suballocator_->Alloc(32, bytes);
+  void* mem_addr = suballocator_->Alloc(alignment, bytes);
   if (mem_addr == nullptr && !started_backpedal_) {
     // Only backpedal once.
     started_backpedal_ = true;
@@ -119,7 +119,7 @@ bool BFCAllocator::Extend(size_t rounded_bytes) {
     while (mem_addr == nullptr) {
       bytes = RoundedBytes(bytes * kBackpedalFactor);
       if (bytes < rounded_bytes) break;
-      mem_addr = suballocator_->Alloc(32, bytes);
+      mem_addr = suballocator_->Alloc(alignment, bytes);
     }
   }
 
@@ -261,7 +261,7 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
   }
 
   // Try to extend
-  if (Extend(rounded_bytes)) {
+  if (Extend(unused_alignment, rounded_bytes)) {
     ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes);
     if (ptr != nullptr) {
       return ptr;
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index ba5a3eea3a..52aedb1e9c 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -305,7 +305,8 @@ class BFCAllocator : public VisitableAllocator {
   // Try to add a new memory region that can satisfy an allocation of
   // 'rounded_bytes' bytes.  Returns true on success and false on
   // failure.
-  bool Extend(size_t rounded_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  bool Extend(size_t alignment, size_t rounded_bytes)
+      EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   // Returns a pointer to an underlying allocated chunk of size
   // 'rounded_bytes'.
diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
index 6e08e33f8e..486f0be698 100644
--- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
@@ -105,9 +105,25 @@ TEST(DirectSessionWithTrackingAllocTest, CostModelTest) {
         EXPECT_EQ(2, shape.dim(0).size());
         EXPECT_EQ(1, shape.dim(1).size());
         if (node->name() == y->name()) {
+#ifdef INTEL_MKL
+          // if MKL is used, it goes through various additional 
+          // graph rewrite pass. In TF, everytime a graph pass 
+          // happens, "constant" nodes are allocated
+          // and deallocated. Each allocation calls the
+          // (FindChunkPtr of BFCAllocator),
+          // which increments the value of AllocationId. 
+          // Thus AllocationId becomes more than 3 and 4 if 
+          // MKL is used. Now they are 9 and 10 for MKL. 
+          EXPECT_EQ(19, cm->AllocationId(node, 0));
+#else
           EXPECT_EQ(21, cm->AllocationId(node, 0));
+#endif 
         } else {
+#ifdef INTEL_MKL
+          EXPECT_EQ(20, cm->AllocationId(node, 0));
+#else
           EXPECT_EQ(22, cm->AllocationId(node, 0));
+#endif 
         }
       }
       EXPECT_LE(0, cm->MaxExecutionTime(node));
diff --git a/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc b/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
new file mode 100644
index 0000000000..5d583a8360
--- /dev/null
+++ b/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
@@ -0,0 +1,53 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#include "tensorflow/core/common_runtime/threadpool_device.h"
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+
+#ifdef _OPENMP
+TEST(MKLThreadPoolDeviceTest, TestOmpDefaults) {
+  SessionOptions options;
+  unsetenv("OMP_NUM_THREADS");
+
+  ThreadPoolDevice* tp = new ThreadPoolDevice(
+      options, "/device:CPU:0", Bytes(256), DeviceLocality(), cpu_allocator());
+
+  const int ht = port::NumHyperthreadsPerCore();
+  EXPECT_EQ(omp_get_max_threads(), (port::NumSchedulableCPUs() + ht - 1) / ht);
+}
+
+TEST(MKLThreadPoolDeviceTest, TestOmpPreSets) {
+  SessionOptions options;
+  setenv("OMP_NUM_THREADS", "314", 1);
+
+  ThreadPoolDevice* tp = new ThreadPoolDevice(
+      options, "/device:CPU:0", Bytes(256), DeviceLocality(), cpu_allocator());
+
+  EXPECT_EQ(omp_get_max_threads(), 314);
+}
+#endif  // _OPENMP
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc
index 21912236d0..a5d31b75c7 100644
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -16,8 +16,10 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/process_util.h"
 
 #ifdef INTEL_MKL
+#ifdef _OPENMP
 #include <omp.h>
-#endif
+#endif  // _OPENMP
+#endif  // INTEL_MKL
 #include <string.h>
 
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -57,7 +59,10 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
   // MKL library executes ops in parallel using OMP threads
   // Set inter_op conservatively to avoid thread oversubscription that could
   // lead to severe perf degradations and OMP resource exhaustion
-  const int mkl_intra_op = omp_get_max_threads();
+  int mkl_intra_op = 1;
+#ifdef _OPENMP
+  mkl_intra_op = omp_get_max_threads();
+#endif  // _OPENMP
   CHECK_GE(mkl_intra_op, 1);
   const int32 mkl_inter_op = std::max(
       (port::NumSchedulableCPUs() + mkl_intra_op - 1) / mkl_intra_op, 2);
@@ -68,7 +73,7 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
 #else
   // Default to using the number of cores available in the process.
   return port::NumSchedulableCPUs();
-#endif
+#endif  // INTEL_MKL
 }
 
 thread::ThreadPool* NewThreadPoolFromSessionOptions(
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index f7a07fe503..74a87215e1 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -31,7 +31,11 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 
 #ifdef INTEL_MKL
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 #include "tensorflow/core/common_runtime/mkl_cpu_allocator.h"
+#include "tensorflow/core/platform/cpu_info.h"
 #endif
 
 namespace tensorflow {
@@ -43,7 +47,26 @@ ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options,
     : LocalDevice(options, Device::BuildDeviceAttributes(
                                name, DEVICE_CPU, memory_limit, locality)),
       allocator_(allocator),
-      scoped_allocator_mgr_(new ScopedAllocatorMgr(name)) {}
+      scoped_allocator_mgr_(new ScopedAllocatorMgr(name)) {
+#ifdef INTEL_MKL
+#ifdef _OPENMP
+  const char* user_omp_threads = getenv("OMP_NUM_THREADS");
+  if (user_omp_threads == nullptr) {
+    // OMP_NUM_THREADS controls MKL's intra-op parallelization
+    // Default to available physical cores
+    const int mkl_intra_op = port::NumSchedulableCPUs();
+    const int ht = port::NumHyperthreadsPerCore();
+    omp_set_num_threads((mkl_intra_op + ht - 1) / ht);
+  } else {
+    uint64 user_val = 0;
+    if (strings::safe_strtou64(user_omp_threads, &user_val)) {
+      // Superflous but triggers OpenMP loading
+      omp_set_num_threads(user_val);
+    }
+  }
+#endif  // _OPENMP
+#endif  // INTEL_MKL
+}
 
 ThreadPoolDevice::~ThreadPoolDevice() {}
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
index 1cea1b1462..770a0fcf14 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
@@ -147,7 +147,9 @@ MasterService::Stub::Stub(
 }
 
 MasterService::AsyncService::AsyncService() {
-  for (int i = 0; i < 10; ++i) {
+  int method_len = sizeof(grpcMasterService_method_names) / 
+                    sizeof(grpcMasterService_method_names[0]);
+  for (int i = 0; i < method_len; ++i) {
     AddMethod(new ::grpc::internal::RpcServiceMethod(
         grpcMasterService_method_names[i],
         ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr));
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
index 89f83f9f24..a8508d2d4f 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_session.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
@@ -50,9 +51,14 @@ Status TestCluster::MakeTestCluster(const SessionOptions& options, int n,
   }
 
   for (int i = 0; i < n; ++i) {
+    string server_file =
+        strings::StrCat(testing::TensorFlowSrcRoot(),
+                        "/core/distributed_runtime/rpc/grpc_testlib_server");
+    if (!options.env->FileExists(server_file).ok()) {
+      return errors::Internal("Could not find grpc_testlib_server");
+    }
     const std::vector<string> argv(
-        {strings::StrCat(testing::TensorFlowSrcRoot(),
-                         "/core/distributed_runtime/rpc/grpc_testlib_server"),
+        {server_file,
          /* see grpc_testlib_server.cc for flags */
          tf_jobs, "--tf_job=localhost", strings::StrCat("--tf_task=", i),
          strings::StrCat("--num_cpus=", num_cpus),
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 2c87156dca..2bb4d32d57 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -67,13 +67,8 @@ struct AllocatorStats {
 // device memory.
 class Allocator {
  public:
-#ifdef EIGEN_VECTORIZE_AVX512
   // Align to 64 byte boundary.
   static constexpr size_t kAllocatorAlignment = 64;
-#else
-  // Align to 32 byte boundary.
-  static constexpr size_t kAllocatorAlignment = 32;
-#endif
 
   virtual ~Allocator();
 
diff --git a/tensorflow/core/framework/op_gen_lib.cc b/tensorflow/core/framework/op_gen_lib.cc
index 3d7920a6e2..4b56d807df 100644
--- a/tensorflow/core/framework/op_gen_lib.cc
+++ b/tensorflow/core/framework/op_gen_lib.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/op_gen_lib.h"
 
+#include <algorithm>
 #include <vector>
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
diff --git a/tensorflow/core/framework/remote_fused_graph_execute_info.proto b/tensorflow/core/framework/remote_fused_graph_execute_info.proto
index eb689ec1e6..10072724d2 100644
--- a/tensorflow/core/framework/remote_fused_graph_execute_info.proto
+++ b/tensorflow/core/framework/remote_fused_graph_execute_info.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "RemoteFusedGraphExecuteInfoProto";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-//add go_package externally
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index b613effd18..80e168df97 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -1147,29 +1147,29 @@ TEST(Tensor, FailureToAllocate) {
 
 // On the alignment.
 //
-// As of 2015/8, tensorflow::Tensor allocates its buffer with 32-byte
+// As of 2018/5, tensorflow::Tensor allocates its buffer with 64-byte
 // alignment. Tensor::tensor/flat/vec/matrix methods requires the
 // buffer satisfies Eigen::Aligned (e.g., 16-bytes aligned usually,
-// and 32-bytes for AVX). Tensor::Slice requires the caller to ensure
-// its result is aligned if the caller intends to use those methods.
-// In this test case, we simply make sure each slice is 32-byte
-// aligned: sizeof(float) * 4 * 2 = 32.
+// 32-bytes for AVX, and 64-bytes for AVX512). Tensor::Slice requires
+// the caller to ensure its result is aligned if the caller intends
+// to use those methods. In this test case, we simply make sure each
+// slice is 64-byte aligned: sizeof(float) * 4 * 36 = 576.  576 % 64 = 0.
 TEST(Tensor, Slice_Basic) {
   Tensor saved;
   {  // General
-    Tensor x(DT_FLOAT, TensorShape({10, 4, 34}));
+    Tensor x(DT_FLOAT, TensorShape({10, 4, 36}));
     // Fills in known values.
     for (int i = 0; i < 10; ++i) {
       x.Slice(i, i + 1).flat<float>().setConstant(i * 1.f);
     }
     // A simple slice along dim0.
     Tensor y = x.Slice(4, 8);
-    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 4, 34})));
+    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 4, 36})));
     auto tx = x.tensor<float, 3>();
     auto ty = y.tensor<float, 3>();
     for (int i = 0; i < 4; ++i) {
       for (int j = 0; j < 4; ++j) {
-        for (int k = 0; k < 34; ++k) {
+        for (int k = 0; k < 36; ++k) {
           EXPECT_EQ(ty(i, j, k), 4.0 + i);
           EXPECT_EQ(&tx(4 + i, j, k), &ty(i, j, k));
         }
@@ -1186,7 +1186,7 @@ TEST(Tensor, Slice_Basic) {
     auto tz = z.tensor<float, 3>();
     EXPECT_EQ(1, z.dim_size(0));
     for (int j = 0; j < 4; ++j) {
-      for (int k = 0; k < 34; ++k) {
+      for (int k = 0; k < 36; ++k) {
         EXPECT_EQ(tz(0, j, k), 6.0);
       }
     }
@@ -1198,16 +1198,16 @@ TEST(Tensor, Slice_Basic) {
     EXPECT_EQ(1, saved.dim_size(0));
     auto tsaved = saved.tensor<float, 3>();
     for (int j = 0; j < 4; ++j) {
-      for (int k = 0; k < 34; ++k) {
+      for (int k = 0; k < 36; ++k) {
         EXPECT_EQ(tsaved(0, j, k), 6.0);
       }
     }
   }
   {  // Empty
-    Tensor x(DT_FLOAT, TensorShape({10, 0, 34}));
+    Tensor x(DT_FLOAT, TensorShape({10, 0, 36}));
     x.flat<float>().setRandom();
     Tensor y = x.Slice(4, 8);
-    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 0, 34})));
+    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 0, 36})));
   }
 
   {
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 72a13d4da7..b9667998d6 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -2691,14 +2691,14 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
     // If Op has been specifically assigned to a non-CPU device, then No.
     if (!n->assigned_device_name().empty() &&
-        !str_util::StrContains(n->assigned_device_name(),kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->assigned_device_name(), kCPUDeviceSubStr)) {
       result = false;
       reason = "Op has been assigned a runtime device that is not CPU.";
     }
 
     // If user has specifically assigned this op to a non-CPU device, then No.
     if (!n->def().device().empty() &&
-        !str_util::StrContains(n->def().device(),kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->def().device(), kCPUDeviceSubStr)) {
       result = false;
       reason = "User has assigned a device that is not CPU.";
     }
@@ -2865,9 +2865,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return false;
   }
 
-  // If the depth_radius of LRN is not 2, then MKL DNN takes unoptimized 
-  // path. The unoptimized path is slow. Thus we dont rewrite the node 
-  // and use default Eigen. But for depth_radius=2, MKL DNN optimized 
+  // If the depth_radius of LRN is not 2, then MKL DNN takes unoptimized
+  // path. The unoptimized path is slow. Thus we dont rewrite the node
+  // and use default Eigen. But for depth_radius=2, MKL DNN optimized
   // path is taken, i.e., eigen node is rewritten by MKl DNN node.
   static bool LrnRewrite(const Node* n) {
     CHECK_NOTNULL(n);
@@ -2876,13 +2876,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     CHECK_EQ(GetNodeAttr(n->def(), "depth_radius", &depth_radius).ok(), true);
 
     // if the depth_radius of LRN is not 2, don't rewrite the node by MKL DNN
-    // and use eigen node instead 
+    // and use eigen node instead
     if (depth_radius == 2) {
       return true;
     }
     VLOG(1) << "LrnRewrite: The model sets depth_radius as not 2 which"
             << "case is not optimized by Intel MKL, thus using Eigen op"
-            << "for LRN " ; 
+            << "for LRN ";
 
     return false;
   }
@@ -3015,6 +3015,35 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                                 std::vector<NodeBuilder::NodeOut>* ws_tensors,
                                 bool* are_ws_tensors_added);
 
+  // Helper function used by FixMklMetaDataEdges. Fixes the metadata edge
+  // pointed by 'e_metadata' corresponding to the data edge 'e_data' in graph
+  // 'g'. Returns true is fixup was done; otherwise, it returns false.
+  bool FixMklMetaDataEdgeIfNeeded(std::unique_ptr<Graph>* g,
+    const Edge* e_data, const Edge* e_metadata);
+
+  // Are the input Mkl metadata edges for node 'n' in graph 'g' correctly
+  // connected? If not, then fix them. This is needed because a graph may have
+  // some input Mkl metadata edges incorrectly setup after node merge and
+  // rewrite passes. This could happen because GetReversePostOrder function may
+  // not provide topologically sorted order if a graph contains cycles. The
+  // function returns true if at least one Mkl metadata edge for node 'n' was
+  // fixed. Otherwise, it returns false.
+  //
+  // Example:
+  //
+  // X = MklConv2D(_, _, _)
+  // Y = MklConv2DWithBias(_, _, _, _, _, _)
+  // Z = MklAdd(X, Y, DummyMklTensor, Y:1)
+  //
+  // For a graph such as shown above, note that 3rd argument of MklAdd contains
+  // DummyMklTensor. Actually, it should be getting the Mkl metadata from
+  // MklConv2D op (specifically, X:2). This incorrect plumbing could be possible
+  // (although rare) if the Mkl NodeMerge + NodeRewrite passes visit Z before X
+  // (possible if X, Y, Z are part of a loop.) This function fixes the Mkl
+  // metadata edges only - it does not rewrite nodes nor does it modify the Mkl
+  // data edges (1st and 2nd arguments of MklAdd).
+  bool FixMklMetaDataEdges(std::unique_ptr<Graph>* g, Node* n);
+
   // Functions specific to operators to copy attributes
   // We need operator-specific function to copy attributes because the framework
   // does not provide any generic function for it.
@@ -4241,6 +4270,92 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
   return nullptr;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+//              Post-rewrite Mkl metadata fixup pass
+///////////////////////////////////////////////////////////////////////////////
+bool MklLayoutRewritePass::FixMklMetaDataEdgeIfNeeded(std::unique_ptr<Graph>* g,
+    const Edge* e_data, const Edge* e_metadata) {
+  if (g == nullptr || e_data == nullptr || e_metadata == nullptr) {
+    return false;
+  }
+
+  Node* n_data = e_data->src();
+  int n_data_op_slot = e_data->src_output();
+  int n_metadata_op_slot = GetTensorMetaDataIndex(n_data_op_slot,
+                                                  n_data->num_outputs());
+
+  // If the source of meta edge is a constant node (producing dummy Mkl metadata
+  // tensor), then we will need to fix.
+  if (IsConstant(e_metadata->src())) {
+    Node* e_metadata_dst = e_metadata->dst();
+    int e_metadata_in_slot = e_metadata->dst_input();
+    CHECK_NOTNULL((*g)->AddEdge(n_data, n_metadata_op_slot,
+                  e_metadata_dst, e_metadata_in_slot));
+
+    (*g)->RemoveEdge(e_metadata);
+    return true;
+  }
+
+  return false;
+}
+
+bool MklLayoutRewritePass::FixMklMetaDataEdges(std::unique_ptr<Graph>* g,
+    Node* n) {
+  bool result = false;
+
+  // If graph node is not Mkl node, then return.
+  DataType T = DT_INVALID;
+  if (!GetNodeAttr(n->def(), "T", &T).ok() ||
+      !mkl_op_registry::IsMklOp(n->type_string(), T)) {
+    return result;
+  }
+
+  // If it is Mkl node, then check if the input edges to this node that carry
+  // Mkl metadata are linked up correctly with the source node.
+
+  // For Mkl nodes, we generate twice the number of input tensors (n for Mkl
+  // data tensors + n for Mkl metadata tensors). We need to check for correct
+  // connection of n metadata tensors only.
+  int num_data_inputs = n->num_inputs() / 2;
+  for (int idx = 0; idx < num_data_inputs; idx++) {
+    // Get the edge connecting input slot with index (idx).
+    const Edge* e = nullptr;
+    TF_CHECK_OK(n->input_edge(idx, &e));
+
+    // If e is control edge, then skip.
+    if (e->IsControlEdge()) {
+      continue;
+    }
+
+    // Check that the source node for edge 'e' is Mkl node. If it is not an Mkl
+    // node, then we don't need to do anything.
+    Node* e_src = e->src();
+    if (GetNodeAttr(e_src->def(), "T", &T).ok() &&
+        mkl_op_registry::IsMklOp(e_src->type_string(), T)) {
+      // Source node for edge 'e' is Mkl node.
+      // Destination node and destination input slot of e is node 'n' and 'idx'
+      // resp.
+      CHECK_EQ(e->dst(), n);
+      CHECK_EQ(e->dst_input(), idx);
+
+      // Let's get edge that carries Mkl metadata corresponding to Mkl data edge
+      // 'e'. For that, let's first get the input slot of 'n' where the meta
+      // edge will feed the value.
+      int e_meta_in_slot = GetTensorMetaDataIndex(e->dst_input(),
+                                                  n->num_inputs());
+      const Edge* e_meta = nullptr;
+      TF_CHECK_OK(n->input_edge(e_meta_in_slot, &e_meta));
+
+      // Let's check if we need to fix this meta edge.
+      if (FixMklMetaDataEdgeIfNeeded(g, e, e_meta)) {
+        result = true;
+      }
+    }
+  }
+
+  return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 //              Run function for the pass
 ///////////////////////////////////////////////////////////////////////////////
@@ -4307,6 +4422,25 @@ bool MklLayoutRewritePass::RunPass(std::unique_ptr<Graph>* g) {
 
   DumpGraph("After running MklLayoutRewritePass(NodeMerge+Rewrite)", &**g);
 
+  order.clear();
+  GetReversePostOrder(**g, &order);  // This will give us topological sort.
+  for (Node* n : order) {
+    // If node is not an op or it cannot run on CPU device, then skip.
+    if (!n->IsOp() || !CanOpRunOnCPUDevice(n)) {
+      continue;
+    }
+    if (FixMklMetaDataEdges(g, n)) {
+      string node_name = n->name();
+      string op_name = n->type_string();
+
+      VLOG(1) << "MklLayoutRewritePass: fixed metadata edges for node "
+              << node_name << " with op " << op_name;
+      result = true;
+    }
+  }
+  DumpGraph("After running MklLayoutRewritePass(NodeMerge+Rewrite+Fixup)",
+            &**g);
+
   return result;
 }
 
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 029cdcf94a..7645b4a7f0 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -3518,6 +3518,37 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_DeviceTest) {
             "B->C:1;C->E;D->E:1;E->Z;M->C:2;N->C:3;Y->Z:1");
 }
 
+/////////////////////////////////////////////////////////////////////
+//         Post-rewrite fixup pass test
+
+TEST_F(MklLayoutPassTest, PostRewriteFixUpPass) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'C' op: '_MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A', 'B', 'M', 'N']}"
+      "node { name: 'D' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_UINT8 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_UINT8 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'E' op: '_MklAdd'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A', 'D', 'D']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Const);E(_MklAdd);"
+            "M(_MklInput);N(_MklInput)|A->C;A->E:1;B->C:1;C->E;C:2->E:2;"
+            "D->E:3;M->C:2;N->C:3");
+}
+
 /////////////////////////////////////////////////////////////////////
 
 static void BM_MklLayoutRewritePass(int iters, int op_nodes) {
diff --git a/tensorflow/core/grappler/clusters/single_machine_test.cc b/tensorflow/core/grappler/clusters/single_machine_test.cc
index 352f08fede..31b19cfcfd 100644
--- a/tensorflow/core/grappler/clusters/single_machine_test.cc
+++ b/tensorflow/core/grappler/clusters/single_machine_test.cc
@@ -546,7 +546,7 @@ TEST_F(SingleMachineTest, ReleaseMemoryAfterDestruction) {
   TF_CHECK_OK(cluster_->GetPeakMemoryUsage(&device_peak_memory_before));
   EXPECT_EQ(device_peak_memory_before.size(), 1);
   // There might be a bit memory used before session's running anything.
-  EXPECT_LT(device_peak_memory_before.begin()->second, 200);
+  EXPECT_LT(device_peak_memory_before.begin()->second, 400);
 
   RunMetadata metadata;
   TF_CHECK_OK(cluster_->Run(item.graph, item.feed, item.fetch, &metadata));
@@ -567,8 +567,8 @@ TEST_F(SingleMachineTest, ReleaseMemoryAfterDestruction) {
   // Check memory used by resources are released after cluster destruction.
   EXPECT_EQ(device_peak_memory_before.size(), 1);
   EXPECT_EQ(device_peak_memory_after.size(), 1);
-  EXPECT_LT(device_peak_memory_before.begin()->second, 200);
-  EXPECT_LT(device_peak_memory_after.begin()->second, 200);
+  EXPECT_LT(device_peak_memory_before.begin()->second, 400);
+  EXPECT_LT(device_peak_memory_after.begin()->second, 400);
 }
 
 TEST_F(SingleMachineTest, PeakMemory) {
@@ -597,7 +597,7 @@ TEST_F(SingleMachineTest, PeakMemory) {
       device_peak_memory.end());
   cpu_memory =
       device_peak_memory["/job:localhost/replica:0/task:0/device:CPU:0"];
-  EXPECT_LT(cpu_memory, 100);
+  EXPECT_LT(cpu_memory, 200);
 }
 
 TEST_F(SingleMachineTest, PeakMemoryStatsNotEnabled) {
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 6749a7c571..0c02876ac5 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -610,7 +610,6 @@ class SymbolicShapeRefiner {
     }
   };
 
-  // Compute the shape of the tensors outputed by node 'node' at output port
   // 'port_index' as the union of shape1 and shape2.
   ShapeHandle OutputAsUnion(const NodeDef* node, int port_index,
                             ShapeHandle shape1, ShapeHandle shape2) {
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 1b18087cdf..8ca726df0b 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -679,6 +679,7 @@ cc_library(
     deps = [
         ":constant_folding",
         ":graph_optimizer",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:graph_view",
         "//tensorflow/core/grappler:grappler_item",
@@ -780,7 +781,6 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:scoped_allocator_ops_op_lib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index 4dde7ed1b4..03e36a7b9c 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -200,8 +201,7 @@ Status Remapper::Optimize(Cluster* /*cluster*/, const GrapplerItem& item,
         }
       }
       if (optimizable) {
-        VLOG(2) << "Optimizing fused batch norm node " << node.DebugString()
-                << std::endl;
+        VLOG(1) << "Optimizing fused batch norm node " << node.DebugString();
         AddBatchNormNodes(optimized_graph, node);
         continue;
       }
diff --git a/tensorflow/core/kernels/as_string_op.cc b/tensorflow/core/kernels/as_string_op.cc
index 66c4aff3e3..a7757d1361 100644
--- a/tensorflow/core/kernels/as_string_op.cc
+++ b/tensorflow/core/kernels/as_string_op.cc
@@ -73,6 +73,7 @@ class AsStringOp : public OpKernel {
     }
     switch (dtype) {
       case DT_INT8:
+      case DT_INT16:
       case DT_INT32:
         strings::Appendf(&format_, "d");
         break;
@@ -129,6 +130,7 @@ class AsStringOp : public OpKernel {
       ENCODE_TYPE(DT_FLOAT, float, format_);
       ENCODE_TYPE(DT_DOUBLE, double, format_);
       ENCODE_TYPE(DT_INT8, int8, format_);
+      ENCODE_TYPE(DT_INT16, int16, format_);
       case (DT_BOOL): {
         const auto& input_flat = input_tensor->flat<bool>();
         for (int i = 0; i < input_flat.size(); ++i) {
diff --git a/tensorflow/core/kernels/cwise_op_clip.cc b/tensorflow/core/kernels/cwise_op_clip.cc
index 14d889e8e3..49b90e855b 100644
--- a/tensorflow/core/kernels/cwise_op_clip.cc
+++ b/tensorflow/core/kernels/cwise_op_clip.cc
@@ -33,52 +33,41 @@ class ClipOp : public OpKernel {
     const Tensor& in0 = ctx->input(0);
     const Tensor& in1 = ctx->input(1);
     const Tensor& in2 = ctx->input(2);
+    OP_REQUIRES(ctx, (in0.shape() == in1.shape() ||
+                      TensorShapeUtils::IsScalar(in1.shape())) &&
+                     (in0.shape() == in2.shape() ||
+                      TensorShapeUtils::IsScalar(in2.shape())),
+                errors::InvalidArgument(
+                    "clip_value_min and clip_value_max must be either of "
+                    "the same shape as input, or a scalar. ",
+                    "input shape: ", in0.shape().DebugString(),
+                    "clip_value_min shape: ", in1.shape().DebugString(),
+                    "clip_value_max shape: ", in2.shape().DebugString()));
+
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(
+        ctx, ctx->forward_input_or_allocate_output({0}, 0, in0.shape(), &out));
+    if (out->NumElements() == 0) return;  // Nothing to do for empty output
 
     auto in0_flat = in0.flat<T>();
     auto in1_flat = in1.flat<T>();
     auto in2_flat = in2.flat<T>();
+    auto out_flat = out->flat<T>();
     const Device& d = ctx->eigen_device<Device>();
 
-    Tensor* out = nullptr;
-    OP_REQUIRES_OK(
-        ctx, ctx->forward_input_or_allocate_output({0}, 0, in0.shape(), &out));
-    auto out_flat = out->flat<T>();
     if (in1.shape() == in2.shape()) {
       if (in0.shape() == in1.shape()) {
         functor::TernaryClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                             out_flat);
       } else {
-        OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(in1.shape()),
-                    errors::InvalidArgument(
-                        "clip_value_min and clip_value_max must be either of "
-                        "the same shape as input, or a scalar. ",
-                        "input shape: ", in0.shape().DebugString(),
-                        "clip_value_min shape: ", in1.shape().DebugString(),
-                        "clip_value_max shape: ", in2.shape().DebugString()));
         functor::UnaryClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                           out_flat);
       }
     } else {
       if (in0.shape() == in1.shape()) {
-        OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(in2.shape()),
-                    errors::InvalidArgument(
-                        "clip_value_min and clip_value_max must be either of "
-                        "the same shape as input, or a scalar. ",
-                        "input shape: ", in0.shape().DebugString(),
-                        "clip_value_min shape: ", in1.shape().DebugString(),
-                        "clip_value_max shape: ", in2.shape().DebugString()));
         functor::BinaryLeftClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                                out_flat);
       } else {
-        OP_REQUIRES(ctx,
-                    (in0.shape() == in2.shape() &&
-                     TensorShapeUtils::IsScalar(in1.shape())),
-                    errors::InvalidArgument(
-                        "clip_value_min and clip_value_max must be either of "
-                        "the same shape as input, or a scalar. ",
-                        "input shape: ", in0.shape().DebugString(),
-                        "clip_value_min shape: ", in1.shape().DebugString(),
-                        "clip_value_max shape: ", in2.shape().DebugString()));
         functor::BinaryRightClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                                 out_flat);
       }
diff --git a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
index 9a3b2303a3..17a85d9773 100644
--- a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
@@ -57,6 +57,7 @@ struct DenseUpdate<GPUDevice, T, SUB> {
   template struct functor::DenseUpdate<GPUDevice, T, ADD>; \
   template struct functor::DenseUpdate<GPUDevice, T, SUB>;
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
+TF_CALL_int32(DEFINE_GPU_KERNELS);
 TF_CALL_int64(DEFINE_GPU_KERNELS);
 #undef DEFINE_GPU_KERNELS
 
diff --git a/tensorflow/core/kernels/gather_functor.cc b/tensorflow/core/kernels/gather_functor.cc
index e6fefe643b..5cd8e04927 100644
--- a/tensorflow/core/kernels/gather_functor.cc
+++ b/tensorflow/core/kernels/gather_functor.cc
@@ -37,6 +37,7 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
+TF_CALL_int64(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
 TF_CALL_complex128(DECLARE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/gather_functor_gpu.cu.cc b/tensorflow/core/kernels/gather_functor_gpu.cu.cc
index 39b6924d74..4563fc6353 100644
--- a/tensorflow/core/kernels/gather_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/gather_functor_gpu.cu.cc
@@ -31,6 +31,7 @@ typedef Eigen::GpuDevice GPUDevice;
   DEFINE_GPU_SPECS_INDEX(T, int32); \
   DEFINE_GPU_SPECS_INDEX(T, int64);
 
+TF_CALL_int64(DEFINE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
 TF_CALL_complex64(DEFINE_GPU_SPECS);
 TF_CALL_complex128(DEFINE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/gather_nd_op.cc b/tensorflow/core/kernels/gather_nd_op.cc
index 7e5a9e1ec5..4e53291b7f 100644
--- a/tensorflow/core/kernels/gather_nd_op.cc
+++ b/tensorflow/core/kernels/gather_nd_op.cc
@@ -228,6 +228,8 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
+TF_CALL_int32(DECLARE_GPU_SPECS);
+TF_CALL_int64(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
 TF_CALL_complex128(DECLARE_GPU_SPECS);
@@ -239,6 +241,8 @@ TF_CALL_complex128(DECLARE_GPU_SPECS);
 // Registration of the GPU implementations.
 #define REGISTER_GATHER_ND_GPU(type) REGISTER_GATHER_ND_ALL_INDICES(GPU, type)
 
+TF_CALL_int32(REGISTER_GATHER_ND_GPU);
+TF_CALL_int64(REGISTER_GATHER_ND_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GATHER_ND_GPU);
 TF_CALL_complex64(REGISTER_GATHER_ND_GPU);
 TF_CALL_complex128(REGISTER_GATHER_ND_GPU);
diff --git a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
index b03efc684f..da8d2e9e3c 100644
--- a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
@@ -119,6 +119,8 @@ struct GatherNdSlice<GPUDevice, T, Index, IXDIM> {
   DEFINE_GPU_SPECS_INDEX(T, int32); \
   DEFINE_GPU_SPECS_INDEX(T, int64);
 
+TF_CALL_int32(DEFINE_GPU_SPECS);
+TF_CALL_int64(DEFINE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
 TF_CALL_complex64(DEFINE_GPU_SPECS);
 TF_CALL_complex128(DEFINE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index ef332ebee3..094504d6b9 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -153,6 +153,7 @@ TF_CALL_uint64(REGISTER_GATHER_CPU);
 // Registration of the GPU implementations.
 #define REGISTER_GATHER_GPU(type) REGISTER_GATHER_ALL_INDICES(GPU, type)
 
+TF_CALL_int64(REGISTER_GATHER_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GATHER_GPU);
 TF_CALL_complex64(REGISTER_GATHER_GPU);
 TF_CALL_complex128(REGISTER_GATHER_GPU);
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index 5eeb23d810..31d1b949ef 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -14,6 +14,7 @@ limitations under the License.
 
 #include <limits>
 #include <vector>
+#include <unordered_map>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -590,8 +591,8 @@ class MklConcatOp : public OpKernel {
       const int N = input_tensors.size();
 
       // Get Tensor shapes.
-      std::vector<MklDnnShape> input_shapes(N);
-      GetMklShapeList(context, "values", &input_shapes);
+      std::vector<MklDnnShape> mkl_input_shapes(N);
+      GetMklShapeList(context, "values", &mkl_input_shapes);
 
       const Tensor& concat_dim_tensor = (AxisArgName == NAME_IS_CONCAT_DIM)
                                             ? MklGetInput(context, 0)
@@ -610,19 +611,14 @@ class MklConcatOp : public OpKernel {
       int i = 0;
       bool invoke_eigen = false;
       bool are_all_mkl_inputs = true, are_all_tf_inputs = true;
-      const TensorShape expected_shape = input_shapes[0].IsMklTensor()
-                                             ? input_shapes[0].GetTfShape()
-                                             : input_tensors[0].shape();
+      const TensorShape expected_shape = mkl_input_shapes[0].IsMklTensor()
+                                       ? mkl_input_shapes[0].GetTfShape()
+                                       : input_tensors[0].shape();
       size_t expected_dims = expected_shape.dims();
 
       if (concat_dim < 0) concat_dim = expected_dims + concat_dim;
 
-      for (auto& s : input_shapes) {
-        if (s == expected_shape) {
-          ++i;
-          continue;
-        }
-
+      for (auto& s : mkl_input_shapes) {
         TensorShape s_shape =
             s.IsMklTensor() ? s.GetTfShape() : input_tensors[i].shape();
         size_t s_dims = s_shape.dims();
@@ -665,21 +661,14 @@ class MklConcatOp : public OpKernel {
 
       // Call Eigen library
       if (invoke_eigen) {
-        TensorShapeList tf_input_shapes;
-        i = 0;
-        for (auto& s : input_shapes) {
-          TensorShape s_shape =
-              s.IsMklTensor() ? s.GetTfShape() : input_tensors[i].shape();
-          tf_input_shapes.push_back(s_shape);
-          ++i;
-        }
-        CallEigenVersion(context, input_tensors, tf_input_shapes);
+        CallEigenVersion(context, input_tensors, mkl_input_shapes);
         return;
       }
 
       memory::dims dst_dims;
+
       if (are_all_mkl_inputs)
-        dst_dims = TFShapeToMklDnnDims(input_shapes[0].GetTfShape());
+        dst_dims = TFShapeToMklDnnDims(mkl_input_shapes[0].GetTfShape());
       else
         // When all the inputs are in Tensorflow format, we don't know
         // what is the input data format. In that case, we just use
@@ -689,26 +678,61 @@ class MklConcatOp : public OpKernel {
       std::vector<memory::primitive_desc> srcs_pd;
       std::vector<MklDnnData<T>> srcs(N, MklDnnData<T>(&cpu_engine));
       int64 dst_concat_dim_size = 0;
-      for (int k = 0; k < N; k++) {
-        bool is_mkl_tensor = input_shapes[k].IsMklTensor();
-        memory::dims src_dims;
-
-        // Same comment as dst_dims for src_dims.
-        src_dims = (is_mkl_tensor)
-                       ? TFShapeToMklDnnDims(input_shapes[k].GetTfShape())
-                       : TFShapeToMklDnnDims(input_tensors[k].shape());
-
-        dst_concat_dim_size += src_dims[concat_dim];
-        auto src_md =
-            is_mkl_tensor ? input_shapes[k].GetMklLayout() :
-                          // It does not matter what data format we use here
-                          // (NHWC or NCHW). We just need to ensure that output
-                          // of Concat uses same data format as input.
-                memory::desc(src_dims, MklDnnType<T>(), memory::format::nchw);
-
-        srcs[k].SetUsrMem(src_md, &input_tensors[k]);
-        auto src_mpd = srcs[k].GetUsrMemPrimDesc();
-        srcs_pd.push_back(src_mpd);
+
+      bool isMklReorderNeeded = false;
+      memory::format mkl_common_format = memory::format::any;
+      if (are_all_mkl_inputs) {
+        mkl_common_format =
+            FindMklCommonFormat(mkl_input_shapes, concat_dim,
+               &isMklReorderNeeded, &dst_concat_dim_size);
+
+        if (!isMklReorderNeeded) {
+          // All MKL tensors have a same format. Reorder is not needed.
+          for (int k = 0; k < N; k++) {
+            if (input_tensors[k].NumElements() == 0)
+              continue;
+
+            auto src_md = mkl_input_shapes[k].GetMklLayout();
+            srcs[k].SetUsrMem(src_md, &input_tensors[k]);
+            auto src_mpd = srcs[k].GetUsrMemPrimDesc();
+            srcs_pd.push_back(src_mpd);
+          }
+        } else {
+          // MKL tensors have different formats.
+          // Reorder them to most common format.
+          for (int k = 0; k < N; k++) {
+            if (input_tensors[k].NumElements() == 0)
+              continue;
+
+            auto src_dims = TFShapeToMklDnnDims(
+                mkl_input_shapes[k].GetTfShape());
+            auto src_md = mkl_input_shapes[k].GetMklLayout();
+            srcs[k].SetUsrMem(src_md, &input_tensors[k]);
+
+            if (src_md.data.format != mkl_common_format)
+              src_md = memory::desc(src_dims, MklDnnType<T>(),
+                           mkl_common_format);
+
+            srcs_pd.push_back(memory::primitive_desc(src_md, cpu_engine));
+          }
+        }
+      } else {  // All TF inputs
+        for (int k = 0; k < N; k++) {
+          if (input_tensors[k].NumElements() == 0)
+            continue;
+
+          memory::dims src_dims = TFShapeToMklDnnDims(input_tensors[k].shape());
+          dst_concat_dim_size += src_dims[concat_dim];
+
+          // It does not matter what data format to be used (NHWC versus NCHW).
+          // We just need to ensure that output uses same data format as inputs.
+          auto src_md =
+              memory::desc(src_dims, MklDnnType<T>(), memory::format::nchw);
+
+          srcs[k].SetUsrMem(src_md, &input_tensors[k]);
+          auto src_mpd = srcs[k].GetUsrMemPrimDesc();
+          srcs_pd.push_back(src_mpd);
+        }
       }
       dst_dims[concat_dim] = dst_concat_dim_size;
 
@@ -718,25 +742,33 @@ class MklConcatOp : public OpKernel {
       if (are_all_mkl_inputs) {
         // Since we are passing a specific format for destination,
         // we need to have dst_dims in MklDnn order (NCHW).
-        auto orig_tf_format = input_shapes[0].GetTfDataFormat();
+        auto orig_tf_format = mkl_input_shapes[0].GetTfDataFormat();
         dst_dims_in_nchw = MklDnnDimsInNCHW(
             dst_dims, MklDnnDataFormatToTFDataFormat(orig_tf_format));
-        // We will set the output in the same format as input to avoid layout
-        // conversions.
-        // Currently we are setting dst format same as input format.
-        // See if we can make this choice in a better way.
+        // Set the output format same as the most common format of inputs
+        // to avoid layout conversions.
         dst_md = memory::desc(
-            dst_dims_in_nchw, MklDnnType<T>(),
-            (memory::format)input_shapes[0].GetMklLayout().data.format);
+            dst_dims_in_nchw, MklDnnType<T>(), mkl_common_format);
       } else {
-        // Again, format does not matter here. We just need to make it same as
-        // input format.
+        // All inputs are TF tensors.
+        // Set the output format same as input format (nchw).
         dst_md = memory::desc(dst_dims, MklDnnType<T>(), memory::format::nchw);
       }
 
       std::vector<primitive::at> inputs;
-      for (int k = 0; k < input_tensors.size(); k++)
-        inputs.push_back(srcs[k].GetOpMem());
+      std::vector<primitive> net;
+      if (isMklReorderNeeded) {
+        for (int k = 0; k < input_tensors.size(); k++) {
+          if (input_tensors[k].NumElements() > 0) {
+            srcs[k].CheckReorderToOpMem(srcs_pd[k], &net);
+          }
+        }
+      }
+      for (int k = 0; k < input_tensors.size(); k++) {
+        if (input_tensors[k].NumElements() > 0) {
+          inputs.push_back(srcs[k].GetOpMem());
+        }
+      }
 
       // If all inputs are in MKL format, then meaning of concat_dim needs to
       // change. Value of concat_dim is tied to input Tensorflow data format
@@ -745,7 +777,8 @@ class MklConcatOp : public OpKernel {
       // But ifinput tensors are in NHWC order, then semantics need to change.
       // E.g., if we are concatinating over Channel (dimension 3 for NHWC),
       // then since MklDnn order is NCHW, concat_dim needs to be 1.
-      if (are_all_mkl_inputs) concat_dim = input_shapes[0].TfDimIdx(concat_dim);
+      if (are_all_mkl_inputs)
+         concat_dim = mkl_input_shapes[0].TfDimIdx(concat_dim);
 
       auto concat_pd = concat::primitive_desc(dst_md, concat_dim, srcs_pd);
 
@@ -758,7 +791,7 @@ class MklConcatOp : public OpKernel {
         dnn_shape_dst.SetMklLayout(&dst_pd);
         dnn_shape_dst.SetElemType(MklDnnType<T>());
         dnn_shape_dst.SetTfLayout(dst_dims.size(), dst_dims_in_nchw,
-                                  input_shapes[0].GetTfDataFormat());
+                                  mkl_input_shapes[0].GetTfDataFormat());
         tf_shape_dst.AddDim((dst_pd.get_size() / sizeof(T)));
       } else {
         dnn_shape_dst.SetMklTensor(false);
@@ -773,7 +806,6 @@ class MklConcatOp : public OpKernel {
       dst.SetUsrMem(dst_md, dst_tensor);
 
       auto concat_op = concat(concat_pd, inputs, dst.GetOpMem());
-      std::vector<primitive> net;
       net.push_back(concat_op);
       stream(stream::kind::eager).submit(net).wait();
     } catch (mkldnn::error& e) {
@@ -787,15 +819,27 @@ class MklConcatOp : public OpKernel {
   }
 
   void CallEigenVersion(OpKernelContext* context, const OpInputList& values,
-                        const TensorShapeList& input_shapes) {
-    CHECK_EQ(values.size(), input_shapes.size());
+                        const MklDnnShapeList& mkl_input_shapes) {
+    CHECK_EQ(values.size(), mkl_input_shapes.size());
 
     std::vector<Tensor> converted_values;
-    for (int i = 0; i < input_shapes.size(); i++)
-      converted_values.push_back(values[i]);
+    TensorShapeList tf_input_shapes;
+    for (int i = 0; i < mkl_input_shapes.size(); i++) {
+      if (mkl_input_shapes[i].IsMklTensor()) {
+        // do conversion from MKL to TF
+        Tensor tmp_tensor =
+            ConvertMklToTF<T>(context, values[i], mkl_input_shapes[i]);
+        converted_values.push_back(tmp_tensor);
+        tf_input_shapes.push_back(mkl_input_shapes[i].GetTfShape());
+      } else {
+        // no conversion since it is TF tensor already
+        converted_values.push_back(values[i]);
+        tf_input_shapes.push_back(values[i].shape());
+      }
+    }
 
     // Call Eigen concat.
-    eigen_concat_op_.Compute(context, converted_values, input_shapes);
+    eigen_concat_op_.Compute(context, converted_values, tf_input_shapes);
 
     // Set output Mkl tensor for this op.
     MklDnnShape dnn_shape_output;
@@ -812,6 +856,55 @@ class MklConcatOp : public OpKernel {
         output_tensor->flat<uint8>().data(),
         output_tensor->flat<uint8>().size() * sizeof(uint8));
   }
+
+  // This method finds the most commom format accross all MKL inputs
+  // Inputs:
+  //   1. input_shapes: shapes of input (MKL) tensors.
+  //   2. concat_dim: concat dimension.
+  // Outputs:
+  //   1. is_reorder_needed is set to true if inputs have difference formats
+  //      It is set to false otherwise.
+  //   2. concat_dim_size is the size of concat_dim.
+  // Return:
+  //   return the common MKL format.
+  memory::format FindMklCommonFormat(const MklDnnShapeList& input_shapes,
+      int concat_dim, bool* is_reorder_needed, int64* concat_dim_size) {
+    *is_reorder_needed = false;
+    *concat_dim_size = 0;
+    std::unordered_map<int, int> occurrence_map;
+    if (input_shapes.size() == 0)
+      return memory::format::any;
+
+    // Compute ocurrences of each format of all inputs.
+    for (int k=0; k <input_shapes.size(); k++) {
+      auto src_dims = TFShapeToMklDnnDims(input_shapes[k].GetTfShape());
+      *concat_dim_size += src_dims[concat_dim];
+      int fmt = static_cast<int>(
+          input_shapes[k].GetMklLayout().data.format);
+      occurrence_map[fmt] += 1;
+    }
+
+    if (occurrence_map.size() == 1) {
+       // this means that all inputs have a same format
+       // return it with is_reorder_needed set false.
+       return static_cast<memory::format>(
+           input_shapes[0].GetMklLayout().data.format);
+    }
+
+    // Input tensors have different formats. Thus, reorder is needed.
+    // We pick up the most common format to minimize the total
+    // number of input reorder.
+    memory::format commonest_format = memory::format::any;
+    int max_occurrence = 0;
+    *is_reorder_needed = true;
+    for (auto item : occurrence_map) {
+      if (item.second > max_occurrence) {
+        commonest_format = static_cast<memory::format>(item.first);
+        max_occurrence = item.second;
+      }
+    }
+    return commonest_format;
+  }
 };
 
 #endif
diff --git a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
index c1da0ded1d..f857be6c32 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
@@ -18,6 +18,7 @@ limitations under the License.
 // bias.
 
 #ifdef INTEL_MKL
+#ifdef INTEL_MKL_ML
 
 #define USE_EIGEN_TENSOR
 #define EIGEN_USE_THREADS
@@ -264,4 +265,5 @@ class MklConv2DCustomBackpropBiasOp : public OpKernel {
 TF_CALL_float(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 } /* namespace tensorflow */
+#endif /* INTEL_MKL_ML */
 #endif /* INTEL_MKL */
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
index 279167aba2..c0dfed7d7d 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -199,13 +199,15 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase<T> {
     CHECK_NOTNULL(pool_params);
     CHECK_NOTNULL(dnn_data_input);
     TensorShape input_tensor_shape = input_tensor.shape();
-    memory::desc input_md =
+    if (input_tensor.NumElements() != 0) {
+      memory::desc input_md =
         input_mkl_shape.IsMklTensor()
             ? input_mkl_shape.GetMklLayout()
             : memory::desc(TFShapeToMklDnnDimsInNCHW(input_tensor_shape,
                                                      this->data_format_tf_),
                            MklDnnType<T>(), this->data_format_mkldnn_);
-    dnn_data_input->SetUsrMem(input_md, &input_tensor);
+      dnn_data_input->SetUsrMem(input_md, &input_tensor);
+    }
     this->InitMklPoolParameters(context, pool_params, input_mkl_shape,
                                 input_tensor_shape);
   }
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index 43c5b29509..e1fc2ea128 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -292,6 +292,7 @@ TF_CALL_string(REGISTER_SCATTER_ND_CPU);
   REGISTER_SCATTER_ND_UPDATE_GPU(type);   \
   REGISTER_SCATTER_ND_GPU(type);
 
+TF_CALL_int32(REGISTER_SCATTER_ND_ALL_GPU);
 // TODO(b/66916790): Support half types in ScatterNd.
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_ALL_GPU);
 TF_CALL_complex64(REGISTER_SCATTER_ND_ALL_GPU);
@@ -306,6 +307,8 @@ TF_CALL_complex128(REGISTER_SCATTER_ND_ALL_GPU);
 #define REGISTER_SCATTER_ND_UPDATE_SYCL(type) \
   REGISTER_SCATTER_ND_UPDATE(type, SYCL);
 
+TF_CALL_int32(REGISTER_SCATTER_ND_ADD_SUB_SYCL);
+TF_CALL_int32(REGISTER_SCATTER_ND_UPDATE_SYCL);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_ADD_SUB_SYCL);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_UPDATE_SYCL);
 #undef REGISTER_SCATTER_ND_ADD_SUB_SYCL
@@ -576,6 +579,7 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
+TF_CALL_int32(DECLARE_GPU_SPECS);
 // TODO(b/66916790): Support half types in ScatterNd.
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
index a3c21edc15..08b657f4c3 100644
--- a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
@@ -170,6 +170,7 @@ struct ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM> {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
+TF_CALL_int32(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
 TF_CALL_complex128(DECLARE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/scoped_allocator_ops_test.cc b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
index bb0129fa6f..634f9ba887 100644
--- a/tensorflow/core/kernels/scoped_allocator_ops_test.cc
+++ b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
@@ -216,8 +216,13 @@ TEST_F(ScopedAllocatorConcatOpTest, Success3) {
 }
 
 TEST_F(ScopedAllocatorConcatOpTest, Reshape) {
-  MakeOp({2, 2, 2}, DT_DOUBLE, true, "test", 120, 2);
-  ExecOp(DT_DOUBLE, 120, {{2, 2}, {2, 2}});
+  MakeOp({2, 2, 4}, DT_DOUBLE, true, "test", 120, 2);
+
+  // The elements of the third parameter to ExecOp must be multiples of
+  // Allocator::kAllocatorAlignment in size.  If they are not, the backing
+  // tensor allocated by PrepOp will have too many elements and reshaping
+  // will fail.
+  ExecOp(DT_DOUBLE, 120, {{2, 4}, {2, 4}});
 }
 
 TEST_F(ScopedAllocatorConcatOpTest, NoReshapeAttr) {
diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h
index 7796bf3587..d65692a552 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.h
+++ b/tensorflow/core/kernels/segment_reduction_ops.h
@@ -16,6 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 #define TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 
+
+// This file requires the following include because it uses CudaAtomicMax:
+// #include "tensorflow/core/util/cuda_kernel_helper.h"
+
+// Unfortunately we can't add the #include, since it breaks compilation for
+// non-GPU targets. This only breaks in clang, because it's more strict for
+// template code and CudaAtomicMax is used in template context.
+
 // This file requires the following include because it uses CudaAtomicMax:
 // #include "tensorflow/core/util/cuda_kernel_helper.h"
 
@@ -130,4 +138,4 @@ struct Highest {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index a1f9667b78..866c5dcd52 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -1490,7 +1490,7 @@ inline void LibxsmmSparseMatMul<TL, TR>::Compute(
 
 #endif  // TENSORFLOW_USE_LIBXSMM
 
-// Here is a an overview of the SparseMatMul code. Note that we assume that the
+// Here is an overview of the SparseMatMul code. Note that we assume that the
 // left matrix is sparse.
 //
 // The matrix "left" is divided into a grid with blocksize of (M, KL). Each
diff --git a/tensorflow/core/kernels/string_split_op.cc b/tensorflow/core/kernels/string_split_op.cc
index 4c2b312c34..26ab72f12e 100644
--- a/tensorflow/core/kernels/string_split_op.cc
+++ b/tensorflow/core/kernels/string_split_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
@@ -43,6 +44,63 @@ std::vector<string> Split(const string& str, const string& delimiter,
   return char_vector;
 }
 
+std::vector<string> SplitV2(const string& str, StringPiece sep, int maxsplit) {
+  // This SplitV2 method matches the behavior of python's str.split:
+  //   If sep is given, consecutive delimiters are not grouped together
+  //   and are deemed to delimit empty strings (for example, '1,,2'.split(',')
+  //   returns ['1', '', '2']). The sep argument may consist of multiple
+  //   characters (for example, '1<>2<>3'.split('<>') returns ['1', '2', '3']).
+  //   Splitting an empty string with a specified separator returns [''].
+  //
+  //   If sep is not specified or is None, a different splitting algorithm is
+  //   applied: runs of consecutive whitespace are regarded as a single
+  //   separator, and the result will contain no empty strings at the start or
+  //   end if the string has leading or trailing whitespace. Consequently,
+  //   splitting an empty string or a string consisting of just whitespace
+  //   with a None separator returns [].
+
+  std::vector<string> result;
+
+  StringPiece text(str);
+  if (maxsplit == 0) {
+    result.emplace_back(std::string(text));
+    return result;
+  }
+
+  if (sep.empty()) {
+    StringPiece token;
+    // Remove leading whitespaces.
+    str_util::RemoveLeadingWhitespace(&text);
+    int split = 0;
+    while (str_util::ConsumeNonWhitespace(&text, &token)) {
+      result.emplace_back(std::string(token));
+      str_util::RemoveLeadingWhitespace(&text);
+      ++split;
+      if (maxsplit > 0 && split == maxsplit) {
+        result.emplace_back(std::string(text));
+        return result;
+      }
+    }
+    return result;
+  }
+  auto p = std::search(text.begin(), text.end(), sep.begin(), sep.end());
+  int split = 0;
+  while (p != text.end()) {
+    StringPiece token = text.substr(0, p - text.begin());
+    result.emplace_back(std::string(token));
+    text.remove_prefix(token.size());
+    text.remove_prefix(sep.size());
+    ++split;
+    if (maxsplit > 0 && split == maxsplit) {
+      result.emplace_back(std::string(text));
+      return result;
+    }
+    p = std::search(text.begin(), text.end(), sep.begin(), sep.end());
+  }
+  result.emplace_back(std::string(text));
+  return result;
+}
+
 }  // namespace
 
 class StringSplitOp : public OpKernel {
@@ -122,6 +180,78 @@ class StringSplitOp : public OpKernel {
   bool skip_empty_;
 };
 
+class StringSplitV2Op : public OpKernel {
+ public:
+  explicit StringSplitV2Op(OpKernelConstruction* context)
+      : OpKernel(context), maxsplit_(-1) {
+    OP_REQUIRES_OK(context, context->GetAttr("maxsplit", &maxsplit_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* input_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(input_tensor->shape()),
+                errors::InvalidArgument("input must be a vector, got shape: ",
+                                        input_tensor->shape().DebugString()));
+
+    const auto input_vec = input_tensor->vec<string>();
+    const int64 batch_size = input_vec.dimension(0);
+
+    const Tensor* sep_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("sep", &sep_tensor));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(sep_tensor->shape()),
+                errors::InvalidArgument("sep must be a scalar, got shape: ",
+                                        sep_tensor->shape().DebugString()));
+    const auto sep_vec = sep_tensor->flat<string>();
+    StringPiece sep(sep_vec(0));
+    std::vector<string> tokens;
+    // Guess that we'll be unpacking a handful of tokens per example.
+    static constexpr int kReserveSize = 4;
+    tokens.reserve(batch_size * kReserveSize);
+
+    int64 output_size = 0;
+    int64 max_num_entries = 0;
+    std::vector<int64> num_indices(batch_size);
+    for (int64 i = 0; i < batch_size; ++i) {
+      std::vector<string> parts = SplitV2(input_vec(i), sep, maxsplit_);
+      int64 n_entries = parts.size();
+      num_indices[i] = n_entries;
+      output_size += n_entries;
+      max_num_entries = std::max(max_num_entries, n_entries);
+      tokens.insert(tokens.end(), parts.begin(), parts.end());
+    }
+
+    Tensor* sp_indices_t;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({output_size, 2}),
+                                             &sp_indices_t));
+    Tensor* sp_tokens_t;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output(1, TensorShape({output_size}), &sp_tokens_t));
+    Tensor* sp_shape_t;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(2, TensorShape({2}), &sp_shape_t));
+
+    auto sp_indices = sp_indices_t->matrix<int64>();
+    auto sp_tokens = sp_tokens_t->vec<string>();
+    auto sp_shape = sp_shape_t->vec<int64>();
+    sp_shape(0) = batch_size;
+    sp_shape(1) = max_num_entries;
+    size_t c = 0;
+    for (size_t i = 0; i < batch_size; ++i) {
+      for (size_t j = 0; j < num_indices[i]; ++j) {
+        sp_indices(c, 0) = i;
+        sp_indices(c, 1) = j;
+        sp_tokens(c) = tokens[c];
+        ++c;
+      }
+    }
+  }
+
+ private:
+  int maxsplit_;
+};
+
 REGISTER_KERNEL_BUILDER(Name("StringSplit").Device(DEVICE_CPU), StringSplitOp);
+REGISTER_KERNEL_BUILDER(Name("StringSplitV2").Device(DEVICE_CPU),
+                        StringSplitV2Op);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/candidate_sampling_ops.cc b/tensorflow/core/ops/candidate_sampling_ops.cc
index 6e4d100b04..6e589c8d1c 100644
--- a/tensorflow/core/ops/candidate_sampling_ops.cc
+++ b/tensorflow/core/ops/candidate_sampling_ops.cc
@@ -145,12 +145,15 @@ REGISTER_OP("ComputeAccidentalHits")
       int64 num_true;
       TF_RETURN_IF_ERROR(c->GetAttr("num_true", &num_true));
 
-      // Validate true_classes.
+      // Validate true_classes, must be a matrix.
       ShapeHandle true_classes;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &true_classes));
       DimensionHandle unused;
       TF_RETURN_IF_ERROR(
           c->WithValue(c->Dim(true_classes, 1), num_true, &unused));
+      // Validate sampled_candidates, must be a vector.
+      ShapeHandle sampled_candidates;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &sampled_candidates));
 
       // All three outputs are the same shape.
       ShapeHandle v = c->Vector(InferenceContext::kUnknownDim);
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 15e0ca8af9..9dca5f53ce 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -218,7 +218,17 @@ REGISTER_OP("MapAndBatchDataset")
     .Attr("Targuments: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      // Use index from the end to retrieve the Input shapes,
+      // so that to avoid guessing the length of "other_arguments".
+      // batch_size, num_parallel_batches, and drop_remainder are 0-D scalars.
+      shape_inference::ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 1), 0, &unused));
+
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("MapAndBatchDatasetV2")
     .Input("input_dataset: variant")
@@ -231,7 +241,17 @@ REGISTER_OP("MapAndBatchDatasetV2")
     .Attr("Targuments: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      // Use index from the end to retrieve the Input shapes,
+      // so that to avoid guessing the length of "other_arguments".
+      // batch_size, num_parallel_calls, and drop_remainder are 0-D scalars.
+      shape_inference::ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 1), 0, &unused));
+
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("PrefetchDataset")
     .Input("input_dataset: variant")
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index d949e70c66..87f4991134 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -454,7 +454,9 @@ REGISTER_OP("DrawBoundingBoxes")
       DimensionHandle unused;
       TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 2), 4, &unused));
 
-      return shape_inference::UnchangedShapeWithRankAtLeast(c, 3);
+      // The rank of the input image (rank = 4) has already been restricted
+      // above, and the output is of the same shape as the input.
+      return shape_inference::UnchangedShape(c);
     });
 
 // --------------------------------------------------------------------------
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 1740fa152c..b3487122e2 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1084,7 +1084,7 @@ REGISTER_OP("UnsortedSegmentProd")
     .Input("segment_ids: Tindices")
     .Input("num_segments: Tnumsegments")
     .Output("output: T")
-    .Attr("T: realnumbertype")
+    .Attr("T: numbertype")
     .Attr("Tindices: {int32,int64}")
     .Attr("Tnumsegments: {int32,int64} = DT_INT32")
     .SetShapeFn(UnsortedSegmentReductionShapeFn);
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index fc60e807b9..41efa49ce3 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -1453,6 +1453,7 @@ REGISTER_OP("QuantizedReluX")
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Scalar());
       return Status::OK();
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index 1d5c743a56..4423062362 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -78,7 +78,7 @@ REGISTER_OP("ReduceJoin")
 REGISTER_OP("AsString")
     .Input("input: T")
     .Output("output: string")
-    .Attr("T: {int32, int64, complex64, float, double, bool, int8}")
+    .Attr("T: {int8, int16, int32, int64, complex64, float, double, bool}")
     .Attr("precision: int = -1")
     .Attr("scientific: bool = false")
     .Attr("shortest: bool = false")
@@ -134,6 +134,24 @@ REGISTER_OP("StringSplit")
       return Status::OK();
     });
 
+REGISTER_OP("StringSplitV2")
+    .Input("input: string")
+    .Input("sep: string")
+    .Output("indices: int64")
+    .Output("values: string")
+    .Output("shape: int64")
+    .Attr("maxsplit: int = -1")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+
+      c->set_output(0, c->Matrix(InferenceContext::kUnknownDim, 2));
+      c->set_output(1, c->Vector(InferenceContext::kUnknownDim));
+      c->set_output(2, c->Vector(2));
+      return Status::OK();
+    });
+
 REGISTER_OP("StringStrip")
     .Input("input: string")
     .Output("output: string")
diff --git a/tensorflow/core/platform/cpu_info.cc b/tensorflow/core/platform/cpu_info.cc
index 99de364042..e9da3d8e32 100644
--- a/tensorflow/core/platform/cpu_info.cc
+++ b/tensorflow/core/platform/cpu_info.cc
@@ -344,5 +344,28 @@ int CPUModelNum() {
 #endif
 }
 
+int CPUIDNumSMT() {
+#ifdef PLATFORM_IS_X86
+  // https://software.intel.com/en-us/articles/intel-64-architecture-processor-topology-enumeration
+  // https://software.intel.com/en-us/articles/intel-sdm (Vol 3A)
+  // Section: Detecting Hardware Multi-threads Support and Topology
+  // Uses CPUID Leaf 11 to enumerate system topology on Intel x86 architectures
+  // Other cases not supported
+  uint32 eax, ebx, ecx, edx;
+  // Check if system supports Leaf 11
+  GETCPUID(eax, ebx, ecx, edx, 0, 0);
+  if (eax >= 11) {
+    // 1) Leaf 11 available? CPUID.(EAX=11, ECX=0):EBX != 0
+    // 2) SMT_Mask_Width = CPUID.(EAX=11, ECX=0):EAX[4:0] if CPUID.(EAX=11,
+    // ECX=0):ECX[15:8] is 1
+    GETCPUID(eax, ebx, ecx, edx, 11, 0);
+    if (ebx != 0 && ((ecx & 0xff00) >> 8) == 1) {
+      return 1 << (eax & 0x1f);  // 2 ^ SMT_Mask_Width
+    }
+  }
+#endif  // PLATFORM_IS_X86
+  return 0;
+}
+
 }  // namespace port
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cpu_info.h b/tensorflow/core/platform/cpu_info.h
index b5be7e8b54..175c9ae8b1 100644
--- a/tensorflow/core/platform/cpu_info.h
+++ b/tensorflow/core/platform/cpu_info.h
@@ -35,6 +35,10 @@ namespace port {
 // software can change it dynamically.
 int NumSchedulableCPUs();
 
+// Returns an estimate of the number of hyperthreads per physical core
+// on the CPU
+int NumHyperthreadsPerCore();
+
 // Mostly ISA related features that we care about
 enum CPUFeature {
   // Do not change numeric assignments.
@@ -107,6 +111,9 @@ int CPUModelNum();
 // Returns nominal core processor cycles per second of each processor.
 double NominalCPUFrequency();
 
+// Returns num of hyperthreads per physical core
+int CPUIDNumSMT();
+
 }  // namespace port
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index ae81f9b5b3..a319ccbdbe 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -71,6 +71,8 @@ def pyx_library(
         name = filename + "_cython_translation",
         srcs = [filename],
         outs = [filename.split(".")[0] + ".cpp"],
+        # Optionally use PYTHON_BIN_PATH on Linux platforms so that python 3
+        # works. Windows has issues with cython_binary so skip PYTHON_BIN_PATH.
         cmd = "PYTHONHASHSEED=0 $(location @cython//:cython_binary) --cplus $(SRCS) --output-file $(OUTS)",
         tools = ["@cython//:cython_binary"] + pxd_srcs,
     )
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
index 72c12318ca..ff4b4436bb 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@@ -115,18 +115,17 @@ class LibHDFS {
     const char* kLibHdfsDso = "libhdfs.so";
 #endif
     char* hdfs_home = getenv("HADOOP_HDFS_HOME");
-    if (hdfs_home == nullptr) {
-      status_ = errors::FailedPrecondition(
-          "Environment variable HADOOP_HDFS_HOME not set");
-      return;
-    }
-    string path = io::JoinPath(hdfs_home, "lib", "native", kLibHdfsDso);
-    status_ = TryLoadAndBind(path.c_str(), &handle_);
-    if (!status_.ok()) {
-      // try load libhdfs.so using dynamic loader's search path in case
-      // libhdfs.so is installed in non-standard location
-      status_ = TryLoadAndBind(kLibHdfsDso, &handle_);
+    if (hdfs_home != nullptr) {
+      string path = io::JoinPath(hdfs_home, "lib", "native", kLibHdfsDso);
+      status_ = TryLoadAndBind(path.c_str(), &handle_);
+      if (status_.ok()) {
+        return;
+      }
     }
+
+    // Try to load the library dynamically in case it has been installed
+    // to a in non-standard location.
+    status_ = TryLoadAndBind(kLibHdfsDso, &handle_);
   }
 
   Status status_;
diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc
index 8e316472fe..708f32ba80 100644
--- a/tensorflow/core/platform/posix/port.cc
+++ b/tensorflow/core/platform/posix/port.cc
@@ -74,6 +74,11 @@ int NumSchedulableCPUs() {
   return kDefaultCores;
 }
 
+int NumHyperthreadsPerCore() {
+  static const int ht_per_core = tensorflow::port::CPUIDNumSMT();
+  return (ht_per_core > 0) ? ht_per_core : 1;
+}
+
 void* AlignedMalloc(size_t size, int minimum_alignment) {
 #if defined(__ANDROID__)
   return memalign(minimum_alignment, size);
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 522a9d84fd..cb1fd09dbb 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -19,12 +19,12 @@ limitations under the License.
 // TensorFlow uses semantic versioning, see http://semver.org/.
 
 #define TF_MAJOR_VERSION 1
-#define TF_MINOR_VERSION 8
+#define TF_MINOR_VERSION 9
 #define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX ""
+#define TF_VERSION_SUFFIX "-rc0"
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index dffc965b14..90b6533690 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -42,6 +42,7 @@ limitations under the License.
 
 #ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
+#include "tensorflow/core/lib/core/stringpiece.h"
 
 using mkldnn::engine;
 using mkldnn::memory;
@@ -712,15 +713,48 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
   return output_tensor;
 }
 #else
+using mkldnn::stream;
+template <typename T> class MklDnnData;
+
 template <typename T>
 inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
                              const MklDnnShape& mkl_shape) {
   Tensor output_tensor;
-  TensorShape output_shape;
-
-  TF_CHECK_OK(
-      Status(error::Code::UNIMPLEMENTED, "Unimplemented conversion function"));
-
+  try {
+    if (!mkl_shape.IsMklTensor())
+      return mkl_tensor;  // return input since it is already TF tensor
+
+    TensorShape output_shape = mkl_shape.GetTfShape();;
+
+    // Allocate output tensor.
+    context->allocate_temp(DataTypeToEnum<T>::v(),
+        output_shape, &output_tensor);
+
+    auto cpu_engine = engine(engine::cpu, 0);
+    MklDnnData<T> input(&cpu_engine);
+
+    // Get Mkl layout of input tensor.
+    auto input_mkl_md = mkl_shape.GetMklLayout();
+    auto output_tf_md = mkl_shape.GetTfLayout();
+    auto output_tf_pd = memory::primitive_desc(output_tf_md, cpu_engine);
+    input.SetUsrMem(input_mkl_md, &mkl_tensor);
+
+    // reorder
+    if (input.IsReorderNeeded(output_tf_pd)) {
+      std::vector<primitive> net;
+      CHECK_EQ(input.CheckReorderToOpMem(output_tf_pd, &output_tensor, &net),
+             true);
+      stream(stream::kind::eager).submit(net).wait();
+    } else {
+      // If not, just forward input tensor to output tensor.
+      CHECK(output_tensor.CopyFrom(mkl_tensor, output_shape));
+    }
+  } catch (mkldnn::error& e) {
+    string error_msg = "Status: " + std::to_string(e.status) +
+                       ", message: " + string(e.message) + ", in file " +
+                       string(__FILE__) + ":" + std::to_string(__LINE__);
+    LOG(FATAL) << "Operation received an exception: " << error_msg;
+  }
   return output_tensor;
 }
 #endif
@@ -1843,7 +1877,7 @@ class FactoryKeyCreator {
   template <typename T>
   void AddAsKey(const T data) {
     auto buffer = reinterpret_cast<const char *>(&data);
-    Append(absl::string_view(buffer, sizeof(T)));
+    Append(StringPiece(buffer, sizeof(T)));
   }
 
   std::string GetKey() {
@@ -1854,8 +1888,8 @@ class FactoryKeyCreator {
   string key_;
   const char delimiter = 'x';
   const int kMaxKeyLength = 256;
-  void Append(absl::string_view s) {
-    key_.append(string(s));
+  void Append(StringPiece s) {
+    key_.append(s.ToString());
     key_.append(1, delimiter);
   }
 };
diff --git a/tensorflow/docs_src/community/groups.md b/tensorflow/docs_src/community/groups.md
index d92f5775fa..0b07d413da 100644
--- a/tensorflow/docs_src/community/groups.md
+++ b/tensorflow/docs_src/community/groups.md
@@ -1,17 +1,38 @@
 # User Groups
 
-TensorFlow has communities around the world.
+TensorFlow has communities around the world. [Submit your community!](https://docs.google.com/forms/d/e/1FAIpQLSc_RQIUYtVgLLihzATaO_WUXkEyBDE_OoRoOXYDPmBEvHuEBA/viewform)
 
 ## Asia
 
-* [TensorFlow Korea (TF-KR) User Group](https://www.facebook.com/groups/TensorFlowKR/) _(Korean language)_
-* [TensorFlow User Group Tokyo](https://tfug-tokyo.connpass.com/) _(Japanese Language)_
-* [Soleil Data Dojo](https://soleildatadojo.connpass.com/) _(Japanese language)_
+* [TensorFlow China community](https://www.tensorflowers.cn)
+* [TensorFlow Korea (TF-KR) User Group](https://www.facebook.com/groups/TensorFlowKR/)
+* [TensorFlow User Group Tokyo](https://tfug-tokyo.connpass.com/)
+* [Soleil Data Dojo](https://soleildatadojo.connpass.com/)
 * [TensorFlow User Group Utsunomiya](https://tfug-utsunomiya.connpass.com/)
+* [TensorFlow Philippines Community](https://www.facebook.com/groups/TensorFlowPH/)
+* [TensorFlow and Deep Learning Singapore](https://www.meetup.com/TensorFlow-and-Deep-Learning-Singapore/)
+* [TensorFlow India](https://www.facebook.com/tensorflowindia)
 
 
 ## Europe
 
 * [TensorFlow Barcelona](https://www.meetup.com/Barcelona-Machine-Learning-Meetup/)
 * [TensorFlow Madrid](https://www.meetup.com/TensorFlow-Madrid/)
+* [Tensorflow Belgium](https://www.meetup.com/TensorFlow-Belgium)
+* [TensorFlow x Rome Meetup](https://www.meetup.com/it-IT/TensorFlow-x-Rome-Meetup)
+* [TensorFlow London](https://www.meetup.com/TensorFlow-London/)
+* [TensorFlow Edinburgh](https://www.meetup.com/tensorflow-edinburgh/)
 
+
+## America
+
+* [TensorFlow Buenos Aires](https://www.meetup.com/TensorFlow-Buenos-Aires/)
+
+
+## Oceania
+* [Melbourne TensorFlow Meetup](https://www.meetup.com/Melbourne-TensorFlow-Meetup)
+
+
+## Africa
+
+* [TensorFlow Tunis Meetup](https://www.meetup.com/fr-FR/TensorFlow-Tunis-Meetup/)
diff --git a/tensorflow/docs_src/get_started/eager.md b/tensorflow/docs_src/get_started/eager.md
index f08ac74425..bbb25e20c6 100644
--- a/tensorflow/docs_src/get_started/eager.md
+++ b/tensorflow/docs_src/get_started/eager.md
@@ -1,3 +1,3 @@
 # Get Started with Eager Execution
 
-[Colab notebook](https://colab.research.google.com/github/tensorflow/models/blob/r1.8.0/samples/core/get_started/eager.ipynb)
+[Colab notebook](https://colab.research.google.com/github/tensorflow/models/blob/r1.9.0/samples/core/get_started/eager.ipynb)
diff --git a/tensorflow/docs_src/get_started/index.md b/tensorflow/docs_src/get_started/index.md
index 55579d52fb..232d2f1547 100644
--- a/tensorflow/docs_src/get_started/index.md
+++ b/tensorflow/docs_src/get_started/index.md
@@ -10,9 +10,9 @@ course prior to diving into TensorFlow documentation:
 TensorFlow is a tool for machine learning. While it contains a wide range of
 functionality, TensorFlow is mainly designed for deep neural network models.
 
-The easiest way to get started with TensorFlow is using Eager Execution.
+The easiest way to get started with TensorFlow is by using Eager Execution.
 
-  * @{$get_started/eager}, is for anyone new to  machine learning or TensorFlow.
+  * @{$get_started/eager}, is for anyone new to machine learning or TensorFlow.
 
 TensorFlow provides many APIs. The remainder of this section focuses on the
 Estimator API which provide scalable, high-performance models. See the
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index 1abd840ab3..2901848745 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -38,7 +38,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.8.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.9.0-rc0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index 52a2a3f8a6..55bc0f64e7 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.8.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.9.0-rc0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index 1256fb99c4..637231da12 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.8.0</version>
+  <version>1.9.0-rc0</version>
 </dependency>
 ```
 
@@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.8.0</version>
+                 <version>1.9.0-rc0</version>
                </dependency>
              </dependencies>
          </project>
@@ -124,12 +124,12 @@ instead:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-  <version>1.8.0</version>
+  <version>1.9.0-rc0</version>
 </dependency>
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow_jni_gpu</artifactId>
-  <version>1.8.0</version>
+  <version>1.9.0-rc0</version>
 </dependency>
 ```
 
@@ -148,7 +148,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.9.0-rc0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -167,7 +167,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.8.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.9.0-rc0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -175,13 +175,13 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.9.0-rc0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.8.0.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.9.0-rc0.zip).
   3. Extract this .zip file.
 
-
+__Note__: The native library (`tensorflow_jni.dll`) requires `msvcp140.dll` at runtime, which is included in the [Visual C++ 2015 Redistributable](https://www.microsoft.com/en-us/download/details.aspx?id=48145) package. 
 
 ### Validate the installation
 
@@ -227,7 +227,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.8.0.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.9.0-rc0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -241,11 +241,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<pre><b>java -cp libtensorflow-1.8.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.9.0-rc0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.8.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.9.0-rc0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 0ed8160027..c8d706cf3c 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -339,9 +339,7 @@ Docker will download the TensorFlow binary image the first time you launch it.
 
 #### GPU support
 
-Prior to installing TensorFlow with GPU support, ensure that your system meets all
-[NVIDIA software requirements](#NVIDIARequirements).  To launch a Docker container
-with NVidia GPU support, enter a command of the following format:
+To launch a Docker container with NVidia GPU support, enter a command of the following format (this [does not require any local CUDA installation](https://github.com/nvidia/nvidia-docker/wiki/CUDA#requirements)):
 
 <pre>
 $ <b>nvidia-docker run -it</b> <i>-p hostPort:containerPort TensorFlowGPUImage</i>
@@ -438,7 +436,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 <a name="ValidateYourInstallation"></a>
 ## Validate your installation
@@ -517,7 +515,7 @@ on your system:
   from source. To use the TensorFlow binaries, version 3.5 or higher is required.
   See the [NVIDIA documentation](https://developer.nvidia.com/cuda-gpus) for a
   list of supported GPU cards.
-* [GPU drivers](http://nvidia.com/driver) that support your version of the CUDA
+* [GPU drivers](http://nvidia.com/drivers) that support your version of the CUDA
   Toolkit.
 * The `libcupti-dev` library is the NVIDIA CUDA Profile Tools Interface. This
   library provides advanced profiling support. To install this library,
@@ -684,14 +682,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -703,14 +701,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -722,14 +720,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -741,14 +739,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index 29a867a9e3..9d01271c5a 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -119,7 +119,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py3-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -242,7 +242,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py3-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -350,7 +350,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -522,7 +522,7 @@ The value you specify depends on your Python version.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py2-none-any.whl
 </pre>
 
 
@@ -530,5 +530,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py2-none-any.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py3-none-any.whl
 </pre>
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index 5ba522b436..dc6c1e36fc 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -81,7 +81,7 @@ or
 [macOS](#PrepareMac)
 
 
-<a name="#PrepareLinux"></a>
+<a name="PrepareLinux"></a>
 ## Prepare environment for Linux
 
 Before building TensorFlow on Linux, install the following build
@@ -328,10 +328,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.8.0 on Linux:
+for TensorFlow 1.9.0rc0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.8.0-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.9.0rc0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
@@ -373,9 +373,9 @@ The build and installation problems you encounter typically depend on the
 operating system.  See the "Common installation problems" section
 of one of the following guides:
 
-  * @{$install_linux#CommonInstallationProblems$Installing TensorFlow on Linux}
-  * @{$install_mac#CommonInstallationProblems$Installing TensorFlow on Mac OS}
-  * @{$install_windows#CommonInstallationProblems$Installing TensorFlow on Windows}
+  * @{$install_linux#common_installation_problems$Installing TensorFlow on Linux}
+  * @{$install_mac#common_installation_problems$Installing TensorFlow on Mac OS}
+  * @{$install_windows#common_installation_problems$Installing TensorFlow on Windows}
 
 Beyond the errors documented in those two guides, the following table
 notes additional errors specific to building TensorFlow.  Note that we
@@ -433,6 +433,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Linux**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.9.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.11.0</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.9.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.11.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.8.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.10.0</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.8.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.7.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.10.0</td><td>N/A</td><td>N/A</td></tr>
@@ -456,6 +458,7 @@ Stack Overflow and specify the `tensorflow` tag.
 **Mac**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.9.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.11.0</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.8.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.10.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.7.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.10.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.6.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
@@ -472,6 +475,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Windows**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.9.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.9.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.8.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.8.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.7.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
diff --git a/tensorflow/docs_src/mobile/linking_libs.md b/tensorflow/docs_src/mobile/linking_libs.md
index cf0db59021..efef5dd0da 100644
--- a/tensorflow/docs_src/mobile/linking_libs.md
+++ b/tensorflow/docs_src/mobile/linking_libs.md
@@ -27,7 +27,7 @@ called `libandroid_tensorflow_inference_java.jar`. There are three ways to
 include this functionality in your program:
 
 1. Include the jcenter AAR which contains it, as in this
- [example app](https://github.com/googlecodelabs/tensorflow-for-poets-2/blob/master/android/build.gradle#L59-L65)
+ [example app](https://github.com/googlecodelabs/tensorflow-for-poets-2/blob/master/android/tfmobile/build.gradle#L59-L65)
 
 2. Download the nightly precompiled version from
 [ci.tensorflow.org](http://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/).
diff --git a/tensorflow/docs_src/mobile/prepare_models.md b/tensorflow/docs_src/mobile/prepare_models.md
index 8b22c04d87..2b84dbb973 100644
--- a/tensorflow/docs_src/mobile/prepare_models.md
+++ b/tensorflow/docs_src/mobile/prepare_models.md
@@ -105,8 +105,8 @@ inline constants so everything’s in one file.  To handle the conversion, you
 need the `freeze_graph.py` script, that’s held in
 [`tensorflow/python/tools/freeze_graph.py`](https://www.tensorflow.org/code/tensorflow/python/tools/freeze_graph.py). You’ll run it like this:
 
-    bazel build tensorflow/tools:freeze_graph
-    bazel-bin/tensorflow/tools/freeze_graph \
+    bazel build tensorflow/python/tools:freeze_graph
+    bazel-bin/tensorflow/python/tools/freeze_graph \
     --input_graph=/tmp/model/my_graph.pb \
     --input_checkpoint=/tmp/model/model.ckpt-1000 \
     --output_graph=/tmp/frozen_graph.pb \
diff --git a/tensorflow/docs_src/performance/quantization.md b/tensorflow/docs_src/performance/quantization.md
index 2fea02d861..c97f74139c 100644
--- a/tensorflow/docs_src/performance/quantization.md
+++ b/tensorflow/docs_src/performance/quantization.md
@@ -227,8 +227,8 @@ of 30.0f, and an 8-bit array, the quantized values represent the following:
   <table>
     <tr><th>Quantized</th><th>Float</th></tr>
     <tr><td>0</td><td>-10.0</td></tr>
-    <tr><td>255</td><td>30.0</td></tr>
     <tr><td>128</td><td>10.0</td></tr>
+    <tr><td>255</td><td>30.0</td></tr>
   </table>
   <figcaption>
     <b>Table 2</b>: Example quantized value range
diff --git a/tensorflow/docs_src/programmers_guide/estimators.md b/tensorflow/docs_src/programmers_guide/estimators.md
index c4aae1d9d6..b13b47184d 100644
--- a/tensorflow/docs_src/programmers_guide/estimators.md
+++ b/tensorflow/docs_src/programmers_guide/estimators.md
@@ -21,18 +21,17 @@ Note: TensorFlow also includes a deprecated `Estimator` class at
 
 Estimators provide the following benefits:
 
-*   You can run Estimators-based models on a local host or on a
+*   You can run Estimator-based models on a local host or on a
     distributed multi-server environment without changing your model.
-    Furthermore, you can run Estimators-based models on CPUs, GPUs,
+    Furthermore, you can run Estimator-based models on CPUs, GPUs,
     or TPUs without recoding your model.
 *   Estimators simplify sharing implementations between model developers.
-*   You can develop a state of the art model with high-level intuitive code,
+*   You can develop a state of the art model with high-level intuitive code.
     In short, it is generally much easier to create models with Estimators
     than with the low-level TensorFlow APIs.
-*   Estimators are themselves built on tf.layers, which
+*   Estimators are themselves built on @{tf.layers}, which
     simplifies customization.
-*   Estimators build the graph for you.  In other words, you don't have to
-    build the graph.
+*   Estimators build the graph for you.
 *   Estimators provide a safe distributed training loop that controls how and
     when to:
     *   build the graph
@@ -57,7 +56,7 @@ the "plumbing" for you.  That is, pre-made Estimators create and manage
 pre-made Estimators let you experiment with different model architectures by
 making only minimal code changes.  @{tf.estimator.DNNClassifier$`DNNClassifier`},
 for example, is a pre-made Estimator class that trains classification models
-through dense, feed-forward neural networks.
+based on dense, feed-forward neural networks.
 
 
 ### Structure of a pre-made Estimators program
@@ -79,7 +78,7 @@ of the following four steps:
     an input function:
 
         def input_fn(dataset):
-           ...  # manipulate dataset, extracting feature names and the label
+           ...  # manipulate dataset, extracting the feature dict and the label
            return feature_dict, label
 
     (See @{$programmers_guide/datasets} for full details.)
@@ -96,13 +95,13 @@ of the following four steps:
         population = tf.feature_column.numeric_column('population')
         crime_rate = tf.feature_column.numeric_column('crime_rate')
         median_education = tf.feature_column.numeric_column('median_education',
-                            normalizer_fn='lambda x: x - global_education_mean')
+                            normalizer_fn=lambda x: x - global_education_mean)
 
 3.  **Instantiate the relevant pre-made Estimator.**  For example, here's
     a sample instantiation of a pre-made Estimator named `LinearClassifier`:
 
         # Instantiate an estimator, passing the feature columns.
-        estimator = tf.estimator.Estimator.LinearClassifier(
+        estimator = tf.estimator.LinearClassifier(
             feature_columns=[population, crime_rate, median_education],
             )
 
diff --git a/tensorflow/docs_src/programmers_guide/feature_columns.md b/tensorflow/docs_src/programmers_guide/feature_columns.md
index 845194fe0e..90f5c53a17 100644
--- a/tensorflow/docs_src/programmers_guide/feature_columns.md
+++ b/tensorflow/docs_src/programmers_guide/feature_columns.md
@@ -528,10 +528,10 @@ suggested by the following snippet:
 categorical_column = ... # Create any categorical column
 
 # Represent the categorical column as an embedding column.
-# This means creating a one-hot vector with one element for each category.
+# This means creating an embedding vector lookup table with one element for each category.
 embedding_column = tf.feature_column.embedding_column(
     categorical_column=categorical_column,
-    dimension=dimension_of_embedding_vector)
+    dimension=embedding_dimensions)
 ```
 
 @{$programmers_guide/embedding$Embeddings} is a significant topic within machine
diff --git a/tensorflow/examples/learn/iris.py b/tensorflow/examples/learn/iris.py
index 03e60972aa..86f5204ec3 100644
--- a/tensorflow/examples/learn/iris.py
+++ b/tensorflow/examples/learn/iris.py
@@ -21,7 +21,8 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-import urllib
+
+from six.moves.urllib.request import urlretrieve
 
 import tensorflow as tf
 
@@ -38,9 +39,7 @@ FEATURE_KEYS = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
 def maybe_download_iris_data(file_name, download_url):
   """Downloads the file and returns the number of data."""
   if not os.path.exists(file_name):
-    raw = urllib.urlopen(download_url).read()
-    with open(file_name, 'w') as f:
-      f.write(raw)
+    urlretrieve(download_url, file_name)
 
   # The first line is a comma-separated string. The first one is the number of
   # total data in the file.
diff --git a/tensorflow/java/src/gen/cc/op_generator.cc b/tensorflow/java/src/gen/cc/op_generator.cc
index debd95fc62..9b171f66ec 100644
--- a/tensorflow/java/src/gen/cc/op_generator.cc
+++ b/tensorflow/java/src/gen/cc/op_generator.cc
@@ -376,9 +376,6 @@ void GenerateOp(const OpSpec& op, const EndpointSpec& endpoint,
     }
   }
   // op annotations
-  op_class.add_annotation(
-      Annotation::Create("Generated", "javax.annotation")
-          .attributes("value = \"TensorFlow Java Op Generator\""));
   if (endpoint.deprecated()) {
     op_class.add_annotation(Annotation::Create("Deprecated"));
     string explanation;
@@ -415,8 +412,12 @@ void GenerateOp(const OpSpec& op, const EndpointSpec& endpoint,
   SourceFileWriter writer(op_file.get());
   std::list<Type> dependencies;
   CollectOpDependencies(op, mode, &dependencies);
-  writer.Write(kLicense).EndLine().BeginType(op_class, PUBLIC | FINAL,
-                                             &dependencies, &op_javadoc);
+  writer.Write(kLicense)
+      .EndLine()
+      .Write("// This class has been generated, DO NOT EDIT!")
+      .EndLine()
+      .EndLine()
+      .BeginType(op_class, PUBLIC | FINAL, &dependencies, &op_javadoc);
   if (!op.optional_attributes().empty()) {
     RenderOptionsClass(op, op_class, &writer);
   }
diff --git a/tensorflow/java/src/gen/cc/op_specs.cc b/tensorflow/java/src/gen/cc/op_specs.cc
index 181fd4c5e3..941ab2699c 100644
--- a/tensorflow/java/src/gen/cc/op_specs.cc
+++ b/tensorflow/java/src/gen/cc/op_specs.cc
@@ -96,6 +96,7 @@ Type TypeResolver::TypeOf(const OpDef_ArgDef& arg_def, bool* iterable_out) {
     *iterable_out = true;
     visited_attrs_.insert(std::make_pair(arg_def.number_attr(), Type::Int()));
   }
+
   Type type = Type::Wildcard();
   if (arg_def.type() != DataType::DT_INVALID) {
     // resolve type from DataType
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index b2e6c60021..bd97b181ff 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -196,11 +196,11 @@ def implicit_val_and_grad(f):
   # TODO(cais): Remove calls to tf.constant() once the gradients functions
   # accept lists and np.ndarrays.
 
-  def grad_fn(*args):
+  def grad_fn(*args, **kwds):
     """Computes the gradient of the wrapped function."""
     this_tape = tape.push_new_tape()
     try:
-      end_node = f(*args)
+      end_node = f(*args, **kwds)
       if end_node is None:
         raise ValueError("Cannot differentiate a function that returns None; "
                          "did you forget to return a value from {}?".format(
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index 9cd17e0407..20522098b0 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -978,7 +978,10 @@ py_test(
     size = "large",
     srcs = ["keras_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["notsan"],
+    tags = [
+        "no_windows",
+        "notsan",
+    ],
     deps = [
         ":keras",
         "//tensorflow/core:protos_all_py",
diff --git a/tensorflow/python/estimator/exporter.py b/tensorflow/python/estimator/exporter.py
index 7cdf840c97..b18212cfcd 100644
--- a/tensorflow/python/estimator/exporter.py
+++ b/tensorflow/python/estimator/exporter.py
@@ -156,7 +156,7 @@ def _loss_smaller(best_eval_result, current_eval_result):
   return best_eval_result[default_key] > current_eval_result[default_key]
 
 
-def _verify_compre_fn_args(compare_fn):
+def _verify_compare_fn_args(compare_fn):
   """Verifies compare_fn arguments."""
   args = set(util.fn_args(compare_fn))
   if 'best_eval_result' not in args:
@@ -265,7 +265,7 @@ class BestExporter(Exporter):
     self._compare_fn = compare_fn
     if self._compare_fn is None:
       raise ValueError('`compare_fn` must not be None.')
-    _verify_compre_fn_args(self._compare_fn)
+    _verify_compare_fn_args(self._compare_fn)
 
     self._saved_model_exporter = _SavedModelExporter(
         name, serving_input_receiver_fn, assets_extra, as_text)
diff --git a/tensorflow/python/estimator/inputs/numpy_io.py b/tensorflow/python/estimator/inputs/numpy_io.py
index 035c7c148c..a6cefdece2 100644
--- a/tensorflow/python/estimator/inputs/numpy_io.py
+++ b/tensorflow/python/estimator/inputs/numpy_io.py
@@ -136,11 +136,13 @@ def numpy_input_fn(x,
       values in `x` have same shape).
     ValueError: if duplicate keys are in both `x` and `y` when `y` is a dict.
     ValueError: if x or y is an empty dict.
-    TypeError: `x` is not a dict or array, or if `shuffle` is not bool.
+    TypeError: `x` is not a dict or array.
+    ValueError: if 'shuffle' is not provided or a bool.
   """
   if not isinstance(shuffle, bool):
-    raise TypeError('shuffle must be explicitly set as boolean; '
-                    'got {}'.format(shuffle))
+    raise ValueError('shuffle must be provided and explicitly set as boolean '
+                     '(it is recommended to set it as True for training); '
+                     'got {}'.format(shuffle))
 
   def input_fn():
     """Numpy input function."""
diff --git a/tensorflow/python/estimator/inputs/numpy_io_test.py b/tensorflow/python/estimator/inputs/numpy_io_test.py
index 92d057e25d..81b201cc5c 100644
--- a/tensorflow/python/estimator/inputs/numpy_io_test.py
+++ b/tensorflow/python/estimator/inputs/numpy_io_test.py
@@ -286,8 +286,9 @@ class NumpyIoTest(test.TestCase):
     x = np.arange(32, 36)
     y = np.arange(4)
     with self.test_session():
-      with self.assertRaisesRegexp(TypeError,
-                                   'shuffle must be explicitly set as boolean'):
+      with self.assertRaisesRegexp(ValueError,
+                                   'shuffle must be provided and explicitly '
+                                   'set as boolean'):
         # Default shuffle is None.
         numpy_io.numpy_input_fn(x, y)
 
diff --git a/tensorflow/python/estimator/inputs/pandas_io.py b/tensorflow/python/estimator/inputs/pandas_io.py
index 938e244fb3..57f8e5fd6a 100644
--- a/tensorflow/python/estimator/inputs/pandas_io.py
+++ b/tensorflow/python/estimator/inputs/pandas_io.py
@@ -68,15 +68,16 @@ def pandas_input_fn(x,
   Raises:
     ValueError: if `x` already contains a column with the same name as `y`, or
       if the indexes of `x` and `y` don't match.
-    TypeError: `shuffle` is not bool.
+    ValueError: if 'shuffle' is not provided or a bool.
   """
   if not HAS_PANDAS:
     raise TypeError(
         'pandas_input_fn should not be called without pandas installed')
 
   if not isinstance(shuffle, bool):
-    raise TypeError('shuffle must be explicitly set as boolean; '
-                    'got {}'.format(shuffle))
+    raise ValueError('shuffle must be provided and explicitly set as boolean '
+                     '(it is recommended to set it as True for training); '
+                     'got {}'.format(shuffle))
 
   x = x.copy()
   if y is not None:
diff --git a/tensorflow/python/estimator/inputs/pandas_io_test.py b/tensorflow/python/estimator/inputs/pandas_io_test.py
index e5912a3b28..dcecf6dd61 100644
--- a/tensorflow/python/estimator/inputs/pandas_io_test.py
+++ b/tensorflow/python/estimator/inputs/pandas_io_test.py
@@ -70,8 +70,9 @@ class PandasIoTest(test.TestCase):
       return
     x, _ = self.makeTestDataFrame()
     y_noindex = pd.Series(np.arange(-32, -28))
-    with self.assertRaisesRegexp(TypeError,
-                                 'shuffle must be explicitly set as boolean'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'shuffle must be provided and explicitly '
+                                 'set as boolean'):
       # Default shuffle is None
       pandas_io.pandas_input_fn(x, y_noindex)
 
diff --git a/tensorflow/python/estimator/inputs/queues/feeding_functions.py b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
index 8e2ec83020..51a61adb21 100644
--- a/tensorflow/python/estimator/inputs/queues/feeding_functions.py
+++ b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
@@ -250,7 +250,7 @@ class _PandasFeedFn(object):
                num_epochs=None):
     if len(placeholders) != len(dataframe.columns) + 1:
       raise ValueError("Expected {} placeholders; got {}.".format(
-          len(dataframe.columns), len(placeholders)))
+          len(dataframe.columns) + 1, len(placeholders)))
     self._index_placeholder = placeholders[0]
     self._col_placeholders = placeholders[1:]
     self._dataframe = dataframe
diff --git a/tensorflow/python/estimator/keras.py b/tensorflow/python/estimator/keras.py
index c80af08fba..2f439f765e 100644
--- a/tensorflow/python/estimator/keras.py
+++ b/tensorflow/python/estimator/keras.py
@@ -70,7 +70,7 @@ def _convert_tensor(x):
   return x
 
 
-def _any_variable_initalized():
+def _any_variable_initialized():
   """Check if any variable has been initialized in the Keras model.
 
   Returns:
@@ -511,7 +511,7 @@ def model_to_estimator(keras_model=None,
       keras_model_fn, model_dir=model_dir, config=config)
 
   # Check if we need to call get_weights:
-  if _any_variable_initalized():
+  if _any_variable_initialized():
     keras_weights = keras_model.get_weights()
     # Warn if config passed to estimator tries to update GPUOptions. If a
     # session has already been created, the GPUOptions passed to the first
diff --git a/tensorflow/python/estimator/keras_test.py b/tensorflow/python/estimator/keras_test.py
index 6688a84130..5e094ae92b 100644
--- a/tensorflow/python/estimator/keras_test.py
+++ b/tensorflow/python/estimator/keras_test.py
@@ -31,10 +31,10 @@ from tensorflow.python.estimator import run_config as run_config_lib
 from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.applications import mobilenet
 from tensorflow.python.keras.optimizers import SGD
+from tensorflow.python.ops.parsing_ops import gen_parsing_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
@@ -146,13 +146,13 @@ def randomize_io_type(array, name):
 def multi_inputs_multi_outputs_model():
   a = keras.layers.Input(shape=(16,), name='input_a')
   b = keras.layers.Input(shape=(16,), name='input_b')
-  m = keras.layers.Input(shape=(8,), dtype='bool', name='input_m')
+  m = keras.layers.Input(shape=(8,), dtype='string', name='input_m')
   dense = keras.layers.Dense(8, name='dense_1')
 
   a_2 = dense(a)
-  # Apply a mask
-  s_2 = keras.layers.Lambda(lambda k:
-                            K.switch(k[0], k[1], K.zeros_like(k[1])))([m, a_2])
+  # Read m
+  m_2 = keras.layers.Lambda(gen_parsing_ops.string_to_number)(m)
+  s_2 = keras.layers.Lambda(lambda k: k[0] * k[1])([m_2, a_2])
   b_2 = dense(b)
   merged = keras.layers.concatenate([s_2, b_2], name='merge')
   c = keras.layers.Dense(3, activation='softmax', name='dense_2')(merged)
@@ -372,13 +372,13 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
 
     def train_input_fn():
       input_dict = {'input_a': a_train, 'input_b': b_train,
-                    'input_m': input_m_train > 0}
+                    'input_m': input_m_train.astype(np.str)}
       output_dict = {'dense_2': c_train, 'dense_3': d_train}
       return input_dict, output_dict
 
     def eval_input_fn():
       input_dict = {'input_a': a_test, 'input_b': b_test,
-                    'input_m': input_m_test > 0}
+                    'input_m': input_m_test.astype(np.str)}
       output_dict = {'dense_2': c_test, 'dense_3': d_test}
       return input_dict, output_dict
 
diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py
index e487f583be..f608dea430 100644
--- a/tensorflow/python/keras/activations.py
+++ b/tensorflow/python/keras/activations.py
@@ -93,6 +93,8 @@ def selu(x):
       - To be used together with the initialization "lecun_normal".
       - To be used together with the dropout variant "AlphaDropout".
 
+  References:
+      - [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
   """
   alpha = 1.6732632423543772848170429916717
   scale = 1.0507009873554804934193349852946
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 70b6a8431a..9f91368e5b 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -724,15 +724,6 @@ class TensorBoard(Callback):
         for weight in layer.weights:
           mapped_weight_name = weight.name.replace(':', '_')
           tf_summary.histogram(mapped_weight_name, weight)
-          if self.write_grads:
-            grads = model.optimizer.get_gradients(model.total_loss, weight)
-
-            def is_indexed_slices(grad):
-              return type(grad).__name__ == 'IndexedSlices'
-
-            grads = [grad.values if is_indexed_slices(grad) else grad
-                     for grad in grads]
-            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)
           if self.write_images:
             w_img = array_ops.squeeze(weight)
             shape = K.int_shape(w_img)
@@ -759,6 +750,18 @@ class TensorBoard(Callback):
             assert len(shape) == 4 and shape[-1] in [1, 3, 4]
             tf_summary.image(mapped_weight_name, w_img)
 
+        if self.write_grads:
+          for weight in layer.trainable_weights:
+            mapped_weight_name = weight.name.replace(':', '_')
+            grads = model.optimizer.get_gradients(model.total_loss, weight)
+
+            def is_indexed_slices(grad):
+              return type(grad).__name__ == 'IndexedSlices'
+
+            grads = [grad.values if is_indexed_slices(grad) else grad
+                     for grad in grads]
+            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)
+
         if hasattr(layer, 'output'):
           tf_summary.histogram('{}_out'.format(layer.name), layer.output)
     self.merged = tf_summary.merge_all()
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index b355f4a269..5062a26580 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -653,6 +653,8 @@ class KerasCallbacksTest(test.TestCase):
       model.add(
           keras.layers.Dense(
               NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+      # non_trainable_weights: moving_variance, moving_mean
+      model.add(keras.layers.BatchNormalization())
       model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
       model.compile(
           loss='categorical_crossentropy',
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index a4cd017d60..1c9135982e 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -123,7 +123,7 @@ class Network(base_layer.Layer):
     # Entries are unique. Includes input and output layers.
     self._layers = []
 
-    # Used in symbolic mode only, only in conjonction with graph-networks
+    # Used in symbolic mode only, only in conjunction with graph-networks
     self._outbound_nodes = []
     self._inbound_nodes = []
 
diff --git a/tensorflow/python/keras/engine/saving_test.py b/tensorflow/python/keras/engine/saving_test.py
index 6a94986b9c..7e82db028b 100644
--- a/tensorflow/python/keras/engine/saving_test.py
+++ b/tensorflow/python/keras/engine/saving_test.py
@@ -482,7 +482,7 @@ class TestWholeModelSaving(test.TestCase):
       with h5py.File(fname, 'r') as h5file:
         num_names_arrays = len([attr for attr in h5file['model_weights'].attrs
                                 if attr.startswith('layer_names')])
-      # The chunking of layer names array should have happend.
+      # The chunking of layer names array should have happened.
       self.assertGreater(num_names_arrays, 0)
       out2 = model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
@@ -527,7 +527,7 @@ class TestWholeModelSaving(test.TestCase):
         num_weight_arrays = len(
             [attr for attr in h5file['model_weights']['nested_model'].attrs
              if attr.startswith('weight_names')])
-      # The chunking of layer names array should have happend.
+      # The chunking of layer names array should have happened.
       self.assertGreater(num_weight_arrays, 0)
       out2 = model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 89c1f1a40f..fce6cbdb7a 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -24,6 +24,7 @@ import numpy as np
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -409,11 +410,13 @@ class Model(Network):
         else:
           if sample_weight_mode == 'temporal':
             sample_weights.append(array_ops.placeholder_with_default(
-                [[1.]], shape=[None, None], name=name + '_sample_weights'))
+                constant_op.constant([[1.]], dtype=K.floatx()),
+                shape=[None, None], name=name + '_sample_weights'))
             sample_weight_modes.append('temporal')
           else:
             sample_weights.append(array_ops.placeholder_with_default(
-                [1.], shape=[None], name=name + '_sample_weights'))
+                constant_op.constant([1.], dtype=K.floatx()),
+                shape=[None], name=name + '_sample_weights'))
             sample_weight_modes.append(None)
     self.sample_weight_modes = sample_weight_modes
     self._feed_sample_weight_modes = []
diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py
index 2ecbff3a1c..e8838cd3bc 100644
--- a/tensorflow/python/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/engine/training_eager.py
@@ -732,7 +732,7 @@ def slice_arrays(arrays, indices, contiguous=True):
   """Slices batches out of provided arrays (workaround for eager tensors).
 
   Unfortunately eager tensors don't have the same slicing behavior as
-  Numpy arrays (they folow  the same slicing behavior as symbolic TF tensors),
+  Numpy arrays (they follow the same slicing behavior as symbolic TF tensors),
   hence we cannot use `generic_utils.slice_arrays` directly
   and we have to implement this workaround based on `concat`. This has a
   performance cost.
diff --git a/tensorflow/python/keras/initializers_test.py b/tensorflow/python/keras/initializers_test.py
index a54d6da839..c519e194bd 100644
--- a/tensorflow/python/keras/initializers_test.py
+++ b/tensorflow/python/keras/initializers_test.py
@@ -71,7 +71,7 @@ class KerasInitializersTest(test.TestCase):
                                                       stddev=1,
                                                       seed=126),
                    tensor_shape,
-                   target_mean=0., target_std=None, target_max=2)
+                   target_mean=0., target_max=2, target_min=-2)
 
   def test_constant(self):
     tensor_shape = (5, 6, 4)
@@ -83,49 +83,49 @@ class KerasInitializersTest(test.TestCase):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(3. / fan_in)
+      std = np.sqrt(1. / fan_in)
       self._runner(keras.initializers.lecun_uniform(seed=123), tensor_shape,
-                   target_mean=0., target_max=scale, target_min=-scale)
+                   target_mean=0., target_std=std)
 
   def test_glorot_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, fan_out = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(6. / (fan_in + fan_out))
+      std = np.sqrt(2. / (fan_in + fan_out))
       self._runner(keras.initializers.glorot_uniform(seed=123), tensor_shape,
-                   target_mean=0., target_max=scale, target_min=-scale)
+                   target_mean=0., target_std=std)
 
   def test_he_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(6. / fan_in)
+      std = np.sqrt(2. / fan_in)
       self._runner(keras.initializers.he_uniform(seed=123), tensor_shape,
-                   target_mean=0., target_max=scale, target_min=-scale)
+                   target_mean=0., target_std=std)
 
   def test_lecun_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(1. / fan_in)
+      std = np.sqrt(1. / fan_in)
       self._runner(keras.initializers.lecun_normal(seed=123), tensor_shape,
-                   target_mean=0., target_std=None, target_max=2 * scale)
+                   target_mean=0., target_std=std)
 
   def test_glorot_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, fan_out = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(2. / (fan_in + fan_out))
+      std = np.sqrt(2. / (fan_in + fan_out))
       self._runner(keras.initializers.glorot_normal(seed=123), tensor_shape,
-                   target_mean=0., target_std=None, target_max=2 * scale)
+                   target_mean=0., target_std=std)
 
   def test_he_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(2. / fan_in)
+      std = np.sqrt(2. / fan_in)
       self._runner(keras.initializers.he_normal(seed=123), tensor_shape,
-                   target_mean=0., target_std=None, target_max=2 * scale)
+                   target_mean=0., target_std=std)
 
   def test_orthogonal(self):
     tensor_shape = (20, 20)
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index 5061825d38..f60064ed63 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -19,7 +19,9 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
+import sys
 import types as python_types
+import warnings
 
 import numpy as np
 
@@ -714,6 +716,7 @@ class Lambda(Layer):
     return self.mask
 
   def get_config(self):
+    module = self.function.__module__
     if isinstance(self.function, python_types.LambdaType):
       function = generic_utils.func_dump(self.function)
       function_type = 'lambda'
@@ -721,21 +724,26 @@ class Lambda(Layer):
       function = self.function.__name__
       function_type = 'function'
 
+    output_shape_module = None
     if isinstance(self._output_shape, python_types.LambdaType):
       output_shape = generic_utils.func_dump(self._output_shape)
       output_shape_type = 'lambda'
+      output_shape_module = self._output_shape.__module__
     elif callable(self._output_shape):
       output_shape = self._output_shape.__name__
       output_shape_type = 'function'
+      output_shape_module = self._output_shape.__module__
     else:
       output_shape = self._output_shape
       output_shape_type = 'raw'
 
     config = {
         'function': function,
+        'module': module,
         'function_type': function_type,
         'output_shape': output_shape,
         'output_shape_type': output_shape_type,
+        'output_shape_module': output_shape_module,
         'arguments': self.arguments
     }
     base_config = super(Lambda, self).get_config()
@@ -745,8 +753,16 @@ class Lambda(Layer):
   def from_config(cls, config, custom_objects=None):
     config = config.copy()
     globs = globals()
+    module = config.pop('module', None)
+    if module in sys.modules:
+      globs.update(sys.modules[module].__dict__)
+    elif module is not None:
+      # Note: we don't know the name of the function if it's a lambda.
+      warnings.warn('{} is not loaded, but a Lambda layer uses it. '
+                    'It may cause errors.'.format(module)
+                    , UserWarning)
     if custom_objects:
-      globs = dict(list(globs.items()) + list(custom_objects.items()))
+      globs.update(custom_objects)
     function_type = config.pop('function_type')
     if function_type == 'function':
       # Simple lookup in custom objects
@@ -760,6 +776,14 @@ class Lambda(Layer):
     else:
       raise TypeError('Unknown function type:', function_type)
 
+    output_shape_module = config.pop('output_shape_module', None)
+    if output_shape_module in sys.modules:
+      globs.update(sys.modules[output_shape_module].__dict__)
+    elif output_shape_module is not None:
+      # Note: we don't know the name of the function if it's a lambda.
+      warnings.warn('{} is not loaded, but a Lambda layer uses it. '
+                    'It may cause errors.'.format(output_shape_module)
+                    , UserWarning)
     output_shape_type = config.pop('output_shape_type')
     if output_shape_type == 'function':
       # Simple lookup in custom objects
diff --git a/tensorflow/python/keras/models_test.py b/tensorflow/python/keras/models_test.py
index c616d8f24f..e6e45902a8 100644
--- a/tensorflow/python/keras/models_test.py
+++ b/tensorflow/python/keras/models_test.py
@@ -144,5 +144,19 @@ class CheckpointingTests(test.TestCase):
     model.load_weights(save_prefix)
     self.assertEqual(12., self.evaluate(beta1_power))
 
+class TestModelBackend(test.TestCase):
+
+  def test_model_backend_float64_use_cases(self):
+    # Test case for GitHub issue 19318
+    floatx = keras.backend.floatx()
+    keras.backend.set_floatx('float64')
+
+    x = keras.Input((5,))
+    y = keras.layers.Dense(1)(x)
+    model = keras.models.Model(x, y)
+    model.compile('rmsprop', 'mse')
+
+    keras.backend.set_floatx(floatx)
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/as_string_op_test.py b/tensorflow/python/kernel_tests/as_string_op_test.py
index 9d54add264..94ed8ebd31 100644
--- a/tensorflow/python/kernel_tests/as_string_op_test.py
+++ b/tensorflow/python/kernel_tests/as_string_op_test.py
@@ -130,6 +130,16 @@ class AsStringOpTest(test.TestCase):
       result = output.eval(feed_dict={input_: int_inputs_})
       self.assertAllEqual(s(result), ["%d" % x for x in int_inputs_])
 
+  def testHalfInt(self):
+    s = lambda strs: [x.decode("ascii") for x in strs]
+
+    with self.test_session():
+      input_ = array_ops.placeholder(dtypes.int16)
+      int_inputs_ = [np.iinfo(np.int16).min, np.iinfo(np.int16).max]
+      output = string_ops.as_string(input_)
+      result = output.eval(feed_dict={input_: int_inputs_})
+      self.assertAllEqual(s(result), ["%d" % x for x in int_inputs_])
+
   def testBool(self):
     bool_inputs_ = [False, True]
     s = lambda strs: [x.decode("ascii") for x in strs]
diff --git a/tensorflow/python/kernel_tests/betainc_op_test.py b/tensorflow/python/kernel_tests/betainc_op_test.py
index 08b03f8518..16fdedac41 100644
--- a/tensorflow/python/kernel_tests/betainc_op_test.py
+++ b/tensorflow/python/kernel_tests/betainc_op_test.py
@@ -172,7 +172,7 @@ class BetaincTest(test.TestCase):
       tf_gout_t = math_ops.betainc(tf_ga_s, tf_gb_s, tf_gx_s)
       err = gradient_checker.compute_gradient_error(
           [tf_gx_s], [gx_s.shape], tf_gout_t, gx_s.shape)
-      print("betainc gradient err = %g " % err)
+      tf_logging.info("betainc gradient err = %g " % err)
       self.assertLess(err, err_tolerance)
 
       # Test broadcast gradient
@@ -181,7 +181,7 @@ class BetaincTest(test.TestCase):
       tf_gout_t = math_ops.betainc(tf_ga_s, tf_gb_s, tf_gx_s)
       err = gradient_checker.compute_gradient_error(
           [tf_gx_s], [()], tf_gout_t, ga_s.shape)
-      print("betainc gradient err = %g " % err)
+      tf_logging.info("betainc gradient err = %g " % err)
       self.assertLess(err, err_tolerance)
 
 
diff --git a/tensorflow/python/kernel_tests/clip_ops_test.py b/tensorflow/python/kernel_tests/clip_ops_test.py
index e08123b041..fb52d10475 100644
--- a/tensorflow/python/kernel_tests/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/clip_ops_test.py
@@ -18,9 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.platform import test
@@ -414,6 +417,16 @@ class ClipTest(test.TestCase):
 
     self.assertAllClose(np_ans, tf_ans)
 
+  def testClipByValueEmptyTensor(self):
+    # Test case for GitHub issue 19337
+    zero = array_ops.placeholder(dtype=dtypes.float32, shape=None)
+    x = clip_ops.clip_by_value(zero, zero, zero)
+    y = clip_ops.clip_by_value(zero, 1.0, 1.0)
+    z = clip_ops.clip_by_value(zero, zero, 1.0)
+    w = clip_ops.clip_by_value(zero, 1.0, zero)
+    with self.test_session(use_gpu=True) as sess:
+      sess.run([x, y, z, w], feed_dict={zero: np.zeros((7, 0))})
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index 8699fd5b25..80ba7dafc9 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -312,8 +312,8 @@ class Conv2DTest(test.TestCase):
       expected_values = self.evaluate(expected_results)
       computed_values = self.evaluate(computed_results)
       for e_value, c_value in zip(expected_values, computed_values):
-        print("expected = ", e_value)
-        print("actual = ", c_value)
+        tf_logging.info("expected = ", e_value)
+        tf_logging.info("actual = ", c_value)
         self.assertAllClose(
             e_value.flatten(), c_value.flatten(), atol=tolerance, rtol=1e-4)
 
@@ -337,8 +337,8 @@ class Conv2DTest(test.TestCase):
       for i in range(len(tensors)):
         conv = tensors[i]
         value = values[i]
-        print("expected = ", expected)
-        print("actual = ", value)
+        tf_logging.info("expected = ", expected)
+        tf_logging.info("actual = ", value)
         tol = 1e-5
         if value.dtype == np.float16:
           tol = 1e-3
@@ -547,8 +547,8 @@ class Conv2DTest(test.TestCase):
       # "values" consists of two tensors for two backprops
       value = self.evaluate(conv)
       self.assertShapeEqual(value, conv)
-    print("expected = ", expected)
-    print("actual = ", value)
+    tf_logging.info("expected = ", expected)
+    tf_logging.info("actual = ", value)
     self.assertArrayNear(expected, value.flatten(), err)
 
   def _CompareBackpropInput(self, input_sizes, filter_sizes, output_sizes,
@@ -723,8 +723,8 @@ class Conv2DTest(test.TestCase):
             data_format=data_format)
         value = self.evaluate(conv)
         self.assertShapeEqual(value, conv)
-      print("expected = ", expected)
-      print("actual = ", value)
+      tf_logging.info("expected = ", expected)
+      tf_logging.info("actual = ", value)
       self.assertArrayNear(expected, value.flatten(), 1e-5)
 
   def _CompareBackFilter(self, input_sizes, filter_sizes, output_sizes,
@@ -912,8 +912,8 @@ class Conv2DTest(test.TestCase):
         value_2 = sess.run(conv_2)
         self.assertShapeEqual(value, conv)
         self.assertShapeEqual(value_2, conv_2)
-      print("expected = ", value_2)
-      print("actual = ", value)
+      tf_logging.info("expected = ", value_2)
+      tf_logging.info("actual = ", value)
       self.assertArrayNear(value_2.flatten(), value.flatten(), err)
 
   # Testing for backprops
@@ -965,8 +965,8 @@ class Conv2DTest(test.TestCase):
         value_2 = sess.run(conv_2)
         self.assertShapeEqual(value, conv)
         self.assertShapeEqual(value_2, conv_2)
-      print("expected = ", value_2)
-      print("actual = ", value)
+      tf_logging.info("expected = ", value_2)
+      tf_logging.info("actual = ", value)
       self.assertArrayNear(value_2.flatten(), value.flatten(), err)
 
   def testConv2D2x2Depth3ValidBackpropFilterStride1x1Dilation2x1(self):
@@ -1178,7 +1178,7 @@ class Conv2DTest(test.TestCase):
           # since fp16 numerical gradients are too imprecise.
           err = np.fabs(jacob_t - reference_jacob_t).max()
 
-        print("conv_2d gradient error = ", err)
+        tf_logging.info("conv_2d gradient error = ", err)
         self.assertLess(err, 0.002)
 
   def testInputGradientValidPaddingStrideOne(self):
@@ -1546,7 +1546,7 @@ class DepthwiseConv2DTest(test.TestCase):
       conv = nn_impl.depthwise_conv2d(
           t1, t2, strides=[1, stride, stride, 1], padding=padding)
       value = sess.run(conv)
-    print("value = ", value)
+    tf_logging.info("value = ", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
 
@@ -1668,7 +1668,7 @@ class SeparableConv2DTest(test.TestCase):
         conv = array_ops.transpose(conv, [0, 2, 3, 1])
 
       value = sess.run(conv)
-    print("value = ", value)
+    tf_logging.info("value = ", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
 
@@ -1826,7 +1826,7 @@ class Conv2DBenchmark(test.Benchmark):
         wall_time = time.time() - start
         self.report_benchmark(
             name="conv_stack_iter_%d" % iter_index, wall_time=wall_time)
-        print("conv_stack_iter_%d: %.4f" % (iter_index, wall_time))
+        tf_logging.info("conv_stack_iter_%d: %.4f" % (iter_index, wall_time))
 
 
 def GetInceptionFwdTest(input_size, filter_size, stride, padding,
diff --git a/tensorflow/python/kernel_tests/gather_nd_op_test.py b/tensorflow/python/kernel_tests/gather_nd_op_test.py
index 91ebe8de99..58e2a8ac2a 100644
--- a/tensorflow/python/kernel_tests/gather_nd_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_nd_op_test.py
@@ -197,7 +197,21 @@ class GatherNdTest(test.TestCase):
     self.assertEqual(None, shape.ndims)
     self.assertEqual(None, shape[0].value)
 
-  def testBadIndices(self):
+  def testBadIndicesCPU(self):
+    with self.test_session(use_gpu=False):
+      params = [0, 1, 2]
+      indices = [[[0], [7]]]  # Make this one higher rank
+      gather_nd = array_ops.gather_nd(params, indices)
+      with self.assertRaisesOpError(
+          r"flat indices\[1, :\] = \[7\] does not index into param "
+          r"\(shape: \[3\]\)"):
+        gather_nd.eval()
+
+  def _disabledTestBadIndicesGPU(self):
+    # TODO disabled due to different behavior on GPU and CPU
+    # On GPU the bad indices do not raise error but fetch 0 values
+    if not test.is_gpu_available():
+      return
     with self.test_session(use_gpu=True):
       params = [0, 1, 2]
       indices = [[[0], [7]]]  # Make this one higher rank
@@ -207,7 +221,21 @@ class GatherNdTest(test.TestCase):
           r"\(shape: \[3\]\)"):
         gather_nd.eval()
 
-  def testBadIndicesWithSlices(self):
+  def testBadIndicesWithSlicesCPU(self):
+    with self.test_session(use_gpu=False):
+      params = [[0, 1, 2]]
+      indices = [[[0], [0], [1]]]  # Make this one higher rank
+      gather_nd = array_ops.gather_nd(params, indices)
+      with self.assertRaisesOpError(
+          r"flat indices\[2, :\] = \[1\] does not index into param "
+          r"\(shape: \[1,3\]\)"):
+        gather_nd.eval()
+
+  def _disabledTestBadIndicesWithSlicesGPU(self):
+    # TODO disabled due to different behavior on GPU and CPU
+    # On GPU the bad indices do not raise error but fetch 0 values
+    if not test.is_gpu_available():
+      return
     with self.test_session(use_gpu=True):
       params = [[0, 1, 2]]
       indices = [[[0], [0], [1]]]  # Make this one higher rank
diff --git a/tensorflow/python/kernel_tests/gather_op_test.py b/tensorflow/python/kernel_tests/gather_op_test.py
index a2fcd751df..033fa95935 100644
--- a/tensorflow/python/kernel_tests/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_op_test.py
@@ -27,7 +27,8 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.platform import test
 
-_TEST_TYPES = (dtypes.float32, dtypes.complex64, dtypes.complex128)
+_TEST_TYPES = (dtypes.int64, dtypes.float32,
+               dtypes.complex64, dtypes.complex128)
 
 
 class GatherTest(test.TestCase):
@@ -122,6 +123,9 @@ class GatherTest(test.TestCase):
                 gather, [tf_params, tf_indices, tf_axis], gather_grad)
             self.assertEqual(indices_grad, None)
             self.assertEqual(axis_grad, None)
+            if dtype.is_integer:
+              self.assertEqual(params_grad, None)
+              continue
             # For axis 0, we are able to create an efficient IndexedSlices for
             # the gradient.
             if axis == 0:
@@ -177,7 +181,19 @@ class GatherTest(test.TestCase):
     gather_t = array_ops.gather(params, indices, axis=axis)
     self.assertEqual(None, gather_t.shape)
 
-  def testBadIndices(self):
+  def testBadIndicesCPU(self):
+    with self.test_session(use_gpu=False):
+      params = [[0, 1, 2], [3, 4, 5]]
+      with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 2\)"):
+        array_ops.gather(params, [[7]], axis=0).eval()
+      with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 3\)"):
+        array_ops.gather(params, [[7]], axis=1).eval()
+
+  def _disabledTestBadIndicesGPU(self):
+    # TODO disabled due to different behavior on GPU and CPU
+    # On GPU the bad indices do not raise error but fetch 0 values
+    if not test.is_gpu_available():
+      return
     with self.test_session(use_gpu=True):
       params = [[0, 1, 2], [3, 4, 5]]
       with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 2\)"):
diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py
index a9b55854f1..795aa67248 100644
--- a/tensorflow/python/kernel_tests/init_ops_test.py
+++ b/tensorflow/python/kernel_tests/init_ops_test.py
@@ -362,6 +362,33 @@ class UniformUnitScalingInitializationTest(test.TestCase):
         dtype=dtypes.string)
 
 
+class VarianceScalingInitializationTest(test.TestCase):
+
+  def testNormalDistribution(self):
+    shape = [100, 100]
+    expect_mean = 0.
+    expect_var = 1. / shape[0]
+    init = init_ops.variance_scaling_initializer(distribution='normal')
+
+    with self.test_session(use_gpu=True):
+      x = init(shape).eval()
+
+    self.assertNear(np.mean(x), expect_mean, err=1e-2)
+    self.assertNear(np.var(x), expect_var, err=1e-2)
+
+  def testUniformDistribution(self):
+    shape = [100, 100]
+    expect_mean = 0.
+    expect_var = 1. / shape[0]
+    init = init_ops.variance_scaling_initializer(distribution='uniform')
+
+    with self.test_session(use_gpu=True):
+      x = init(shape).eval()
+
+    self.assertNear(np.mean(x), expect_mean, err=1e-2)
+    self.assertNear(np.var(x), expect_var, err=1e-2)
+
+
 # TODO(vrv): move to sequence_ops_test?
 class RangeTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index a0c372db7d..e95c729715 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -947,7 +947,7 @@ class PoolingTest(test.TestCase):
           output_sizes,
           x_init_value=x_init_value,
           delta=1e-2)
-    print("%s gradient error = " % func_name, err)
+    tf_logging.info("%s gradient error = " % func_name, err)
     self.assertLess(err, err_tolerance)
 
   def _ConstructAndTestSecondGradient(self,
@@ -1024,7 +1024,7 @@ class PoolingTest(test.TestCase):
           input_sizes,
           x_init_value=x_init_value,
           delta=1e-2)
-    print("%s second-order gradient error = " % func_name, err)
+    tf_logging.info("%s second-order gradient error = " % func_name, err)
     self.assertLess(err, err_tolerance)
 
   def _testMaxPoolGradValidPadding1_1(self, data_format, use_gpu):
diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index 677253946e..253e43920b 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import gc
 import re
 
 import numpy as np
@@ -434,13 +435,29 @@ class PyFuncTest(test.TestCase):
 
   # ----- Tests shared by py_func and eager_py_func -----
   def testCleanup(self):
-    for _ in xrange(1000):
-      g = ops.Graph()
-      with g.as_default():
-        c = constant_op.constant([1.], dtypes.float32)
-        _ = script_ops.py_func(lambda x: x + 1, [c], [dtypes.float32])
-        _ = script_ops.eager_py_func(lambda x: x + 1, [c], [dtypes.float32])
-    self.assertLess(script_ops._py_funcs.size(), 100)
+    # Delete everything created by previous tests to avoid side effects.
+    ops.reset_default_graph()
+    gc.collect()
+    initial_size = script_ops._py_funcs.size()
+    # Encapsulate the graph generation, so locals can be deleted.
+    def make_graphs():
+      for _ in xrange(1000):
+        g = ops.Graph()
+        with g.as_default():
+          c = constant_op.constant([1.], dtypes.float32)
+          _ = script_ops.py_func(lambda x: x + 1, [c], [dtypes.float32])
+          _ = script_ops.eager_py_func(lambda x: x + 1, [c], [dtypes.float32])
+          # These ops have a reference to 'c' which has a reference to the graph.
+          # Checks if the functions are being deleted though the graph is referenced from them.
+          # (see #18292)
+          _ = script_ops.py_func(lambda x: x + c.shape[0], [c], [dtypes.float32])
+          _ = script_ops.eager_py_func(lambda x: x + c.shape[0], [c], [dtypes.float32])
+ 
+    # Call garbage collector to enforce deletion.
+    make_graphs()
+    ops.reset_default_graph()
+    gc.collect()
+    self.assertEqual(initial_size, script_ops._py_funcs.size())
 
   # ----- Tests for eager_py_func -----
   @test_util.run_in_graph_and_eager_modes()
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index 79fe927b8a..faa4b49a8d 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -144,7 +144,9 @@ class StatefulScatterNdTest(test.TestCase):
         self.assertAllClose(new, ref_var.eval())
 
   def _VariableRankTests(self, np_scatter, tf_scatter):
-    for vtype in (np.float32, np.float64, np.complex64, np.complex128):
+    for vtype in (np.int32,
+                  np.float32, np.float64,
+                  np.complex64, np.complex128):
       for itype in (np.int32, np.int64):
         self._VariableRankTest(np_scatter, tf_scatter, vtype, itype)
 
@@ -221,7 +223,7 @@ class StatefulScatterNdTest(test.TestCase):
   #   self._VariableRankTests(_NumpyDiv, state_ops.scatter_nd_div)
 
   def _ScatterRepeatIndicesTest(self, np_scatter, tf_scatter):
-    for vtype in (np.float32, np.float64):
+    for vtype in (np.int32, np.float32, np.float64):
       for itype in (np.int32, np.int64):
         self._VariableRankTest(
             np_scatter, tf_scatter, vtype, itype, repeat_indices=True)
diff --git a/tensorflow/python/kernel_tests/scatter_ops_test.py b/tensorflow/python/kernel_tests/scatter_ops_test.py
index c70a4ffce7..1a0fa744ae 100644
--- a/tensorflow/python/kernel_tests/scatter_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_ops_test.py
@@ -159,7 +159,13 @@ class ScatterTest(test.TestCase):
 
           # Clips small values to avoid division by zero.
           def clip_small_values(x):
-            return 1e-4 * np.sign(x) if np.abs(x) < 1e-4 else x
+            threshold = 1e-4
+            sign = np.sign(x)
+
+            if isinstance(x, np.int32):
+              threshold = 1
+              sign = np.random.choice([-1, 1])
+            return threshold * sign if np.abs(x) < threshold else x
 
           updates = np.vectorize(clip_small_values)(updates)
           old = _AsType(np.random.randn(*((first_dim,) + extra_shape)), vtype)
@@ -181,7 +187,11 @@ class ScatterTest(test.TestCase):
                          tf_scatter,
                          repeat_indices=False,
                          updates_are_scalar=False):
-    for vtype in (np.float32, np.float64):
+    vtypes = [np.float32, np.float64]
+    if tf_scatter != state_ops.scatter_div:
+      vtypes.append(np.int32)
+
+    for vtype in vtypes:
       for itype in (np.int32, np.int64):
         self._VariableRankTest(tf_scatter, vtype, itype, repeat_indices,
                                updates_are_scalar)
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index 794be096b7..a82855dfeb 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -264,7 +264,9 @@ class UnsortedSegmentTest(SegmentReductionHelper):
 
     # A subset of ops has been enabled for complex numbers
     self.complex_ops_list = [(np.add, None,
-                              math_ops.unsorted_segment_sum, lambda t: 0)]
+                              math_ops.unsorted_segment_sum, lambda t: 0),
+                             (np.ndarray.__mul__, None,
+                              math_ops.unsorted_segment_prod, lambda t: 1)]
     self.differentiable_dtypes = [dtypes_lib.float16, dtypes_lib.float32,
                                   dtypes_lib.float64]
     self.all_dtypes = (self.differentiable_dtypes +
diff --git a/tensorflow/python/kernel_tests/string_split_op_test.py b/tensorflow/python/kernel_tests/string_split_op_test.py
index a5bd1b6ee0..e20daccb28 100644
--- a/tensorflow/python/kernel_tests/string_split_op_test.py
+++ b/tensorflow/python/kernel_tests/string_split_op_test.py
@@ -146,5 +146,101 @@ class StringSplitOpTest(test.TestCase):
       self.assertAllEqual(shape, [3, 1])
 
 
+class StringSplitV2OpTest(test.TestCase):
+
+  def testSplitV2(self):
+    strings = ["pigs on the wing", "animals"]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings)
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0]])
+      self.assertAllEqual(values, [b"pigs", b"on", b"the", b"wing", b"animals"])
+      self.assertAllEqual(shape, [2, 4])
+
+  def testSplitV2MultiCharSeparator(self):
+    # Match Python behavior:
+    # >>> '1<>2<>3'.split('<>')
+    # ['1', '2', '3']
+    # >>> "<><>4<>5<><>6<>".split("<>")
+    # ['', '', '4', '5', '', '6', '']
+    strings = ["1<>2<>3", "<><>4<>5<><>6<>"]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings, sep="<>")
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(
+          indices, [[0, 0], [0, 1], [0, 2],
+                    [1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5], [1, 6]])
+      self.assertAllEqual(values, [b"1", b"2", b"3",
+                                   b"", b"", b"4", b"5", b"", b"6", b""])
+      self.assertAllEqual(shape, [2, 7])
+
+  def testSplitV2SimpleSeparator(self):
+    # Match Python behavior:
+    # >>> '1,2,3'.split(',')
+    # ['1', '2', '3']
+    # >>> '1,2,,3,'.split(',')
+    # ['1', '2', '', '3', '']
+    strings = ["1,2,3", "4,5,,6,"]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings, sep=',')
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2],
+                                    [1, 0], [1, 1], [1, 2], [1, 3], [1, 4]])
+      self.assertAllEqual(values, [b"1", b"2", b"3",
+                                   b"4", b"5", b"", b"6", b""])
+      self.assertAllEqual(shape, [2, 5])
+
+  def testSplitV2EmptySeparator(self):
+    # Match Python behavior:
+    # >>> '1 2 3'.split()
+    # ['1', '2', '3']
+    #>>> '   1   2   3   '.split()
+    #['1', '2', '3']
+    strings = ["1 2 3", "  4  5    6  "]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings)
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2],
+                                    [1, 0], [1, 1], [1, 2]])
+      self.assertAllEqual(values, [b"1", b"2", b"3", b"4", b"5", b"6"])
+      self.assertAllEqual(shape, [2, 3])
+
+  def testSplitV2SimpleSeparatorMaxSplit(self):
+    # Match Python behavior:
+    # >>> '1,2,3'.split(',', maxsplit=1)
+    # ['1', '2,3']
+    # >>> '4,5,,6,'.split(',', maxsplit=1)
+    # ['4', '5,,6,']
+    strings = ["1,2,3", "4,5,,6,"]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings, sep=',', maxsplit=1)
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(indices, [[0, 0], [0, 1],
+                                    [1, 0], [1, 1]])
+      self.assertAllEqual(values, [b"1", b"2,3", b"4", b"5,,6,"])
+      self.assertAllEqual(shape, [2, 2])
+
+  def testSplitV2EmptySeparatorMaxSplit(self):
+    # Match Python behavior:
+    # '1 2 3'.split(maxsplit=1)
+    # ['1', '2 3']
+    # >>> "  4  5    6  ".split(maxsplit=1)
+    # ['4', '5    6  ']
+    strings = ["1 2 3", "  4  5    6  "]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings, maxsplit=1)
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(indices, [[0, 0], [0, 1],
+                                    [1, 0], [1, 1]])
+      self.assertAllEqual(values, [b"1", b"2 3", b"4", b"5    6  "])
+      self.assertAllEqual(shape, [2, 2])
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 8129334703..fae63b1132 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -2619,6 +2619,10 @@ reverse.__doc__ = gen_array_ops.reverse_v2.__doc__
 
 # pylint: disable=redefined-builtin
 @tf_export("reverse_sequence")
+@deprecation.deprecated_args(
+    None, "seq_dim is deprecated, use seq_axis instead", "seq_dim")
+@deprecation.deprecated_args(
+    None, "batch_dim is deprecated, use batch_axis instead", "batch_dim")
 def reverse_sequence(input,
                      seq_lengths,
                      seq_axis=None,
diff --git a/tensorflow/python/ops/gradient_checker.py b/tensorflow/python/ops/gradient_checker.py
index 12afcd0b51..94c8d79335 100644
--- a/tensorflow/python/ops/gradient_checker.py
+++ b/tensorflow/python/ops/gradient_checker.py
@@ -283,10 +283,10 @@ def compute_gradient(x,
   numbers.  For example, if `x` is complex with shape `[m]` and `y` is complex
   with shape `[n]`, each Jacobian `J` will have shape `[m * 2, n * 2]` with
 
-      J[:m, :n] = d(Re y)/d(Re x)
-      J[:m, n:] = d(Im y)/d(Re x)
-      J[m:, :n] = d(Re y)/d(Im x)
-      J[m:, n:] = d(Im y)/d(Im x)
+      J[::2, ::2] = d(Re y)/d(Re x)
+      J[::2, 1::2] = d(Im y)/d(Re x)
+      J[1::2, ::2] = d(Re y)/d(Im x)
+      J[1::2, 1::2] = d(Im y)/d(Im x)
 
   Args:
     x: a tensor or list of tensors
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index bdcf420980..f27d9224c1 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_image_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
@@ -258,14 +259,14 @@ def random_flip_up_down(image, seed=None):
   dimension, which is `height`.  Otherwise output the image as-is.
 
   Args:
-    image: A 3-D tensor of shape `[height, width, channels].`
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
     seed: A Python integer. Used to create a random seed. See
       @{tf.set_random_seed}
       for behavior.
 
   Returns:
-    A 3-D tensor of the same type and shape as `image`.
-
+    A tensor of the same type and shape as `image`.
   Raises:
     ValueError: if the shape of `image` not supported.
   """
@@ -280,13 +281,14 @@ def random_flip_left_right(image, seed=None):
   second dimension, which is `width`.  Otherwise output the image as-is.
 
   Args:
-    image: A 3-D tensor of shape `[height, width, channels].`
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
     seed: A Python integer. Used to create a random seed. See
       @{tf.set_random_seed}
       for behavior.
 
   Returns:
-    A 3-D tensor of the same type and shape as `image`.
+    A tensor of the same type and shape as `image`.
 
   Raises:
     ValueError: if the shape of `image` not supported.
@@ -297,7 +299,8 @@ def random_flip_left_right(image, seed=None):
 def _random_flip(image, flip_index, seed, scope_name):
   """Randomly (50% chance) flip an image along axis `flip_index`.
     Args:
-      image: A 3-D tensor of shape `[height, width, channels].`
+      image: 4-D Tensor of shape `[batch, height, width, channels]` or
+             3-D Tensor of shape `[height, width, channels]`.
       flip_index: The dimension along which to flip the image.
                   Vertical: 0, Horizontal: 1
       seed: A Python integer. Used to create a random seed. See
@@ -306,22 +309,37 @@ def _random_flip(image, flip_index, seed, scope_name):
       scope_name: Name of the scope in which the ops are added.
 
     Returns:
-      A 3-D tensor of the same type and shape as `image`.
+      A tensor of the same type and shape as `image`.
 
     Raises:
       ValueError: if the shape of `image` not supported.
   """
   with ops.name_scope(None, scope_name, [image]) as scope:
     image = ops.convert_to_tensor(image, name='image')
-    image = _Assert3DImage(image)
-    uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
-    mirror_cond = math_ops.less(uniform_random, .5)
-    result = control_flow_ops.cond(
-        mirror_cond,
-        lambda: array_ops.reverse(image, [flip_index]),
-        lambda: image,
-        name=scope)
-    return fix_image_flip_shape(image, result)
+    image = _AssertAtLeast3DImage(image)
+    shape = image.get_shape()
+    if shape.ndims == 3 or shape.ndims is None:
+      uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
+      mirror_cond = math_ops.less(uniform_random, .5)
+      result = control_flow_ops.cond(
+          mirror_cond,
+          lambda: array_ops.reverse(image, [flip_index]),
+          lambda: image,
+          name=scope
+      )
+      return fix_image_flip_shape(image, result)
+    elif shape.ndims == 4:
+      uniform_random = random_ops.random_uniform(
+          [array_ops.shape(image)[0]], 0, 1.0, seed=seed
+      )
+      mirror_cond = math_ops.less(uniform_random, .5)
+      return array_ops.where(
+          mirror_cond,
+          image,
+          functional_ops.map_fn(lambda x: array_ops.reverse(x, [flip_index]), image, dtype=image.dtype)
+      )
+    else:
+      raise ValueError('\'image\' must have either 3 or 4 dimensions.')
 
 
 @tf_export('image.flip_left_right')
@@ -1634,13 +1652,13 @@ def is_jpeg(contents, name=None):
 
 
 @tf_export('image.decode_image')
-def decode_image(contents, channels=None, name=None):
+def decode_image(contents, channels=None, dtype=dtypes.uint8, name=None):
   """Convenience function for `decode_bmp`, `decode_gif`, `decode_jpeg`,
   and `decode_png`.
 
   Detects whether an image is a BMP, GIF, JPEG, or PNG, and performs the
-  appropriate operation to convert the input bytes `string` into a `Tensor` of
-  type `uint8`.
+  appropriate operation to convert the input bytes `string` into a `Tensor`
+  of type `dtype`.
 
   Note: `decode_gif` returns a 4-D array `[num_frames, height, width, 3]`, as
   opposed to `decode_bmp`, `decode_jpeg` and `decode_png`, which return 3-D
@@ -1652,10 +1670,11 @@ def decode_image(contents, channels=None, name=None):
     contents: 0-D `string`. The encoded image bytes.
     channels: An optional `int`. Defaults to `0`. Number of color channels for
       the decoded image.
+    dtype: The desired DType of the returned `Tensor`.
     name: A name for the operation (optional)
 
   Returns:
-    `Tensor` with type `uint8` with shape `[height, width, num_channels]` for
+    `Tensor` with type `dtype` and shape `[height, width, num_channels]` for
       BMP, JPEG, and PNG images and shape `[num_frames, height, width, 3]` for
       GIF images.
 
@@ -1679,7 +1698,7 @@ def decode_image(contents, channels=None, name=None):
       channels_msg = 'Channels must be in (None, 0, 3) when decoding BMP images'
       assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
       with ops.control_dependencies([assert_decode, assert_channels]):
-        return gen_image_ops.decode_bmp(contents)
+        return convert_image_dtype(gen_image_ops.decode_bmp(contents), dtype)
 
     def _gif():
       # Create assert to make sure that channels is not set to 1
@@ -1692,7 +1711,7 @@ def decode_image(contents, channels=None, name=None):
       channels_msg = 'Channels must be in (None, 0, 3) when decoding GIF images'
       assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
       with ops.control_dependencies([assert_channels]):
-        return gen_image_ops.decode_gif(contents)
+        return convert_image_dtype(gen_image_ops.decode_gif(contents), dtype)
 
     def check_gif():
       # Create assert op to check that bytes are GIF decodable
@@ -1701,7 +1720,11 @@ def decode_image(contents, channels=None, name=None):
 
     def _png():
       """Decodes a PNG image."""
-      return gen_image_ops.decode_png(contents, channels)
+      return convert_image_dtype(
+          gen_image_ops.decode_png(contents, channels,
+                                   dtype=dtypes.uint8
+                                   if dtype == dtypes.uint8
+                                   else dtypes.uint16), dtype)
 
     def check_png():
       """Checks if an image is PNG."""
@@ -1717,7 +1740,8 @@ def decode_image(contents, channels=None, name=None):
                       'images')
       assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
       with ops.control_dependencies([assert_channels]):
-        return gen_image_ops.decode_jpeg(contents, channels)
+        return convert_image_dtype(
+            gen_image_ops.decode_jpeg(contents, channels), dtype)
 
     # Decode normal JPEG images (start with \xff\xd8\xff\xe0)
     # as well as JPEG images with EXIF data (start with \xff\xd8\xff\xe1).
@@ -1878,7 +1902,7 @@ def sample_distorted_bounding_box(image_size,
       width / height within this range.
     area_range: An optional list of `floats`. Defaults to `[0.05, 1]`.
       The cropped area of the image must contain a fraction of the
-      supplied image within in this range.
+      supplied image within this range.
     max_attempts: An optional `int`. Defaults to `100`.
       Number of attempts at generating a cropped region of the image
       of the specified constraints. After `max_attempts` failures, return the
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 45499dcce0..2a6ab26e96 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -533,6 +533,37 @@ class FlipImageBenchmark(test.Benchmark):
         iters=benchmark_rounds,
         wall_time=step_time)
 
+  def _benchmarkBatchedRandomFlipLeftRight(self, device, cpu_count):
+    image_shape = [16, 299, 299, 3]
+    warmup_rounds = 100
+    benchmark_rounds = 1000
+    config = config_pb2.ConfigProto()
+    if cpu_count is not None:
+      config.inter_op_parallelism_threads = 1
+      config.intra_op_parallelism_threads = cpu_count
+    with session.Session("", graph=ops.Graph(), config=config) as sess:
+      with ops.device(device):
+        inputs = variables.Variable(
+            random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255,
+            trainable=False,
+            dtype=dtypes.float32)
+        run_op = image_ops.random_flip_left_right(inputs)
+        sess.run(variables.global_variables_initializer())
+        for i in xrange(warmup_rounds + benchmark_rounds):
+          if i == warmup_rounds:
+            start = time.time()
+          sess.run(run_op)
+    end = time.time()
+    step_time = (end - start) / benchmark_rounds
+    tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
+    print("benchmarkBatchedRandomFlipLeftRight_16_299_299_3_%s step_time: "
+          "%.2f us" %
+          (tag, step_time * 1e6))
+    self.report_benchmark(
+        name="benchmarkBatchedRandomFlipLeftRight_16_299_299_3_%s" % (tag),
+        iters=benchmark_rounds,
+        wall_time=step_time)
+
   def benchmarkFlipLeftRightCpu1(self):
     self._benchmarkFlipLeftRight("/cpu:0", 1)
 
@@ -551,6 +582,15 @@ class FlipImageBenchmark(test.Benchmark):
   def benchmarkRandomFlipLeftRightGpu(self):
     self._benchmarkRandomFlipLeftRight(test.gpu_device_name(), None)
 
+  def benchmarkBatchedRandomFlipLeftRightCpu1(self):
+    self._benchmarkBatchedRandomFlipLeftRight("/cpu:0", 1)
+
+  def benchmarkBatchedRandomFlipLeftRightCpuAll(self):
+    self._benchmarkBatchedRandomFlipLeftRight("/cpu:0", None)
+
+  def benchmarkBatchedRandomFlipLeftRightGpu(self):
+    self._benchmarkBatchedRandomFlipLeftRight(test.gpu_device_name(), None)
+
 
 class AdjustHueBenchmark(test.Benchmark):
 
@@ -987,7 +1027,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
 
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_left_right(x_tf)
+      y = image_ops.random_flip_left_right(x_tf, seed=seed)
       self.assertTrue(y.op.name.startswith("random_flip_left_right"))
 
       count_flipped = 0
@@ -1008,6 +1048,50 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       self.assertGreaterEqual(count_flipped, 20)
       self.assertGreaterEqual(count_unflipped, 20)
 
+  def testRandomFlipLeftRightWithBatch(self):
+    batch_size = 16
+    seed = 42
+
+    # create single item of test data
+    x_np_raw = np.array(
+        [[1, 2, 3], [1, 2, 3]], dtype=np.uint8
+    ).reshape([1, 2, 3, 1])
+    y_np_raw = np.array(
+        [[3, 2, 1], [3, 2, 1]], dtype=np.uint8
+    ).reshape([1, 2, 3, 1])
+
+    # create batched test data
+    x_np = np.vstack([x_np_raw for _ in range(batch_size)])
+    y_np = np.vstack([y_np_raw for _ in range(batch_size)])
+
+    with self.test_session(use_gpu=True):
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.random_flip_left_right(x_tf, seed=seed)
+      self.assertTrue(y.op.name.startswith("random_flip_left_right"))
+
+      count_flipped = 0
+      count_unflipped = 0
+      for _ in range(100):
+        y_tf = y.eval()
+
+        # check every element of the batch
+        for i in range(batch_size):
+          if y_tf[i][0][0] == 1:
+            self.assertAllEqual(y_tf[i], x_np[i])
+            count_unflipped += 1
+          else:
+            self.assertAllEqual(y_tf[i], y_np[i])
+            count_flipped += 1
+
+      # 100 trials, each containing batch_size elements
+      # Mean: 50 * batch_size
+      # Std Dev: ~5 * sqrt(batch_size)
+      # Six Sigma: 50 * batch_size - (5 * 6 * sqrt(batch_size))
+      #          = 50 * batch_size - 30 * sqrt(batch_size) = 800 - 30 * 4 = 680
+      six_sigma = 50 * batch_size - 30 * np.sqrt(batch_size)
+      self.assertGreaterEqual(count_flipped, six_sigma)
+      self.assertGreaterEqual(count_unflipped, six_sigma)
+
   def testInvolutionUpDown(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
 
@@ -1057,9 +1141,11 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[4, 5, 6], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
 
+    seed = 42
+
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_up_down(x_tf, seed=42)
+      y = image_ops.random_flip_up_down(x_tf, seed=seed)
       self.assertTrue(y.op.name.startswith("random_flip_up_down"))
       count_flipped = 0
       count_unflipped = 0
@@ -1079,6 +1165,50 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       self.assertGreaterEqual(count_flipped, 20)
       self.assertGreaterEqual(count_unflipped, 20)
 
+  def testRandomFlipUpDownWithBatch(self):
+    batch_size = 16
+    seed = 42
+
+    # create single item of test data
+    x_np_raw = np.array(
+        [[1, 2, 3], [4, 5, 6]], dtype=np.uint8
+    ).reshape([1, 2, 3, 1])
+    y_np_raw = np.array(
+        [[4, 5, 6], [1, 2, 3]], dtype=np.uint8
+    ).reshape([1, 2, 3, 1])
+
+    # create batched test data
+    x_np = np.vstack([x_np_raw for _ in range(batch_size)])
+    y_np = np.vstack([y_np_raw for _ in range(batch_size)])
+
+    with self.test_session(use_gpu=True):
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.random_flip_up_down(x_tf, seed=seed)
+      self.assertTrue(y.op.name.startswith("random_flip_up_down"))
+
+      count_flipped = 0
+      count_unflipped = 0
+      for _ in range(100):
+        y_tf = y.eval()
+
+        # check every element of the batch
+        for i in range(batch_size):
+          if y_tf[i][0][0] == 1:
+            self.assertAllEqual(y_tf[i], x_np[i])
+            count_unflipped += 1
+          else:
+            self.assertAllEqual(y_tf[i], y_np[i])
+            count_flipped += 1
+
+      # 100 trials, each containing batch_size elements
+      # Mean: 50 * batch_size
+      # Std Dev: ~5 * sqrt(batch_size)
+      # Six Sigma: 50 * batch_size - (5 * 6 * sqrt(batch_size))
+      #          = 50 * batch_size - 30 * sqrt(batch_size) = 800 - 30 * 4 = 680
+      six_sigma = 50 * batch_size - 30 * np.sqrt(batch_size)
+      self.assertGreaterEqual(count_flipped, six_sigma)
+      self.assertGreaterEqual(count_unflipped, six_sigma)
+
   def testInvolutionTranspose(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
 
@@ -1156,6 +1286,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     #Ops that support 4D input
     for op in [
         image_ops.flip_left_right, image_ops.flip_up_down,
+        image_ops.random_flip_left_right, image_ops.random_flip_up_down,
         image_ops.transpose_image, image_ops.rot90
     ]:
       transformed_unknown_dims_4 = op(p_unknown_dims_4)
@@ -1166,14 +1297,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
                                    "must be at least three-dimensional"):
         op(p_wrong_rank)
 
-    for op in [
-        image_ops.random_flip_left_right,
-        image_ops.random_flip_up_down,
-    ]:
-      with self.assertRaisesRegexp(ValueError, "must be three-dimensional"):
-        op(p_wrong_rank)
-
-
   def testRot90GroupOrder(self):
     image = np.arange(24, dtype=np.uint8).reshape([2, 4, 3])
     with self.test_session(use_gpu=True):
@@ -1208,41 +1331,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         y_np = np.rot90(image, k=k, axes=(1, 2))
         self.assertAllEqual(y_np, y_tf.eval({k_placeholder: k}))
 
-class RandomFlipTest(test_util.TensorFlowTestCase):
-
-  def testRandomLeftRight(self):
-    x_np = np.array([0, 1], dtype=np.uint8).reshape([1, 2, 1])
-    num_iterations = 500
-
-    hist = [0, 0]
-    with self.test_session(use_gpu=True):
-      x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_left_right(x_tf)
-      for _ in xrange(num_iterations):
-        y_np = y.eval().flatten()[0]
-        hist[y_np] += 1
-
-    # Ensure that each entry is observed within 4 standard deviations.
-    four_stddev = 4.0 * np.sqrt(num_iterations / 2.0)
-    self.assertAllClose(hist, [num_iterations / 2.0] * 2, atol=four_stddev)
-
-  def testRandomUpDown(self):
-    x_np = np.array([0, 1], dtype=np.uint8).reshape([2, 1, 1])
-    num_iterations = 500
-
-    hist = [0, 0]
-    with self.test_session(use_gpu=True):
-      x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_up_down(x_tf)
-      for _ in xrange(num_iterations):
-        y_np = y.eval().flatten()[0]
-        hist[y_np] += 1
-
-    # Ensure that each entry is observed within 4 standard deviations.
-    four_stddev = 4.0 * np.sqrt(num_iterations / 2.0)
-    self.assertAllClose(hist, [num_iterations / 2.0] * 2, atol=four_stddev)
-
-
 class AdjustContrastTest(test_util.TensorFlowTestCase):
 
   def _testContrast(self, x_np, y_np, contrast_factor):
@@ -3880,5 +3968,88 @@ class SobelEdgesTest(test_util.TensorFlowTestCase):
       self.assertAllClose(expected_batch, actual_sobel)
 
 
+class DecodeImageTest(test_util.TensorFlowTestCase):
+
+  def testJpegUint16(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/jpeg/testdata"
+      jpeg0 = io_ops.read_file(os.path.join(base, "jpeg_merge_test1.jpg"))
+      image0 = image_ops.decode_image(jpeg0, dtype=dtypes.uint16)
+      image1 = image_ops.convert_image_dtype(image_ops.decode_jpeg(jpeg0),
+                                             dtypes.uint16)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testPngUint16(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/png/testdata"
+      png0 = io_ops.read_file(os.path.join(base, "lena_rgba.png"))
+      image0 = image_ops.decode_image(png0, dtype=dtypes.uint16)
+      image1 = image_ops.convert_image_dtype(
+          image_ops.decode_png(png0, dtype=dtypes.uint16), dtypes.uint16)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testGifUint16(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/gif/testdata"
+      gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
+      image0 = image_ops.decode_image(gif0, dtype=dtypes.uint16)
+      image1 = image_ops.convert_image_dtype(image_ops.decode_gif(gif0),
+                                             dtypes.uint16)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testBmpUint16(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/bmp/testdata"
+      bmp0 = io_ops.read_file(os.path.join(base, "lena.bmp"))
+      image0 = image_ops.decode_image(bmp0, dtype=dtypes.uint16)
+      image1 = image_ops.convert_image_dtype(image_ops.decode_bmp(bmp0),
+                                             dtypes.uint16)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testJpegFloat32(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/jpeg/testdata"
+      jpeg0 = io_ops.read_file(os.path.join(base, "jpeg_merge_test1.jpg"))
+      image0 = image_ops.decode_image(jpeg0, dtype=dtypes.float32)
+      image1 = image_ops.convert_image_dtype(image_ops.decode_jpeg(jpeg0),
+                                             dtypes.float32)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testPngFloat32(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/png/testdata"
+      png0 = io_ops.read_file(os.path.join(base, "lena_rgba.png"))
+      image0 = image_ops.decode_image(png0, dtype=dtypes.float32)
+      image1 = image_ops.convert_image_dtype(
+          image_ops.decode_png(png0, dtype=dtypes.uint16), dtypes.float32)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testGifFloat32(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/gif/testdata"
+      gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
+      image0 = image_ops.decode_image(gif0, dtype=dtypes.float32)
+      image1 = image_ops.convert_image_dtype(image_ops.decode_gif(gif0),
+                                             dtypes.float32)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testBmpFloat32(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/bmp/testdata"
+      bmp0 = io_ops.read_file(os.path.join(base, "lena.bmp"))
+      image0 = image_ops.decode_image(bmp0, dtype=dtypes.float32)
+      image1 = image_ops.convert_image_dtype(image_ops.decode_bmp(bmp0),
+                                             dtypes.float32)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 2df230d470..724fcc39cd 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -467,7 +467,8 @@ class VarianceScaling(Initializer):
     else:
       scale /= max(1., (fan_in + fan_out) / 2.)
     if self.distribution == "normal":
-      stddev = math.sqrt(scale)
+      # constant taken from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
+      stddev = math.sqrt(scale) / .87962566103423978
       return random_ops.truncated_normal(
           shape, 0.0, stddev, dtype, seed=self.seed)
     else:
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 222b8ebc9d..8276047cb6 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -35,8 +35,9 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 # Assert and Print are special symbols in python, so we must
-# use an upper-case version of them.
-@tf_export("Print")
+# have an upper-case version of them.  For users with Python 3 or Python 2.7
+# with `from __future__ import print_function`, we also allow lowercase.
+@tf_export("Print", "print")
 def Print(input_, data, message=None, first_n=None, summarize=None,
           name=None):
   """Prints a list of tensors.
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index e40481f3a7..466d0dadc8 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -125,8 +125,8 @@ def abs(x, name=None):  # pylint: disable=redefined-builtin
   ```
 
   Args:
-    x: A `Tensor` or `SparseTensor` of type `float32`, `float64`, `int32`,
-      `int64`, `complex64` or `complex128`.
+    x: A `Tensor` or `SparseTensor` of type `float16`, `float32`, `float64`,
+      `int32`, `int64`, `complex64` or `complex128`.
     name: A name for the operation (optional).
 
   Returns:
@@ -430,10 +430,10 @@ def pow(x, y, name=None):  # pylint: disable=redefined-builtin
   ```
 
   Args:
-    x: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
-     or `complex128`.
-    y: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
-     or `complex128`.
+    x: A `Tensor` of type `float16`, `float32`, `float64`, `int32`, `int64`,
+     `complex64`, or `complex128`.
+    y: A `Tensor` of type `float16`, `float32`, `float64`, `int32`, `int64`,
+     `complex64`, or `complex128`.
     name: A name for the operation (optional).
 
   Returns:
@@ -600,7 +600,7 @@ def round(x, name=None):  # pylint: disable=redefined-builtin
   ```
 
   Args:
-    x: A `Tensor` of type `float32` or `float64`.
+    x: A `Tensor` of type `float16`, `float32`, `float64`, `int32`, or `int64`.
     name: A name for the operation (optional).
 
   Returns:
@@ -1257,7 +1257,7 @@ def reduce_sum(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
@@ -1397,7 +1397,7 @@ def reduce_mean(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
@@ -1469,7 +1469,7 @@ def reduce_prod(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   Args:
@@ -1519,7 +1519,7 @@ def reduce_min(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   Args:
@@ -1568,7 +1568,7 @@ def reduce_max(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   Args:
@@ -1617,7 +1617,7 @@ def reduce_all(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
@@ -1675,7 +1675,7 @@ def reduce_any(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 783d485892..f47f38e29e 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -621,7 +621,7 @@ def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):
   """Calculate the mean and variance of based on the sufficient statistics.
 
   Args:
-    counts: A `Tensor` containing a the total count of the data (one value).
+    counts: A `Tensor` containing the total count of the data (one value).
     mean_ss: A `Tensor` containing the mean sufficient statistics: the (possibly
       shifted) sum of the elements to average over.
     variance_ss: A `Tensor` containing the variance sufficient statistics: the
@@ -689,6 +689,9 @@ def moments(
     # Compute true mean while keeping the dims for proper broadcasting.
     mean = math_ops.reduce_mean(y, axes, keepdims=True, name="mean")
     # sample variance, not unbiased variance
+    # Note: stop_gradient does not change the gradient that gets 
+    #       backpropagated to the mean from the variance calculation,
+    #       because that gradient is zero
     variance = math_ops.reduce_mean(
         math_ops.squared_difference(y, array_ops.stop_gradient(mean)),
         axes,
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index a0b55eb077..0c2f5b06c4 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1596,12 +1596,12 @@ def leaky_relu(features, alpha=0.2, name=None):
   Returns:
     The activation value.
   """
-  with ops.name_scope(name, "LeakyRelu", [features, alpha]):
+  with ops.name_scope(name, "LeakyRelu", [features, alpha]) as name:
     features = ops.convert_to_tensor(features, name="features")
     if features.dtype.is_integer:
       features = math_ops.to_float(features)
     alpha = ops.convert_to_tensor(alpha, dtype=features.dtype, name="alpha")
-    return math_ops.maximum(alpha * features, features)
+    return math_ops.maximum(alpha * features, features, name=name)
 
 
 def _flatten_outer_dims(logits):
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 46a5f4fae6..035b4735af 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -962,6 +962,16 @@ class LeakyReluTest(test_lib.TestCase):
       self.assertAllClose(
           outputs, [-0.4, -0.2, 0.0, 1.0, 2.0], rtol=tol, atol=tol)
 
+  def testName(self):
+    np_values = np.array([-2, -1, 0, 1, 2], dtype=np.float64)
+    outputs_with_name_set = nn_ops.leaky_relu(
+        constant_op.constant(np_values),
+        name='test_relu_op')
+    self.assertEqual(outputs_with_name_set.name, 'test_relu_op:0')
+    outputs_without_name_set = nn_ops.leaky_relu(
+        constant_op.constant(np_values))
+    self.assertEqual(outputs_without_name_set.name, 'LeakyRelu:0')
+
 
 class SwishTest(test_lib.TestCase):
 
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index f8676ccb5f..219562de5d 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -23,6 +23,7 @@ import threading
 
 # Used by py_util.cc to get tracebacks.
 import traceback  # pylint: disable=unused-import
+import weakref
 
 import numpy as np
 import six
@@ -129,11 +130,14 @@ class FuncRegistry(object):
   def __init__(self):
     self._lock = threading.Lock()
     self._unique_id = 0  # GUARDED_BY(self._lock)
-    self._funcs = {}
+    # Only store weakrefs to the funtions. The strong reference is stored in
+    # the graph.
+    self._funcs = weakref.WeakValueDictionary()
 
   def insert(self, func):
     """Registers `func` and returns a unique token for this entry."""
     token = self._next_unique_token()
+    # Store a weakref to the function
     self._funcs[token] = func
     return token
 
@@ -186,7 +190,7 @@ class FuncRegistry(object):
     Raises:
       ValueError: if no function is registered for `token`.
     """
-    func = self._funcs[token]
+    func = self._funcs.get(token, None)
     if func is None:
       raise ValueError("callback %s is not found" % token)
     if isinstance(func, EagerFunc):
@@ -228,19 +232,6 @@ _py_funcs = FuncRegistry()
 pywrap_tensorflow.InitializePyTrampoline(_py_funcs)
 
 
-class CleanupFunc(object):
-  """A helper class to remove a registered function from _py_funcs."""
-
-  def __init__(self, token):
-    self._token = token
-
-  def __del__(self):
-    if _py_funcs is not None:
-      # If _py_funcs is None, the program is most likely in shutdown, and the
-      # _py_funcs object has been destroyed already.
-      _py_funcs.remove(self._token)
-
-
 def _internal_py_func(func,
                       inp,
                       Tout,
@@ -270,17 +261,15 @@ def _internal_py_func(func,
     # bound to that of the outer graph instead.
     graph = graph._outer_graph
 
-  cleanup = CleanupFunc(token)
-
   # TODO(zhifengc): Consider adding a Graph method to collect
   # `cleanup` objects in one of its member.
-  if not hasattr(graph, "_cleanup_py_funcs_used_in_graph"):
-    graph._cleanup_py_funcs_used_in_graph = []
+  if not hasattr(graph, "_py_funcs_used_in_graph"):
+    graph._py_funcs_used_in_graph = []
 
-  # When `graph` is destroyed, elements in _cleanup_py_funcs_used_in_graph
-  # will be destroyed and their __del__ will remove the 'token' from
-  # the funcs registry.
-  graph._cleanup_py_funcs_used_in_graph.append(cleanup)
+  # Store a reference to the function in the graph to ensure it stays alive
+  # as long as the graph lives. When the graph is destroyed, the function
+  # is left to the garbage collector for destruction as well.
+  graph._py_funcs_used_in_graph.append(func)
   # pylint: enable=protected-access
 
   if eager:
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 0130233746..c3b16a7bd5 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -84,6 +84,8 @@ def _convert_to_sparse_tensors(sp_inputs):
 
 # pylint: disable=protected-access
 @tf_export("sparse_concat")
+@deprecation.deprecated_args(
+    None, "concat_dim is deprecated, use axis instead", "concat_dim")
 def sparse_concat(axis,
                   sp_inputs,
                   name=None,
@@ -597,6 +599,8 @@ class KeywordRequired(object):
 
 
 @tf_export("sparse_split")
+@deprecation.deprecated_args(
+    None, "split_dim is deprecated, use axis instead", "split_dim")
 def sparse_split(keyword_required=KeywordRequired(),
                  sp_input=None,
                  num_split=None,
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index ae79c01949..0280c89c10 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -91,6 +91,59 @@ def string_split(source, delimiter=" ", skip_empty=True):  # pylint: disable=inv
   shape.set_shape([2])
   return sparse_tensor.SparseTensor(indices, values, shape)
 
+@tf_export("strings.split")
+def string_split_v2(source, sep=None, maxsplit=-1):
+  """Split elements of `source` based on `sep` into a `SparseTensor`.
+
+  Let N be the size of source (typically N will be the batch size). Split each
+  element of `source` based on `sep` and return a `SparseTensor`
+  containing the split tokens. Empty tokens are ignored.
+
+  For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
+  then the output will be
+
+  st.indices = [0, 0;
+                0, 1;
+                1, 0;
+                1, 1;
+                1, 2]
+  st.shape = [2, 3]
+  st.values = ['hello', 'world', 'a', 'b', 'c']
+
+  If `sep` is given, consecutive delimiters are not grouped together and are
+  deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
+  sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
+  string, consecutive whitespace are regarded as a single separator, and the
+  result will contain no empty strings at the startor end if the string has
+  leading or trailing whitespace.
+
+  Note that the above mentioned behavior matches python's str.split.
+
+  Args:
+    source: `1-D` string `Tensor`, the strings to split.
+    sep: `0-D` string `Tensor`, the delimiter character.
+    maxsplit: An `int`. If `maxsplit > 0`, limit of the split of the result.
+
+  Raises:
+    ValueError: If sep is not a string.
+
+  Returns:
+    A `SparseTensor` of rank `2`, the strings split according to the delimiter.
+    The first column of the indices corresponds to the row in `source` and the
+    second column corresponds to the index of the split component in this row.
+  """
+  if sep is None:
+    sep = ''
+  sep = ops.convert_to_tensor(sep, dtype=dtypes.string)
+  source = ops.convert_to_tensor(source, dtype=dtypes.string)
+
+  indices, values, shape = gen_string_ops.string_split_v2(
+      source, sep=sep, maxsplit=maxsplit)
+  indices.set_shape([None, 2])
+  values.set_shape([None])
+  shape.set_shape([2])
+  return sparse_tensor.SparseTensor(indices, values, shape)
+
 
 def _reduce_join_reduction_dims(x, axis, reduction_indices):
   """Returns range(rank(x) - 1, 0, -1) if reduction_indices is None."""
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index f49e2d314d..47414c28af 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -1786,6 +1786,23 @@ class variable_scope(object):
           assert v.name == "foo/bar/v:0"
   ```
 
+  Simple example of how to reenter a premade variable scope safely:
+
+  ```python
+  with tf.variable_scope("foo") as vs:
+    pass
+
+  # Re-enter the variable scope.
+  with tf.variable_scope(vs,
+                         auxiliary_name_scope=False) as vs1:
+    # Restore the original name_scope.
+    with tf.name_scope(vs1.original_name_scope):
+        v = tf.get_variable("v", [1])
+        assert v.name == "foo/v:0"
+        c = tf.constant([1], name="c")
+        assert c.name == "foo/c:0"
+  ```
+
   Basic example of sharing a variable AUTO_REUSE:
 
   ```python
@@ -1924,7 +1941,9 @@ class variable_scope(object):
         (which must have the same shape). Constraints are not safe to
         use when doing asynchronous distributed training.
       auxiliary_name_scope: If `True`, we create an auxiliary name scope with
-        the scope. If `False`, we don't touch name scope.
+        the scope. If `False`, we don't create it. Note that the argument is
+        not inherited, and it only takes effect for once when creating. You
+        should only use it for re-entering a premade variable scope.
 
     Returns:
       A scope that can be captured and reused.
diff --git a/tensorflow/python/tools/import_pb_to_tensorboard.py b/tensorflow/python/tools/import_pb_to_tensorboard.py
old mode 100755
new mode 100644
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 1f9fbad0b4..c3bc9ccd45 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1723,7 +1723,7 @@ def tf_py_build_info_genrule():
       name="py_build_info_gen",
       outs=["platform/build_info.py"],
       cmd=
-      "$(location //tensorflow/tools/build_info:gen_build_info.py) --raw_generate \"$@\" --build_config " + if_cuda("cuda", "cpu"),
+     "$(location //tensorflow/tools/build_info:gen_build_info.py) --raw_generate \"$@\" --build_config " + if_cuda("cuda", "cpu"),
       local=1,
       tools=[clean_dep("//tensorflow/tools/build_info:gen_build_info.py")],)
 
diff --git a/tensorflow/tools/api/generator/create_python_api.py b/tensorflow/tools/api/generator/create_python_api.py
index bca9fa49eb..671b7e387e 100644
--- a/tensorflow/tools/api/generator/create_python_api.py
+++ b/tensorflow/tools/api/generator/create_python_api.py
@@ -41,7 +41,11 @@ _GENERATED_FILE_HEADER = """# This file is MACHINE GENERATED! Do not edit.
 # Generated by: tensorflow/tools/api/generator/create_python_api.py script.
 \"\"\"%s
 \"\"\"
+
+from __future__ import print_function
+
 """
+_GENERATED_FILE_FOOTER = "\n\ndel print_function\n"
 
 
 class SymbolExposedTwiceError(Exception):
@@ -149,6 +153,7 @@ class _ModuleInitCodeBuilder(object):
 _names_with_underscore = [%s]
 __all__ = [_s for _s in dir() if not _s.startswith('_')]
 __all__.extend([_s for _s in _names_with_underscore])
+__all__.remove('print_function')
 ''' % underscore_names_str
 
     return module_text_map
@@ -333,7 +338,8 @@ def create_api_files(
     if module or not root_init_template:
       contents = (
           _GENERATED_FILE_HEADER %
-          get_module_docstring(module, package, api_name) + text)
+          get_module_docstring(module, package, api_name) +
+          text + _GENERATED_FILE_FOOTER)
     else:
       # Read base init file
       with open(root_init_template, 'r') as root_init_template_file:
diff --git a/tensorflow/tools/api/golden/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
index 5bb3b3c444..10171b3d60 100644
--- a/tensorflow/tools/api/golden/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
@@ -58,7 +58,7 @@ tf_module {
   }
   member_method {
     name: "decode_image"
-    argspec: "args=[\'contents\', \'channels\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'uint8\'>\", \'None\'], "
   }
   member_method {
     name: "decode_jpeg"
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index dc2bd40096..3051c4437e 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -1532,6 +1532,10 @@ tf_module {
     name: "pow"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "print"
+    argspec: "args=[\'input_\', \'data\', \'message\', \'first_n\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "py_func"
     argspec: "args=[\'func\', \'inp\', \'Tout\', \'stateful\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/tensorflow.strings.pbtxt
index a3fbe95bba..b641c39feb 100644
--- a/tensorflow/tools/api/golden/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.strings.pbtxt
@@ -4,4 +4,8 @@ tf_module {
     name: "regex_full_match"
     argspec: "args=[\'input\', \'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "split"
+    argspec: "args=[\'source\', \'sep\', \'maxsplit\'], varargs=None, keywords=None, defaults=[\'None\', \'-1\'], "
+  }
 }
diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index 5fa75e1d61..883bb93647 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@@ -322,6 +322,10 @@ create_activate_virtualenv_and_install_tensorflow() {
   pip install -v ${PIP_FLAGS} ${WHL_PATH} || \
     die "pip install (forcing to reinstall tensorflow) FAILED"
   echo "Successfully installed pip package ${TF_WHEEL_PATH}"
+
+  # Force downgrade setuptools.
+  pip install --upgrade setuptools==39.1.0
+
 }
 
 ################################################################################
diff --git a/tensorflow/tools/ci_build/builds/with_the_same_user b/tensorflow/tools/ci_build/builds/with_the_same_user
index d4bf546d40..b216e3549f 100755
--- a/tensorflow/tools/ci_build/builds/with_the_same_user
+++ b/tensorflow/tools/ci_build/builds/with_the_same_user
@@ -40,7 +40,7 @@ if [ -n "${CI_BUILD_USER_FORCE_BADNAME}" ]; then
   ADDUSER_OPTS="--force-badname"
 fi
 
-getent group "${CI_BUILD_GID}" || addgroup --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}"
+getent group "${CI_BUILD_GID}" || addgroup ${ADDUSER_OPTS} --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}"
 getent passwd "${CI_BUILD_UID}" || adduser ${ADDUSER_OPTS} \
     --gid "${CI_BUILD_GID}" --uid "${CI_BUILD_UID}" \
     --gecos "${CI_BUILD_USER} (generated by with_the_same_user script)" \
diff --git a/tensorflow/tools/ci_build/ci_build.sh b/tensorflow/tools/ci_build/ci_build.sh
index 072dd6ab99..1f0fd0387a 100755
--- a/tensorflow/tools/ci_build/ci_build.sh
+++ b/tensorflow/tools/ci_build/ci_build.sh
@@ -134,6 +134,12 @@ if [[ $? != "0" ]]; then
   die "ERROR: docker build failed. Dockerfile is at ${DOCKERFILE_PATH}"
 fi
 
+# If caller wants the with_the_same_user script to allow bad usernames, 
+# pass the var to the docker environment
+if [ -n "${CI_BUILD_USER_FORCE_BADNAME}" ]; then
+        CI_BUILD_USER_FORCE_BADNAME_ENV="-e CI_BUILD_USER_FORCE_BADNAME=yes"
+fi
+
 # Run the command inside the container.
 echo "Running '${COMMAND[*]}' inside ${DOCKER_IMG_NAME}..."
 mkdir -p ${WORKSPACE}/bazel-ci_build-cache
@@ -148,6 +154,7 @@ ${DOCKER_BINARY} run --rm --pid=host \
     -e "CI_BUILD_GROUP=$(id -g -n)" \
     -e "CI_BUILD_GID=$(id -g)" \
     -e "CI_TENSORFLOW_SUBMODULE_PATH=${CI_TENSORFLOW_SUBMODULE_PATH}" \
+    ${CI_BUILD_USER_FORCE_BADNAME_ENV} \
     -v ${WORKSPACE}:/workspace \
     -w /workspace \
     ${GPU_EXTRA_PARAMS} \
diff --git a/tensorflow/tools/ci_build/copy_binary.py b/tensorflow/tools/ci_build/copy_binary.py
index 420d390d2b..148526492d 100755
--- a/tensorflow/tools/ci_build/copy_binary.py
+++ b/tensorflow/tools/ci_build/copy_binary.py
@@ -32,7 +32,8 @@ import shutil
 import tempfile
 import zipfile
 
-TF_NIGHTLY_REGEX = r"(.+)tf_nightly(|_gpu)-(\d\.\d\.\d.dev[\d]{0,8})-(.+)\.whl"
+TF_NIGHTLY_REGEX = (r"(.+)tf_nightly(|_gpu)-(\d\.[\d]{1,2}"
+                    "\.\d.dev[\d]{0,8})-(.+)\.whl")
 BINARY_STRING_TEMPLATE = "%s-%s-%s.whl"
 
 
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index 60290df833..88f1d04193 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -115,3 +115,7 @@ pip2 install keras_applications==1.0.2
 pip3 install keras_applications==1.0.2
 pip2 install keras_preprocessing==1.0.1
 pip3 install keras_preprocessing==1.0.1
+
+# Install last working version of setuptools.
+pip2 install --upgrade setuptools==39.1.0
+pip3 install --upgrade setuptools==39.1.0
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index edb9d4b929..acd69ef346 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -39,7 +39,6 @@ if [[ -z $pip35_version ]]; then
 fi
 
 set -e
-pip3.5 install --upgrade setuptools
 pip3.5 install --upgrade pip
 
 pip3.5 install --upgrade virtualenv
@@ -86,4 +85,7 @@ pip3.5 install --upgrade termcolor
 pip3.5 install keras_applications==1.0.2
 pip3.5 install keras_preprocessing==1.0.1
 
+# Install last working version of setuptools.
+pip3.5 install --upgrade setuptools==39.1.0
+
 # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh)
diff --git a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
index 5635977731..323b30f48e 100755
--- a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
@@ -49,7 +49,6 @@ cd Python-3.6.1
 make altinstall
 ln -s /usr/local/bin/pip3.6 /usr/local/bin/pip3
 
-pip3 install --upgrade setuptools
 pip3 install --upgrade pip
 
 pip3 install --upgrade virtualenv
@@ -101,4 +100,8 @@ pip3 install --upgrade termcolor
 # Keras
 pip3.5 install keras_applications==1.0.2
 pip3.5 install keras_preprocessing==1.0.1
+
+# Install last working version of setuptools.
+pip3 install --upgrade setuptools==39.1.0
+
 # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh)
diff --git a/tensorflow/tools/ci_build/linux/mkl/basic-mkl-test.sh b/tensorflow/tools/ci_build/linux/mkl/basic-mkl-test.sh
new file mode 100755
index 0000000000..10a09a415a
--- /dev/null
+++ b/tensorflow/tools/ci_build/linux/mkl/basic-mkl-test.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Usage: basic_mkl_test.sh
+
+# Helper function to traverse directories up until given file is found.
+function upsearch () {
+  test / == "$PWD" && return || \
+      test -e "$1" && echo "$PWD" && return || \
+      cd .. && upsearch "$1"
+}
+
+# Set up WORKSPACE.
+WORKSPACE="${WORKSPACE:-$(upsearch WORKSPACE)}"
+
+BUILD_TAG=mkl-ci-test CI_BUILD_USER_FORCE_BADNAME=yes ${WORKSPACE}/tensorflow/tools/ci_build/ci_build.sh cpu tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
diff --git a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
index 1bd1852ffc..b8bce57c87 100755
--- a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
+++ b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
@@ -79,6 +79,7 @@ if [[ $1 == "PI_ONE" ]]; then
   --linkopt=-L${OPENBLAS_INSTALL_PATH}/lib/
   --linkopt=-l:libopenblas.a"
   echo "Building for the Pi One/Zero, with no NEON support"
+  WHEEL_ARCH=linux_armv6l
 else
   PI_COPTS='--copt=-march=armv7-a --copt=-mfpu=neon-vfpv4
   --copt=-std=gnu11 --copt=-DS_IREAD=S_IRUSR --copt=-DS_IWRITE=S_IWUSR
@@ -86,6 +87,7 @@ else
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8'
+  WHEEL_ARCH=linux_armv7l
   echo "Building for the Pi Two/Three, with NEON acceleration"
 fi
 
@@ -100,6 +102,8 @@ bazel build -c opt ${PI_COPTS} \
   --copt=-fomit-frame-pointer --cpu=armeabi \
   --crosstool_top=@local_config_arm_compiler//:toolchain \
   --verbose_failures \
+  //tensorflow:libtensorflow.so \
+  //tensorflow:libtensorflow_framework.so \
   //tensorflow/tools/benchmark:benchmark_model \
   //tensorflow/tools/pip_package:build_pip_package
 
@@ -112,10 +116,12 @@ BDIST_OPTS="--universal" \
   bazel-bin/tensorflow/tools/pip_package/build_pip_package "${OUTDIR}"
 
 OLD_FN=$(ls "${OUTDIR}" | grep -m 1 \.whl)
-SUB='s/tensorflow-([^-]+)-([^-]+)-.*/tensorflow-\1-\2-none-any.whl/; print'
+SUB='s/tensorflow-([^-]+)-([^-]+)-.*/tensorflow-\1-\2-none-'${WHEEL_ARCH}'.whl/; print'
 NEW_FN=$(echo "${OLD_FN}" | perl -ne "${SUB}")
 mv "${OUTDIR}/${OLD_FN}" "${OUTDIR}/${NEW_FN}"
 cp bazel-bin/tensorflow/tools/benchmark/benchmark_model "${OUTDIR}"
+cp bazel-bin/tensorflow/libtensorflow.so "${OUTDIR}"
+cp bazel-bin/tensorflow/libtensorflow_framework.so "${OUTDIR}"
 
 echo "Output can be found here:"
 find "${OUTDIR}"
diff --git a/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl b/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
index 47539b2423..f8f63e276c 100644
--- a/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
+++ b/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
@@ -31,7 +31,11 @@ def _def_file_filter_configure_impl(repository_ctx):
   vc_path = find_vc_path(repository_ctx)
   if vc_path == "visual-studio-not-found":
     auto_configure_fail("Visual C++ build tools not found on your machine")
-  undname_bin_path = find_msvc_tool(repository_ctx, vc_path, "undname.exe").replace("\\", "\\\\")
+
+  undname = find_msvc_tool(repository_ctx, vc_path, "undname.exe")
+  if undname == None:
+    auto_configure_fail("Couldn't find undname.exe under %s, please check your VC installation and set BAZEL_VC environment variable correctly." % vc_path)
+  undname_bin_path = undname.replace("\\", "\\\\")
 
   repository_ctx.template(
     "def_file_filter.py",
diff --git a/tensorflow/tools/dist_test/local_test.sh b/tensorflow/tools/dist_test/local_test.sh
index 06c2b997cb..b0114721bd 100755
--- a/tensorflow/tools/dist_test/local_test.sh
+++ b/tensorflow/tools/dist_test/local_test.sh
@@ -64,9 +64,6 @@ die() {
 # Configurations
 DOCKER_IMG_NAME="tensorflow/tf-dist-test-local-cluster"
 
-# Use TensorFlow v1.5.0 for Python 2.7 and CPU only as we set num_gpus to 0 in the below
-DEFAULT_WHL_FILE_LOCATION="https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp27-none-linux_x86_64.whl"
-
 # Parse input arguments
 LEAVE_CONTAINER_RUNNING=0
 MODEL_NAME=""
@@ -77,8 +74,7 @@ SYNC_REPLICAS_FLAG=""
 
 WHL_FILE_LOCATION=${1}
 if [[ -z "${WHL_FILE_LOCATION}" ]]; then
-  WHL_FILE_LOCATION=${DEFAULT_WHL_FILE_LOCATION}
-  echo "use default whl file location"
+  echo "WARNING: No wheel url passed. Will use latest tf-nightly cpu p2 wheel."
 fi
 
 while true; do
@@ -131,7 +127,11 @@ echo "Building in temporary directory: ${BUILD_DIR}"
 cp -r ${DIR}/* "${BUILD_DIR}"/ || \
   die "Failed to copy files to ${BUILD_DIR}"
 
-if [[ $WHL_FILE_LOCATION =~ 'http://' || $WHL_FILE_LOCATION =~ 'https://' ]]; then
+# Download whl file into the build context directory.
+if [[ -z "${WHL_FILE_LOCATION}" ]]; then
+  pip2 download --no-deps tf-nightly
+  cp tf-nightly-*.whl "${BUILD_DIR}"/tensorflow-none-any.whl
+elif [[ $WHL_FILE_LOCATION =~ 'http://' || $WHL_FILE_LOCATION =~ 'https://' ]]; then
     # Download whl file into the build context directory.
     wget -P "${BUILD_DIR}" "${WHL_FILE_LOCATION}" || \
         die "Failed to download tensorflow whl file from URL: ${WHL_FILE_LOCATION}"
diff --git a/tensorflow/tools/dist_test/remote_test.sh b/tensorflow/tools/dist_test/remote_test.sh
index 935535312d..e188c88c8f 100755
--- a/tensorflow/tools/dist_test/remote_test.sh
+++ b/tensorflow/tools/dist_test/remote_test.sh
@@ -108,7 +108,7 @@ fi
 # Parse command-line arguments.
 WHL_URL=${1}
 if [[ -z "${WHL_URL}" ]]; then
-  die "whl URL is not specified"
+  echo "WARNING: No wheel url passed. Will use latest tf-nightly cpu p2 wheel."
 fi
 
 # Create docker build context directory.
@@ -121,8 +121,13 @@ cp -r ${DIR}/* ${BUILD_DIR}/ || \
   die "Failed to copy files to ${BUILD_DIR}"
 
 # Download whl file into the build context directory.
-wget -P "${BUILD_DIR}" ${WHL_URL} || \
-  die "Failed to download tensorflow whl file from URL: ${WHL_URL}"
+if [[ -z "${WHL_URL}" ]]; then
+  pip2 download --no-deps tf-nightly
+  cp tf-nightly-*.whl "${BUILD_DIR}"/tensorflow-none-any.whl
+else
+  wget -P "${BUILD_DIR}" ${WHL_URL} || \
+    die "Failed to download tensorflow whl file from URL: ${WHL_URL}"
+fi
 
 # Build docker image for test.
 docker build ${NO_CACHE_FLAG} \
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 406d134699..57a491255e 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -76,7 +76,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.8 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.9 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # TODO(craigcitro): Don't install the pip package, since it makes it
 # more difficult to experiment with local changes. Instead, just add
diff --git a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
index a6cd44ced1..6796ad70e5 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
+++ b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
@@ -3,7 +3,7 @@ FROM tensorflow/tensorflow:latest-devel
 LABEL maintainer="Clayne Robison<clayne.b.robison@intel.com>"
 
 # These arguments are parameterized. Use --build-args to override.
-ARG TF_BRANCH=r1.8
+ARG TF_BRANCH=r1.9
 ARG WHL_DIR=/whl
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 2fe47f3356..204b5b4dba 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -13,8 +13,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         cuda-cusparse-dev-9-0 \
         curl \
         git \
-        libcudnn7=7.0.5.15-1+cuda9.0 \
-        libcudnn7-dev=7.0.5.15-1+cuda9.0 \
+        libcudnn7=7.1.4.18-1+cuda9.0 \
+        libcudnn7-dev=7.1.4.18-1+cuda9.0 \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
@@ -85,7 +85,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.8 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.9 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
index bff4a20392..9197651ff4 100644
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@@ -12,7 +12,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         cuda-cusolver-9-0 \
         cuda-cusparse-9-0 \
         curl \
-        libcudnn7=7.0.5.15-1+cuda9.0 \
+        libcudnn7=7.1.4.18-1+cuda9.0 \
         libfreetype6-dev \
         libhdf5-serial-dev \
         libpng12-dev \
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index d0fd0fae97..d149365ac1 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -61,6 +61,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/autograph/core:test_lib",
     "//tensorflow/contrib/autograph/impl:impl",
     "//tensorflow/contrib/autograph/lang:lang",
+    "//tensorflow/contrib/autograph/operators:operators",
     "//tensorflow/contrib/autograph/pyct:pyct",
     "//tensorflow/contrib/autograph/pyct/static_analysis:static_analysis",
     "//tensorflow/contrib/boosted_trees:boosted_trees_pip",
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index 0c4065bc77..f7e42ce536 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -41,51 +41,15 @@ function is_windows() {
   fi
 }
 
-function main() {
+function prepare_src() {
   if [ $# -lt 1 ] ; then
     echo "No destination dir provided"
     exit 1
   fi
 
-  DEST=$(real_path $1)
-  TMPDIR=$(mktemp -d -t tmp.XXXXXXXXXX)
-
-  PKG_NAME_FLAG=""
-  GPU_BUILD=0
-  NIGHTLY_BUILD=0
-  PROJECT_NAME=""
-  while true; do
-    if [[ "$1" == "--nightly_flag" ]]; then
-      NIGHTLY_BUILD=1
-    elif [[ "$1" == "--gpu" ]]; then
-      GPU_BUILD=1
-    elif [[ "$1" == "--gpudirect" ]]; then
-      PKG_NAME_FLAG="--project_name tensorflow_gpudirect"
-    elif [[ "$1" == "--project_name" ]]; then
-      shift
-      if [[ -z "$1" ]]; then
-        break
-      fi
-      PROJECT_NAME="$1"
-    fi
-    shift
-
-    if [[ -z "$1" ]]; then
-      break
-    fi
-  done
-
-  if [[ -n ${PROJECT_NAME} ]]; then
-    PKG_NAME_FLAG="--project_name ${PROJECT_NAME}"
-  elif [[ ${NIGHTLY_BUILD} == "1" && ${GPU_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tf_nightly_gpu"
-  elif [[ ${NIGHTLY_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tf_nightly"
-  elif [[ ${GPU_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tensorflow_gpu"
-  fi
-
-  echo $(date) : "=== Using tmpdir: ${TMPDIR}"
+  TMPDIR="$1"
+  mkdir -p "$TMPDIR"
+  echo $(date) : "=== Preparing sources in dir: ${TMPDIR}"
 
   if [ ! -d bazel-bin/tensorflow ]; then
     echo "Could not find bazel-bin.  Did you run from the root of the build tree?"
@@ -155,17 +119,28 @@ function main() {
   # over so user defined ops can be compiled.
   mkdir -p ${TMPDIR}/google
   mkdir -p ${TMPDIR}/third_party
-  pushd ${RUNFILES%org_tensorflow}
+  pushd ${RUNFILES%org_tensorflow} > /dev/null
   for header in $(find protobuf_archive -name \*.h); do
     mkdir -p "${TMPDIR}/google/$(dirname ${header})"
     cp "$header" "${TMPDIR}/google/$(dirname ${header})/"
   done
-  popd
+  popd > /dev/null
   cp -R $RUNFILES/third_party/eigen3 ${TMPDIR}/third_party
 
   cp tensorflow/tools/pip_package/MANIFEST.in ${TMPDIR}
   cp tensorflow/tools/pip_package/README ${TMPDIR}
   cp tensorflow/tools/pip_package/setup.py ${TMPDIR}
+}
+
+function build_wheel() {
+  if [ $# -lt 2 ] ; then
+    echo "No src and dest dir provided"
+    exit 1
+  fi
+
+  TMPDIR="$1"
+  DEST="$2"
+  PKG_NAME_FLAG="$3"
 
   # Before we leave the top-level directory, make sure we know how to
   # call python.
@@ -173,15 +148,110 @@ function main() {
     source tools/python_bin_path.sh
   fi
 
-  pushd ${TMPDIR}
+  pushd ${TMPDIR} > /dev/null
   rm -f MANIFEST
   echo $(date) : "=== Building wheel"
   "${PYTHON_BIN_PATH:-python}" setup.py bdist_wheel ${PKG_NAME_FLAG} >/dev/null
   mkdir -p ${DEST}
   cp dist/* ${DEST}
-  popd
-  rm -rf ${TMPDIR}
+  popd > /dev/null
   echo $(date) : "=== Output wheel file is in: ${DEST}"
 }
 
+function usage() {
+  echo "Usage:"
+  echo "$0 [--src srcdir] [--dst dstdir] [options]"
+  echo "$0 dstdir [options]"
+  echo ""
+  echo "    --src                 prepare sources in srcdir"
+  echo "                              will use temporary dir if not specified"
+  echo ""
+  echo "    --dst                 build wheel in dstdir"
+  echo "                              if dstdir is not set do not build, only prepare sources"
+  echo ""
+  echo "  Options:"
+  echo "    --project_name <name> set project name to name"
+  echo "    --gpu                 build tensorflow_gpu"
+  echo "    --gpudirect           build tensorflow_gpudirect"
+  echo "    --nightly_flag        build tensorflow nightly"
+  echo ""
+  exit 1
+}
+
+function main() {
+  PKG_NAME_FLAG=""
+  PROJECT_NAME=""
+  GPU_BUILD=0
+  NIGHTLY_BUILD=0
+  SRCDIR=""
+  DSTDIR=""
+  CLEANSRC=1
+  while true; do
+    if [[ "$1" == "--help" ]]; then
+      usage
+      exit 1
+    elif [[ "$1" == "--nightly_flag" ]]; then
+      NIGHTLY_BUILD=1
+    elif [[ "$1" == "--gpu" ]]; then
+      GPU_BUILD=1
+    elif [[ "$1" == "--gpudirect" ]]; then
+      PKG_NAME_FLAG="--project_name tensorflow_gpudirect"
+    elif [[ "$1" == "--project_name" ]]; then
+      shift
+      if [[ -z "$1" ]]; then
+        break
+      fi
+      PROJECT_NAME="$1"
+    elif [[ "$1" == "--src" ]]; then
+      shift
+      SRCDIR="$(real_path $1)"
+      CLEANSRC=0
+    elif [[ "$1" == "--dst" ]]; then
+      shift
+      DSTDIR="$(real_path $1)"
+    else
+      DSTDIR="$(real_path $1)"
+    fi
+    shift
+
+    if [[ -z "$1" ]]; then
+      break
+    fi
+  done
+
+  if [[ -z "$DSTDIR" ]] && [[ -z "$SRCDIR" ]]; then
+    echo "No destination dir provided"
+    usage
+    exit 1
+  fi
+
+  if [[ -z "$SRCDIR" ]]; then
+    # make temp srcdir if none set
+    SRCDIR="$(mktemp -d -t tmp.XXXXXXXXXX)"
+  fi
+
+  prepare_src "$SRCDIR"
+
+  if [[ -z "$DSTDIR" ]]; then
+      # only want to prepare sources
+      exit
+  fi
+
+  if [[ -n ${PROJECT_NAME} ]]; then
+    PKG_NAME_FLAG="--project_name ${PROJECT_NAME}"
+  elif [[ ${NIGHTLY_BUILD} == "1" && ${GPU_BUILD} == "1" ]]; then
+    PKG_NAME_FLAG="--project_name tf_nightly_gpu"
+  elif [[ ${NIGHTLY_BUILD} == "1" ]]; then
+    PKG_NAME_FLAG="--project_name tf_nightly"
+  elif [[ ${GPU_BUILD} == "1" ]]; then
+    PKG_NAME_FLAG="--project_name tensorflow_gpu"
+  fi
+
+  build_wheel "$SRCDIR" "$DSTDIR" "$PKG_NAME_FLAG"
+
+  if [[ $CLEANSRC -ne 0 ]]; then
+    rm -rf "${TMPDIR}"
+  fi
+}
+
 main "$@"
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index d25a9e77b1..97f625e7e9 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -45,7 +45,7 @@ DOCLINES = __doc__.split('\n')
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.8.0'
+_VERSION = '1.9.0-rc0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
@@ -54,6 +54,7 @@ REQUIRED_PACKAGES = [
     'numpy >= 1.13.3',
     'six >= 1.10.0',
     'protobuf >= 3.4.0',
+    'setuptools <= 39.1.0',
     'tensorboard >= 1.8.0, < 1.9.0',
     'termcolor >= 1.1.0',
 ]
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
index 29add6d5ea..15d7c70281 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
@@ -814,6 +814,9 @@ void Generator::Generate(const FileDescriptor& fd) {
   // Add header to cc file.
   SetOutput(&cc_);
   Print("// GENERATED FILE - DO NOT MODIFY");
+  Print();
+  Print("#include <algorithm>");  // for `std::stable_sort()`
+  Print();
   headers = {GetProtoTextHeaderName(fd, true /* impl */)};
   AddHeadersToCurrentSection(headers);
   Print();
diff --git a/tensorflow/tools/quantization/quantize_graph_test.py b/tensorflow/tools/quantization/quantize_graph_test.py
index df71840b64..92bb5127da 100644
--- a/tensorflow/tools/quantization/quantize_graph_test.py
+++ b/tensorflow/tools/quantization/quantize_graph_test.py
@@ -119,8 +119,8 @@ def are_tensors_near(a, b, tolerance):
   flat_a = a.flatten()
   flat_b = b.flatten()
   if len(flat_a) != len(flat_b):
-    print("Tensors are different sizes: " + str(len(flat_a)) + " vs " + str(
-        len(flat_b)))
+    tf_logging.info("Tensors are different sizes: " + str(len(flat_a)) + " vs "
+                    + str(len(flat_b)))
     return False
   value_count = len(flat_a)
   how_many_different = 0
@@ -140,10 +140,10 @@ def are_tensors_near(a, b, tolerance):
   if how_many_different == 0:
     return True
   else:
-    print("Tensors have {0} different values ({1}%), with mean difference"
-          " {2} and mean absolute difference {3}".format(
-              how_many_different, proportion_different * 100, mean_difference,
-              mean_abs_difference))
+    tf_logging.info("Tensors have {0} different values ({1}%), with mean"
+                    " difference {2} and mean absolute difference {3}".format(
+                        how_many_different, proportion_different * 100,
+                        mean_difference, mean_abs_difference))
     return False
 
 
diff --git a/tensorflow/tools/test/upload_test_benchmarks.py b/tensorflow/tools/test/upload_test_benchmarks.py
index 9c45359ee1..c030575109 100644
--- a/tensorflow/tools/test/upload_test_benchmarks.py
+++ b/tensorflow/tools/test/upload_test_benchmarks.py
@@ -89,7 +89,6 @@ import shutil
 
 from six import text_type
 from google.cloud import datastore
-from six import text_type
 
 
 def is_real_file(dirpath, fname):
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 161d1dbd06..b4fbbd6c23 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -50,31 +50,31 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   mkl_repository(
       name = "mkl_linux",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_lnx_2018.0.2.20180127.tgz",
-          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_lnx_2018.0.2.20180127.tgz",
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.14/mklml_lnx_2018.0.3.20180406.tgz",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.14/mklml_lnx_2018.0.3.20180406.tgz"
       ],
-      sha256 = "74844bd77294742bf2396ff040369d1aa4cdd9e826fcd38cf8398ae83564d146",
-      strip_prefix = "mklml_lnx_2018.0.2.20180127",
+      sha256 = "d2305244fdc9b87db7426ed4496e87a4b3977ad3374d73b8000e8b7a5b7aa725",
+      strip_prefix = "mklml_lnx_2018.0.3.20180406",
       build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
   mkl_repository(
       name = "mkl_windows",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_win_2018.0.2.20180127.zip",
-          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_win_2018.0.2.20180127.zip"
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.14/mklml_win_2018.0.3.20180406.zip",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.14/mklml_win_2018.0.3.20180406.zip"
       ],
-      sha256 = "d8fbf0faa0684bffa3548005d05fe5cfe56ff9dbc0e15e7612d7ac01055a6ded",
-      strip_prefix = "mklml_win_2018.0.2.20180127",
+      sha256 = "a584a5bf1c8d2ad70b90d12b52652030e9a338217719064fdb84b7ad0d693694",
+      strip_prefix = "mklml_win_2018.0.3.20180406",
       build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
   mkl_repository(
       name = "mkl_darwin",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_mac_2018.0.2.20180127.tgz",
-          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_mac_2018.0.2.20180127.tgz"
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.14/mklml_mac_2018.0.3.20180406.tgz",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.14/mklml_mac_2018.0.3.20180406.tgz"
       ],
-      sha256 = "aa740d71e14562bfea56e6829e6dc186e7487cbcf6748a88dec73826b7ec1943",
-      strip_prefix = "mklml_mac_2018.0.2.20180127",
+      sha256 = "094e3dfd61c816136dc8d12a45cc611ce26c5f4828176a3644cd0b0efa15a25b",
+      strip_prefix = "mklml_mac_2018.0.3.20180406",
       build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
 
@@ -85,11 +85,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "mkl_dnn",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/v0.13.tar.gz",
-          "https://github.com/intel/mkl-dnn/archive/v0.13.tar.gz",
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/v0.14.tar.gz",
+          "https://github.com/intel/mkl-dnn/archive/v0.14.tar.gz",
       ],
-      sha256 = "d2cfd93a70cfe86ebe054477c530c9b5c1218b70f75856eb6d1956c68ee89e8f",
-      strip_prefix = "mkl-dnn-0.13",
+      sha256 = "efebc53882856afec86457a2da644693f5d59c68772d41d640d6b60a8efc4eb0",
+      strip_prefix = "mkl-dnn-0.14",
       build_file = clean_dep("//third_party/mkl_dnn:mkldnn.BUILD"),
   )
 
@@ -187,11 +187,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "highwayhash",
       urls = [
-          "https://mirror.bazel.build/github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
-          "https://github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
+          "http://mirror.bazel.build/github.com/google/highwayhash/archive/fd3d9af80465e4383162e4a7c5e2f406e82dd968.tar.gz",
+          "https://github.com/google/highwayhash/archive/fd3d9af80465e4383162e4a7c5e2f406e82dd968.tar.gz",
       ],
-      sha256 = "0f30a15b1566d93f146c8d149878a06e91d9bb7ec2cfd76906df62a82be4aac9",
-      strip_prefix = "highwayhash-dfcb97ca4fe9277bf9dc1802dd979b071896453b",
+      sha256 = "9c3e0e87d581feeb0c18d814d98f170ff23e62967a2bd6855847f0b2fe598a37",
+      strip_prefix = "highwayhash-fd3d9af80465e4383162e4a7c5e2f406e82dd968",
       build_file = clean_dep("//third_party:highwayhash.BUILD"),
   )
 
diff --git a/third_party/eigen.BUILD b/third_party/eigen.BUILD
index 07bb6645eb..e54c1a4501 100644
--- a/third_party/eigen.BUILD
+++ b/third_party/eigen.BUILD
@@ -64,6 +64,7 @@ cc_library(
         # This define (mostly) guarantees we don't link any problematic
         # code. We use it, but we do not rely on it, as evidenced above.
         "EIGEN_MPL2_ONLY",
+        "EIGEN_MAX_ALIGN_BYTES=64",
     ],
     includes = ["."],
     visibility = ["//visibility:public"],
diff --git a/third_party/highwayhash.BUILD b/third_party/highwayhash.BUILD
index 1b8e40765e..08cb84ea2c 100644
--- a/third_party/highwayhash.BUILD
+++ b/third_party/highwayhash.BUILD
@@ -10,6 +10,7 @@ cc_library(
     srcs = ["highwayhash/sip_hash.cc"],
     hdrs = [
         "highwayhash/sip_hash.h",
+        "highwayhash/endianess.h",
         "highwayhash/state_helpers.h",
     ],
     visibility = ["//visibility:public"],
diff --git a/third_party/jpeg/jpeg.BUILD b/third_party/jpeg/jpeg.BUILD
index 4418ac32fc..663a218733 100644
--- a/third_party/jpeg/jpeg.BUILD
+++ b/third_party/jpeg/jpeg.BUILD
@@ -291,8 +291,10 @@ cc_library(
         "jchuff.h",
         "jconfig.h",
         "jdct.h",
+        "jerror.h",
         "jinclude.h",
         "jmorecfg.h",
+        "jpegint.h",
         "jpeglib.h",
         "jsimd.h",
         "jsimddct.h",
diff --git a/third_party/png.BUILD b/third_party/png.BUILD
index 76ab32d69c..17c5449cc0 100644
--- a/third_party/png.BUILD
+++ b/third_party/png.BUILD
@@ -28,7 +28,14 @@ cc_library(
         "pngwrite.c",
         "pngwtran.c",
         "pngwutil.c",
-    ],
+    ] + select({
+        "@org_tensorflow//tensorflow:linux_ppc64le": [
+            "powerpc/powerpc_init.c",
+            "powerpc/filter_vsx_intrinsics.c",
+        ],
+        "//conditions:default": [
+        ],
+    }),
     hdrs = [
         "png.h",
         "pngconf.h",
diff --git a/third_party/py/python_configure.bzl b/third_party/py/python_configure.bzl
index 954f21f5f8..3c7e5c8469 100644
--- a/third_party/py/python_configure.bzl
+++ b/third_party/py/python_configure.bzl
@@ -6,6 +6,7 @@
   * `PYTHON_LIB_PATH`: Location of python libraries.
 """
 
+_BAZEL_SH = "BAZEL_SH"
 _PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
 _PYTHON_LIB_PATH = "PYTHON_LIB_PATH"
 _TF_PYTHON_CONFIG_REPO = "TF_PYTHON_CONFIG_REPO"
@@ -152,6 +153,22 @@ def _get_python_bin(repository_ctx):
             _PYTHON_BIN_PATH, repository_ctx.os.environ.get("PATH", "")))
 
 
+def _get_bash_bin(repository_ctx):
+  """Gets the bash bin path."""
+  bash_bin = repository_ctx.os.environ.get(_BAZEL_SH)
+  if bash_bin != None:
+    return bash_bin
+  else:
+    bash_bin_path = repository_ctx.which("bash")
+    if bash_bin_path != None:
+      return str(bash_bin_path)
+    else:
+      _fail("Cannot find bash in PATH, please make sure " +
+            "bash is installed and add its directory in PATH, or --define " +
+            "%s='/path/to/bash'.\nPATH=%s" % (
+                _BAZEL_SH, repository_ctx.os.environ.get("PATH", "")))
+
+
 def _get_python_lib(repository_ctx, python_bin):
   """Gets the python lib path."""
   python_lib = repository_ctx.os.environ.get(_PYTHON_LIB_PATH)
@@ -184,14 +201,14 @@ def _get_python_lib(repository_ctx, python_bin):
       "  print(paths[0])\n" +
       "END")
   cmd = '%s - %s' % (python_bin, print_lib)
-  result = repository_ctx.execute(["bash", "-c", cmd])
+  result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
   return result.stdout.strip('\n')
 
 
 def _check_python_lib(repository_ctx, python_lib):
   """Checks the python lib path."""
   cmd = 'test -d "%s" -a -x "%s"' % (python_lib, python_lib)
-  result = repository_ctx.execute(["bash", "-c", cmd])
+  result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
   if result.return_code == 1:
     _fail("Invalid python library path: %s" % python_lib)
 
@@ -199,7 +216,7 @@ def _check_python_lib(repository_ctx, python_lib):
 def _check_python_bin(repository_ctx, python_bin):
   """Checks the python bin path."""
   cmd =  '[[ -x "%s" ]] && [[ ! -d "%s" ]]' % (python_bin, python_bin)
-  result = repository_ctx.execute(["bash", "-c", cmd])
+  result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
   if result.return_code == 1:
     _fail("--define %s='%s' is not executable. Is it the python binary?" % (
         _PYTHON_BIN_PATH, python_bin))
@@ -294,6 +311,7 @@ def _python_autoconf_impl(repository_ctx):
 python_configure = repository_rule(
     implementation = _python_autoconf_impl,
     environ = [
+        _BAZEL_SH,
         _PYTHON_BIN_PATH,
         _PYTHON_LIB_PATH,
         _TF_PYTHON_CONFIG_REPO,
diff --git a/third_party/repo.bzl b/third_party/repo.bzl
index 36f5aa5bde..cb67d3e961 100644
--- a/third_party/repo.bzl
+++ b/third_party/repo.bzl
@@ -17,7 +17,6 @@
 _SINGLE_URL_WHITELIST = depset([
     "arm_compiler",
     "ortools_archive",
-    "gemmlowp",
 ])
 
 def _is_windows(ctx):
@@ -88,7 +87,9 @@ def _tf_http_archive(ctx):
   if ctx.attr.patch_file != None:
     _apply_patch(ctx, ctx.attr.patch_file)
   if ctx.attr.build_file != None:
-    ctx.template("BUILD", ctx.attr.build_file, {
+    # Use BUILD.bazel to avoid conflict with third party projects with
+    # BUILD or build (directory) underneath.
+    ctx.template("BUILD.bazel", ctx.attr.build_file, {
         "%prefix%": ".." if _repos_are_siblings() else "external",
     }, False)
 
-- 
cgit v1.2.3


From 4631936e61651101932073197c08b600006530a3 Mon Sep 17 00:00:00 2001
From: gracehoney <31743510+aaroey@users.noreply.github.com>
Date: Thu, 21 Jun 2018 15:23:05 -0700
Subject: Fix internal build errors.

---
 configure.py                                       |  2 +-
 tensorflow/contrib/tensorrt/BUILD                  |  1 +
 .../contrib/tensorrt/convert/convert_graph.cc      | 94 +++++++++++++---------
 .../contrib/tensorrt/convert/convert_nodes.cc      |  7 +-
 .../contrib/tensorrt/convert/convert_nodes.h       |  9 +--
 tensorflow/contrib/tensorrt/convert/utils.h        |  2 +-
 .../contrib/tensorrt/kernels/trt_engine_op.cc      | 28 +++----
 .../contrib/tensorrt/kernels/trt_engine_op.h       | 10 +--
 tensorflow/contrib/tensorrt/python/trt_convert.py  | 12 ++-
 .../tensorrt/resources/trt_int8_calibrator.cc      |  1 -
 .../contrib/tensorrt/resources/trt_resources.h     | 12 +--
 tensorflow/contrib/tensorrt/test/test_tftrt.py     | 11 ++-
 12 files changed, 101 insertions(+), 88 deletions(-)

(limited to 'configure.py')

diff --git a/configure.py b/configure.py
index a14d006a73..ad585fa52e 100644
--- a/configure.py
+++ b/configure.py
@@ -944,7 +944,7 @@ def set_tf_cudnn_version(environ_cp):
 
 
 def is_cuda_compatible(lib, cuda_ver, cudnn_ver):
-  """Check the compatibility between given library and cudnn/cudart libraries."""
+  """Check compatibility between given library and cudnn/cudart libraries."""
   ldd_bin = which('ldd') or '/usr/bin/ldd'
   ldd_out = run_shell([ldd_bin, lib], True)
   ldd_out = ldd_out.split(os.linesep)
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index e7b3fe38e5..adda0b758b 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -207,6 +207,7 @@ tf_cuda_library(
     ],
     deps = [
         ":trt_logging",
+        ":utils",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:lib_proto_parsing",
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index ba7d3b5f86..1c4fd4a0ce 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -49,13 +49,14 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"  // NOLINT
 #include "tensorflow/core/protobuf/device_properties.pb.h"  // NOLINT
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"  // NOLINT
 #include "tensorflow/core/util/device_name_utils.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include <cuda/include/cuda_runtime_api.h>
+#include "cuda/include/cuda_runtime_api.h"
 #include "tensorrt/include/NvInfer.h"
 namespace tensorflow {
 namespace tensorrt {
@@ -238,14 +239,14 @@ tensorflow::Status ConvertGraphDefToTensorRT(
 }
 
 // Function to get subsegment information structure.
-EngineInfo GetEngineInfo(
+tensorflow::Status GetEngineInfo(
     const tensorflow::Graph* g,
     const tensorflow::grappler::GraphProperties& graph_properties,
     const std::set<string>& segment_nodes,
     const std::unordered_map<string, tensorflow::Node*>& node_map,
-    const std::vector<tensorflow::Node*>& reverse_topo_order) {
+    const std::vector<tensorflow::Node*>& reverse_topo_order,
+    EngineInfo* info) {
   std::vector<int> subgraph_node_ids;
-  EngineInfo info;
   std::set<string> segment_devices;
   int input_port = 0;
   int output_port = 0;
@@ -296,9 +297,9 @@ EngineInfo GetEngineInfo(
             created_edges.insert({s, port});
             input_port++;
           }
-          info.connections.emplace_back(input_node->name(), input_node->id(),
-                                        edge->src_output(), node_name, node_id,
-                                        edge->dst_input(), true, port);
+          info->connections.emplace_back(input_node->name(), input_node->id(),
+                                         edge->src_output(), node_name, node_id,
+                                         edge->dst_input(), true, port);
         }
       }
     }
@@ -316,28 +317,28 @@ EngineInfo GetEngineInfo(
           created_edges.insert({s, port});
           output_port++;
         }
-        info.connections.emplace_back(output_node->name(), output_node->id(),
-                                      edge->dst_input(), node_name, node_id,
-                                      edge->src_output(), false, port);
+        info->connections.emplace_back(output_node->name(), output_node->id(),
+                                       edge->dst_input(), node_name, node_id,
+                                       edge->src_output(), false, port);
       }
     }
   }
 
-  ConvertSegmentToGraphDef(g, graph_properties, subgraph_node_ids,
-                           &info.connections, &info.segment_graph_def,
-                           &info.engine_name);
+  TF_RETURN_IF_ERROR(ConvertSegmentToGraphDef(
+      g, graph_properties, subgraph_node_ids, &info->connections,
+      &info->segment_graph_def, &info->engine_name));
   // TODO(sami): This should not happen once segmenter is updated.
   if (segment_devices.size() == 1) {
-    info.device = *segment_devices.begin();
+    info->device = *segment_devices.begin();
   } else if (segment_devices.size() > 1) {
     LOG(WARNING) << "Detected multiple(" << segment_devices.size()
                  << ") devices for the segment. Picking first one to continue "
                  << "but this shouldn't have happened";
-    info.device = *segment_devices.begin();
+    info->device = *segment_devices.begin();
   } else {
     VLOG(1) << "Segment devices size is 0";
   }
-  return info;
+  return Status::OK();
 }
 
 // Function to insert a TRT node into the graph. The graph is not modified if
@@ -562,7 +563,9 @@ tensorflow::Status RegisterSegmentFunctionToFunctionLibrary(
     tensorflow::NodeDefBuilder node_builder(
         StrCat(name, "_Arg"), tensorflow::FunctionLibraryDefinition::kArgOp);
     VLOG(1) << "Adding " << StrCat(name, "_Arg");
-    node_builder.Attr("T", node->output_type(0)).Attr("index", i).Finalize(&nd);
+    TF_RETURN_IF_ERROR(node_builder.Attr("T", node->output_type(0))
+                           .Attr("index", i)
+                           .Finalize(&nd));
     tensorflow::Status s;
     auto node_arg = sgraph.AddNode(nd, &s);
     if (!s.ok()) {
@@ -593,7 +596,9 @@ tensorflow::Status RegisterSegmentFunctionToFunctionLibrary(
     VLOG(1) << " input " << nout.node << ":" << nout.index
             << " dtype=" << tensorflow::DataTypeString(nout.data_type);
     node_builder.Input({nout});
-    node_builder.Attr("T", node->output_type(0)).Attr("index", i).Finalize(&nd);
+    TF_RETURN_IF_ERROR(node_builder.Attr("T", node->output_type(0))
+                           .Attr("index", i)
+                           .Finalize(&nd));
     if (VLOG_IS_ON(3)) {
       VLOG(3) << nd.DebugString();
     }
@@ -713,11 +718,12 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
     segment_options.exclude_node_list.insert(node);
   }
   segment_options.minimum_segment_size = params.minimum_segment_size;
-  tensorflow::tensorrt::segment::SegmentNodesVector segments;
+  tensorflow::tensorrt::segment::SegmentNodesVector initial_segments;
   TF_RETURN_IF_ERROR(tensorrt::segment::SegmentGraph(
-      &graph, IsTensorRTCandidate, segment_options, &segments));
-  if (segments.size() > 1) {
-    VLOG(0) << "MULTIPLE tensorrt candidate conversion: " << segments.size();
+      &graph, IsTensorRTCandidate, segment_options, &initial_segments));
+  if (initial_segments.size() > 1) {
+    VLOG(0) << "MULTIPLE tensorrt candidate conversion: "
+            << initial_segments.size();
   }
 
   // Get the EngineInfo for each segment.
@@ -725,17 +731,24 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
   TF_RETURN_IF_ERROR(BuildNodeMap(graph, &node_map));
   float total_num_nodes_in_segments = 0.;
   std::vector<EngineInfo> engine_segments;
-  engine_segments.reserve(segments.size());
+  engine_segments.reserve(initial_segments.size());
   std::vector<tensorflow::Node*> reverse_topo_order;
   tensorflow::GetPostOrder(graph, &reverse_topo_order);
   size_t total_engine_bytes_size = 0;
   std::vector<size_t> engine_bytes_size;
-  for (size_t t = 0; t < segments.size(); t++) {
-    auto& s = segments.at(t);
-    engine_segments.emplace_back(GetEngineInfo(&graph, *params.graph_properties,
-                                               s.first, node_map,
-                                               reverse_topo_order));
-    auto& curr_engine = engine_segments.back();
+  tensorflow::tensorrt::segment::SegmentNodesVector converted_segments;
+  converted_segments.reserve(initial_segments.size());
+  for (size_t t = 0; t < initial_segments.size(); t++) {
+    auto& curr_segment = initial_segments.at(t);
+    EngineInfo curr_engine;
+    Status status =
+        GetEngineInfo(&graph, *params.graph_properties, curr_segment.first,
+                      node_map, reverse_topo_order, &curr_engine);
+    if (!status.ok()) {
+      LOG(WARNING) << "Failed to get engine info for segment " << t << ": "
+                   << status;
+      continue;
+    }
     curr_engine.precision_mode = params.precision_mode;
     curr_engine.engine_type =
         (params.is_dyn_op || params.precision_mode == INT8MODE
@@ -744,12 +757,19 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
     curr_engine.cached_engine_batches = params.cached_engine_batches;
     curr_engine.maximum_cached_engines = params.max_cached_engines;
     StrAppend(&curr_engine.engine_name, "my_trt_op_", t);
-    RegisterSegmentFunctionToFunctionLibrary(
+    status = RegisterSegmentFunctionToFunctionLibrary(
         &graph, curr_engine.segment_graph_def, curr_engine.engine_name);
+    if (!status.ok()) {
+      LOG(WARNING) << "Failed to register segment graphdef as a function " << t
+                   << ": " << status;
+      continue;
+    }
 
     engine_bytes_size.push_back(curr_engine.segment_graph_def.ByteSizeLong());
     total_engine_bytes_size += engine_bytes_size.back();
-    total_num_nodes_in_segments += s.first.size();
+    total_num_nodes_in_segments += curr_segment.first.size();
+    engine_segments.push_back(std::move(curr_engine));
+    converted_segments.push_back(std::move(curr_segment));
 
     if (VLOG_IS_ON(8)) {
       string fname = curr_engine.engine_name;
@@ -775,7 +795,7 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
     engine.max_workspace_size_bytes =
         params.max_workspace_size_bytes *
         (engine_bytes_size.at(i) / total_engine_bytes_size +
-         segments.at(i).first.size() / total_num_nodes_in_segments) /
+         converted_segments.at(i).first.size() / total_num_nodes_in_segments) /
         2.0;
     // The allocator is used to build the engine. The build and the built engine
     // will be destroyed after we get the serialized engine string, so it's fine
@@ -793,17 +813,17 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
     cudaSetDevice(cuda_device_id);
     auto status = CreateTRTNode(&graph, engine_segments, i, alloc.get(),
                                 params.max_batch_size);
-    // If status is ok, we successfuly added the node to the graph and can
+    // If status is ok, we successfully added the node to the graph and can
     // remove segment ops. Otherwise graph is not modified.
     if (status.ok()) {
-      for (auto node_name : segments.at(i).first) {
+      for (auto node_name : converted_segments.at(i).first) {
         graph.RemoveNode(node_map.at(node_name));
       }
     } else {
       // Graph is not modified.
       LOG(WARNING) << "Engine creation for segment " << i << ", composed of "
-                   << segments.at(i).first.size() << " nodes failed: " << status
-                   << ". Skipping...";
+                   << converted_segments.at(i).first.size() << " nodes failed: "
+                   << status << ". Skipping...";
     }
   }
   cudaSetDevice(old_cuda_device);
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index b5214b461a..146b9c7344 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -2130,13 +2130,10 @@ void Converter::register_op_converters() {
 }  // namespace
 
 tensorflow::Status ConvertGraphDefToEngine(
-    const tensorflow::GraphDef& gdef,
-    int precision_mode,
-    int max_batch_size,
+    const tensorflow::GraphDef& gdef, int precision_mode, int max_batch_size,
     size_t max_workspace_size_bytes,
     const std::vector<tensorflow::PartialTensorShape>& input_shapes,
-    Logger* logger,
-    nvinfer1::IGpuAllocator* allocator,
+    Logger* logger, nvinfer1::IGpuAllocator* allocator,
     TRTInt8Calibrator* calibrator,
     TrtUniquePtrType<nvinfer1::ICudaEngine>* engine,
     bool* convert_successfully) {
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
index 2da4edf7f5..7684d8d4a2 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
@@ -78,7 +78,7 @@ struct EngineInfo {
   EngineInfo()
       : engine_type(EngineType::TRTStatic),
         max_workspace_size_bytes(0),
-        precision_mode(FP32MODE) {};
+        precision_mode(FP32MODE) {}
 
   string engine_name;
   string device;
@@ -120,13 +120,10 @@ tensorflow::Status ConvertSegmentToGraphDef(
 //   is successful. This is different than successfully building the engine:
 //   building can still fail afterwards.
 tensorflow::Status ConvertGraphDefToEngine(
-    const tensorflow::GraphDef& gdef,
-    int precision_mode,
-    int max_batch_size,
+    const tensorflow::GraphDef& gdef, int precision_mode, int max_batch_size,
     size_t max_workspace_size_bytes,
     const std::vector<tensorflow::PartialTensorShape>& input_shapes,
-    Logger* logger,
-    nvinfer1::IGpuAllocator* allocator,
+    Logger* logger, nvinfer1::IGpuAllocator* allocator,
     TRTInt8Calibrator* calibrator,
     TrtUniquePtrType<nvinfer1::ICudaEngine>* engine,
     bool* convert_successfully);
diff --git a/tensorflow/contrib/tensorrt/convert/utils.h b/tensorflow/contrib/tensorrt/convert/utils.h
index 021fdaf8c5..f601c06701 100644
--- a/tensorflow/contrib/tensorrt/convert/utils.h
+++ b/tensorflow/contrib/tensorrt/convert/utils.h
@@ -31,7 +31,7 @@ struct TrtDestroyer {
 template <typename T>
 using TrtUniquePtrType = std::unique_ptr<T, TrtDestroyer<T>>;
 
-}  // namespace convert
 }  // namespace tensorrt
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CONTRIB_TENSORRT_CONVERT_UTILS_H_
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
index d12f738ac5..75e32559bb 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
@@ -15,8 +15,8 @@ limitations under the License.
 #include "tensorflow/contrib/tensorrt/kernels/trt_engine_op.h"
 
 #include <algorithm>
-#include "tensorflow/contrib/tensorrt/convert/utils.h"
 #include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
+#include "tensorflow/contrib/tensorrt/convert/utils.h"
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
 #include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h"
 #include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
@@ -77,9 +77,8 @@ tensorflow::Status TRTEngineOp::ConstructFunctionHandle(OpKernelContext* ctx) {
   }
   auto fdef = lib->GetFunctionLibraryDefinition()->Find(funcdef_name_);
   if (fdef == nullptr) {
-    return tensorflow::errors::Internal(
-        "Native FunctionDef ", funcdef_name_,
-        " can't be found in function library");
+    return tensorflow::errors::Internal("Native FunctionDef ", funcdef_name_,
+                                        " can't be found in function library");
   }
   tensorflow::FunctionLibraryRuntime::InstantiateOptions inst_ops;
   inst_ops.overlay_lib = nullptr;
@@ -128,8 +127,8 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)
   } else if (precision_string == "INT8") {
     precision_mode_ = convert::INT8MODE;
   }
-  calibration_mode_ = (precision_mode_ == convert::INT8MODE &&
-                       calibration_data.size() == 0);
+  calibration_mode_ =
+      (precision_mode_ == convert::INT8MODE && calibration_data.size() == 0);
   if (calibration_data.size()) {
     calibrator_.reset(new TRTInt8Calibrator(calibration_data));
     calibration_data.resize(0);
@@ -291,8 +290,8 @@ void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx,
   std::vector<void*> buffers(num_binding);
   for (int i = 0; i < ctx->num_inputs(); i++) {
     const string inp_name = StrCat(kInputPHName, i);
-    const size_t binding_index = trt_engine_ptr->getBindingIndex(
-        inp_name.c_str());
+    const size_t binding_index =
+        trt_engine_ptr->getBindingIndex(inp_name.c_str());
 
     const Tensor& input_tensor = ctx->input(i);
     const TensorShape& input_shape = input_tensor.shape();
@@ -320,7 +319,7 @@ void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx,
       default:
         LOG(ERROR) << "Unknown TRT data type: " << int(dtype);
         ctx->SetStatus(tensorflow::errors::InvalidArgument(
-            "Unknown ouput TRT data type! ", int(dtype)));
+            "Unknown ouput TRT data type! ", static_cast<int>(dtype)));
         return;
     }
   }
@@ -343,8 +342,8 @@ void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx,
                                            &output_shape));
     } else {
       LOG(ERROR) << "output node not found, at " << output_name;
-      ctx->SetStatus(tensorflow::errors::Internal(
-          "output ", output_name, " couldn't be found!"));
+      ctx->SetStatus(tensorflow::errors::Internal("output ", output_name,
+                                                  " couldn't be found!"));
       return;
     }
     auto status = ctx->allocate_output(i, output_shape, &output_tensor);
@@ -370,7 +369,7 @@ void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx,
             "INT8 outputs are not supported!"));
         return;
       default:
-        LOG(ERROR) << "Unknown TRT data type: " << int(dtype);
+        LOG(ERROR) << "Unknown TRT data type: " << static_cast<int>(dtype);
         ctx->SetStatus(tensorflow::errors::InvalidArgument(
             "Unsupported output data type! ", int(dtype)));
         return;
@@ -442,7 +441,7 @@ TRTEngineOp::EngineCtxPair& TRTEngineOp::GetEngine(int batch_size,
     if (allocator == nullptr) {
       // GetAllocator already set the Status.
       return null_pair;
-    };
+    }
     infer->setGpuAllocator(allocator);
 #endif
     TrtUniquePtrType<nvinfer1::ICudaEngine> static_engine(
@@ -506,8 +505,7 @@ TRTEngineOp::EngineCtxPair& TRTEngineOp::GetEngine(int batch_size,
 }
 
 tensorflow::Status TRTEngineOp::AllocateCalibrationResources(
-    tensorflow::OpKernelContext* ctx,
-    TRTCalibrationResource** cr) {
+    tensorflow::OpKernelContext* ctx, TRTCalibrationResource** cr) {
   auto cres = new TRTCalibrationResource();
   *cr = cres;
   // Get the allocator.
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
index 0d2f9e8a9d..6fe318be6a 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
@@ -52,19 +52,17 @@ class TRTEngineOp : public AsyncOpKernel {
 
  private:
   // Execute calibration
-  void ExecuteCalibration(OpKernelContext* ctx,
-                          AsyncHelper* helper);
+  void ExecuteCalibration(OpKernelContext* ctx, AsyncHelper* helper);
 
   // Construct a function handle for executing native funcdef graph
   Status ConstructFunctionHandle(OpKernelContext* ctx);
 
   // Execute replaced native segment as function Op.
-  void ExecuteNativeSegment(OpKernelContext* ctx,
-                            AsyncHelper* helper);
+  void ExecuteNativeSegment(OpKernelContext* ctx, AsyncHelper* helper);
 
   // Allocate necessary resources for calibration
-  Status AllocateCalibrationResources(
-      OpKernelContext* ctx, TRTCalibrationResource** cr);
+  Status AllocateCalibrationResources(OpKernelContext* ctx,
+                                      TRTCalibrationResource** cr);
 
   // TODO(samikama): context should go to a resource manager!
   typedef std::pair<TrtUniquePtrType<nvinfer1::ICudaEngine>,
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert.py b/tensorflow/contrib/tensorrt/python/trt_convert.py
index 490c74a701..79f512dbcf 100644
--- a/tensorflow/contrib/tensorrt/python/trt_convert.py
+++ b/tensorflow/contrib/tensorrt/python/trt_convert.py
@@ -21,9 +21,9 @@ from __future__ import print_function
 # pylint: disable=unused-import,line-too-long
 import six as _six
 from tensorflow.contrib.tensorrt.wrap_conversion import calib_convert
-from tensorflow.contrib.tensorrt.wrap_conversion import trt_convert
-from tensorflow.contrib.tensorrt.wrap_conversion import get_loaded_tensorrt_version
 from tensorflow.contrib.tensorrt.wrap_conversion import get_linked_tensorrt_version
+from tensorflow.contrib.tensorrt.wrap_conversion import get_loaded_tensorrt_version
+from tensorflow.contrib.tensorrt.wrap_conversion import trt_convert
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.framework import errors
@@ -58,6 +58,10 @@ def create_inference_graph(input_graph_def,
     precision_mode: one of 'FP32', 'FP16' and 'INT8'
     minimum_segment_size: the minimum number of nodes required for a subgraph to
       be replaced by TRTEngineOp.
+    is_dynamic_op: whether to generate dynamic TRT ops which will build the TRT
+      network and engine at run time.
+    maximum_cached_engines: max number of cached TRT engines in dynamic TRT ops.
+    cached_engine_batches: batch sizes used to pre-create cached engines.
 
   Returns:
     New GraphDef with TRTEngineOps placed in graph replacing subgraphs.
@@ -81,7 +85,7 @@ def create_inference_graph(input_graph_def,
         "TensorRT %s but library loaded from environment is TensorRT %s" %
         (".".join([str(x) for x in compiled_version]),
          ".".join([str(x) for x in loaded_version])) +
-        ". Please make sure that correct version of TensorRT "\
+        ". Please make sure that correct version of TensorRT " +
         "is available in the system and added to ldconfig or LD_LIBRARY_PATH"
     )
     raise RuntimeError("Incompatible TensorRT library version")
@@ -178,7 +182,7 @@ def calib_graph_to_infer_graph(calibration_graph_def, is_dynamic_op=False):
   is_calib_graph = False
   for n in calibration_graph_def.node:
     if n.op == "TRTEngineOp":
-      is_calib_graph = is_calib_graph or len(n.attr["calibration_data"].s) == 0
+      is_calib_graph = is_calib_graph or not n.attr["calibration_data"].s
   if not is_calib_graph:
     tf_logging.error(
         "Not a calib graph. Doesn't seem to contain any calibration nodes.")
diff --git a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc b/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc
index 59ae860bc0..32e81858b9 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc
+++ b/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h"
 
 #include <atomic>
-#include <chrono>
 #include <unordered_map>
 
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resources.h b/tensorflow/contrib/tensorrt/resources/trt_resources.h
index 76863503bd..b7d5ffd674 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_resources.h
+++ b/tensorflow/contrib/tensorrt/resources/trt_resources.h
@@ -49,15 +49,15 @@ class TRTCalibrationResource : public tensorflow::ResourceBase {
 
   string DebugString() override {
     std::stringstream oss;
-    using std::hex;
     using std::dec;
     using std::endl;
+    using std::hex;
     oss << " Calibrator = " << hex << calibrator_.get() << dec << endl
-        << " Builder    = " << hex << builder_.get()    << dec << endl
-        << " Engine     = " << hex << engine_.get()     << dec << endl
-        << " Logger     = " << hex << &logger_          << dec << endl
-        << " Allocator  = " << hex << allocator_.get()  << dec << endl
-        << " Thread     = " << hex << thr_.get()        << dec << endl;
+        << " Builder    = " << hex << builder_.get() << dec << endl
+        << " Engine     = " << hex << engine_.get() << dec << endl
+        << " Logger     = " << hex << &logger_ << dec << endl
+        << " Allocator  = " << hex << allocator_.get() << dec << endl
+        << " Thread     = " << hex << thr_.get() << dec << endl;
     return oss.str();
   }
 
diff --git a/tensorflow/contrib/tensorrt/test/test_tftrt.py b/tensorflow/contrib/tensorrt/test/test_tftrt.py
index 5e74f9295d..090aa8bdb0 100644
--- a/tensorflow/contrib/tensorrt/test/test_tftrt.py
+++ b/tensorflow/contrib/tensorrt/test/test_tftrt.py
@@ -76,7 +76,7 @@ def get_multi_engine_graph_def(mode="FP32"):
   g = ops.Graph()
   with g.as_default():
     x = aops.placeholder(shape=[None, 3, 7, 5], name="input", dtype=dtype)
-    with g.name_scope("Global_scope") as scope:
+    with g.name_scope("Global_scope"):
       with g.name_scope("first_scope"):
         e = cop.constant(
             np.random.randn(3, 2, 3, 4), name="weights", dtype=dtype)
@@ -92,15 +92,14 @@ def get_multi_engine_graph_def(mode="FP32"):
 
         b = cop.constant(np.random.randn(1, 4, 1, 1), name="bias2", dtype=dtype)
         q = conv / b
-        c = cop.constant(np.random.randn(1, 4, 1, 1), name="bias3", dtype=dtype)
       edge = mops.sin(q)
       edge1 = mops.cos(conv)
       with g.name_scope("test_scope"):
         de = edge + edge1
-        t = t - edge1
-        q = q * edge
-        t = t + q
-        t = t - de
+        t -= edge1
+        q *= edge
+        t += q
+        t -= de
     k = aops.squeeze(t, name="output")
   print(k.dtype)
   return g.as_graph_def()
-- 
cgit v1.2.3


From 6896a74984efb4b1b77fc36ea274703536ba649d Mon Sep 17 00:00:00 2001
From: Jon Triebenbach <jlt@us.ibm.com>
Date: Wed, 27 Jun 2018 13:29:53 -0500
Subject: Build OpenBLAS 0.3.0 on ppc64le for TF tests

---
 configure.py                                       |  7 ++++++
 tensorflow/tools/ci_build/Dockerfile.cpu.ppc64le   |  2 +-
 tensorflow/tools/ci_build/Dockerfile.gpu.ppc64le   |  2 +-
 .../ci_build/install/install_openblas_ppc64le.sh   | 28 ++++++++++++++++++++++
 4 files changed, 37 insertions(+), 2 deletions(-)
 create mode 100755 tensorflow/tools/ci_build/install/install_openblas_ppc64le.sh

(limited to 'configure.py')

diff --git a/configure.py b/configure.py
index ad585fa52e..04ad1c9441 100644
--- a/configure.py
+++ b/configure.py
@@ -1465,6 +1465,13 @@ def main():
     environ_cp['TF_NEED_JEMALLOC'] = '0'
     environ_cp['TF_NEED_TENSORRT'] = '0'
 
+  # The numpy package on ppc64le uses OpenBLAS which has multi-threading
+  # issues that lead to incorrect answers.  Set OMP_NUM_THREADS=1 at
+  # runtime to allow the Tensorflow testcases which compare numpy
+  # results to Tensorflow results to succeed.
+  if is_ppc64le():
+    write_action_env_to_bazelrc("OMP_NUM_THREADS", 1)
+
   set_build_var(environ_cp, 'TF_NEED_JEMALLOC', 'jemalloc as malloc',
                 'with_jemalloc', True)
   set_build_var(environ_cp, 'TF_NEED_GCP', 'Google Cloud Platform',
diff --git a/tensorflow/tools/ci_build/Dockerfile.cpu.ppc64le b/tensorflow/tools/ci_build/Dockerfile.cpu.ppc64le
index e879c34bbd..ada2c63880 100644
--- a/tensorflow/tools/ci_build/Dockerfile.cpu.ppc64le
+++ b/tensorflow/tools/ci_build/Dockerfile.cpu.ppc64le
@@ -7,7 +7,7 @@ COPY install/*.sh /install/
 RUN /install/install_bootstrap_deb_packages.sh
 RUN add-apt-repository -y ppa:openjdk-r/ppa
 RUN /install/install_deb_packages.sh
-RUN apt-get update && apt-get install -y libopenblas-dev
+RUN /install/install_openblas_ppc64le.sh
 RUN /install/install_hdf5_ppc64le.sh
 RUN /install/install_pip_packages.sh
 RUN /install/install_bazel_from_source.sh
diff --git a/tensorflow/tools/ci_build/Dockerfile.gpu.ppc64le b/tensorflow/tools/ci_build/Dockerfile.gpu.ppc64le
index 8967138747..a404f129ab 100644
--- a/tensorflow/tools/ci_build/Dockerfile.gpu.ppc64le
+++ b/tensorflow/tools/ci_build/Dockerfile.gpu.ppc64le
@@ -13,7 +13,7 @@ ARG DEBIAN_FRONTEND=noninteractive
 RUN /install/install_bootstrap_deb_packages.sh
 RUN add-apt-repository -y ppa:openjdk-r/ppa
 RUN /install/install_deb_packages.sh
-RUN apt-get update && apt-get install -y libopenblas-dev
+RUN /install/install_openblas_ppc64le.sh 
 RUN /install/install_hdf5_ppc64le.sh
 RUN /install/install_pip_packages.sh
 RUN /install/install_bazel_from_source.sh
diff --git a/tensorflow/tools/ci_build/install/install_openblas_ppc64le.sh b/tensorflow/tools/ci_build/install/install_openblas_ppc64le.sh
new file mode 100755
index 0000000000..9ace25a36f
--- /dev/null
+++ b/tensorflow/tools/ci_build/install/install_openblas_ppc64le.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+OPENBLAS_SRC_PATH=/tmp/openblas_src/
+POWER="POWER8"
+USE_OPENMP="USE_OPENMP=1"
+OPENBLAS_INSTALL_PATH="/usr"
+apt-get install -y gfortran gfortran-5
+sudo rm -rf ${OPENBLAS_SRC_PATH}
+git clone -b release-0.3.0 https://github.com/xianyi/OpenBLAS ${OPENBLAS_SRC_PATH}
+cd ${OPENBLAS_SRC_PATH}
+# Pick up fix for OpenBLAS issue 1571
+git cherry-pick -X theirs 961d25e9c7e4a1758adb1dbeaa15187de69dd052
+make TARGET=${POWER} ${USE_OPENMP} FC=gfortran
+make PREFIX=${OPENBLAS_INSTALL_PATH} install
-- 
cgit v1.2.3


From 1e7b0e4ad6d0f57f3241fe0b80a65f2c2a7f11b0 Mon Sep 17 00:00:00 2001
From: Mingxing Tan <tanmingxing@google.com>
Date: Thu, 28 Jun 2018 19:13:20 -0700
Subject: Merge changes from github.

PiperOrigin-RevId: 202585094
---
 .gitignore                                         |    1 +
 configure.py                                       |   72 +-
 tensorflow/BUILD                                   |   26 +
 tensorflow/c/c_api.cc                              |    6 +-
 tensorflow/compiler/aot/codegen.cc                 |    2 +-
 tensorflow/compiler/xla/rpc/BUILD                  |    6 +-
 tensorflow/compiler/xla/service/BUILD              |    1 -
 tensorflow/compiler/xla/service/hlo_instruction.cc |    1 +
 .../compiler/xla/service/hlo_instruction_test.cc   |   34 +
 tensorflow/contrib/autograph/converters/BUILD      |    5 +-
 .../contrib/autograph/operators/control_flow.py    |    2 +-
 .../contrib/autograph/pyct/static_analysis/cfg.py  |    2 +-
 tensorflow/contrib/autograph/pyct/transformer.py   |    4 +-
 tensorflow/contrib/cmake/CMakeLists.txt            |   36 +-
 .../contrib/cmake/external/double_conversion.cmake |    6 +-
 tensorflow/contrib/cmake/external/mkl.cmake        |   68 ++
 tensorflow/contrib/cmake/external/mkldnn.cmake     |   12 +-
 tensorflow/contrib/cmake/tf_python.cmake           |   77 +-
 tensorflow/contrib/cmake/tf_shared_lib.cmake       |    5 +
 .../contrib/constrained_optimization/README.md     |    2 +-
 .../python/swap_regret_optimizer.py                |    8 +-
 .../python/kernel_tests/slide_dataset_op_test.py   |   42 +-
 tensorflow/contrib/data/python/ops/sliding.py      |    2 +-
 .../nmt_with_attention/nmt_with_attention.ipynb    |  909 +++++++++++++++++
 .../gan/python/estimator/python/head_impl.py       |    6 +-
 .../gan/python/estimator/python/head_test.py       |    9 +-
 tensorflow/contrib/gdr/gdr_server_lib.cc           |    2 +-
 .../optimized/depthwiseconv_uint8_3x3_filter.h     |    2 +-
 .../interpreter_wrapper/interpreter_wrapper.h      |    2 +
 tensorflow/contrib/opt/BUILD                       |   20 +
 tensorflow/contrib/opt/__init__.py                 |   11 +-
 .../opt/python/training/weight_decay_optimizers.py |  362 +++++++
 .../training/weight_decay_optimizers_test.py       |  188 ++++
 .../contrib/solvers/python/ops/linear_equations.py |    1 -
 tensorflow/contrib/tensorrt/BUILD                  |   20 +-
 .../contrib/tensorrt/convert/convert_graph.cc      | 1027 +++++++++++++-------
 .../contrib/tensorrt/convert/convert_graph.h       |   61 +-
 .../contrib/tensorrt/convert/convert_nodes.cc      |  801 +++++----------
 .../contrib/tensorrt/convert/convert_nodes.h       |  133 ++-
 .../tensorrt/convert/trt_optimization_pass.cc      |   48 +-
 .../tensorrt/convert/trt_optimization_pass.h       |    3 +
 tensorflow/contrib/tensorrt/convert/utils.h        |   37 +
 .../contrib/tensorrt/kernels/trt_engine_op.cc      |  588 +++++++++--
 .../contrib/tensorrt/kernels/trt_engine_op.h       |   98 +-
 tensorflow/contrib/tensorrt/ops/trt_engine_op.cc   |   18 +-
 tensorflow/contrib/tensorrt/python/trt_convert.py  |   55 +-
 .../contrib/tensorrt/resources/trt_allocator.cc    |    2 +-
 .../contrib/tensorrt/resources/trt_allocator.h     |    5 +-
 .../tensorrt/resources/trt_int8_calibrator.cc      |   34 +-
 .../tensorrt/resources/trt_int8_calibrator.h       |   35 +-
 .../contrib/tensorrt/resources/trt_resources.h     |   49 +-
 tensorflow/contrib/tensorrt/segment/segment.h      |    7 +-
 tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc   |   76 +-
 tensorflow/contrib/tensorrt/test/test_tftrt.py     |  138 ++-
 tensorflow/contrib/tensorrt/trt_conversion.i       |   98 +-
 tensorflow/contrib/tpu/profiler/BUILD              |    2 +-
 tensorflow/contrib/verbs/BUILD                     |    4 +-
 tensorflow/core/api_def/BUILD                      |    7 +
 .../api_def_SampleDistortedBoundingBox.pbtxt       |    2 +-
 .../api_def_SampleDistortedBoundingBoxV2.pbtxt     |    2 +-
 .../api_def/base_api/api_def_SlideDataset.pbtxt    |    2 +-
 .../core/api_def/java_api/api_def_Assert.pbtxt     |    4 +
 .../core/api_def/java_api/api_def_Const.pbtxt      |    4 +
 .../core/api_def/java_api/api_def_Switch.pbtxt     |    4 +
 .../direct_session_with_tracking_alloc_test.cc     |   18 +-
 .../core/common_runtime/mkl_cpu_allocator.cc       |    7 +
 tensorflow/core/debug/BUILD                        |    4 +-
 tensorflow/core/distributed_runtime/BUILD          |    4 +-
 tensorflow/core/distributed_runtime/eager/BUILD    |    4 +-
 tensorflow/core/distributed_runtime/rpc/BUILD      |   36 +-
 .../core/distributed_runtime/rpc/eager/BUILD       |    7 +-
 .../distributed_runtime/rpc/grpc_server_lib.cc     |    6 +
 .../core/distributed_runtime/rpc/grpc_server_lib.h |    3 +
 tensorflow/core/graph/mkl_layout_pass_test.cc      |   21 +-
 tensorflow/core/kernels/data/slide_dataset_op.cc   |   51 +-
 tensorflow/core/kernels/mkl_conv_ops.cc            |  332 ++++---
 tensorflow/core/kernels/reduction_gpu_kernels.cu.h |    2 +-
 tensorflow/core/kernels/segment_reduction_ops.h    |    6 +
 tensorflow/core/ops/math_ops.cc                    |    8 +-
 tensorflow/core/platform/cloud/oauth_client.cc     |    4 +-
 tensorflow/core/platform/default/build_config.bzl  |    5 +-
 tensorflow/core/platform/windows/port.cc           |    5 +
 .../core/profiler/internal/tfprof_timeline.cc      |   16 +-
 tensorflow/core/util/mkl_util.h                    |   32 +-
 tensorflow/docs_src/get_started/index.md           |   29 +
 tensorflow/docs_src/guide/debugger.md              |    2 +-
 tensorflow/go/attrs.go                             |  245 +++++
 tensorflow/go/attrs_test.go                        |  193 ++++
 tensorflow/go/op/wrappers.go                       |    9 +-
 tensorflow/go/operation.go                         |   66 ++
 tensorflow/go/operation_test.go                    |   62 ++
 tensorflow/java/BUILD                              |    5 +
 tensorflow/java/maven/.gitignore                   |    6 +
 tensorflow/java/maven/README.md                    |    6 +
 tensorflow/java/maven/hadoop/pom.xml               |   24 +
 tensorflow/java/maven/pom.xml                      |    2 +
 tensorflow/java/maven/run_inside_container.sh      |   47 +-
 tensorflow/java/maven/spark-connector/pom.xml      |   24 +
 tensorflow/java/src/gen/cc/op_generator.cc         |   11 +-
 tensorflow/java/src/gen/cc/op_specs.h              |    2 +
 .../tensorflow/processor/OperatorProcessor.java    |  348 ++++++-
 tensorflow/python/estimator/canned/baseline.py     |    4 +-
 tensorflow/python/estimator/export/export.py       |    6 +-
 tensorflow/python/keras/datasets/boston_housing.py |    7 +-
 tensorflow/python/keras/datasets/mnist.py          |   10 +-
 tensorflow/python/keras/datasets/reuters.py        |    6 +-
 tensorflow/python/keras/layers/__init__.py         |    2 +
 tensorflow/python/keras/layers/merge.py            |    4 +
 .../python/kernel_tests/dynamic_stitch_op_test.py  |    1 -
 tensorflow/python/lib/core/numpy.h                 |    2 +
 tensorflow/python/lib/core/py_util.cc              |    2 +
 tensorflow/python/ops/image_ops_impl.py            |  103 +-
 tensorflow/python/ops/image_ops_test.py            |   96 ++
 tensorflow/python/ops/math_ops_test.py             |    9 +
 tensorflow/python/ops/special_math_ops.py          |    2 +
 tensorflow/python/ops/special_math_ops_test.py     |   10 +-
 tensorflow/python/ops/state_ops.py                 |    4 +-
 tensorflow/python/training/checkpoint_utils.py     |    2 +-
 tensorflow/tf_framework_version_script.lds         |   11 +
 tensorflow/tools/api/golden/tensorflow.image.pbtxt |    4 +
 .../golden/tensorflow.keras.layers.-minimum.pbtxt  |  176 ++++
 .../golden/tensorflow.keras.layers.-subtract.pbtxt |  176 ++++
 .../tools/api/golden/tensorflow.keras.layers.pbtxt |   16 +
 tensorflow/tools/ci_build/Dockerfile.cpu.ppc64le   |   19 +
 tensorflow/tools/ci_build/Dockerfile.gpu.ppc64le   |   27 +
 tensorflow/tools/ci_build/ci_build.sh              |    4 +-
 .../tools/ci_build/ci_parameterized_build.sh       |    8 +-
 .../ci_build/install/install_bazel_from_source.sh  |   40 +
 .../install/install_buildifier_from_source.sh      |   30 +
 .../ci_build/install/install_golang_ppc64le.sh     |   22 +
 .../tools/ci_build/install/install_pip_packages.sh |    4 +
 .../install/install_python3.5_pip_packages.sh      |    3 +
 .../install/install_python3.6_pip_packages.sh      |    6 +-
 tensorflow/tools/ci_build/linux/gpu/run_mkl.sh     |   47 +
 .../tools/ci_build/linux/mkl/basic-mkl-gpu-test.sh |   29 +
 tensorflow/tools/git/gen_git_source.py             |   11 +-
 tensorflow/tools/lib_package/BUILD                 |    4 +-
 tensorflow/tools/pip_package/BUILD                 |    2 +-
 tensorflow/tools/pip_package/build_pip_package.sh  |   21 +-
 tensorflow/tools/pip_package/setup.py              |    4 +-
 tensorflow/workspace.bzl                           |   80 +-
 third_party/curl.BUILD                             |   22 +-
 third_party/flatbuffers/flatbuffers.BUILD          |    2 +
 third_party/jsoncpp.BUILD                          |    7 +-
 third_party/libxsmm.BUILD                          |    2 +-
 145 files changed, 6294 insertions(+), 1701 deletions(-)
 create mode 100644 tensorflow/contrib/cmake/external/mkl.cmake
 create mode 100644 tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
 create mode 100644 tensorflow/contrib/opt/python/training/weight_decay_optimizers.py
 create mode 100644 tensorflow/contrib/opt/python/training/weight_decay_optimizers_test.py
 create mode 100644 tensorflow/contrib/tensorrt/convert/utils.h
 create mode 100644 tensorflow/core/api_def/java_api/api_def_Assert.pbtxt
 create mode 100644 tensorflow/core/api_def/java_api/api_def_Const.pbtxt
 create mode 100644 tensorflow/core/api_def/java_api/api_def_Switch.pbtxt
 create mode 100644 tensorflow/docs_src/get_started/index.md
 create mode 100644 tensorflow/go/attrs.go
 create mode 100644 tensorflow/go/attrs_test.go
 create mode 100644 tensorflow/java/maven/hadoop/pom.xml
 create mode 100644 tensorflow/java/maven/spark-connector/pom.xml
 create mode 100644 tensorflow/tf_framework_version_script.lds
 create mode 100644 tensorflow/tools/api/golden/tensorflow.keras.layers.-minimum.pbtxt
 create mode 100644 tensorflow/tools/api/golden/tensorflow.keras.layers.-subtract.pbtxt
 create mode 100644 tensorflow/tools/ci_build/Dockerfile.cpu.ppc64le
 create mode 100644 tensorflow/tools/ci_build/Dockerfile.gpu.ppc64le
 create mode 100755 tensorflow/tools/ci_build/install/install_bazel_from_source.sh
 create mode 100755 tensorflow/tools/ci_build/install/install_buildifier_from_source.sh
 create mode 100755 tensorflow/tools/ci_build/install/install_golang_ppc64le.sh
 create mode 100755 tensorflow/tools/ci_build/linux/gpu/run_mkl.sh
 create mode 100755 tensorflow/tools/ci_build/linux/mkl/basic-mkl-gpu-test.sh

(limited to 'configure.py')

diff --git a/.gitignore b/.gitignore
index 828bbe9bd3..b5306b8b79 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,7 @@ __pycache__
 cmake_build/
 .idea/**
 /build/
+[Bb]uild/
 /tensorflow/core/util/version_info.cc
 /tensorflow/python/framework/fast_tensor_util.cpp
 Pods
diff --git a/configure.py b/configure.py
index ada342a50a..ad585fa52e 100644
--- a/configure.py
+++ b/configure.py
@@ -943,6 +943,35 @@ def set_tf_cudnn_version(environ_cp):
   write_action_env_to_bazelrc('TF_CUDNN_VERSION', tf_cudnn_version)
 
 
+def is_cuda_compatible(lib, cuda_ver, cudnn_ver):
+  """Check compatibility between given library and cudnn/cudart libraries."""
+  ldd_bin = which('ldd') or '/usr/bin/ldd'
+  ldd_out = run_shell([ldd_bin, lib], True)
+  ldd_out = ldd_out.split(os.linesep)
+  cudnn_pattern = re.compile('.*libcudnn.so\\.?(.*) =>.*$')
+  cuda_pattern = re.compile('.*libcudart.so\\.?(.*) =>.*$')
+  cudnn = None
+  cudart = None
+  cudnn_ok = True  # assume no cudnn dependency by default
+  cuda_ok = True  # assume no cuda dependency by default
+  for line in ldd_out:
+    if 'libcudnn.so' in line:
+      cudnn = cudnn_pattern.search(line)
+      cudnn_ok = False
+    elif 'libcudart.so' in line:
+      cudart = cuda_pattern.search(line)
+      cuda_ok = False
+  if cudnn and len(cudnn.group(1)):
+    cudnn = convert_version_to_int(cudnn.group(1))
+  if cudart and len(cudart.group(1)):
+    cudart = convert_version_to_int(cudart.group(1))
+  if cudnn is not None:
+    cudnn_ok = (cudnn == cudnn_ver)
+  if cudart is not None:
+    cuda_ok = (cudart == cuda_ver)
+  return cudnn_ok and cuda_ok
+
+
 def set_tf_tensorrt_install_path(environ_cp):
   """Set TENSORRT_INSTALL_PATH and TF_TENSORRT_VERSION.
 
@@ -959,8 +988,8 @@ def set_tf_tensorrt_install_path(environ_cp):
     raise ValueError('Currently TensorRT is only supported on Linux platform.')
 
   # Ask user whether to add TensorRT support.
-  if str(int(get_var(
-      environ_cp, 'TF_NEED_TENSORRT', 'TensorRT', False))) != '1':
+  if str(int(get_var(environ_cp, 'TF_NEED_TENSORRT', 'TensorRT',
+                     False))) != '1':
     return
 
   for _ in range(_DEFAULT_PROMPT_ASK_ATTEMPTS):
@@ -973,47 +1002,29 @@ def set_tf_tensorrt_install_path(environ_cp):
 
     # Result returned from "read" will be used unexpanded. That make "~"
     # unusable. Going through one more level of expansion to handle that.
-    trt_install_path = os.path.realpath(
-        os.path.expanduser(trt_install_path))
+    trt_install_path = os.path.realpath(os.path.expanduser(trt_install_path))
 
     def find_libs(search_path):
       """Search for libnvinfer.so in "search_path"."""
       fl = set()
       if os.path.exists(search_path) and os.path.isdir(search_path):
-        fl.update([os.path.realpath(os.path.join(search_path, x))
-                   for x in os.listdir(search_path) if 'libnvinfer.so' in x])
+        fl.update([
+            os.path.realpath(os.path.join(search_path, x))
+            for x in os.listdir(search_path)
+            if 'libnvinfer.so' in x
+        ])
       return fl
 
     possible_files = find_libs(trt_install_path)
     possible_files.update(find_libs(os.path.join(trt_install_path, 'lib')))
     possible_files.update(find_libs(os.path.join(trt_install_path, 'lib64')))
-
-    def is_compatible(tensorrt_lib, cuda_ver, cudnn_ver):
-      """Check the compatibility between tensorrt and cudnn/cudart libraries."""
-      ldd_bin = which('ldd') or '/usr/bin/ldd'
-      ldd_out = run_shell([ldd_bin, tensorrt_lib]).split(os.linesep)
-      cudnn_pattern = re.compile('.*libcudnn.so\\.?(.*) =>.*$')
-      cuda_pattern = re.compile('.*libcudart.so\\.?(.*) =>.*$')
-      cudnn = None
-      cudart = None
-      for line in ldd_out:
-        if 'libcudnn.so' in line:
-          cudnn = cudnn_pattern.search(line)
-        elif 'libcudart.so' in line:
-          cudart = cuda_pattern.search(line)
-      if cudnn and len(cudnn.group(1)):
-        cudnn = convert_version_to_int(cudnn.group(1))
-      if cudart and len(cudart.group(1)):
-        cudart = convert_version_to_int(cudart.group(1))
-      return (cudnn == cudnn_ver) and (cudart == cuda_ver)
-
     cuda_ver = convert_version_to_int(environ_cp['TF_CUDA_VERSION'])
     cudnn_ver = convert_version_to_int(environ_cp['TF_CUDNN_VERSION'])
     nvinfer_pattern = re.compile('.*libnvinfer.so.?(.*)$')
     highest_ver = [0, None, None]
 
     for lib_file in possible_files:
-      if is_compatible(lib_file, cuda_ver, cudnn_ver):
+      if is_cuda_compatible(lib_file, cuda_ver, cudnn_ver):
         matches = nvinfer_pattern.search(lib_file)
         if len(matches.groups()) == 0:
           continue
@@ -1029,12 +1040,13 @@ def set_tf_tensorrt_install_path(environ_cp):
     # Try another alternative from ldconfig.
     ldconfig_bin = which('ldconfig') or '/sbin/ldconfig'
     ldconfig_output = run_shell([ldconfig_bin, '-p'])
-    search_result = re.search(
-        '.*libnvinfer.so\\.?([0-9.]*).* => (.*)', ldconfig_output)
+    search_result = re.search('.*libnvinfer.so\\.?([0-9.]*).* => (.*)',
+                              ldconfig_output)
     if search_result:
       libnvinfer_path_from_ldconfig = search_result.group(2)
       if os.path.exists(libnvinfer_path_from_ldconfig):
-        if is_compatible(libnvinfer_path_from_ldconfig, cuda_ver, cudnn_ver):
+        if is_cuda_compatible(libnvinfer_path_from_ldconfig, cuda_ver,
+                              cudnn_ver):
           trt_install_path = os.path.dirname(libnvinfer_path_from_ldconfig)
           tf_tensorrt_version = search_result.group(1)
           break
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index e4530a5962..233fe21fbf 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -154,6 +154,12 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "linux_s390x",
+    values = {"cpu": "s390x"},
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
     name = "debug",
     values = {
@@ -459,6 +465,15 @@ filegroup(
 tf_cc_shared_object(
     name = "libtensorflow_framework.so",
     framework_so = [],
+    linkopts = select({
+        "//tensorflow:darwin": [],
+        "//tensorflow:windows": [],
+        "//tensorflow:windows_msvc": [],
+        "//conditions:default": [
+            "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
+            "$(location //tensorflow:tf_framework_version_script.lds)",
+        ],
+    }),
     linkstatic = 1,
     visibility = ["//visibility:public"],
     deps = [
@@ -468,6 +483,7 @@ tf_cc_shared_object(
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry_impl",
         "//tensorflow/core:lib_internal_impl",
         "//tensorflow/stream_executor:stream_executor_impl",
+        "//tensorflow:tf_framework_version_script.lds",
     ] + tf_additional_binary_deps(),
 )
 
@@ -571,3 +587,13 @@ py_library(
     visibility = ["//visibility:public"],
     deps = ["//tensorflow/python:no_contrib"],
 )
+
+cc_library(
+    name = "grpc",
+    deps = ["@grpc"],
+)
+
+cc_library(
+    name = "grpc++",
+    deps = ["@grpc//:grpc++"],
+)
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 37c8302e08..5c218d3f25 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -2068,7 +2068,8 @@ TF_ImportGraphDefResults* TF_GraphImportGraphDefWithResults(
     TF_Graph* graph, const TF_Buffer* graph_def,
     const TF_ImportGraphDefOptions* options, TF_Status* status) {
   GraphDef def;
-  if (!def.ParseFromArray(graph_def->data, graph_def->length)) {
+  if (!tensorflow::ParseProtoUnlimited(&def, graph_def->data,
+                                       graph_def->length)) {
     status->status = InvalidArgument("Invalid GraphDef");
     return nullptr;
   }
@@ -2098,7 +2099,8 @@ void TF_GraphImportGraphDefWithReturnOutputs(
     return;
   }
   GraphDef def;
-  if (!def.ParseFromArray(graph_def->data, graph_def->length)) {
+  if (!tensorflow::ParseProtoUnlimited(&def, graph_def->data,
+                                       graph_def->length)) {
     status->status = InvalidArgument("Invalid GraphDef");
     return;
   }
diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc
index 0025842aea..28070d60db 100644
--- a/tensorflow/compiler/aot/codegen.cc
+++ b/tensorflow/compiler/aot/codegen.cc
@@ -287,7 +287,7 @@ Status GenerateHeader(const CodegenOpts& opts, const tf2xla::Config& config,
   TF_RETURN_IF_ERROR(ValidateFeedFetchCppNames(config));
   const int64 result_index = compile_result.aot->result_buffer_index();
   const xla::BufferSizes& temp_sizes = compile_result.aot->buffer_sizes();
-  if (result_index < 0 || result_index > temp_sizes.size()) {
+  if (result_index < 0 || result_index >= temp_sizes.size()) {
     return errors::InvalidArgument("result index: ", result_index,
                                    " is outside the range of temp sizes: [0,",
                                    temp_sizes.size(), ")");
diff --git a/tensorflow/compiler/xla/rpc/BUILD b/tensorflow/compiler/xla/rpc/BUILD
index 1775666652..0b1cec1925 100644
--- a/tensorflow/compiler/xla/rpc/BUILD
+++ b/tensorflow/compiler/xla/rpc/BUILD
@@ -39,10 +39,10 @@ tf_cc_binary(
     srcs = ["grpc_service_main.cc"],
     deps = [
         ":grpc_service",
+        "//tensorflow:grpc++",
         "//tensorflow/compiler/xla/service:cpu_plugin",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
-        "@grpc//:grpc++",
     ],
 )
 
@@ -54,6 +54,7 @@ tf_cc_test(
     ],
     deps = [
         ":grpc_stub",
+        "//tensorflow:grpc++",
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -61,7 +62,6 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "@grpc//:grpc++",
     ],
 )
 
@@ -71,9 +71,9 @@ cc_library(
     hdrs = ["grpc_service.h"],
     deps = [
         ":xla_service_proto",
+        "//tensorflow:grpc++",
         "//tensorflow/compiler/xla/service",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
-        "@grpc//:grpc++",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index ae0749edb9..fe99f700d2 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -2550,7 +2550,6 @@ cc_library(
     name = "hlo_tfgraph_builder",
     srcs = ["hlo_tfgraph_builder.cc"],
     hdrs = ["hlo_tfgraph_builder.h"],
-    visibility = ["//tensorflow/compiler/xla/tools:__pkg__"],
     deps = [
         ":hlo",
         "//tensorflow/compiler/xla:literal_util",
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 088c97fbe3..5aaeec802f 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -1515,6 +1515,7 @@ bool HloInstruction::IdenticalSlowPath(
 
     // Remaining instructions with special values.
     case HloOpcode::kCall:
+      return eq_computations(to_apply(), other.to_apply());
     case HloOpcode::kConditional:
       return eq_computations(true_computation(), other.true_computation()) &&
              eq_computations(false_computation(), other.false_computation());
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index e1c5123774..d8ca99dfd1 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -924,6 +924,40 @@ TEST_F(HloInstructionTest, IdenticalInstructions) {
       *HloInstruction::CreateBinary(shape, HloOpcode::kDivide, op1, op2)));
 }
 
+TEST_F(HloInstructionTest, IdenticalCallInstructions) {
+  const char* const hlo_string = R"(
+HloModule Module
+
+subcomp1 (x: f32[]) -> f32[] {
+  x = f32[] parameter(0)
+  ROOT n = f32[] sine(x)
+}
+
+subcomp2 (x: f32[]) -> f32[] {
+  x = f32[] parameter(0)
+  ROOT n = f32[] cosine(x)
+}
+
+ENTRY entry (param: f32[]) -> (f32[], f32[], f32[]) {
+  p = f32[] parameter(0)
+  t1 = f32[] call(p), to_apply=subcomp1
+  t2 = f32[] call(p), to_apply=subcomp1
+  t3 = f32[] call(p), to_apply=subcomp2
+  ROOT t = (f32[], f32[], f32[]) tuple(t1, t2, t3)
+ }
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto* t1 = root->operand(0);
+  auto* t2 = root->operand(1);
+  auto* t3 = root->operand(2);
+
+  EXPECT_TRUE(StructuralEqual(*t1, *t2));
+  EXPECT_FALSE(StructuralEqual(*t1, *t3));
+}
+
 TEST_F(HloInstructionTest, FunctionVisitor) {
   // Verify the function visitor HloInstruction::Accept visits all instructions
   // from a root properly given the following graph:
diff --git a/tensorflow/contrib/autograph/converters/BUILD b/tensorflow/contrib/autograph/converters/BUILD
index 931ff62064..b2e2e27673 100644
--- a/tensorflow/contrib/autograph/converters/BUILD
+++ b/tensorflow/contrib/autograph/converters/BUILD
@@ -120,7 +120,10 @@ py_test(
     name = "decorators_test",
     srcs = ["decorators_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
+    tags = [
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":converters",
         "//tensorflow/contrib/autograph/core:test_lib",
diff --git a/tensorflow/contrib/autograph/operators/control_flow.py b/tensorflow/contrib/autograph/operators/control_flow.py
index 671c9ccc13..988df70157 100644
--- a/tensorflow/contrib/autograph/operators/control_flow.py
+++ b/tensorflow/contrib/autograph/operators/control_flow.py
@@ -51,7 +51,7 @@ def for_stmt(iter_, extra_test, body, init_state):
   Args:
     iter_: The entity being iterated over.
     extra_test: Callable with the state as arguments, and boolean return type.
-        An additionnal loop condition.
+        An additional loop condition.
     body: Callable with the iterate and the state as arguments, and
         state as return type. The actual loop body.
     init_state: Tuple containing the initial state.
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/cfg.py b/tensorflow/contrib/autograph/pyct/static_analysis/cfg.py
index 358d56ce20..4acc4ed66a 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/cfg.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/cfg.py
@@ -286,7 +286,7 @@ class Forward(object):
 
   # TODO(alexbw): see if we can simplify by visiting breadth-first
   def visit(self, node):
-    """Depth-first walking the CFG, applying dataflow information propagtion."""
+    """Depth-first walking the CFG, applying dataflow info propagation."""
     # node.value is None only for the exit CfgNode.
     if not node.value:
       return
diff --git a/tensorflow/contrib/autograph/pyct/transformer.py b/tensorflow/contrib/autograph/pyct/transformer.py
index 3328dde7aa..7655811830 100644
--- a/tensorflow/contrib/autograph/pyct/transformer.py
+++ b/tensorflow/contrib/autograph/pyct/transformer.py
@@ -218,7 +218,7 @@ class Base(gast.NodeTransformer):
 
   # TODO(mdan): Once we have error tracing, we may be able to just go to SSA.
   def apply_to_single_assignments(self, targets, values, apply_fn):
-    """Applies a fuction to each individual assignment.
+    """Applies a function to each individual assignment.
 
     This function can process a possibly-unpacked (e.g. a, b = c, d) assignment.
     It tries to break down the unpacking if possible. In effect, it has the same
@@ -246,7 +246,7 @@ class Base(gast.NodeTransformer):
           targets field of an ast.Assign node.
       values: an AST node.
       apply_fn: a function of a single argument, which will be called with the
-          respective nodes of each single assignment. The signaure is
+          respective nodes of each single assignment. The signature is
           apply_fn(target, value), no return value.
     """
     if not isinstance(targets, (list, tuple)):
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index e524e9e743..4ca7a1b28c 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -336,40 +336,14 @@ endif()
 # MKL Support
 if (tensorflow_ENABLE_MKL_SUPPORT)
   add_definitions(-DINTEL_MKL -DEIGEN_USE_VML)
-  if (WIN32)
-    find_path(MKL_HOME_PLATFORM mkl
-      PATHS ${MKL_HOME} ${MKL_HOME}/../ ${MKL_HOME}/../../
-      $ENV{MKLROOT} $ENV{MKLROOT}/../ $ENV{MKLROOT}/../../
-      PATH_SUFFIXES windows)
-    set(MKL_INCLUDE_DIRS ${MKL_HOME_PLATFORM}/mkl/include)
-    set(MKL_LINK_DIRS
-      ${MKL_HOME_PLATFORM}/mkl/lib/intel64
-      ${MKL_HOME_PLATFORM}/tbb/lib/intel64/vc_mt
-      ${MKL_HOME_PLATFORM}/compiler/lib/intel64
-      ${MKL_HOME_PLATFORM}/mkl/tools/builder/lib)
-    set(MKL_REDIST_DLL_DIRS
-      ${MKL_HOME_PLATFORM}/redist/intel64/mkl
-      ${MKL_HOME_PLATFORM}/redist/intel64/tbb/vc_mt
-      ${MKL_HOME_PLATFORM}/redist/intel64/compiler)
-    list(APPEND tensorflow_EXTERNAL_LIBRARIES
-      mkl_intel_lp64_dll mkl_sequential_dll mkl_core_dll mkl_rt mkl_cdll_intel64)
-  endif()
-  if (UNIX)
-    # Fix me: complete the path on linux
-    find_path(MKL_HOME_PLATFORM mkl
-      HINTS ${MKL_HOME} ${MKL_HOME}/../ ${MKL_HOME}/../../
-      $ENV{MKLROOT} $ENV{MKLROOT}/../ $ENV{MKLROOT}/../../
-      PATH_SUFFIXES linux)
-    set(MKL_INCLUDE_DIRS ${MKL_HOME_PLATFORM}/mkl/include)
-    set(MKL_LINK_DIRS) # incompleted
-    set(MKL_REDIST_SO_DIRS) # incompleted
-  endif()
-  include_directories(${MKL_INCLUDE_DIRS})
-  link_directories(${MKL_LINK_DIRS})
+  include(mkl)
+  list(APPEND tensorflow_EXTERNAL_LIBRARIES ${mkl_STATIC_LIBRARIES})
+  list(APPEND tensorflow_EXTERNAL_DEPENDENCIES mkl_copy_shared_to_destination)
+  include_directories(${mkl_INCLUDE_DIRS})
   if (tensorflow_ENABLE_MKLDNN_SUPPORT)
     include(mkldnn)
     list(APPEND tensorflow_EXTERNAL_LIBRARIES ${mkldnn_STATIC_LIBRARIES})
-    list(APPEND tensorflow_EXTERNAL_DEPENDENCIES mkldnn)
+    list(APPEND tensorflow_EXTERNAL_DEPENDENCIES mkldnn_copy_shared_to_destination)
     include_directories(${mkldnn_INCLUDE_DIRS})
   else (tensorflow_ENABLE_MKLDNN_SUPPORT)
     add_definitions(-DINTEL_MKL_ML)
diff --git a/tensorflow/contrib/cmake/external/double_conversion.cmake b/tensorflow/contrib/cmake/external/double_conversion.cmake
index 527ccdc8d8..5c5adaf579 100644
--- a/tensorflow/contrib/cmake/external/double_conversion.cmake
+++ b/tensorflow/contrib/cmake/external/double_conversion.cmake
@@ -16,15 +16,15 @@ include (ExternalProject)
 
 set(double_conversion_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/double_conversion/src/double_conversion)
 set(double_conversion_URL https://github.com/google/double-conversion.git)
-set(double_conversion_TAG 5664746)
+set(double_conversion_TAG 3992066a95b823efc8ccc1baf82a1cfc73f6e9b8)
 set(double_conversion_BUILD ${double_conversion_INCLUDE_DIR})
 set(double_conversion_LIBRARIES ${double_conversion_BUILD}/double-conversion/libdouble-conversion.so)
 set(double_conversion_INCLUDES ${double_conversion_BUILD})
 
 if(WIN32)
-  set(double_conversion_STATIC_LIBRARIES ${double_conversion_BUILD}/double-conversion/$(Configuration)/double-conversion.lib)
+  set(double_conversion_STATIC_LIBRARIES ${double_conversion_BUILD}/$(Configuration)/double-conversion.lib)
 else()
-  set(double_conversion_STATIC_LIBRARIES ${double_conversion_BUILD}/double-conversion/libdouble-conversion.a)
+  set(double_conversion_STATIC_LIBRARIES ${double_conversion_BUILD}/libdouble-conversion.a)
 endif()
 
 set(double_conversion_HEADERS
diff --git a/tensorflow/contrib/cmake/external/mkl.cmake b/tensorflow/contrib/cmake/external/mkl.cmake
new file mode 100644
index 0000000000..a172e3a41a
--- /dev/null
+++ b/tensorflow/contrib/cmake/external/mkl.cmake
@@ -0,0 +1,68 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+include (ExternalProject)
+
+# NOTE: Different from mkldnn.cmake, this file is meant to download mkl libraries
+set(mkl_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/mkl/src/mkl/include)
+set(mkl_BIN_DIRS ${CMAKE_CURRENT_BINARY_DIR}/mkl/bin)
+set(mkl_WIN mklml_win_2018.0.3.20180406.zip) # match for v0.14
+set(mkl_MAC mklml_mac_2018.0.3.20180406.tgz)
+set(mkl_LNX mklml_lnx_2018.0.3.20180406.tgz)
+set(mkl_TAG v0.14)
+set(mkl_URL https://github.com/intel/mkl-dnn/releases)
+
+if (WIN32)
+  set(mkl_DOWNLOAD_URL ${mkl_URL}/download/${mkl_TAG}/${mkl_WIN})
+  list(APPEND mkl_STATIC_LIBRARIES
+    ${CMAKE_CURRENT_BINARY_DIR}/mkl/src/mkl/lib/mklml.lib)
+  list(APPEND mkl_STATIC_LIBRARIES
+    ${CMAKE_CURRENT_BINARY_DIR}/mkl/src/mkl/lib/libiomp5md.lib)
+  list(APPEND mkl_SHARED_LIBRARIES
+    ${CMAKE_CURRENT_BINARY_DIR}/mkl/src/mkl/lib/mklml.dll)
+  list(APPEND mkl_SHARED_LIBRARIES
+    ${CMAKE_CURRENT_BINARY_DIR}/mkl/src/mkl/lib/libiomp5md.dll)
+elseif (UNIX)
+  set(mkl_DOWNLOAD_URL ${mkl_URL}/download/${mkl_TAG}/${mkl_LNX})
+  list(APPEND mkl_SHARED_LIBRARIES
+    ${CMAKE_CURRENT_BINARY_DIR}/mkl/src/mkl/lib/libiomp5.so)
+  list(APPEND mkl_SHARED_LIBRARIES
+    ${CMAKE_CURRENT_BINARY_DIR}/mkl/src/mkl/lib/libmklml_gnu.so)
+  list(APPEND mkl_SHARED_LIBRARIES
+    ${CMAKE_CURRENT_BINARY_DIR}/mkl/src/mkl/lib/libmklml_intel.so)
+elseif (APPLE)
+  set(mkl_DOWNLOAD_URL ${mkl_URL}/download/${mkl_TAG}/${mkl_MAC})
+  #TODO need more information
+endif ()
+
+ExternalProject_Add(mkl
+    PREFIX mkl
+    URL ${mkl_DOWNLOAD_URL}
+    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+    UPDATE_COMMAND ""
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+    INSTALL_COMMAND "")
+
+# put mkl dynamic libraries in one bin directory
+add_custom_target(mkl_create_destination_dir
+  COMMAND ${CMAKE_COMMAND} -E make_directory ${mkl_BIN_DIRS}
+  DEPENDS mkl)
+
+add_custom_target(mkl_copy_shared_to_destination DEPENDS mkl_create_destination_dir)
+
+foreach(dll_file ${mkl_SHARED_LIBRARIES})
+  add_custom_command(TARGET mkl_copy_shared_to_destination PRE_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${dll_file} ${mkl_BIN_DIRS})
+endforeach()
diff --git a/tensorflow/contrib/cmake/external/mkldnn.cmake b/tensorflow/contrib/cmake/external/mkldnn.cmake
index a639fdee36..8123ee1f39 100644
--- a/tensorflow/contrib/cmake/external/mkldnn.cmake
+++ b/tensorflow/contrib/cmake/external/mkldnn.cmake
@@ -22,8 +22,11 @@ set(mkldnn_TAG 3063b2e4c943983f6bf5f2fb9a490d4a998cd291)
 if(WIN32)
   if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
     set(mkldnn_STATIC_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src/Release/mkldnn.lib)
+    set(mkldnn_SHARED_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src/Release/mkldnn.dll)
+    set(mkldnn_BUILD ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src/Release)
   else()
     set(mkldnn_STATIC_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src/mkldnn.lib)
+    set(mkldnn_SHARED_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src/mkldnn.dll)
   endif()
 else()
     set(mkldnn_STATIC_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src/libmkldnn.a)
@@ -31,6 +34,7 @@ endif()
 
 ExternalProject_Add(mkldnn
     PREFIX mkldnn
+    DEPENDS mkl
     GIT_REPOSITORY ${mkldnn_URL}
     GIT_TAG ${mkldnn_TAG}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
@@ -40,5 +44,11 @@ ExternalProject_Add(mkldnn
     CMAKE_CACHE_ARGS
         -DCMAKE_BUILD_TYPE:STRING=Release
         -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-        -DMKLINC:STRING=${MKL_INCLUDE_DIRS}
+        -DMKLINC:STRING=${mkl_INCLUDE_DIRS}
 )
+
+# since mkldnn depends on mkl, copy the mkldnn.dll together with mklml.dll to mkl_bin_dirs
+add_custom_target(mkldnn_copy_shared_to_destination DEPENDS mkldnn)
+
+add_custom_command(TARGET mkldnn_copy_shared_to_destination PRE_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${mkldnn_SHARED_LIBRARIES} ${mkl_BIN_DIRS})
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index df6702a42c..e3b59001bc 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -755,26 +755,65 @@ set(api_init_list_file "${tensorflow_source_dir}/api_init_files_list.txt")
 file(WRITE "${api_init_list_file}" "${api_init_files}")
 
 # Run create_python_api.py to generate __init__.py files.
-add_custom_command(
-      OUTPUT ${api_init_files}
-      DEPENDS tf_python_ops tf_python_copy_scripts_to_destination pywrap_tensorflow_internal tf_python_touchup_modules tf_extension_ops
-
-      # tensorflow/__init__.py depends on files generated in this step. So, remove it while
-      # this step is running since the files aren't there yet.
-      COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
-
-      # Run create_python_api.py to generate API init files.
-      COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/tf_python ${PYTHON_EXECUTABLE}
-              "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tools/api/generator/create_python_api.py"
-              "--root_init_template=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/api_template.__init__.py"
-              "--apidir=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow"
-              "--package=tensorflow.python"
-              "--apiname=tensorflow"
-              "${api_init_list_file}"
 
-      COMMENT "Generating __init__.py files for Python API."
-      WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/tf_python"
-)
+### TODO
+# In order to download and compile MKL/MKL-DNN automatically in cmake script, mkl-built libraries should be added to system path
+# to be loaded by python executor. However `add_custom_command` has an issue with `COMMAND ${CMAKE_COMMAND} -E env PATH=`, where
+# arguments of multiple paths (such as D:/;D:/mkl) will be parsed in to seperate string without semicolon and that command fail to
+# recongnize paths. As CUDA isn't built with MKL, the MKL built directory is the only path to this command to work around that issue.
+# To not override the CUDA and system path in other circumstances, `if-else` branch used here to handle this problem,
+# and should be removed if the path issue can be resolved.
+###
+
+if (tensorflow_ENABLE_MKL_SUPPORT)
+    # add mkl dist dlls to system path for python
+    # TODO: In current cmake version, PY_RUNTIME_ENV behaves strange with multiple paths,
+    # so we have to specify only one path in it to work around the issue. We need this if/else
+    # to protect overwriting CUDA environments
+    set(PY_RUNTIME_ENV ${mkl_BIN_DIRS})
+    add_custom_command(
+          OUTPUT ${api_init_files}
+          DEPENDS tf_python_ops tf_python_copy_scripts_to_destination pywrap_tensorflow_internal tf_python_touchup_modules tf_extension_ops
+
+          # tensorflow/__init__.py depends on files generated in this step. So, remove it while
+          # this step is running since the files aren't there yet.
+          COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
+
+          # Run create_python_api.py to generate API init files.
+          COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/tf_python PATH=${PY_RUNTIME_ENV} ${PYTHON_EXECUTABLE}
+                  "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tools/api/generator/create_python_api.py"
+                  "--root_init_template=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/api_template.__init__.py"
+                  "--apidir=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow"
+                  "--package=tensorflow.python"
+                  "--apiname=tensorflow"
+                  "${api_init_list_file}"
+
+          COMMENT "Generating __init__.py files for Python API."
+          WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/tf_python"
+          VERBATIM
+    )
+else (tensorflow_ENABLE_MKL_SUPPORT)
+    add_custom_command(
+          OUTPUT ${api_init_files}
+          DEPENDS tf_python_ops tf_python_copy_scripts_to_destination pywrap_tensorflow_internal tf_python_touchup_modules tf_extension_ops
+
+          # tensorflow/__init__.py depends on files generated in this step. So, remove it while
+          # this step is running since the files aren't there yet.
+          COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
+
+          # Run create_python_api.py to generate API init files.
+          COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/tf_python ${PYTHON_EXECUTABLE}
+                  "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tools/api/generator/create_python_api.py"
+                  "--root_init_template=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/api_template.__init__.py"
+                  "--apidir=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow"
+                  "--package=tensorflow.python"
+                  "--apiname=tensorflow"
+                  "${api_init_list_file}"
+
+          COMMENT "Generating __init__.py files for Python API."
+          WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/tf_python"
+    )
+endif (tensorflow_ENABLE_MKL_SUPPORT)
 
 add_custom_target(tf_python_api SOURCES ${api_init_files})
 add_dependencies(tf_python_api tf_python_ops)
diff --git a/tensorflow/contrib/cmake/tf_shared_lib.cmake b/tensorflow/contrib/cmake/tf_shared_lib.cmake
index 38f40452b5..fdf522f1fd 100644
--- a/tensorflow/contrib/cmake/tf_shared_lib.cmake
+++ b/tensorflow/contrib/cmake/tf_shared_lib.cmake
@@ -145,3 +145,8 @@ install(DIRECTORY ${tensorflow_source_dir}/third_party/eigen3/
 # unsupported Eigen directory
 install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen/unsupported/Eigen/
         DESTINATION include/unsupported/Eigen)
+# mkl
+if (tensorflow_ENABLE_MKL_SUPPORT)
+    install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/mkl/src/mkl/include/
+            DESTINATION include/mkl)
+endif (tensorflow_ENABLE_MKL_SUPPORT)
diff --git a/tensorflow/contrib/constrained_optimization/README.md b/tensorflow/contrib/constrained_optimization/README.md
index c65a150464..cb1dd7d836 100644
--- a/tensorflow/contrib/constrained_optimization/README.md
+++ b/tensorflow/contrib/constrained_optimization/README.md
@@ -46,7 +46,7 @@ document.
 Imagine that we want to constrain the recall of a binary classifier to be at
 least 90%. Since the recall is proportional to the number of true positive
 classifications, which itself is a sum of indicator functions, this constraint
-is non-differentible, and therefore cannot be used in a problem that will be
+is non-differentiable, and therefore cannot be used in a problem that will be
 optimized using a (stochastic) gradient-based algorithm.
 
 For this and similar problems, TFCO supports so-called *proxy constraints*,
diff --git a/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py
index 04014ab4ae..3791dae8d7 100644
--- a/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py
+++ b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py
@@ -169,8 +169,8 @@ def _project_stochastic_matrix_wrt_euclidean_norm(matrix):
     del old_inactive  # Needed by the condition, but not the body.
     iteration += 1
     scale = (1.0 - standard_ops.reduce_sum(
-        matrix, axis=0, keep_dims=True)) / standard_ops.maximum(
-            1.0, standard_ops.reduce_sum(inactive, axis=0, keep_dims=True))
+        matrix, axis=0, keepdims=True)) / standard_ops.maximum(
+            1.0, standard_ops.reduce_sum(inactive, axis=0, keepdims=True))
     matrix += scale * inactive
     new_inactive = standard_ops.to_float(matrix > 0)
     matrix *= new_inactive
@@ -206,10 +206,10 @@ def _project_log_stochastic_matrix_wrt_kl_divergence(log_matrix):
 
   # For numerical reasons, make sure that the largest matrix element is zero
   # before exponentiating.
-  log_matrix -= standard_ops.reduce_max(log_matrix, axis=0, keep_dims=True)
+  log_matrix -= standard_ops.reduce_max(log_matrix, axis=0, keepdims=True)
   log_matrix -= standard_ops.log(
       standard_ops.reduce_sum(
-          standard_ops.exp(log_matrix), axis=0, keep_dims=True))
+          standard_ops.exp(log_matrix), axis=0, keepdims=True))
   return log_matrix
 
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
index 33c48e20be..5590a4bf78 100644
--- a/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
@@ -58,6 +58,7 @@ class SlideDatasetTest(test.TestCase):
                      [t.shape.as_list() for t in get_next])
 
     with self.test_session() as sess:
+      # stride < window_size.
       # Slide over a finite input, where the window_size divides the
       # total number of elements.
       sess.run(init_op, feed_dict={count: 20, window_size: 14, stride: 7})
@@ -71,11 +72,9 @@ class SlideDatasetTest(test.TestCase):
                                 result_component[j])
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
-
       # Slide over a finite input, where the window_size does not
       # divide the total number of elements.
       sess.run(init_op, feed_dict={count: 20, window_size: 17, stride: 9})
-
       num_batches = (20 * 7 - 17) // 9 + 1
       for i in range(num_batches):
         result = sess.run(get_next)
@@ -86,6 +85,41 @@ class SlideDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+      # stride == window_size.
+      sess.run(init_op, feed_dict={count: 20, window_size: 14, stride: 14})
+      num_batches = 20 * 7 // 14
+      for i in range(num_batches):
+        result = sess.run(get_next)
+        for component, result_component in zip(components, result):
+          for j in range(14):
+            self.assertAllEqual(component[(i*14 + j) % 7]**2,
+                                result_component[j])
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # stride > window_size.
+      sess.run(init_op, feed_dict={count: 20, window_size: 10, stride: 14})
+      num_batches = 20 * 7 // 14
+      for i in range(num_batches):
+        result = sess.run(get_next)
+        for component, result_component in zip(components, result):
+          for j in range(10):
+            self.assertAllEqual(component[(i*14 + j) % 7]**2,
+                                result_component[j])
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+      # Drop the last batch which is smaller than window_size.
+      sess.run(init_op, feed_dict={count: 20, window_size: 14, stride: 19})
+      num_batches = (20 * 7 - 7) // 19  # = 19 * 7 // 19
+      for i in range(num_batches):
+        result = sess.run(get_next)
+        for component, result_component in zip(components, result):
+          for j in range(14):
+            self.assertAllEqual(component[(i*19 + j) % 7]**2,
+                                result_component[j])
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
       # Slide over a finite input, which is less than window_size,
       # should fail straight away.
       sess.run(init_op, feed_dict={count: 1, window_size: 10, stride: 4})
@@ -108,10 +142,6 @@ class SlideDatasetTest(test.TestCase):
       # Invalid stride should be an initialization time error.
       with self.assertRaises(errors.InvalidArgumentError):
         sess.run(init_op, feed_dict={count: 14, window_size: 3, stride: 0})
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(init_op, feed_dict={count: 14, window_size: 3, stride: 3})
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(init_op, feed_dict={count: 14, window_size: 3, stride: 5})
 
   def assertSparseValuesEqual(self, a, b):
     self.assertAllEqual(a.indices, b.indices)
diff --git a/tensorflow/contrib/data/python/ops/sliding.py b/tensorflow/contrib/data/python/ops/sliding.py
index f935beb1a9..3f3c5ca17c 100644
--- a/tensorflow/contrib/data/python/ops/sliding.py
+++ b/tensorflow/contrib/data/python/ops/sliding.py
@@ -86,7 +86,7 @@ def sliding_window_batch(window_size, stride=1):
       elements in the sliding window.
     stride: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
       steps moving the sliding window forward for one iteration. The default
-      is `1`. It must be in `[1, window_size)`.
+      is `1`. It must be positive.
 
   Returns:
     A `Dataset` transformation function, which can be passed to
diff --git a/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb b/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
new file mode 100644
index 0000000000..54ebcad8e9
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
@@ -0,0 +1,909 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "nmt_with_attention.ipynb",
+      "version": "0.3.2",
+      "views": {},
+      "default_view": {},
+      "provenance": [
+        {
+          "file_id": "1C4fpM7_7IL8ZzF7Gc5abywqQjeQNS2-U",
+          "timestamp": 1527858391290
+        },
+        {
+          "file_id": "1pExo6aUuw0S6MISFWoinfJv0Ftm9V4qv",
+          "timestamp": 1527776041613
+        }
+      ],
+      "private_outputs": true,
+      "collapsed_sections": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "metadata": {
+        "id": "AOpGoE2T-YXS",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "##### Copyright 2018 The TensorFlow Authors.\n",
+        "\n",
+        "Licensed under the Apache License, Version 2.0 (the \"License\").\n",
+        "\n",
+        "# Neural Machine Translation with Attention\n",
+        "\n",
+        "<table align=\"left\"><td>\n",
+        "<a target=\"_blank\"  href=\"https://colab.sandbox.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb\">\n",
+        "    <img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>  \n",
+        "</td><td>\n",
+        "<a target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb\"><img width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on Github</a></td></table>"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "CiwtNgENbx2g",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "This notebook trains a sequence to sequence (seq2seq) model for Spanish to English translation using [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager). This is an advanced example that assumes some knowledge of sequence to sequence models.\n",
+        "\n",
+        "After training the model in this notebook, you will be able to input a Spanish sentence, such as *\"¿todavia estan en casa?\"*, and return the English translation: *\"are you still at home?\"*\n",
+        "\n",
+        "The translation quality is reasonable for a toy example, but the generated attention plot is perhaps more interesting. This shows which parts of the input sentence has the model's attention while translating:\n",
+        "\n",
+        "<img src=\"https://tensorflow.org/images/spanish-english.png\" alt=\"spanish-english attention plot\">\n",
+        "\n",
+        "Note: This example takes approximately 10 mintues to run on a single P100 GPU."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "tnxXKDjq3jEL",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "from __future__ import absolute_import, division, print_function\n",
+        "\n",
+        "# Import TensorFlow >= 1.9 and enable eager execution\n",
+        "import tensorflow as tf\n",
+        "\n",
+        "tf.enable_eager_execution()\n",
+        "\n",
+        "import matplotlib.pyplot as plt\n",
+        "from sklearn.model_selection import train_test_split\n",
+        "\n",
+        "import unicodedata\n",
+        "import re\n",
+        "import numpy as np\n",
+        "import os\n",
+        "import time\n",
+        "\n",
+        "print(tf.__version__)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "wfodePkj3jEa",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "## Download and prepare the dataset\n",
+        "\n",
+        "We'll use a language dataset provided by http://www.manythings.org/anki/. This dataset contains language translation pairs in the format:\n",
+        "\n",
+        "```\n",
+        "May I borrow this book?\t¿Puedo tomar prestado este libro?\n",
+        "```\n",
+        "\n",
+        "There are a variety of languages available, but we'll use the English-Spanish dataset. For convenience, we've hosted a copy of this dataset on Google Cloud, but you can also download your own copy. After downloading the dataset, here are the steps we'll take to prepare the data:\n",
+        "\n",
+        "1. Add a *start* and *end* token to each sentence.\n",
+        "2. Clean the sentences by removing special characters.\n",
+        "3. Create a word index and reverse word index (dictionaries mapping from word → id and id → word).\n",
+        "4. Pad each sentence to a maximum length."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "kRVATYOgJs1b",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "# Download the file\n",
+        "path_to_zip = tf.keras.utils.get_file(\n",
+        "    'spa-eng.zip', origin='http://download.tensorflow.org/data/spa-eng.zip', \n",
+        "    extract=True)\n",
+        "\n",
+        "path_to_file = os.path.dirname(path_to_zip)+\"/spa-eng/spa.txt\""
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "rd0jw-eC3jEh",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "# Converts the unicode file to ascii\n",
+        "def unicode_to_ascii(s):\n",
+        "    return ''.join(c for c in unicodedata.normalize('NFD', s)\n",
+        "        if unicodedata.category(c) != 'Mn')\n",
+        "\n",
+        "\n",
+        "def preprocess_sentence(w):\n",
+        "    w = unicode_to_ascii(w.lower().strip())\n",
+        "    \n",
+        "    # creating a space between a word and the punctuation following it\n",
+        "    # eg: \"he is a boy.\" => \"he is a boy .\" \n",
+        "    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation\n",
+        "    w = re.sub(r\"([?.!,¿])\", r\" \\1 \", w)\n",
+        "    w = re.sub(r'[\" \"]+', \" \", w)\n",
+        "    \n",
+        "    # replacing everything with space except (a-z, A-Z, \".\", \"?\", \"!\", \",\")\n",
+        "    w = re.sub(r\"[^a-zA-Z?.!,¿]+\", \" \", w)\n",
+        "    \n",
+        "    w = w.rstrip().strip()\n",
+        "    \n",
+        "    # adding a start and an end token to the sentence\n",
+        "    # so that the model know when to start and stop predicting.\n",
+        "    w = '<start> ' + w + ' <end>'\n",
+        "    return w"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "OHn4Dct23jEm",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "# 1. Remove the accents\n",
+        "# 2. Clean the sentences\n",
+        "# 3. Return word pairs in the format: [ENGLISH, SPANISH]\n",
+        "def create_dataset(path, num_examples):\n",
+        "    lines = open(path, encoding='UTF-8').read().strip().split('\\n')\n",
+        "    \n",
+        "    word_pairs = [[preprocess_sentence(w) for w in l.split('\\t')]  for l in lines[:num_examples]]\n",
+        "    \n",
+        "    return word_pairs"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "9xbqO7Iie9bb",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "# This class creates a word -> index mapping (e.g,. \"dad\" -> 5) and vice-versa \n",
+        "# (e.g., 5 -> \"dad\") for each language,\n",
+        "class LanguageIndex():\n",
+        "  def __init__(self, lang):\n",
+        "    self.lang = lang\n",
+        "    self.word2idx = {}\n",
+        "    self.idx2word = {}\n",
+        "    self.vocab = set()\n",
+        "    \n",
+        "    self.create_index()\n",
+        "    \n",
+        "  def create_index(self):\n",
+        "    for phrase in self.lang:\n",
+        "      self.vocab.update(phrase.split(' '))\n",
+        "    \n",
+        "    self.vocab = sorted(self.vocab)\n",
+        "    \n",
+        "    self.word2idx['<pad>'] = 0\n",
+        "    for index, word in enumerate(self.vocab):\n",
+        "      self.word2idx[word] = index + 1\n",
+        "    \n",
+        "    for word, index in self.word2idx.items():\n",
+        "      self.idx2word[index] = word"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "eAY9k49G3jE_",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def max_length(tensor):\n",
+        "    return max(len(t) for t in tensor)\n",
+        "\n",
+        "\n",
+        "def load_dataset(path, num_examples):\n",
+        "    # creating cleaned input, output pairs\n",
+        "    pairs = create_dataset(path, num_examples)\n",
+        "\n",
+        "    # index language using the class defined above    \n",
+        "    inp_lang = LanguageIndex(sp for en, sp in pairs)\n",
+        "    targ_lang = LanguageIndex(en for en, sp in pairs)\n",
+        "    \n",
+        "    # Vectorize the input and target languages\n",
+        "    \n",
+        "    # Spanish sentences\n",
+        "    input_tensor = [[inp_lang.word2idx[s] for s in sp.split(' ')] for en, sp in pairs]\n",
+        "    \n",
+        "    # English sentences\n",
+        "    target_tensor = [[targ_lang.word2idx[s] for s in en.split(' ')] for en, sp in pairs]\n",
+        "    \n",
+        "    # Calculate max_length of input and output tensor\n",
+        "    # Here, we'll set those to the longest sentence in the dataset\n",
+        "    max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)\n",
+        "    \n",
+        "    # Padding the input and output tensor to the maximum length\n",
+        "    input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, \n",
+        "                                                                 maxlen=max_length_inp,\n",
+        "                                                                 padding='post')\n",
+        "    \n",
+        "    target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, \n",
+        "                                                                  maxlen=max_length_tar, \n",
+        "                                                                  padding='post')\n",
+        "    \n",
+        "    return input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_tar"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "GOi42V79Ydlr",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "### Limit the size of the dataset to experiment faster (optional)\n",
+        "\n",
+        "Training on the complete dataset of >100,000 sentences will take a long time. To train faster, we can limit the size of the dataset to 30,000 sentences (of course, translation quality degrades with less data):"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "cnxC7q-j3jFD",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "# Try experimenting with the size of that dataset\n",
+        "num_examples = 30000\n",
+        "input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_targ = load_dataset(path_to_file, num_examples)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "4QILQkOs3jFG",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "# Creating training and validation sets using an 80-20 split\n",
+        "input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)\n",
+        "\n",
+        "# Show length\n",
+        "len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "rgCLkfv5uO3d",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "### Create a tf.data dataset"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "TqHsArVZ3jFS",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "BUFFER_SIZE = len(input_tensor_train)\n",
+        "BATCH_SIZE = 64\n",
+        "embedding_dim = 256\n",
+        "units = 1024\n",
+        "vocab_inp_size = len(inp_lang.word2idx)\n",
+        "vocab_tar_size = len(targ_lang.word2idx)\n",
+        "\n",
+        "dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)\n",
+        "dataset = dataset.apply(tf.contrib.data.batch_and_drop_remainder(BATCH_SIZE))"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "TNfHIF71ulLu",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "## Write the encoder and decoder model\n",
+        "\n",
+        "Here, we'll implement an encoder-decoder model with attention which you can read about in the TensorFlow [Neural Machine Translation (seq2seq) tutorial](https://www.tensorflow.org/tutorials/seq2seq). This example uses a more recent set of APIs. This notebook implements the [attention equations](https://www.tensorflow.org/tutorials/seq2seq#background_on_the_attention_mechanism) from the seq2seq tutorial. The following diagram shows that each input words is assigned a weight by the attention mechanism which is then used by the decoder to predict the next word in the sentence.\n",
+        "\n",
+        "<img src=\"https://www.tensorflow.org/images/seq2seq/attention_mechanism.jpg\" width=\"500\" alt=\"attention mechanism\">\n",
+        "\n",
+        "The input is put through an encoder model which gives us the encoder output of shape *(batch_size, max_length, hidden_size)* and the encoder hidden state of shape *(batch_size, hidden_size)*. \n",
+        "\n",
+        "Here are the equations that are implemented:\n",
+        "\n",
+        "<img src=\"https://www.tensorflow.org/images/seq2seq/attention_equation_0.jpg\" alt=\"attention equation 0\" width=\"800\">\n",
+        "<img src=\"https://www.tensorflow.org/images/seq2seq/attention_equation_1.jpg\" alt=\"attention equation 1\" width=\"800\">\n",
+        "\n",
+        "We're using *Bahdanau attention*. Lets decide on notation before writing the simplified form:\n",
+        "\n",
+        "* FC = Fully connected (dense) layer\n",
+        "* EO = Encoder output\n",
+        "* H = hidden state\n",
+        "* X = input to the decoder\n",
+        "\n",
+        "And the pseudo-code:\n",
+        "\n",
+        "* `score = FC(tanh(FC(EO) + FC(H)))`\n",
+        "* `attention weights = softmax(score, axis = 1)`. Softmax by default is applied on the last axis but here we want to apply it on the *1st axis*, since the shape of score is *(batch_size, max_length, hidden_size)*. `Max_length` is the length of our input. Since we are trying to assign a weight to each input, softmax should be applied on that axis.\n",
+        "* `context vector = sum(attention weights * EO, axis = 1)`. Same reason as above for choosing axis as 1.\n",
+        "* `embedding output` = The input to the decoder X is passed through an embedding layer.\n",
+        "* `merged vector = concat(embedding output, context vector)`\n",
+        "* This merged vector is then given to the GRU\n",
+        "  \n",
+        "The shapes of all the vectors at each step have been specified in the comments in the code:"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "avyJ_4VIUoHb",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def gru(units):\n",
+        "  # If you have a GPU, we recommend using CuDNNGRU(provides a 3x speedup than GRU)\n",
+        "  # the code automatically does that.\n",
+        "  if tf.test.is_gpu_available():\n",
+        "    return tf.keras.layers.CuDNNGRU(units, \n",
+        "                                    return_sequences=True, \n",
+        "                                    return_state=True, \n",
+        "                                    recurrent_initializer='glorot_uniform')\n",
+        "  else:\n",
+        "    return tf.keras.layers.GRU(units, \n",
+        "                               return_sequences=True, \n",
+        "                               return_state=True, \n",
+        "                               recurrent_activation='sigmoid', \n",
+        "                               recurrent_initializer='glorot_uniform')"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "nZ2rI24i3jFg",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "class Encoder(tf.keras.Model):\n",
+        "    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):\n",
+        "        super(Encoder, self).__init__()\n",
+        "        self.batch_sz = batch_sz\n",
+        "        self.enc_units = enc_units\n",
+        "        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)\n",
+        "        self.gru = gru(self.enc_units)\n",
+        "        \n",
+        "    def call(self, x, hidden):\n",
+        "        x = self.embedding(x)\n",
+        "        output, state = self.gru(x, initial_state = hidden)        \n",
+        "        return output, state\n",
+        "    \n",
+        "    def initialize_hidden_state(self):\n",
+        "        return tf.zeros((self.batch_sz, self.enc_units))"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "yJ_B3mhW3jFk",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "class Decoder(tf.keras.Model):\n",
+        "    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):\n",
+        "        super(Decoder, self).__init__()\n",
+        "        self.batch_sz = batch_sz\n",
+        "        self.dec_units = dec_units\n",
+        "        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)\n",
+        "        self.gru = gru(self.dec_units)\n",
+        "        self.fc = tf.keras.layers.Dense(vocab_size)\n",
+        "        \n",
+        "        # used for attention\n",
+        "        self.W1 = tf.keras.layers.Dense(self.dec_units)\n",
+        "        self.W2 = tf.keras.layers.Dense(self.dec_units)\n",
+        "        self.V = tf.keras.layers.Dense(1)\n",
+        "        \n",
+        "    def call(self, x, hidden, enc_output):\n",
+        "        # enc_output shape == (batch_size, max_length, hidden_size)\n",
+        "        \n",
+        "        # hidden shape == (batch_size, hidden size)\n",
+        "        # hidden_with_time_axis shape == (batch_size, 1, hidden size)\n",
+        "        # we are doing this to perform addition to calculate the score\n",
+        "        hidden_with_time_axis = tf.expand_dims(hidden, 1)\n",
+        "        \n",
+        "        # score shape == (batch_size, max_length, hidden_size)\n",
+        "        score = tf.nn.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis))\n",
+        "        \n",
+        "        # attention_weights shape == (batch_size, max_length, 1)\n",
+        "        # we get 1 at the last axis because we are applying score to self.V\n",
+        "        attention_weights = tf.nn.softmax(self.V(score), axis=1)\n",
+        "        \n",
+        "        # context_vector shape after sum == (batch_size, hidden_size)\n",
+        "        context_vector = attention_weights * enc_output\n",
+        "        context_vector = tf.reduce_sum(context_vector, axis=1)\n",
+        "        \n",
+        "        # x shape after passing through embedding == (batch_size, 1, embedding_dim)\n",
+        "        x = self.embedding(x)\n",
+        "        \n",
+        "        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)\n",
+        "        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)\n",
+        "        \n",
+        "        # passing the concatenated vector to the GRU\n",
+        "        output, state = self.gru(x)\n",
+        "        \n",
+        "        # output shape == (batch_size * max_length, hidden_size)\n",
+        "        output = tf.reshape(output, (-1, output.shape[2]))\n",
+        "        \n",
+        "        # output shape == (batch_size * max_length, vocab)\n",
+        "        x = self.fc(output)\n",
+        "        \n",
+        "        return x, state, attention_weights\n",
+        "        \n",
+        "    def initialize_hidden_state(self):\n",
+        "        return tf.zeros((self.batch_sz, self.dec_units))"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "P5UY8wko3jFp",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)\n",
+        "decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "_ch_71VbIRfK",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "## Define the optimizer and the loss function"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "WmTHr5iV3jFr",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "optimizer = tf.train.AdamOptimizer()\n",
+        "\n",
+        "\n",
+        "def loss_function(real, pred):\n",
+        "  mask = 1 - np.equal(real, 0)\n",
+        "  loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask\n",
+        "  return tf.reduce_mean(loss_)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "hpObfY22IddU",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "## Training\n",
+        "\n",
+        "1. Pass the *input* through the *encoder* which return *encoder output* and the *encoder hidden state*.\n",
+        "2. The encoder output, encoder hidden state and the decoder input (which is the *start token*) is passed to the decoder.\n",
+        "3. The decoder returns the *predictions* and the *decoder hidden state*.\n",
+        "4. The decoder hidden state is then passed back into the model and the predictions are used to calculate the loss.\n",
+        "5. Use *teacher forcing* to decide the next input to the decoder.\n",
+        "6. *Teacher forcing* is the technique where the *target word* is passed as the *next input* to the decoder.\n",
+        "7. The final step is to calculate the gradients and apply it to the optimizer and backpropagate."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "ddefjBMa3jF0",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "EPOCHS = 10\n",
+        "\n",
+        "for epoch in range(EPOCHS):\n",
+        "    start = time.time()\n",
+        "    \n",
+        "    hidden = encoder.initialize_hidden_state()\n",
+        "    total_loss = 0\n",
+        "    \n",
+        "    for (batch, (inp, targ)) in enumerate(dataset):\n",
+        "        loss = 0\n",
+        "        \n",
+        "        with tf.GradientTape() as tape:\n",
+        "            enc_output, enc_hidden = encoder(inp, hidden)\n",
+        "            \n",
+        "            dec_hidden = enc_hidden\n",
+        "            \n",
+        "            dec_input = tf.expand_dims([targ_lang.word2idx['<start>']] * BATCH_SIZE, 1)       \n",
+        "            \n",
+        "            # Teacher forcing - feeding the target as the next input\n",
+        "            for t in range(1, targ.shape[1]):\n",
+        "                # passing enc_output to the decoder\n",
+        "                predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)\n",
+        "                \n",
+        "                loss += loss_function(targ[:, t], predictions)\n",
+        "                \n",
+        "                # using teacher forcing\n",
+        "                dec_input = tf.expand_dims(targ[:, t], 1)\n",
+        "        \n",
+        "        total_loss += (loss / int(targ.shape[1]))\n",
+        "        \n",
+        "        variables = encoder.variables + decoder.variables\n",
+        "        \n",
+        "        gradients = tape.gradient(loss, variables)\n",
+        "      \n",
+        "        optimizer.apply_gradients(zip(gradients, variables), tf.train.get_or_create_global_step())\n",
+        "\n",
+        "        if batch % 100 == 0:\n",
+        "            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,\n",
+        "                                                         batch,\n",
+        "                                                         loss.numpy() / int(targ.shape[1])))\n",
+        "    \n",
+        "    print('Epoch {} Loss {:.4f}'.format(epoch + 1,\n",
+        "                                        total_loss/len(input_tensor)))\n",
+        "    print('Time taken for 1 epoch {} sec\\n'.format(time.time() - start))"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "mU3Ce8M6I3rz",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "## Translate\n",
+        "\n",
+        "* The evaluate function is similar to the training loop, except we don't use *teacher forcing* here. The input to the decoder at each time step is its previous predictions along with the hidden state and the encoder output.\n",
+        "* Stop predicting when the model predicts the *end token*.\n",
+        "* And store the *attention weights for every time step*.\n",
+        "\n",
+        "Note: The encoder output is calculated only once for one input."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "EbQpyYs13jF_",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def evaluate(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ):\n",
+        "    attention_plot = np.zeros((max_length_targ, max_length_inp))\n",
+        "    \n",
+        "    sentence = preprocess_sentence(sentence)\n",
+        "\n",
+        "    inputs = [inp_lang.word2idx[i] for i in sentence.split(' ')]\n",
+        "    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=max_length_inp, padding='post')\n",
+        "    inputs = tf.convert_to_tensor(inputs)\n",
+        "    \n",
+        "    result = ''\n",
+        "\n",
+        "    hidden = [tf.zeros((1, units))]\n",
+        "    enc_out, enc_hidden = encoder(inputs, hidden)\n",
+        "\n",
+        "    dec_hidden = enc_hidden\n",
+        "    dec_input = tf.expand_dims([targ_lang.word2idx['<start>']], 0)\n",
+        "\n",
+        "    for t in range(max_length_targ):\n",
+        "        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)\n",
+        "        \n",
+        "        # storing the attention weigths to plot later on\n",
+        "        attention_weights = tf.reshape(attention_weights, (-1, ))\n",
+        "        attention_plot[t] = attention_weights.numpy()\n",
+        "\n",
+        "        predicted_id = tf.multinomial(tf.exp(predictions), num_samples=1)[0][0].numpy()\n",
+        "\n",
+        "        result += targ_lang.idx2word[predicted_id] + ' '\n",
+        "\n",
+        "        if targ_lang.idx2word[predicted_id] == '<end>':\n",
+        "            return result, sentence, attention_plot\n",
+        "        \n",
+        "        # the predicted ID is fed back into the model\n",
+        "        dec_input = tf.expand_dims([predicted_id], 0)\n",
+        "\n",
+        "    return result, sentence, attention_plot"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "s5hQWlbN3jGF",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "# function for plotting the attention weights\n",
+        "def plot_attention(attention, sentence, predicted_sentence):\n",
+        "    fig = plt.figure(figsize=(10,10))\n",
+        "    ax = fig.add_subplot(1, 1, 1)\n",
+        "    ax.matshow(attention, cmap='viridis')\n",
+        "    \n",
+        "    fontdict = {'fontsize': 14}\n",
+        "    \n",
+        "    ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)\n",
+        "    ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)\n",
+        "\n",
+        "    plt.show()"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "sl9zUHzg3jGI",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def translate(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ):\n",
+        "    result, sentence, attention_plot = evaluate(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)\n",
+        "        \n",
+        "    print('Input: {}'.format(sentence))\n",
+        "    print('Predicted translation: {}'.format(result))\n",
+        "    \n",
+        "    attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]\n",
+        "    plot_attention(attention_plot, sentence.split(' '), result.split(' '))"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "WrAM0FDomq3E",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "translate('hace mucho frio aqui.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "zSx2iM36EZQZ",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "translate('esta es mi vida.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "A3LLCx3ZE0Ls",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "translate('¿todavia estan en casa?', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "DUQVLVqUE1YW",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "# wrong translation\n",
+        "translate('trata de averiguarlo.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "RTe5P5ioMJwN",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "## Next steps\n",
+        "\n",
+        "* [Download a different dataset](http://www.manythings.org/anki/) to experiment with translations, for example, English to German, or English to French.\n",
+        "* Experiment with training on a larger dataset, or using more epochs\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/tensorflow/contrib/gan/python/estimator/python/head_impl.py b/tensorflow/contrib/gan/python/estimator/python/head_impl.py
index ff903a78cc..5b5557bd8f 100644
--- a/tensorflow/contrib/gan/python/estimator/python/head_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/head_impl.py
@@ -24,6 +24,7 @@ from tensorflow.contrib.gan.python import namedtuples as tfgan_tuples
 from tensorflow.contrib.gan.python import train as tfgan_train
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator.canned import head
+from tensorflow.python.estimator.export import export_output
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import metrics as metrics_lib
 
@@ -182,7 +183,10 @@ class GANHead(head._Head):  # pylint: disable=protected-access
       if mode == model_fn_lib.ModeKeys.PREDICT:
         return model_fn_lib.EstimatorSpec(
             mode=model_fn_lib.ModeKeys.PREDICT,
-            predictions=gan_model.generated_data)
+            predictions=gan_model.generated_data,
+            export_outputs={
+                'predict': export_output.PredictOutput(gan_model.generated_data)
+            })
       elif mode == model_fn_lib.ModeKeys.EVAL:
         gan_loss = self.create_loss(
             features=None, mode=mode, logits=gan_model, labels=None)
diff --git a/tensorflow/contrib/gan/python/estimator/python/head_test.py b/tensorflow/contrib/gan/python/estimator/python/head_test.py
index 6587f1fc60..5309d87765 100644
--- a/tensorflow/contrib/gan/python/estimator/python/head_test.py
+++ b/tensorflow/contrib/gan/python/estimator/python/head_test.py
@@ -26,8 +26,11 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
+from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.training import training
 
+_DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+
 
 def dummy_loss(gan_model, add_summaries=True):  # pylint:disable=unused-argument
   return math_ops.reduce_sum(gan_model.discriminator_real_outputs -
@@ -71,13 +74,15 @@ class GANHeadTest(test.TestCase):
     return {}
 
   def _test_modes_helper(self, mode):
-    self.gan_head.create_estimator_spec(
+    return self.gan_head.create_estimator_spec(
         features=None,
         mode=mode,
         logits=get_gan_model())
 
   def test_modes_predict(self):
-    self._test_modes_helper(model_fn_lib.ModeKeys.PREDICT)
+    spec = self._test_modes_helper(model_fn_lib.ModeKeys.PREDICT)
+    self.assertItemsEqual((_DEFAULT_SERVING_KEY, 'predict'),
+                          spec.export_outputs.keys())
 
   def test_modes_eval(self):
     self._test_modes_helper(model_fn_lib.ModeKeys.EVAL)
diff --git a/tensorflow/contrib/gdr/gdr_server_lib.cc b/tensorflow/contrib/gdr/gdr_server_lib.cc
index 1f9dd0decb..9025c992a4 100644
--- a/tensorflow/contrib/gdr/gdr_server_lib.cc
+++ b/tensorflow/contrib/gdr/gdr_server_lib.cc
@@ -57,7 +57,7 @@ Status GdrServer::Init() {
         new GdrWorker(env, remote_memory_manager_.get()));
   };
   TF_RETURN_IF_ERROR(
-      GrpcServer::Init(nullptr, rendezvous_mgr_func, worker_func));
+      GrpcServer::Init(nullptr, rendezvous_mgr_func, nullptr, worker_func));
 
   return remote_memory_manager_->Init();
 }
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
index a7b0d805a3..4cfaa0f36d 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
@@ -26,7 +26,7 @@ namespace optimized_ops {
 // Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
 // Jetson TX-2. This compiler does not support the offsetof() macro.
 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
-
+#include <stddef.h>
 // clang-format gets confused with this file and ends up formatting lines to
 // be larger than 80 characters. Turn off here and back on at the end of the
 // file.
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
index cbeb53bee7..681448be20 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
@@ -19,7 +19,9 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+// Place `<locale>` before <Python.h> to avoid build failures in macOS.
 #include <Python.h>
+#include <locale>
 
 // We forward declare TFLite classes here to avoid exposing them to SWIG.
 namespace tflite {
diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index 4f35de4e5d..bbdf962d04 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@@ -29,6 +29,7 @@ py_library(
         "python/training/reg_adagrad_optimizer.py",
         "python/training/sign_decay.py",
         "python/training/variable_clipping_optimizer.py",
+        "python/training/weight_decay_optimizers.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -198,6 +199,25 @@ py_test(
     ],
 )
 
+py_test(
+    name = "weight_decay_optimizers_test",
+    srcs = ["python/training/weight_decay_optimizers_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":opt_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
 tf_py_test(
     name = "drop_stale_gradient_optimizer_test",
     srcs = ["python/training/drop_stale_gradient_optimizer_test.py"],
diff --git a/tensorflow/contrib/opt/__init__.py b/tensorflow/contrib/opt/__init__.py
index b41148329d..65777b1323 100644
--- a/tensorflow/contrib/opt/__init__.py
+++ b/tensorflow/contrib/opt/__init__.py
@@ -22,16 +22,17 @@ from __future__ import print_function
 from tensorflow.contrib.opt.python.training.adamax import *
 from tensorflow.contrib.opt.python.training.addsign import *
 from tensorflow.contrib.opt.python.training.drop_stale_gradient_optimizer import *
+from tensorflow.contrib.opt.python.training.elastic_average_optimizer import *
 from tensorflow.contrib.opt.python.training.external_optimizer import *
+from tensorflow.contrib.opt.python.training.ggt import *
 from tensorflow.contrib.opt.python.training.lazy_adam_optimizer import *
+from tensorflow.contrib.opt.python.training.model_average_optimizer import *
 from tensorflow.contrib.opt.python.training.moving_average_optimizer import *
 from tensorflow.contrib.opt.python.training.multitask_optimizer_wrapper import *
 from tensorflow.contrib.opt.python.training.nadam_optimizer import *
 from tensorflow.contrib.opt.python.training.powersign import *
 from tensorflow.contrib.opt.python.training.variable_clipping_optimizer import *
-from tensorflow.contrib.opt.python.training.elastic_average_optimizer import *
-from tensorflow.contrib.opt.python.training.model_average_optimizer import *
-from tensorflow.contrib.opt.python.training.ggt import *
+from tensorflow.contrib.opt.python.training.weight_decay_optimizers import *
 # pylint: enable=wildcard-import
 
 from tensorflow.python.util.all_util import remove_undocumented
@@ -47,6 +48,10 @@ _allowed_symbols = [
     'LazyAdamOptimizer',
     'NadamOptimizer',
     'MovingAverageOptimizer',
+    'MomentumWOptimizer',
+    'AdamWOptimizer',
+    'DecoupledWeightDecayExtension',
+    'extend_with_decoupled_weight_decay',
     'ScipyOptimizerInterface',
     'VariableClippingOptimizer',
     'MultitaskOptimizerWrapper',
diff --git a/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py b/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py
new file mode 100644
index 0000000000..b9cf40eb7b
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py
@@ -0,0 +1,362 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Base class to make optimizers weight decay ready."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import adam
+from tensorflow.python.training import momentum as momentum_opt
+from tensorflow.python.training import optimizer
+from tensorflow.python.util.tf_export import tf_export
+
+
+class DecoupledWeightDecayExtension(object):
+  """This class allows to extend optimizers with decoupled weight decay.
+
+  It implements the decoupled weight decay described by Loshchilov & Hutter
+  (https://arxiv.org/pdf/1711.05101.pdf), in which the weight decay is
+  decoupled from the optimization steps w.r.t. to the loss function.
+  For SGD variants, this simplifies hyperparameter search since it decouples
+  the settings of weight decay and learning rate.
+  For adaptive gradient algorithms, it regularizes variables with large
+  gradients more than L2 regularization would, which was shown to yield better
+  training loss and generalization error in the paper above.
+
+  This class alone is not an optimizer but rather extends existing
+  optimizers with decoupled weight decay. We explicitly define the two examples
+  used in the above paper (SGDW and AdamW), but in general this can extend
+  any OptimizerX by using
+  `extend_with_weight_decay(OptimizerX, weight_decay=weight_decay)`.
+  In order for it to work, it must be the first class the Optimizer with
+  weight decay inherits from, e.g.
+
+  ```python
+  class AdamWOptimizer(DecoupledWeightDecayExtension, adam.AdamOptimizer):
+    def __init__(self, weight_decay, *args, **kwargs):
+      super(AdamWOptimizer, self).__init__(weight_decay, *args, **kwargs).
+  ```
+
+  Note that this extension decays weights BEFORE applying the update based
+  on the gradient, i.e. this extension only has the desired behaviour for
+  optimizers which do not depend on the value of'var' in the update step!
+  """
+
+  def __init__(self, weight_decay, **kwargs):
+    """Construct the extension class that adds weight decay to an optimizer.
+
+    Args:
+      weight_decay: A `Tensor` or a floating point value, the factor by which
+        a variable is decayed in the update step.
+      **kwargs: Optional list or tuple or set of `Variable` objects to
+        decay.
+    """
+    self._decay_var_list = None  # is set in minimize or apply_gradients
+    self._weight_decay = weight_decay
+    # The tensors are initialized in call to _prepare
+    self._weight_decay_tensor = None
+    super(DecoupledWeightDecayExtension, self).__init__(**kwargs)
+
+  def minimize(self, loss, global_step=None, var_list=None,
+               gate_gradients=optimizer.Optimizer.GATE_OP,
+               aggregation_method=None, colocate_gradients_with_ops=False,
+               name=None, grad_loss=None, decay_var_list=None):
+    """Add operations to minimize `loss` by updating `var_list` with decay.
+
+    This function is the same as Optimizer.minimize except that it allows to
+    specify the variables that should be decayed using decay_var_list.
+    If decay_var_list is None, all variables in var_list are decayed.
+
+    For more information see the documentation of Optimizer.minimize.
+
+    Args:
+      loss: A `Tensor` containing the value to minimize.
+      global_step: Optional `Variable` to increment by one after the
+        variables have been updated.
+      var_list: Optional list or tuple of `Variable` objects to update to
+        minimize `loss`.  Defaults to the list of variables collected in
+        the graph under the key `GraphKeys.TRAINABLE_VARIABLES`.
+      gate_gradients: How to gate the computation of gradients.  Can be
+        `GATE_NONE`, `GATE_OP`, or  `GATE_GRAPH`.
+      aggregation_method: Specifies the method used to combine gradient terms.
+        Valid values are defined in the class `AggregationMethod`.
+      colocate_gradients_with_ops: If True, try colocating gradients with
+        the corresponding op.
+      name: Optional name for the returned operation.
+      grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
+      decay_var_list: Optional list of decay variables.
+
+    Returns:
+      An Operation that updates the variables in `var_list`.  If `global_step`
+      was not `None`, that operation also increments `global_step`.
+
+    """
+    self._decay_var_list = set(decay_var_list) if decay_var_list else False
+    return super(DecoupledWeightDecayExtension, self).minimize(
+        loss, global_step=global_step, var_list=var_list,
+        gate_gradients=gate_gradients, aggregation_method=aggregation_method,
+        colocate_gradients_with_ops=colocate_gradients_with_ops, name=name,
+        grad_loss=grad_loss)
+
+  def apply_gradients(self, grads_and_vars, global_step=None, name=None,
+                      decay_var_list=None):
+    """Apply gradients to variables and decay the variables.
+
+    This function is the same as Optimizer.apply_gradients except that it
+    allows to specify the variables that should be decayed using
+    decay_var_list. If decay_var_list is None, all variables in var_list
+    are decayed.
+
+    For more information see the documentation of Optimizer.apply_gradients.
+
+    Args:
+      grads_and_vars: List of (gradient, variable) pairs as returned by
+        `compute_gradients()`.
+      global_step: Optional `Variable` to increment by one after the
+        variables have been updated.
+      name: Optional name for the returned operation.  Default to the
+        name passed to the `Optimizer` constructor.
+      decay_var_list: Optional list of decay variables.
+
+    Returns:
+      An `Operation` that applies the specified gradients. If `global_step`
+      was not None, that operation also increments `global_step`.
+    """
+    self._decay_var_list = set(decay_var_list) if decay_var_list else False
+    return super(DecoupledWeightDecayExtension, self).apply_gradients(
+        grads_and_vars, global_step=global_step, name=name)
+
+  def _prepare(self):
+    weight_decay = self._weight_decay
+    if callable(weight_decay):
+      weight_decay = weight_decay()
+    self._weight_decay_tensor = ops.convert_to_tensor(
+        weight_decay, name="weight_decay")
+    # Call the optimizers _prepare function.
+    super(DecoupledWeightDecayExtension, self)._prepare()
+
+  def _decay_weights_op(self, var):
+    if not self._decay_var_list or var in self._decay_var_list:
+      return var.assign_sub(self._weight_decay * var, self._use_locking)
+    return control_flow_ops.no_op()
+
+  def _decay_weights_sparse_op(self, var, indices, scatter_add):
+    if not self._decay_var_list or var in self._decay_var_list:
+      return scatter_add(var, indices, -self._weight_decay * var,
+                         self._use_locking)
+    return control_flow_ops.no_op()
+
+  # Here, we overwrite the apply functions that the base optimizer calls.
+  # super().apply_x resolves to the apply_x function of the BaseOptimizer.
+  def _apply_dense(self, grad, var):
+    with ops.control_dependencies([self._decay_weights_op(var)]):
+      return super(DecoupledWeightDecayExtension, self)._apply_dense(grad, var)
+
+  def _resource_apply_dense(self, grad, var):
+    with ops.control_dependencies([self._decay_weights_op(var)]):
+      return super(DecoupledWeightDecayExtension, self)._resource_apply_dense(
+          grad, var)
+
+  def _apply_sparse(self, grad, var):
+    scatter_add = state_ops.scatter_add
+    decay_op = self._decay_weights_sparse_op(var, grad.indices, scatter_add)
+    with ops.control_dependencies([decay_op]):
+      return super(DecoupledWeightDecayExtension, self)._apply_sparse(
+          grad, var)
+
+  def _resource_scatter_add(self, x, i, v, _=None):
+    # last argument allows for one overflow argument, to have the same function
+    # signature as state_ops.scatter_add
+    with ops.control_dependencies(
+        [resource_variable_ops.resource_scatter_add(x.handle, i, v)]):
+      return x.value()
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    scatter_add = self._resource_scatter_add
+    decay_op = self._decay_weights_sparse_op(var, indices, scatter_add)
+    with ops.control_dependencies([decay_op]):
+      return super(DecoupledWeightDecayExtension, self)._resource_apply_sparse(
+          grad, var, indices)
+
+
+def extend_with_decoupled_weight_decay(base_optimizer):
+  """Factory function returning an optimizer class with decoupled weight decay.
+
+  Returns an optimizer class. An instance of the returned class computes the
+  update step of `base_optimizer` and additionally decays the weights.
+  E.g., the class returned by
+  `extend_with_decoupled_weight_decay(tf.train.AdamOptimizer)` is equivalent to
+  `tf.contrib.opt.AdamWOptimizer`.
+
+  The API of the new optimizer class slightly differs from the API of the
+  base optimizer:
+  - The first argument to the constructor is the weight decay rate.
+  - `minimize` and `apply_gradients` accept the optional keyword argument
+    `decay_var_list`, which specifies the variables that should be decayed.
+    If `None`, all variables that are optimized are decayed.
+
+  Usage example:
+  ```python
+  # MyAdamW is a new class
+  MyAdamW = extend_with_decoupled_weight_decay(tf.train.AdamOptimizer)
+  # Create a MyAdamW object
+  optimizer = MyAdamW(weight_decay=0.001, learning_rate=0.001)
+  sess.run(optimizer.minimize(loss, decay_variables=[var1, var2]))
+
+  Note that this extension decays weights BEFORE applying the update based
+  on the gradient, i.e. this extension only has the desired behaviour for
+  optimizers which do not depend on the value of'var' in the update step!
+  ```
+
+  Args:
+    base_optimizer: An optimizer class that inherits from tf.train.Optimizer.
+
+  Returns:
+    A new optimizer class that inherits from DecoupledWeightDecayExtension
+    and base_optimizer.
+  """
+
+  class OptimizerWithDecoupledWeightDecay(DecoupledWeightDecayExtension,
+                                          base_optimizer):
+    """Base_optimizer with decoupled weight decay.
+
+    This class computes the update step of `base_optimizer` and
+    additionally decays the variable with the weight decay being decoupled from
+    the optimization steps w.r.t. to the loss function, as described by
+    Loshchilov & Hutter (https://arxiv.org/pdf/1711.05101.pdf).
+    For SGD variants, this simplifies hyperparameter search since
+    it decouples the settings of weight decay and learning rate.
+    For adaptive gradient algorithms, it regularizes variables with large
+    gradients more than L2 regularization would, which was shown to yield
+    better training loss and generalization error in the paper above.
+    """
+
+    def __init__(self, weight_decay, *args, **kwargs):
+      # super delegation is necessary here
+      # pylint: disable=useless-super-delegation
+      super(OptimizerWithDecoupledWeightDecay, self).__init__(
+          weight_decay, *args, **kwargs)
+      # pylint: enable=useless-super-delegation
+
+  return OptimizerWithDecoupledWeightDecay
+
+
+@tf_export("contrib.opt.MomentumWOptimizer")
+class MomentumWOptimizer(DecoupledWeightDecayExtension,
+                         momentum_opt.MomentumOptimizer):
+  """Optimizer that implements the Momentum algorithm with weight_decay.
+
+  This is an implementation of the SGDW optimizer described in "Fixing
+  Weight Decay Regularization in Adam" by Loshchilov & Hutter
+  (https://arxiv.org/abs/1711.05101)
+  ([pdf])(https://arxiv.org/pdf/1711.05101.pdf).
+  It computes the update step of `train.MomentumOptimizer` and additionally
+  decays the variable. Note that this is different from adding
+  L2 regularization on the variables to the loss. Decoupling the weight decay
+  from other hyperparameters (in particular the learning rate) simplifies
+  hyperparameter search.
+
+  For further information see the documentation of the Momentum Optimizer.
+
+  Note that this optimizer can also be instantiated as
+  ```python
+  extend_with_weight_decay(tf.train.MomentumOptimizer,
+                           weight_decay=weight_decay)
+  ```
+  """
+
+  def __init__(self, weight_decay, learning_rate, momentum,
+               use_locking=False, name="MomentumW", use_nesterov=False):
+    """Construct a new MomentumW optimizer.
+
+    For further information see the documentation of the Momentum Optimizer.
+
+    Args:
+      weight_decay:  A `Tensor` or a floating point value.  The weight decay.
+      learning_rate: A `Tensor` or a floating point value.  The learning rate.
+      momentum: A `Tensor` or a floating point value.  The momentum.
+      use_locking: If `True` use locks for update operations.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to "Momentum".
+      use_nesterov: If `True` use Nesterov Momentum.
+        See [Sutskever et al., 2013](
+        http://jmlr.org/proceedings/papers/v28/sutskever13.pdf).
+        This implementation always computes gradients at the value of the
+        variable(s) passed to the optimizer. Using Nesterov Momentum makes the
+        variable(s) track the values called `theta_t + mu*v_t` in the paper.
+
+    @compatibility(eager)
+    When eager execution is enabled, learning_rate, weight_decay and momentum
+    can each be a callable that takes no arguments and returns the actual value
+    to use. This can be useful for changing these values across different
+    invocations of optimizer functions.
+    @end_compatibility
+    """
+    super(MomentumWOptimizer, self).__init__(
+        weight_decay, learning_rate=learning_rate, momentum=momentum,
+        use_locking=use_locking, name=name, use_nesterov=use_nesterov)
+
+
+@tf_export("contrib.opt.AdamWOptimizer")
+class AdamWOptimizer(DecoupledWeightDecayExtension, adam.AdamOptimizer):
+  """Optimizer that implements the Adam algorithm with weight decay.
+
+  This is an implementation of the AdamW optimizer described in "Fixing
+  Weight Decay Regularization in Adam" by Loshchilov & Hutter
+  (https://arxiv.org/abs/1711.05101)
+  ([pdf])(https://arxiv.org/pdf/1711.05101.pdf).
+
+  It computes the update step of `train.AdamOptimizer` and additionally decays
+  the variable. Note that this is different from adding L2 regularization on
+  the variables to the loss: it regularizes variables with large
+  gradients more than L2 regularization would, which was shown to yield better
+  training loss and generalization error in the paper above.
+
+  For further information see the documentation of the Adam Optimizer.
+
+  Note that this optimizer can also be instantiated as
+  ```python
+  extend_with_weight_decay(tf.train.AdamOptimizer, weight_decay=weight_decay)
+  ```
+  """
+
+  def __init__(self, weight_decay, learning_rate=0.001, beta1=0.9, beta2=0.999,
+               epsilon=1e-8, use_locking=False, name="AdamW"):
+    """Construct a new AdamW optimizer.
+
+    For further information see the documentation of the Adam Optimizer.
+
+    Args:
+      weight_decay:  A `Tensor` or a floating point value.  The weight decay.
+      learning_rate: A Tensor or a floating point value.  The learning rate.
+      beta1: A float value or a constant float tensor.
+        The exponential decay rate for the 1st moment estimates.
+      beta2: A float value or a constant float tensor.
+        The exponential decay rate for the 2nd moment estimates.
+      epsilon: A small constant for numerical stability. This epsilon is
+        "epsilon hat" in the Kingma and Ba paper (in the formula just before
+        Section 2.1), not the epsilon in Algorithm 1 of the paper.
+      use_locking: If True use locks for update operations.
+      name: Optional name for the operations created when applying gradients.
+        Defaults to "Adam".
+    """
+    super(AdamWOptimizer, self).__init__(
+        weight_decay, learning_rate=learning_rate, beta1=beta1, beta2=beta2,
+        epsilon=epsilon, use_locking=use_locking, name=name)
diff --git a/tensorflow/contrib/opt/python/training/weight_decay_optimizers_test.py b/tensorflow/contrib/opt/python/training/weight_decay_optimizers_test.py
new file mode 100644
index 0000000000..76d8a5697a
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/weight_decay_optimizers_test.py
@@ -0,0 +1,188 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for optimizers with weight decay."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.opt.python.training import weight_decay_optimizers
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import adam
+
+WEIGHT_DECAY = 0.01
+
+
+def adamw_update_numpy(param, g_t, t, m, v, lr=0.001, beta1=0.9,
+                       beta2=0.999, epsilon=1e-8):
+  lr_t = lr * np.sqrt(1 - beta2**t) / (1 - beta1**t)
+
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = beta2 * v + (1 - beta2) * g_t * g_t
+
+  param_t = (param - lr_t * m_t / (np.sqrt(v_t) + epsilon) -
+             (param * WEIGHT_DECAY))
+  return param_t, m_t, v_t
+
+
+def momentumw_update_numpy(param, g_t, m, lr=0.001, momentum=0.9, **_):
+  # v, t are not needed for momentum optimizer
+  m = momentum * m + g_t
+  param_t = param - lr * m - param * WEIGHT_DECAY
+  return param_t, m, None
+
+
+class WeightDecayOptimizerTest(test.TestCase):
+
+  def doTest(self, optimizer, update_fn, optimizer_name, slot_name,
+             use_resource=False, do_sparse=False):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.test_session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(
+              var0_np, name="var0_%d" % i)
+          var1 = resource_variable_ops.ResourceVariable(
+              var1_np, name="var1_%d" % i)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+
+        if do_sparse:
+          grads0_np_indices = np.array([0, 1], dtype=np.int32)
+          grads0 = ops.IndexedSlices(constant_op.constant(grads0_np),
+                                     constant_op.constant(grads0_np_indices),
+                                     constant_op.constant([2]))
+          grads1_np_indices = np.array([0, 1], dtype=np.int32)
+          grads1 = ops.IndexedSlices(constant_op.constant(grads1_np),
+                                     constant_op.constant(grads1_np_indices),
+                                     constant_op.constant([2]))
+        else:
+          grads0 = constant_op.constant(grads0_np)
+          grads1 = constant_op.constant(grads1_np)
+
+        opt = optimizer()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+        if not context.executing_eagerly():
+          with ops.Graph().as_default():
+            # Shouldn't return non-slot variables from other graphs.
+            self.assertEqual(0, len(opt.variables()))
+          self.evaluate(variables.global_variables_initializer())
+          # Fetch params to validate initial values
+          self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+          self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        # Run 3 steps of the optimizer
+        for t in range(1, 4):
+          if not context.executing_eagerly():
+            self.evaluate(update)
+          elif t > 1:
+            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+          var0_np, m0, v0 = update_fn(var0_np, grads0_np, t=t, m=m0, v=v0)
+          var1_np, m1, v1 = update_fn(var1_np, grads1_np, t=t, m=m1, v=v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+          if use_resource:
+            self.assertEqual("var0_%d/%s:0" % (i, optimizer_name),
+                             opt.get_slot(var=var0, name=slot_name).name)
+
+
+class AdamWOptimizerTest(WeightDecayOptimizerTest):
+
+  @staticmethod
+  def get_optimizer():
+    return weight_decay_optimizers.AdamWOptimizer(WEIGHT_DECAY)
+
+  def testSparse(self):
+    self.doTest(self.get_optimizer, adamw_update_numpy, "AdamW", "m",
+                use_resource=False, do_sparse=True)
+
+  def testResourceSparse(self):
+    self.doTest(self.get_optimizer, adamw_update_numpy, "AdamW", "m",
+                use_resource=True, do_sparse=True)
+
+  def testBasic(self):
+    self.doTest(self.get_optimizer, adamw_update_numpy, "AdamW", "m",
+                use_resource=False)
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testResourceBasic(self):
+    self.doTest(self.get_optimizer, adamw_update_numpy, "AdamW", "m",
+                use_resource=True)
+
+
+class MomentumWOptimizerTest(WeightDecayOptimizerTest):
+
+  @staticmethod
+  def get_optimizer():
+    return weight_decay_optimizers.MomentumWOptimizer(WEIGHT_DECAY, 0.001, 0.9)
+
+  def testSparse(self):
+    self.doTest(self.get_optimizer, momentumw_update_numpy, "MomentumW",
+                "momentum", use_resource=False, do_sparse=True)
+
+  def testResourceSparse(self):
+    self.doTest(self.get_optimizer, momentumw_update_numpy, "MomentumW",
+                "momentum", use_resource=True, do_sparse=True)
+
+  def testBasic(self):
+    self.doTest(self.get_optimizer, momentumw_update_numpy, "MomentumW",
+                "momentum", use_resource=False)
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testResourceBasic(self):
+    self.doTest(self.get_optimizer, momentumw_update_numpy, "MomentumW",
+                "momentum", use_resource=True)
+
+
+class ExtendWithWeightDecayTest(WeightDecayOptimizerTest):
+
+  @staticmethod
+  def get_optimizer():
+    adamw = weight_decay_optimizers.extend_with_decoupled_weight_decay(
+        adam.AdamOptimizer)
+    return adamw(WEIGHT_DECAY)
+
+  def testBasic(self):
+    self.doTest(self.get_optimizer, adamw_update_numpy, "Adam", "m",
+                use_resource=False)
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testResourceBasic(self):
+    self.doTest(self.get_optimizer, adamw_update_numpy, "Adam", "m",
+                use_resource=True)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/solvers/python/ops/linear_equations.py b/tensorflow/contrib/solvers/python/ops/linear_equations.py
index 9305c6a11c..85918bf850 100644
--- a/tensorflow/contrib/solvers/python/ops/linear_equations.py
+++ b/tensorflow/contrib/solvers/python/ops/linear_equations.py
@@ -28,7 +28,6 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import linalg_ops
 
 
 def conjugate_gradient(operator,
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index a5d8b061b6..adda0b758b 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -49,7 +49,6 @@ tf_cuda_cc_test(
 tf_custom_op_library(
     name = "python/ops/_trt_engine_op.so",
     srcs = [
-        "ops/trt_calib_op.cc",
         "ops/trt_engine_op.cc",
     ],
     deps = [
@@ -76,11 +75,9 @@ tf_cuda_library(
 cc_library(
     name = "trt_engine_op_kernel",
     srcs = [
-        "kernels/trt_calib_op.cc",
         "kernels/trt_engine_op.cc",
     ],
     hdrs = [
-        "kernels/trt_calib_op.h",
         "kernels/trt_engine_op.h",
     ],
     copts = tf_copts(),
@@ -89,20 +86,22 @@ cc_library(
         ":trt_logging",
         ":trt_plugins",
         ":trt_resources",
+        ":trt_conversion",
+        ":utils",
         "//tensorflow/core:gpu_headers_lib",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:stream_executor_headers_lib",
+        "//tensorflow/core/grappler/costs:graph_properties",
     ] + if_tensorrt([
         "@local_config_tensorrt//:nv_infer",
     ]) + tf_custom_op_library_additional_deps(),
-    # TODO(laigd)
+    # TODO(laigd): fix this by merging header file in cc file.
     alwayslink = 1,  # buildozer: disable=alwayslink-with-hdrs
 )
 
 tf_gen_op_libs(
     op_lib_names = [
         "trt_engine_op",
-        "trt_calib_op",
     ],
 )
 
@@ -122,7 +121,6 @@ tf_gen_op_wrapper_py(
     name = "trt_engine_op",
     gen_locally = True,
     deps = [
-        ":trt_calib_op_op_lib",
         ":trt_engine_op_op_lib",
         ":trt_logging",
         ":trt_shape_function",
@@ -140,7 +138,6 @@ tf_custom_op_py_library(
     kernels = [
         ":trt_engine_op_kernel",
         ":trt_engine_op_op_lib",
-        ":trt_calib_op_op_lib",
         ":trt_shape_function",
     ],
     srcs_version = "PY2AND3",
@@ -191,7 +188,6 @@ tf_py_wrap_cc(
     deps = [
         ":trt_conversion",
         ":trt_engine_op_kernel",
-        "//tensorflow/core:framework_lite",
         "//third_party/python_runtime:headers",
     ],
 )
@@ -211,6 +207,7 @@ tf_cuda_library(
     ],
     deps = [
         ":trt_logging",
+        ":utils",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:lib_proto_parsing",
@@ -237,12 +234,12 @@ tf_cuda_library(
         ":trt_plugins",
         ":trt_logging",
         ":trt_resources",
+        ":utils",
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
-        "//tensorflow/core:framework",
         "//tensorflow/core:gpu_runtime",
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:graph",
@@ -343,3 +340,8 @@ py_test(
         "//tensorflow/python:framework_test_lib",
     ],
 )
+
+cc_library(
+    name = "utils",
+    hdrs = ["convert/utils.h"],
+)
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index da4dd5a14c..4dc1c551cc 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
 
+#include <fstream>
 #include <list>
 #include <map>
 #include <set>
@@ -24,10 +24,17 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
+#include "tensorflow/contrib/tensorrt/convert/utils.h"
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
+#include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h"
+#include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
 #include "tensorflow/contrib/tensorrt/segment/segment.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
 #include "tensorflow/core/common_runtime/gpu/process_state.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
+#include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -39,17 +46,39 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/config.pb.h"  // NOLINT
 #include "tensorflow/core/protobuf/device_properties.pb.h"  // NOLINT
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"  // NOLINT
+#include "tensorflow/core/util/device_name_utils.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
+#include "cuda/include/cuda_runtime_api.h"
 #include "tensorrt/include/NvInfer.h"
-
 namespace tensorflow {
 namespace tensorrt {
 namespace convert {
+using ::tensorflow::strings::StrAppend;
+using ::tensorflow::strings::StrCat;
+
+// Returns compiled TRT version information {Maj, Min, Patch}
+std::vector<int> GetLinkedTensorRTVersion() {
+  return {NV_TENSORRT_MAJOR, NV_TENSORRT_MINOR, NV_TENSORRT_PATCH};
+}
+
+// Returns loaded TRT library version {Maj, Min, Patch}
+std::vector<int> GetLoadedTensorRTVersion() {
+  int ver = getInferLibVersion();
+  int ver_major = ver / 1000;
+  ver = ver - ver_major * 1000;
+  int ver_minor = ver / 100;
+  int ver_patch = ver - ver_minor * 100;
+  return {ver_major, ver_minor, ver_patch};
+}
+
 namespace {
 
 bool IsTensorRTCandidate(const tensorflow::Node* node) {
@@ -82,229 +111,6 @@ bool IsTensorRTCandidate(const tensorflow::Node* node) {
           PluginFactoryTensorRT::GetInstance()->IsPlugin(node->type_string()));
 }
 
-void GetSubGraphIncomingEdges(const tensorflow::Graph& graph,
-                              const std::set<int>& subgraph_node_ids,
-                              tensorflow::EdgeSet* incoming_edges) {
-  for (int node_id : subgraph_node_ids) {
-    const tensorflow::Node* node = graph.FindNodeId(node_id);
-    for (const tensorflow::Edge* edge : node->in_edges()) {
-      if (!subgraph_node_ids.count(edge->src()->id()) &&
-          !edge->src()->IsSource() && !edge->IsControlEdge()) {
-        incoming_edges->insert(edge);
-        VLOG(2) << "INCOMING " << edge->src()->name() << " -> " << node->name()
-                << " Y, ";
-      } else {
-        VLOG(2) << "INCOMING " << edge->src()->name() << " -> " << node->name()
-                << " N, ";
-      }
-    }
-  }
-}
-
-void GetSubGraphOutgoingEdges(const tensorflow::Graph& graph,
-                              const std::set<int>& subgraph_node_ids,
-                              tensorflow::EdgeSet* outgoing_edges) {
-  for (int node_id : subgraph_node_ids) {
-    const tensorflow::Node* node = graph.FindNodeId(node_id);
-    for (const tensorflow::Edge* edge : node->out_edges()) {
-      if (!subgraph_node_ids.count(edge->dst()->id()) &&
-          !edge->dst()->IsSink() && !edge->IsControlEdge()) {
-        VLOG(2) << "OUTGOING " << node->name() << " -> " << edge->dst()->name()
-                << " Y, ";
-        outgoing_edges->insert(edge);
-      } else {
-        VLOG(2) << "OUTGOING " << node->name() << " -> " << edge->dst()->name()
-                << " N, ";
-      }
-    }
-  }
-}
-
-std::pair<string, int> ParseTensorName(const string& name,
-                                       int default_idx = 0) {
-  string name_no_idx = name;
-  int idx = default_idx;
-  const size_t sep = name_no_idx.find_last_of(':');
-  if (sep != string::npos) {
-    name_no_idx = name_no_idx.substr(0, sep);
-    idx = std::stoi(name.substr(sep + 1));
-  }
-  return std::make_pair(name_no_idx, idx);
-}
-
-std::unordered_map<string, std::vector<int>> BuildTensorNameMap(
-    const std::vector<string>& tensor_names) {
-  std::unordered_map<string, std::vector<int>> result;
-  for (const string& tensor_name : tensor_names) {
-    string node_name;
-    int index;
-    std::tie(node_name, index) = ParseTensorName(tensor_name);
-    result[node_name].push_back(index);
-  }
-  return result;
-}
-
-// TODO(sami): convert references to pointers
-struct ConvertGraphParams {
-  ConvertGraphParams(
-      tensorflow::Graph& inp_graph,
-      const std::vector<string>& output_node_names,
-      const std::set<int>& subgraph_node_id_numbers,
-      size_t max_supported_batch_size, size_t max_consumed_workspace_size_bytes,
-      const tensorflow::grappler::GraphProperties& current_graph_properties,
-      std::unordered_map<string, std::pair<int, string>>* output_edges,
-      int engine_precision_mode, const string& device_name,
-      std::shared_ptr<nvinfer1::IGpuAllocator> allocator, int cuda_gpu_id)
-      : graph(inp_graph),
-        output_names(output_node_names),
-        subgraph_node_ids(subgraph_node_id_numbers),
-        max_batch_size(max_supported_batch_size),
-        max_workspace_size_bytes(max_consumed_workspace_size_bytes),
-        graph_properties(current_graph_properties),
-        output_edge_map(output_edges),
-        precision_mode(engine_precision_mode),
-        device_name_(device_name),
-        allocator_(allocator),
-        cuda_gpu_id_(cuda_gpu_id) {}
-  tensorflow::Graph& graph;
-  const std::vector<string>& output_names;
-  const std::set<int>& subgraph_node_ids;
-  size_t max_batch_size;
-  size_t max_workspace_size_bytes;
-  const tensorflow::grappler::GraphProperties& graph_properties;
-  std::unordered_map<string, std::pair<int, string>>* output_edge_map;
-  int precision_mode;
-  string device_name_;
-  std::shared_ptr<nvinfer1::IGpuAllocator> allocator_;
-  int cuda_gpu_id_;
-  std::vector<std::pair<int, int>> subgraph_inputs;
-  std::vector<std::pair<int, int>> subgraph_outputs;
-  tensorflow::EdgeSet subgraph_incoming_edges;
-  tensorflow::EdgeSet subgraph_outgoing_edges;
-};
-
-static tensorflow::Status FillSubGraphEdgeSets(ConvertGraphParams* p) {
-  GetSubGraphIncomingEdges(p->graph, p->subgraph_node_ids,
-                           &p->subgraph_incoming_edges);
-
-  std::set<std::pair<int, int>> unique_tensors;
-  // Add only unique input source nodes. If output of an outside node is shared
-  // between multiple nodes inside the engine, only one edge should be created
-  for (const tensorflow::Edge* edge : p->subgraph_incoming_edges) {
-    unique_tensors.insert({edge->src()->id(), edge->src_output()});
-  }
-  p->subgraph_inputs.insert(p->subgraph_inputs.begin(), unique_tensors.begin(),
-                            unique_tensors.end());
-  GetSubGraphOutgoingEdges(p->graph, p->subgraph_node_ids,
-                           &p->subgraph_outgoing_edges);
-  unique_tensors.clear();
-  // Similar to above, if multiple ouside nodes are sharing the output of an
-  // internal node only one output port should be created and shared between
-  // outputs
-  for (const tensorflow::Edge* edge : p->subgraph_outgoing_edges) {
-    unique_tensors.insert({edge->src()->id(), edge->src_output()});
-  }
-  p->subgraph_outputs.reserve(unique_tensors.size());
-  p->subgraph_outputs.insert(p->subgraph_outputs.begin(),
-                             unique_tensors.begin(), unique_tensors.end());
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status GetCalibNode(ConvertGraphParams* params) {
-  TF_RETURN_IF_ERROR(FillSubGraphEdgeSets(params));
-  tensorflow::NodeDef trt_node_def;
-  SubGraphParams s(params->graph, params->subgraph_node_ids,
-                   params->subgraph_inputs, params->subgraph_outputs,
-                   params->max_batch_size, params->max_workspace_size_bytes,
-                   params->graph_properties, params->output_edge_map,
-                   &trt_node_def, params->precision_mode, params->device_name_,
-                   params->allocator_, params->cuda_gpu_id_);
-  TF_RETURN_IF_ERROR(InjectCalibrationNode(s));
-  tensorflow::Status status;
-  tensorflow::Node* trt_node = params->graph.AddNode(trt_node_def, &status);
-
-  TF_RETURN_IF_ERROR(status);
-
-  for (auto in_edge :
-       params->subgraph_incoming_edges) {  // loop over incoming edges and
-                                           // attach them to calib node
-    auto src_output = in_edge->src_output();
-    auto dst_node = in_edge->dst();
-    auto dst_input = in_edge->dst_input();
-    VLOG(1) << " update edge " << trt_node->name() << ":" << src_output
-            << " -> " << dst_node->name() << ":" << dst_input;
-    TF_RETURN_IF_ERROR(
-        params->graph.UpdateEdge(trt_node, src_output, dst_node, dst_input));
-  }
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status ConvertSubGraphToTensorRT(ConvertGraphParams* params) {
-  TF_RETURN_IF_ERROR(FillSubGraphEdgeSets(params));
-  tensorflow::NodeDef trt_node_def;
-
-  SubGraphParams s(params->graph, params->subgraph_node_ids,
-                   params->subgraph_inputs, params->subgraph_outputs,
-                   params->max_batch_size, params->max_workspace_size_bytes,
-                   params->graph_properties, params->output_edge_map,
-                   &trt_node_def, params->precision_mode, params->device_name_,
-                   params->allocator_, params->cuda_gpu_id_);
-  TF_RETURN_IF_ERROR(ConvertSubGraphToTensorRTNodeDef(s));
-  tensorflow::Status status;
-  tensorflow::Node* trt_node = params->graph.AddNode(trt_node_def, &status);
-
-  // AddNode does not wire edges.
-  // Re-map incoming edges to use the new TRT node instead of the orig subgraph
-  std::map<std::pair<int, int>, int> subgraph_edge_to_input_map;
-  for (size_t i = 0; i < params->subgraph_inputs.size(); ++i) {
-    subgraph_edge_to_input_map.insert({params->subgraph_inputs.at(i), i});
-  }
-  std::set<std::pair<int, int>> unique_tensors;
-  for (const tensorflow::Edge* edge : params->subgraph_incoming_edges) {
-    std::pair<int, int> old_src = {edge->src()->id(), edge->src_output()};
-    if (unique_tensors.count(old_src)) continue;
-    unique_tensors.insert(old_src);
-    int new_src_output = subgraph_edge_to_input_map.at(old_src);
-    params->graph.AddEdge(edge->src(), edge->src_output(), trt_node,
-                          new_src_output);
-    VLOG(1) << "Wire " << edge->src()->name() << ":" << edge->src_output()
-            << " -> " << trt_node->name() << ":" << new_src_output;
-    params->graph.RemoveEdge(edge);
-  }
-  if (VLOG_IS_ON(2)) {
-    VLOG(2) << "new edge count: " << trt_node->in_edges().size();
-    for (const tensorflow::Edge* edge : trt_node->in_edges()) {
-      VLOG(2) << edge->src()->name() << " port: " << edge->src_output();
-    }
-  }
-  TF_RETURN_IF_ERROR(status);
-
-  // Re-map outgoing edges to use the new TRT node instead of the orig subgraph
-  std::map<std::pair<int, int>, int> subgraph_edge_to_output_map;
-  for (size_t i = 0; i < params->subgraph_outputs.size(); ++i) {
-    subgraph_edge_to_output_map.insert({params->subgraph_outputs.at(i), i});
-  }
-  TF_RETURN_IF_ERROR(status);
-  for (const tensorflow::Edge* edge : params->subgraph_outgoing_edges) {
-    std::pair<int, int> old_src = {edge->src()->id(), edge->src_output()};
-    int new_src_output = subgraph_edge_to_output_map.at(old_src);
-    TF_RETURN_IF_ERROR(params->graph.UpdateEdge(
-        trt_node, new_src_output, edge->dst(), edge->dst_input()));
-    VLOG(1) << "Wire " << trt_node->name() << ":" << new_src_output << " -> "
-            << edge->dst()->name() << ":" << edge->dst_input();
-  }
-  // Remove the original subgraph
-  for (int node_id : params->subgraph_node_ids) {
-    tensorflow::Node* node = params->graph.FindNodeId(node_id);
-    // Don't remove the input placeholders
-    if (node->type_string() == "Placeholder") {
-      continue;
-    }
-    params->graph.RemoveNode(node);
-  }
-  return tensorflow::Status::OK();
-}
-
 tensorflow::Status BuildNodeMap(
     const tensorflow::Graph& graph,
     std::unordered_map<string, tensorflow::Node*>* node_map) {
@@ -318,51 +124,77 @@ tensorflow::Status BuildNodeMap(
 }
 
 }  // namespace
+
+// Function to get calibration from ResourceMgr and put them into nodedef.
 tensorflow::Status ConvertCalibGraphToInferGraph(
-    const tensorflow::GraphDef& graph_def, tensorflow::GraphDef* infer_graph) {
+    const tensorflow::GraphDef& graph_def, tensorflow::GraphDef* infer_graph,
+    bool is_dyn_op) {
   VLOG(0) << "Starting Calib Conversion";
-  tensorflow::Graph graph(tensorflow::OpRegistry::Global());
-  TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToGraph(
-      tensorflow::GraphConstructorOptions(), graph_def, &graph));
-  //  get calib nodes
-  std::vector<tensorflow::Node*> calib_nodes;
-  std::vector<tensorflow::Node*> topo_order;
-  tensorflow::GetPostOrder(graph, &topo_order);
-  for (auto rit = topo_order.rbegin(); rit != topo_order.rend(); ++rit) {
-    auto node = *rit;
-    if (node->type_string() == "TRTCalibOp") {
-      VLOG(1) << "Found Calib Node " << node->name();
-      calib_nodes.push_back(node);
-    }
+  infer_graph->CopyFrom(graph_def);
+  auto trt_rm = TRTResourceManager::instance();
+  auto calib_rm = trt_rm->getManager("TRTCalibration");
+  int num_nodes = infer_graph->node_size();
+  if (!is_dyn_op) {
+    LOG(WARNING) << "Construction of static int8 engine is not implemented "
+                    "yet!. Dynamic engine will be constructed";
   }
-  VLOG(0) << "Num Calib nodes in graph= " << calib_nodes.size();
-  if (calib_nodes.size() == 0)
-    return tensorflow::errors::FailedPrecondition(
-        "Graph doesn't contain any calibration nodes!."
-        " Please generate calibration graph and run calibration first");
-  for (auto n : calib_nodes) {
-    TF_RETURN_IF_ERROR(
-        tensorrt::convert::ConvertCalibrationNodeToEngineNode(graph, n));
+  for (int i = 0; i < num_nodes; ++i) {
+    auto n = infer_graph->mutable_node(i);
+    if (n->op() == "TRTEngineOp") {
+      VLOG(1) << "Processing " << n->name();
+      string container_name = n->attr().at("segment_funcdef_name").s();
+      TRTCalibrationResource* cres = nullptr;
+      auto status = calib_rm->Lookup(container_name, "Calibrator", &cres);
+      if (!status.ok()) {
+        LOG(ERROR) << "Could not get Calibration information. Did you run with "
+                      "calibration data?";
+        return tensorflow::errors::FailedPrecondition(
+            "Need to run graph with calibration data first!");
+      }
+      if (cres->calibrator_) {
+        cres->calibrator_->setDone();
+        cres->thr_->join();
+        const auto& calibration_table =
+            cres->calibrator_->getCalibrationTableAsString();
+        if (!calibration_table.size()) {
+          LOG(ERROR) << "Calibration table is empty";
+          return tensorflow::errors::Unknown(
+              "Calibration table is missing. This shouldn't have happened!");
+        }
+        n->mutable_attr()->at("calibration_data").set_s(calibration_table);
+      } else {
+        LOG(ERROR) << "Can't get TRTCalibrator from resource manager!";
+        return tensorflow::errors::Unknown(
+            "Can't get TRTCalibrator from resource manager!");
+      }
+      cres->Unref();
+    }
   }
-  graph.ToGraphDef(infer_graph);
   return tensorflow::Status::OK();
 }
 
+// Entry function from Python.
 tensorflow::Status ConvertGraphDefToTensorRT(
     const tensorflow::GraphDef& graph_def,
     const std::vector<string>& output_names, size_t max_batch_size,
     size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def,
-    int precision_mode = FP32MODE, int minimum_segment_size = 3) {
+    int precision_mode, int minimum_segment_size, bool is_dyn_op,
+    int max_cached_engines, std::vector<int> cached_engine_batches) {
   // optimization pass
   tensorflow::grappler::GrapplerItem item;
   item.fetch = output_names;
   item.graph = graph_def;
-
+  // grappler requires a virtual cluster with a proper GPU device
+  // in order to calculate flops>0 or fails with FATAL
+  // We add numbers from a Pascal card here to have flops>0
   tensorflow::DeviceProperties device_properties;
   device_properties.set_type("GPU");
   device_properties.mutable_environment()->insert({"architecture", "6"});
-  tensorflow::grappler::Cluster* cluster =
-      new tensorflow::grappler::VirtualCluster({{"/GPU:0", device_properties}});
+  device_properties.set_num_cores(3584);
+  device_properties.set_frequency(1531);
+  std::unique_ptr<tensorflow::grappler::Cluster> cluster(
+      new tensorflow::grappler::VirtualCluster(
+          {{"/GPU:0", device_properties}}));
 
   // single machine
   int num_cpu_cores = tensorflow::grappler::GetNumAvailableLogicalCPUCores();
@@ -370,134 +202,633 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   VLOG(2) << "cpu_cores: " << num_cpu_cores;
   VLOG(2) << "gpus: " << num_gpus;
   tensorflow::RewriterConfig rw_cfg;
+  // use only const folding and layout for the time being since new optimizers
+  // break the graph for us
+  rw_cfg.add_optimizers("constfold");
+  rw_cfg.add_optimizers("layout");
+  rw_cfg.set_meta_optimizer_iterations(tensorflow::RewriterConfig::ONE);
   tensorflow::grappler::MetaOptimizer meta_opt(nullptr, rw_cfg);
   tensorflow::GraphDef gdef;
-  TF_RETURN_IF_ERROR(meta_opt.Optimize(cluster, item, &gdef));
+  TF_RETURN_IF_ERROR(meta_opt.Optimize(cluster.get(), item, &gdef));
   item.graph = gdef;
 
   // AJ refactoring shape inference through grappler/GraphProperties.
   tensorflow::grappler::GraphProperties static_graph_properties(item);
   TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(true));
   // Build full graph
-
-  return ConvertAfterShapes(gdef, output_names, max_batch_size,
-                            max_workspace_size_bytes, new_graph_def,
-                            precision_mode, minimum_segment_size,
-                            static_graph_properties, nullptr);
+  ConversionParams cp;
+  cp.input_graph_def = &gdef;
+  cp.output_names = &output_names;
+  cp.max_batch_size = max_batch_size;
+  cp.output_graph_def = new_graph_def;
+  cp.precision_mode = precision_mode;
+  cp.is_dyn_op = is_dyn_op;
+  cp.max_cached_engines = max_cached_engines;
+  cp.cached_engine_batches = cached_engine_batches;
+  cp.minimum_segment_size = minimum_segment_size;
+  cp.graph_properties = &static_graph_properties;
+  cp.max_workspace_size_bytes = max_workspace_size_bytes;
+  if (VLOG_IS_ON(5)) {
+    std::fstream f;
+    f.open("TRTConversionInput.pb",
+           std::fstream::out | std::fstream::binary | std::fstream::trunc);
+    f << gdef.SerializeAsString();
+    f.close();
+  }
+  return ConvertAfterShapes(cp);
 }
 
-tensorflow::Status ConvertAfterShapes(
-    const tensorflow::GraphDef& gdef, const std::vector<string>& output_names,
-    size_t max_batch_size, size_t max_workspace_size_bytes,
-    tensorflow::GraphDef* new_graph_def, int precision_mode,
-    int minimum_segment_size,
+// Function to get subsegment information structure.
+tensorflow::Status GetEngineInfo(
+    const tensorflow::Graph* g,
     const tensorflow::grappler::GraphProperties& graph_properties,
-    const tensorflow::grappler::Cluster* cluster) {
-  // Segment the graph into subgraphs that can be converted to TensorRT
-  tensorflow::tensorrt::segment::SegmentOptions segment_options;
+    const std::set<string>& segment_nodes,
+    const std::unordered_map<string, tensorflow::Node*>& node_map,
+    const std::vector<tensorflow::Node*>& reverse_topo_order,
+    EngineInfo* info) {
+  std::vector<int> subgraph_node_ids;
+  std::set<string> segment_devices;
+  int input_port = 0;
+  int output_port = 0;
+
+  // Map from src_node_name+port to the unique port numbers of the TRT op, where
+  // the src_node_name is the name of the source node of the input/output
+  // edge, thus there must not be any duplicates since source nodes of
+  // input/output edges must be in different split of the graph.
+  // TODO(aaroey): consider using node id and port instead.
+  std::unordered_map<string, int> created_edges;
+  for (auto it = reverse_topo_order.rbegin(); it != reverse_topo_order.rend();
+       ++it) {
+    const auto& node_name = (*it)->name();
+
+    if (segment_nodes.count(node_name) == 0) continue;
+    auto node = node_map.at(node_name);
+    auto node_device = node->requested_device();
+    if (!node_device.empty()) {
+      segment_devices.insert(node_device);
+    } else {
+      if (node->has_assigned_device_name()) {
+        segment_devices.insert(node->assigned_device_name());
+      } else {
+        VLOG(2) << "Node " << node->name()
+                << " neither have requested device nor assigned device";
+      }
+    }
+    int node_id = node->id();
+    subgraph_node_ids.push_back(node_id);
+    for (const auto edge : node->in_edges()) {
+      auto input_node = edge->src();
+      if (segment_nodes.count(input_node->name()) == 0) {
+        // Add constant input node into the segment. We don't care if it has
+        // other output edges going into other engines or TF nodes. Since we add
+        // it only to the subsegment node list, not the subsegment itself, it
+        // won't be removed from the graph. If it doesn't have any edges, TF
+        // will prune it out.
+        if (input_node->type_string() == "Const") {
+          subgraph_node_ids.push_back(input_node->id());
+        } else if (!edge->IsControlEdge() && !input_node->IsSource()) {
+          string s(input_node->name());
+          StrAppend(&s, ":", edge->src_output());
+          VLOG(1) << "Input edge = " << s;
+          int port = input_port;
+          if (created_edges.count(s)) {
+            port = created_edges.at(s);
+          } else {
+            created_edges.insert({s, port});
+            input_port++;
+          }
+          info->connections.emplace_back(input_node->name(), input_node->id(),
+                                         edge->src_output(), node_name, node_id,
+                                         edge->dst_input(), true, port);
+        }
+      }
+    }
+    for (const auto edge : node->out_edges()) {
+      auto output_node = edge->dst();
+      if (segment_nodes.count(output_node->name()) == 0 &&
+          !edge->IsControlEdge() && !output_node->IsSink()) {
+        string s(node_name);
+        StrAppend(&s, ":", edge->src_output());
+        VLOG(1) << "Output edge = " << s;
+        int port = output_port;
+        if (created_edges.count(s)) {
+          port = created_edges.at(s);
+        } else {
+          created_edges.insert({s, port});
+          output_port++;
+        }
+        info->connections.emplace_back(output_node->name(), output_node->id(),
+                                       edge->dst_input(), node_name, node_id,
+                                       edge->src_output(), false, port);
+      }
+    }
+  }
+
+  TF_RETURN_IF_ERROR(ConvertSegmentToGraphDef(
+      g, graph_properties, subgraph_node_ids, &info->connections,
+      &info->segment_graph_def, &info->engine_name));
+  // TODO(sami): This should not happen once segmenter is updated.
+  if (segment_devices.size() == 1) {
+    info->device = *segment_devices.begin();
+  } else if (segment_devices.size() > 1) {
+    LOG(WARNING) << "Detected multiple(" << segment_devices.size()
+                 << ") devices for the segment. Picking first one to continue "
+                 << "but this shouldn't have happened";
+    info->device = *segment_devices.begin();
+  } else {
+    VLOG(1) << "Segment devices size is 0";
+  }
+  return Status::OK();
+}
+
+// Function to insert a TRT node into the graph. The graph is not modified if
+// the returned status is not ok.
+// 'alloc' is only used for creating static engine.
+tensorflow::Status CreateTRTNode(tensorflow::Graph* graph,
+                                 const std::vector<EngineInfo>& infos, int pos,
+                                 nvinfer1::IGpuAllocator* alloc,
+                                 int max_batch_size) {
+  const auto& info = infos.at(pos);
+  std::vector<tensorflow::TensorShapeProto> out_shapes;
+  std::vector<tensorflow::TensorShapeProto> input_shapes;
+  std::vector<tensorflow::PartialTensorShape> shapes;
+  std::vector<tensorflow::NodeDefBuilder::NodeOut> inputs;
+  std::vector<tensorflow::DataType> out_types;
+  VLOG(1) << "Processing " << info.engine_name;
+
+  // Update the shape and data types of input/output nodes, and find all unique
+  // inputs.
+  for (const auto& conn : info.connections) {
+    if (!conn.is_input_edge) {
+      // Set the shapes and data types of output edge.
+      tensorflow::TensorShapeProto out_shape;
+      // shape of the output node inside segment
+      conn.inside_shape.AsProto(&out_shape);
+      if (out_shapes.size() <= conn.port_number) {
+        out_shapes.resize(conn.port_number + 1);
+        out_types.resize(conn.port_number + 1);
+      }
+      out_shapes.at(conn.port_number) = out_shape;
+      out_types.at(conn.port_number) = conn.connection_type;
+      continue;
+    }
+
+    // Set the shapes and data types of input edge.
+    tensorflow::TensorShapeProto in_shape;
+    conn.outside_shape.AsProto(&in_shape);
+    if (input_shapes.size() <= conn.port_number) {
+      input_shapes.resize(conn.port_number + 1);
+      shapes.resize(conn.port_number + 1);
+    }
+    input_shapes.at(conn.port_number) = in_shape;
+    shapes.at(conn.port_number) = conn.outside_shape;
+
+    string input_node = conn.outside_node_name;
+    int input_port = conn.outside_port;
+    bool found_engine = false;
+    // Rewire the inputs to other engines if they contain original input node.
+    // Note that we use the information of the engine here, not the information
+    // of the created TRT nodes, so we're able to find all the connections to
+    // any other engines beforehand.
+    for (size_t t = 0; t < infos.size(); ++t) {
+      if (t == pos) continue;
+      auto& engine_info = infos.at(t);
+      for (const auto& eng_conn : engine_info.connections) {
+        if (eng_conn.is_input_edge) continue;
+        if (eng_conn.inside_node_name == input_node) {
+          input_node = engine_info.engine_name;
+          if (eng_conn.inside_port == input_port) {
+            input_port = eng_conn.port_number;
+            found_engine = true;
+            break;
+          }
+        }
+      }
+      if (found_engine) break;
+    }
+    VLOG(1) << "Engine Input " << input_node << ":" << input_port << " -> "
+            << info.engine_name << ":" << inputs.size();
+    // Skip duplicate inputs.
+    bool new_input = true;
+    for (const auto& inp : inputs) {
+      if (inp.node == input_node && inp.index == input_port) {
+        new_input = false;
+        break;
+      }
+    }
+    if (new_input) {
+      inputs.emplace_back(input_node, input_port, conn.connection_type);
+    }
+  }
+
+  // Build the engine and get its serialized representation.
+  string segment_string;
+  if (info.engine_type == EngineInfo::EngineType::TRTStatic ||
+      info.precision_mode == INT8MODE) {
+    // Create static engine for fp32/fp16 mode, and test validity of the engine
+    // for int8 mode. We don't want engine to fail at the calibration time.
+    // So we are constructing a FP32 engine here to check its validity, and if
+    // it is a valid engine then we put the serialized graphdef to the op.
+    // Otherwise we skip node creation for this engine.
+    Logger trt_logger;
+    TrtUniquePtrType<nvinfer1::ICudaEngine> engine;
+    // TODO(sami): What happens if 1st dim is not batch?
+    TF_RETURN_IF_ERROR(ConvertGraphDefToEngine(
+        info.segment_graph_def,
+        info.precision_mode == INT8MODE ? FP32MODE : info.precision_mode,
+        max_batch_size, info.max_workspace_size_bytes, shapes, &trt_logger,
+        alloc, /*calibrator=*/nullptr, &engine,
+        /*convert_successfully=*/nullptr));
+    TrtUniquePtrType<nvinfer1::IHostMemory> engine_data(engine->serialize());
+    segment_string =
+        string((const char*)engine_data->data(), engine_data->size());
+    if (info.precision_mode == INT8MODE) {
+      // See above comment about why not putting this inside the 'else' branch.
+      segment_string = info.segment_graph_def.SerializeAsString();
+    }
+  } else {
+    segment_string = info.segment_graph_def.SerializeAsString();
+  }
+
+  // TODO(aaroey): use enum instead, and add a helper method to do the
+  // conversion.
+  string prec_string;
+  switch (info.precision_mode) {
+    case FP32MODE:
+      prec_string = "FP32";
+      break;
+    case FP16MODE:
+      prec_string = "FP16";
+      break;
+    case INT8MODE:
+      prec_string = "INT8";
+      if (!TRTResourceManager::instance()->getManager("TRTCalibration")) {
+        LOG(ERROR) << "Failed to construct calibration storage";
+      }
+      break;
+    default:
+      return tensorflow::errors::OutOfRange("Unknown precision mode");
+  }
+  tensorflow::NodeDefBuilder node_builder(info.engine_name, "TRTEngineOp");
+  if (!info.device.empty()) node_builder.Device(info.device);
+  if (VLOG_IS_ON(1)) {
+    string ins = StrCat(info.engine_name, " inputs= ");
+    for (const auto& ii : inputs) {
+      StrAppend(&ins, ii.node, ":", ii.index, " ");
+    }
+    VLOG(1) << ins;
+  }
+  node_builder.Input(inputs);
+  if (info.engine_type == EngineInfo::EngineType::TRTStatic &&
+      info.cached_engine_batches.size()) {
+    LOG(WARNING) << "Cached engine batches are ignored for static engines";
+  }
+  tensorflow::NodeDef trt_node;
+  tensorflow::Status status =
+      node_builder.Attr("input_shapes", input_shapes)
+          .Attr("output_shapes", out_shapes)
+          .Attr("static_engine",
+                info.engine_type == EngineInfo::EngineType::TRTStatic)
+          .Attr("segment_funcdef_name",
+                StrCat(info.engine_name, "_native_segment"))
+          .Attr("serialized_segment", segment_string)
+          .Attr("calibration_data", "")
+          .Attr("max_cached_engines_count", info.maximum_cached_engines)
+          .Attr("cached_engine_batches", {max_batch_size})
+          .Attr("workspace_size_bytes", info.max_workspace_size_bytes)
+          .Attr("precision_mode", prec_string)
+          .Attr("OutT", out_types)
+          .Finalize(&trt_node);
+  if (!status.ok()) {
+    LOG(ERROR) << "Node construction failed with" << status;
+    return status;
+  }
+  VLOG(1) << "Adding TRTEngine " << info.engine_name << " to graph";
+
+  // Up until this point, graph is not modified. If we return !status.ok() from
+  // here, this segment will be skipped
+  tensorflow::Node* engine_node = graph->AddNode(trt_node, &status);
+  if (!status.ok()) {
+    LOG(ERROR) << "Adding node failed " << status;
+    return status;
+  }
+  // Updates the inputs of output edges destination nodes, and point them to the
+  // engine node.
+  for (auto& conn : info.connections) {
+    if (conn.is_input_edge) continue;
+    VLOG(1) << " Updating DBG " << engine_node->name() << " out_port "
+            << conn.port_number << " out_id " << conn.outside_id
+            << " name=" << conn.outside_node_name;
+    auto dst_node = graph->FindNodeId(conn.outside_id);
+    // dst_node can only be removed if it is an input node of another engine.
+    // In this case, other engines input edge is updated in nodedef to point to
+    // this engine. Even though edge doesn't exists in the graph, when it is
+    // deserialized again, correct edges will be constructed. This is a problem
+    // of graph->AddNode().
+    if (!dst_node) continue;
+    VLOG(1) << "Updating " << engine_node->name() << ":" << conn.port_number
+            << " to " << dst_node->name() << ":" << conn.outside_port;
+    auto new_edge = graph->AddEdge(engine_node, conn.port_number, dst_node,
+                                   conn.outside_port);
+    CHECK(new_edge) << "Adding a new edge failed " << engine_node->name() << ":"
+                    << conn.port_number << " -> " << dst_node->name() << ":"
+                    << conn.outside_port;
+  }
+  return status;
+}
+
+// Function to construct a funcdef from the segment and add it to the graph.
+tensorflow::Status RegisterSegmentFunctionToFunctionLibrary(
+    tensorflow::Graph* graph, const tensorflow::GraphDef& segment,
+    const string& name) {
+  tensorflow::Graph sgraph(graph->flib_def());
+  tensorflow::GraphConstructorOptions gcopts;
+  TF_RETURN_IF_ERROR(
+      tensorflow::ConvertGraphDefToGraph(gcopts, segment, &sgraph));
+  std::map<string, tensorflow::Node*> io_nodes;
+  int num_inputs = 0;
+  for (auto n : sgraph.op_nodes()) {
+    if (tensorflow::str_util::StartsWith(n->name(), kInputPHName)) {
+      num_inputs++;
+      io_nodes.insert({n->name(), n});
+    } else if (tensorflow::str_util::StartsWith(n->name(), kOutputPHName)) {
+      io_nodes.insert({n->name(), n});
+    }
+  }
+
+  for (int i = 0; i < num_inputs; ++i) {
+    auto name = StrCat(kInputPHName, i);
+    auto node = io_nodes[name];
+    tensorflow::NodeDef nd;
+    tensorflow::NodeDefBuilder node_builder(
+        StrCat(name, "_Arg"), tensorflow::FunctionLibraryDefinition::kArgOp);
+    VLOG(1) << "Adding " << StrCat(name, "_Arg");
+    TF_RETURN_IF_ERROR(node_builder.Attr("T", node->output_type(0))
+                           .Attr("index", i)
+                           .Finalize(&nd));
+    tensorflow::Status s;
+    auto node_arg = sgraph.AddNode(nd, &s);
+    if (!s.ok()) {
+      LOG(ERROR) << "Couldn't add _Arg node for " << name;
+    }
+    for (auto edge : node->out_edges()) {
+      sgraph.AddEdge(node_arg, 0, edge->dst(), edge->dst_input());
+      VLOG(1) << "Updating funcdef input " << node_arg->name() << ":" << 0
+              << " - > " << edge->dst()->name() << ":" << edge->dst_input();
+      if (!s.ok()) {
+        LOG(ERROR) << "Failed to update edge from " << node_arg->name()
+                   << " to " << edge->dst()->name() << ":" << edge->dst_input();
+      }
+    }
+    sgraph.RemoveNode(node);
+  }
+
+  for (int i = 0; i < io_nodes.size() - num_inputs; ++i) {
+    auto name = StrCat(kOutputPHName, i);
+    auto node = io_nodes[name];
+    tensorflow::NodeDef nd;
+    tensorflow::NodeDefBuilder node_builder(
+        StrCat(name, "_Ret"), tensorflow::FunctionLibraryDefinition::kRetOp);
+    auto edge = *(node->in_edges().begin());
+    tensorflow::NodeDefBuilder::NodeOut nout(
+        edge->src()->name(), edge->src_output(),
+        edge->src()->output_type(edge->src_output()));
+    VLOG(1) << " input " << nout.node << ":" << nout.index
+            << " dtype=" << tensorflow::DataTypeString(nout.data_type);
+    node_builder.Input({nout});
+    TF_RETURN_IF_ERROR(node_builder.Attr("T", node->output_type(0))
+                           .Attr("index", i)
+                           .Finalize(&nd));
+    if (VLOG_IS_ON(3)) {
+      VLOG(3) << nd.DebugString();
+    }
+    tensorflow::Status s;
+    auto node_ret = sgraph.AddNode(nd, &s);
+    if (!s.ok()) {
+      LOG(ERROR) << "Couldn't add _Ret node for " << name;
+    }
+    VLOG(1) << "Update edge from " << edge->src()->name() << ":"
+            << edge->src_output() << " - > " << node_ret->name() << ":" << 0;
+    sgraph.AddEdge(edge->src(), edge->src_output(), node_ret, 0);
+    s = sgraph.UpdateEdge(edge->src(), edge->src_output(), node_ret, 0);
+    if (!s.ok()) {
+      LOG(ERROR) << "Failed to update edge from " << edge->src()->name() << ":"
+                 << edge->src_output() << " - > " << node_ret->name() << ":"
+                 << 0;
+    }
+    sgraph.RemoveNode(node);
+  }
+  tensorflow::FunctionDefLibrary fdeflib;
+  auto native_segment = fdeflib.add_function();
+  TF_RETURN_IF_ERROR(tensorflow::GraphToFunctionDef(
+      sgraph, StrCat(name, "_native_segment"), native_segment));
+  if (VLOG_IS_ON(7)) {
+    VLOG(7) << name << " Function_Def ";
+    VLOG(7) << native_segment->DebugString();
+  }
+  VLOG(1) << "Adding funcdef to graphlib";
+  TF_RETURN_IF_ERROR(graph->AddFunctionLibrary(fdeflib));
+  return tensorflow::Status::OK();
+}
+
+std::pair<int, tensorflow::Allocator*> GetDeviceAndAllocator(
+    ConversionParams& params, EngineInfo& engine) {
+  int cuda_device_id = -1;
+  auto check_device_id = [](int tfid) -> int {
+    tensorflow::TfGpuId tf_gpu_id(tfid);
+    CudaGpuId cuda_gpu_id;
+    Status s = GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id);
+    if (s.ok()) {
+      VLOG(1) << "Found TF GPU " << tf_gpu_id.value() << " at cuda device "
+              << cuda_gpu_id.value();
+      return cuda_gpu_id.value();
+    }
+    VLOG(2) << "TF GPU with id " << tfid << " do not exist " << s;
+    return -1;
+  };
+  tensorflow::Allocator* dev_allocator = nullptr;
+  // we need to us PM here since in python path there is no way to get
+  // to allocators.
+  // TODO(sami): when grappler devices become available else path will not be
+  // necessary
+  auto pm = tensorflow::ProcessState::singleton();
+  if (params.cluster) {  // get allocator
+    tensorflow::Device* device = nullptr;
+    if (params.cluster->GetDeviceSet()) {
+      device = params.cluster->GetDeviceSet()->FindDeviceByName(engine.device);
+    }
+    if (device) {
+      tensorflow::AllocatorAttributes alloc_attr;
+      dev_allocator = device->GetAllocator(alloc_attr);
+      VLOG(1) << "Using allocator " << dev_allocator->Name();
+    } else {
+      LOG(WARNING) << "Cluster is set but device '" << engine.device
+                   << "' is not found in the cluster";
+    }
+  } else {  // cluster not found, possibly a python call
+    VLOG(1) << "Cluster is not set, probably called from python";
+    int found_device = 0;
+    bool try_gpu_ids = true;
+    // if device is set, try to find the device. Might be a problem for multi
+    // host case but TensorRT do not support multi host setups yet.
+    if (!engine.device.empty()) {
+      DeviceNameUtils::ParsedName parsed_name;
+      if (DeviceNameUtils::ParseFullName(engine.device, &parsed_name)) {
+        cuda_device_id = parsed_name.has_id ? parsed_name.id : -1;
+      }
+      try_gpu_ids = !parsed_name.has_id;
+    }
+    if (try_gpu_ids) {
+      while (found_device < 100) {
+        cuda_device_id = check_device_id(found_device);
+        if (cuda_device_id >= 0) break;
+        found_device++;
+      }
+    }
+    if (found_device == 100) {
+      LOG(ERROR) << " Can't find a GPU device to work with. Please "
+                    "instantiate a session to initialize devices";
+      return std::make_pair(cuda_device_id, dev_allocator);
+    }
+    LOG(WARNING)
+        << "Can't determine the device, constructing an allocator at device "
+        << found_device;
+    tensorflow::GPUOptions gpuoptions;
+    // this will be a noop if device is already initialized
+    gpuoptions.set_allow_growth(true);
+    tensorflow::TfGpuId tf_gpu_id(found_device);
+    dev_allocator = pm->GetGPUAllocator(gpuoptions, tf_gpu_id, 1);
+  }
+  return std::make_pair(cuda_device_id, dev_allocator);
+}
+
+// Entry function from optimization pass.
+tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
+  // Convert graphdef to graph.
   tensorflow::FunctionLibraryDefinition flib(tensorflow::OpRegistry::Global(),
-                                             gdef.library());
+                                             params.input_graph_def->library());
   tensorflow::Graph graph(flib);
   TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToGraph(
-      tensorflow::GraphConstructorOptions(), gdef, &graph));
+      tensorflow::GraphConstructorOptions(), *params.input_graph_def, &graph));
 
+  // Segment the graph into subgraphs that can be converted to TensorRT
+  tensorflow::tensorrt::segment::SegmentOptions segment_options;
   // TODO(ben,jie,sami): exclude output nodes (DISCUSS IT)
-  for (auto node : output_names) {
+  for (auto node : *(params.output_names)) {
     segment_options.exclude_node_list.insert(node);
   }
-
-  // TODO(sami): this should be passed as a knob!!!!
-  segment_options.minimum_segment_size = minimum_segment_size;
-  tensorflow::tensorrt::segment::SegmentNodesVector segments;
+  segment_options.minimum_segment_size = params.minimum_segment_size;
+  tensorflow::tensorrt::segment::SegmentNodesVector initial_segments;
   TF_RETURN_IF_ERROR(tensorrt::segment::SegmentGraph(
-      &graph, IsTensorRTCandidate, segment_options, &segments));
-  if (segments.size() > 1) {
-    VLOG(0) << "MULTIPLE tensorrt candidate conversion: " << segments.size();
+      &graph, IsTensorRTCandidate, segment_options, &initial_segments));
+  if (initial_segments.size() > 1) {
+    VLOG(0) << "MULTIPLE tensorrt candidate conversion: "
+            << initial_segments.size();
   }
+
+  // Get the EngineInfo for each segment.
   std::unordered_map<string, tensorflow::Node*> node_map;
   TF_RETURN_IF_ERROR(BuildNodeMap(graph, &node_map));
-  std::unordered_map<string, std::pair<int, string>> output_edge_map;
-  int count = 0;
   float total_num_nodes_in_segments = 0.;
-  for (auto s : segments) {
-    total_num_nodes_in_segments += s.first.size();
-  }
-  // We create the map here since cluster may not be available in all cases.
-  std::map<string, tensorflow::Device*> name_to_device_map;
-  if (cluster) {
-    // TODO(aaroey): consider using DeviceSet::FindDeviceByName(), as in a
-    // distributed environment, devices from different workers can have same
-    // short name.
-    for (const auto dm : cluster->GetDeviceSet()->devices()) {
-      name_to_device_map[dm->name()] = dm;
+  std::vector<EngineInfo> engine_segments;
+  engine_segments.reserve(initial_segments.size());
+  std::vector<tensorflow::Node*> reverse_topo_order;
+  tensorflow::GetPostOrder(graph, &reverse_topo_order);
+  size_t total_engine_bytes_size = 0;
+  std::vector<size_t> engine_bytes_size;
+  tensorflow::tensorrt::segment::SegmentNodesVector converted_segments;
+  converted_segments.reserve(initial_segments.size());
+  for (size_t t = 0; t < initial_segments.size(); t++) {
+    auto& curr_segment = initial_segments.at(t);
+    EngineInfo curr_engine;
+    Status status =
+        GetEngineInfo(&graph, *params.graph_properties, curr_segment.first,
+                      node_map, reverse_topo_order, &curr_engine);
+    if (!status.ok()) {
+      LOG(WARNING) << "Failed to get engine info for segment " << t << ": "
+                   << status;
+      continue;
     }
-  }
-  for (const auto& segment_nodes_and_device : segments) {
-    const std::set<string>& subgraph_node_names =
-        segment_nodes_and_device.first;
-    std::set<int> subgraph_node_ids;
-    size_t max_mem_per_engine =
-        max_workspace_size_bytes *
-        ((float)subgraph_node_names.size() / total_num_nodes_in_segments);
-    std::stringstream oss;
-    for (const string& node_name : subgraph_node_names) {
-      oss << " " << node_name;
-      subgraph_node_ids.insert(node_map.at(node_name)->id());
+    curr_engine.precision_mode = params.precision_mode;
+    curr_engine.engine_type =
+        (params.is_dyn_op || params.precision_mode == INT8MODE
+             ? EngineInfo::EngineType::TRTDynamic
+             : EngineInfo::EngineType::TRTStatic);
+    curr_engine.cached_engine_batches = params.cached_engine_batches;
+    curr_engine.maximum_cached_engines = params.max_cached_engines;
+    StrAppend(&curr_engine.engine_name, "my_trt_op_", t);
+    status = RegisterSegmentFunctionToFunctionLibrary(
+        &graph, curr_engine.segment_graph_def, curr_engine.engine_name);
+    if (!status.ok()) {
+      LOG(WARNING) << "Failed to register segment graphdef as a function " << t
+                   << ": " << status;
+      continue;
     }
-    VLOG(1) << "Subgraph nodes at device " << segment_nodes_and_device.second
-            << " : " << oss.str();
-    auto target_device =
-        name_to_device_map.find(segment_nodes_and_device.second);
-    std::shared_ptr<nvinfer1::IGpuAllocator> allocator(0);
 
+    engine_bytes_size.push_back(curr_engine.segment_graph_def.ByteSizeLong());
+    total_engine_bytes_size += engine_bytes_size.back();
+    total_num_nodes_in_segments += curr_segment.first.size();
+    engine_segments.push_back(std::move(curr_engine));
+    converted_segments.push_back(std::move(curr_segment));
+
+    if (VLOG_IS_ON(8)) {
+      string fname = curr_engine.engine_name;
+      StrAppend(&fname, ".pb");
+      std::fstream f;
+      f.open(fname.c_str(), std::fstream::out | std::fstream::binary);
+      f << engine_segments.at(t).segment_graph_def.SerializeAsString();
+      f.close();
+    }
+  }
+
+  // Create a TRT node for each segment using its EngineInfo.
+  int old_cuda_device = 0;
+  auto err = cudaGetDevice(&old_cuda_device);
+  if (err != cudaSuccess) {
+    LOG(ERROR) << "Couldn't get current device: " << cudaGetErrorString(err);
+  }
+  VLOG(1) << "Current cuda device is " << old_cuda_device;
+  for (int i = 0; i < engine_segments.size(); ++i) {
+    auto& engine = engine_segments.at(i);
+    // Partition the workspace size by the average of node ratio and segment
+    // graphdef size
+    engine.max_workspace_size_bytes =
+        params.max_workspace_size_bytes *
+        (engine_bytes_size.at(i) / total_engine_bytes_size +
+         converted_segments.at(i).first.size() / total_num_nodes_in_segments) /
+        2.0;
+    // The allocator is used to build the engine. The build and the built engine
+    // will be destroyed after we get the serialized engine string, so it's fine
+    // to use unique_ptr here.
+    std::unique_ptr<nvinfer1::IGpuAllocator> alloc;
+    auto device_alloc = GetDeviceAndAllocator(params, engine);
     int cuda_device_id = 0;
-    if (target_device != name_to_device_map.end()) {
-      tensorflow::TfGpuId tf_gpu_id(target_device->second->parsed_name().id);
-      CudaGpuId cuda_gpu_id;
-      Status s = GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id);
-      if (!s.ok()) {
-        LOG(ERROR)
-            << "Cuda device identification failed, using device 0. Error= "
-            << s;
-      } else {
-        cuda_device_id = cuda_gpu_id.value();
-      }
-      tensorflow::GPUOptions gpuoptions;
-      // we need to us PM here since in python path there is no way to get to
-      // allocators
-      auto pm = tensorflow::ProcessState::singleton();
-      // this should be instantiated by now
-      auto dev_allocator = pm->GetGPUAllocator(gpuoptions, tf_gpu_id, 1);
-      VLOG(1) << "Got an allocator for device tf_device=" << tf_gpu_id.value()
-              << " cuda device= " << cuda_device_id << " at " << dev_allocator;
-      allocator = std::make_shared<TRTDeviceAllocator>(dev_allocator);
-    } else {  // device unknown or not available
-      allocator = std::make_shared<TRTCudaAllocator>();
+    if (device_alloc.first >= 0) {
+      cuda_device_id = device_alloc.first;
+      alloc.reset(new TRTDeviceAllocator(device_alloc.second));
+    } else {
+      // Setting allocator as nullptr should get revert to the cudamalloc
+      LOG(WARNING) << "Can't identify the cuda device. Running on device 0 ";
     }
-    ConvertGraphParams p(graph, output_names, subgraph_node_ids, max_batch_size,
-                         max_mem_per_engine, graph_properties, &output_edge_map,
-                         precision_mode, segment_nodes_and_device.second,
-                         allocator, cuda_device_id);
-    if (precision_mode == INT8MODE) {
-      tensorflow::Status status = GetCalibNode(&p);
-      if (status != tensorflow::Status::OK()) {
-        LOG(WARNING) << "subgraph conversion error for subgraph_index:" << count
-                     << " due to: \"" << status.ToString()
-                     << "\" SKIPPING......( " << subgraph_node_names.size()
-                     << " nodes)";
+    cudaSetDevice(cuda_device_id);
+    auto status = CreateTRTNode(&graph, engine_segments, i, alloc.get(),
+                                params.max_batch_size);
+    // If status is ok, we successfully added the node to the graph and can
+    // remove segment ops. Otherwise graph is not modified.
+    if (status.ok()) {
+      for (auto node_name : converted_segments.at(i).first) {
+        graph.RemoveNode(node_map.at(node_name));
       }
     } else {
-      tensorflow::Status status = ConvertSubGraphToTensorRT(&p);
-      if (status != tensorflow::Status::OK()) {
-        LOG(WARNING) << "subgraph conversion error for subgraph_index:" << count
-                     << " due to: \"" << status.ToString()
-                     << "\" SKIPPING......( " << subgraph_node_names.size()
-                     << " nodes)";
-      }
+      // Graph is not modified.
+      LOG(WARNING) << "Engine creation for segment " << i << ", composed of "
+                   << converted_segments.at(i).first.size()
+                   << " nodes failed: " << status << ". Skipping...";
     }
-    count++;
   }
-  graph.ToGraphDef(new_graph_def);
+  cudaSetDevice(old_cuda_device);
+  graph.ToGraphDef(params.output_graph_def);
+  VLOG(1) << "Returning from conversion";
   return tensorflow::Status::OK();
 }
 
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.h b/tensorflow/contrib/tensorrt/convert/convert_graph.h
index 65a67d7e73..9d986e4890 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.h
@@ -30,29 +30,60 @@ namespace tensorflow {
 namespace tensorrt {
 namespace convert {
 
-// This method converts an already generated calibration graph which was used in
-// calibration runs to an inference graph
+struct ConversionParams {
+  ConversionParams()
+      : input_graph_def(nullptr),
+        max_batch_size(1),
+        max_workspace_size_bytes(1 << 30),
+        output_graph_def(nullptr),
+        precision_mode(1),
+        minimum_segment_size(3),
+        graph_properties(nullptr),
+        cluster(nullptr),
+        is_dyn_op(false),
+        fixed_input_size(true),
+        max_cached_engines(1) {}
+  const tensorflow::GraphDef* input_graph_def;
+  const std::vector<string>* output_names;
+  size_t max_batch_size;
+  size_t max_workspace_size_bytes;
+  tensorflow::GraphDef* output_graph_def;
+  int precision_mode;
+  int minimum_segment_size;
+  const tensorflow::grappler::GraphProperties* graph_properties;
+  const tensorflow::grappler::Cluster* cluster;
+  bool is_dyn_op;  //  Whether to create engine on conversion or execution time
+  bool fixed_input_size;   // Assume non-batch ranks of input tensors are fixed
+  int max_cached_engines;  // maximum number of cached engines
+  std::vector<int> cached_engine_batches;  // list of cached engines
+};
+
+// This method extracts calibration information from the resource managers
+// and puts them in to engine nodedefs.
 tensorflow::Status ConvertCalibGraphToInferGraph(
-    const tensorflow::GraphDef& graph_def, tensorflow::GraphDef* new_graph_def);
+    const tensorflow::GraphDef& graph_def, tensorflow::GraphDef* new_graph_def,
+    bool is_dyn_op);
 
-// max_batch_size: maximum batch size which can be used for inference for
-//                 optimization targets inference run with max batch size.
-// max_workspace_size_bytes: The upper bound of memory allowance for
-//                 engine building.
+// - max_batch_size: maximum batch size which can be used for inference for
+//   optimization targets inference run with max batch size.
+// - max_workspace_size_bytes: The upper bound of memory allowance for engine
+//   building.
 tensorflow::Status ConvertGraphDefToTensorRT(
     const tensorflow::GraphDef& graph_def,
     const std::vector<string>& output_names, size_t max_batch_size,
     size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def,
-    int precision_mode, int minimum_segment_size);
+    int precision_mode = 1, int minimum_segment_size = 3,
+    bool is_dyn_op = false, int max_cached_engines = 1,
+    std::vector<int> cached_engine_batches = {});
 
 // Method to call from optimization pass
-tensorflow::Status ConvertAfterShapes(
-    const tensorflow::GraphDef& graph, const std::vector<string>& output_names,
-    size_t max_batch_size, size_t max_workspace_size_bytes,
-    tensorflow::GraphDef* new_graph_def, int precision_mode,
-    int minimum_segment_size,
-    const tensorflow::grappler::GraphProperties& graph_properties,
-    const tensorflow::grappler::Cluster* cluster);
+tensorflow::Status ConvertAfterShapes(ConversionParams& params);
+
+// Return compile time TensorRT library version information.
+std::vector<int> GetLinkedTensorRTVersion();
+
+// Return runtime time TensorRT library version information.
+std::vector<int> GetLoadedTensorRTVersion();
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 4e4d295538..146b9c7344 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
 
 #include <algorithm>
 #include <list>
@@ -25,7 +24,9 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/contrib/tensorrt/convert/utils.h"
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
 #include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h"
 #include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
 #include "tensorflow/core/framework/node_def.pb.h"  // NOLINT
@@ -37,6 +38,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
@@ -54,8 +56,11 @@ limitations under the License.
 namespace tensorflow {
 namespace tensorrt {
 namespace convert {
+using ::tensorflow::str_util::Split;
+
 using ::tensorflow::strings::StrAppend;
 using ::tensorflow::strings::StrCat;
+
 namespace {
 
 inline tensorflow::Status ConvertDType(tensorflow::DataType tf_dtype,
@@ -121,12 +126,10 @@ static std::vector<std::pair<int, int>> CreateSamePadding(
 
 string GetCommonNameScope(const string& op_name_a, const string& op_name_b) {
   size_t last_scope_separator = 0;
-  for (size_t i = 0; i < std::min(op_name_a.size(), op_name_b.size()); ++i) {
-    if (op_name_a[i] != op_name_b[i]) {
-      break;
-    } else if (op_name_a[i] == '/') {
-      last_scope_separator = i + 1;
-    }
+  const size_t min_size = std::min(op_name_a.size(), op_name_b.size());
+  for (size_t i = 0; i < min_size; ++i) {
+    if (op_name_a[i] != op_name_b[i]) break;
+    if (op_name_a[i] == '/') last_scope_separator = i + 1;
   }
   return op_name_a.substr(0, last_scope_separator);
 }
@@ -417,20 +420,6 @@ void ReorderRSCKToKCRS(const TRT_ShapedWeights& iweights,
   }
 }
 
-struct InferDeleter {
-  template <typename T>
-  void operator()(T* obj) const {
-    if (obj) {
-      obj->destroy();
-    }
-  }
-};
-
-template <typename T>
-inline std::shared_ptr<T> infer_object(T* obj) {
-  return std::shared_ptr<T>(obj, InferDeleter());
-}
-
 class Converter;
 
 using OpConverter =
@@ -444,7 +433,7 @@ class Converter {
   OpConverter plugin_converter_;
   nvinfer1::INetworkDefinition* trt_network_;
   std::list<std::vector<uint8_t>> temp_bufs_;
-  tensorflow::tensorrt::TRTWeightStore* weight_store_;
+  TRTWeightStore* weight_store_;
   bool fp16_;
   void register_op_converters();
   tensorflow::Status get_inputs(const tensorflow::NodeDef& node_def,
@@ -486,11 +475,11 @@ class Converter {
 
  public:
   explicit Converter(nvinfer1::INetworkDefinition* trt_network,
-                     tensorflow::tensorrt::TRTWeightStore* ws, bool fp16)
+                     TRTWeightStore* ws, bool fp16)
       : trt_network_(trt_network), weight_store_(ws), fp16_(fp16) {
     this->register_op_converters();
   }
-  tensorflow::tensorrt::TRTWeightStore* weight_store() { return weight_store_; }
+  TRTWeightStore* weight_store() { return weight_store_; }
   TRT_ShapedWeights get_temp_weights(tensorflow::DataType type,
                                      nvinfer1::Dims shape) {
     TRT_ShapedWeights weights(type, nullptr, shape);
@@ -2140,559 +2129,265 @@ void Converter::register_op_converters() {
 
 }  // namespace
 
-tensorflow::Status ConvertCalibrationNodeToEngineNode(
-    tensorflow::Graph& graph, tensorflow::Node* c_node) {
-  const auto ndef = c_node->def();
-
-  TFAttrs attrs(ndef);
-  std::vector<string> segment_nodes(
-      attrs.get<std::vector<string>>("segment_nodes"));
-  std::vector<string> output_nodes(
-      attrs.get<std::vector<string>>("segment_output_names"));
-  std::vector<string> input_names(
-      attrs.get<std::vector<string>>("input_names"));
-  string res_name = attrs.get<string>("resource_name");
-  VLOG(1) << "Node name " << c_node->name() << " res_name " << res_name;
-  string engine_name = "my_trt_op";
-  {
-    const auto node_id = tensorflow::str_util::Split(res_name, "_");
-    engine_name += node_id.back();
-  }
-  std::map<string, tensorflow::Node*> node_maps;
-
-  for (auto n : graph.op_nodes()) {
-    node_maps.insert({n->name(), n});
-  }
-  std::set<int> subgraph_ids;
-  for (const auto internal_node : segment_nodes) {
-    subgraph_ids.insert(node_maps.at(internal_node)->id());
-  }
-  if (VLOG_IS_ON(2)) {
-    string node_names = StrCat(c_node->name(), " segment nodes= ");
-
-    for (const auto& node_name : segment_nodes) {
-      StrAppend(&node_names, node_name, ", ");
-    }
-    VLOG(2) << node_names;
+tensorflow::Status ConvertGraphDefToEngine(
+    const tensorflow::GraphDef& gdef, int precision_mode, int max_batch_size,
+    size_t max_workspace_size_bytes,
+    const std::vector<tensorflow::PartialTensorShape>& input_shapes,
+    Logger* logger, nvinfer1::IGpuAllocator* allocator,
+    TRTInt8Calibrator* calibrator,
+    TrtUniquePtrType<nvinfer1::ICudaEngine>* engine,
+    bool* convert_successfully) {
+  engine->reset();
+  if (convert_successfully) *convert_successfully = false;
+
+  // Create the builder.
+  TrtUniquePtrType<nvinfer1::IBuilder> builder(
+      nvinfer1::createInferBuilder(*logger));
+  builder->setMaxBatchSize(max_batch_size);
+  // TODO(aaroey): use the allocator to allocate the TRT workspace.
+  builder->setMaxWorkspaceSize(max_workspace_size_bytes);
+#if NV_TENSORRT_MAJOR > 3
+  builder->setGpuAllocator(allocator);
+#endif
+  if (precision_mode == FP16MODE) {
+    builder->setHalf2Mode(true);
+  } else if (precision_mode == INT8MODE) {
+    builder->setInt8Mode(true);
+    builder->setInt8Calibrator(calibrator);
   }
 
-  VLOG(1) << "Output Nodes:";
-  std::vector<tensorflow::DataType> out_types;
-  std::vector<const tensorflow::Edge*> out_edges;
+  // Create the network.
+  auto trt_network =
+      TrtUniquePtrType<nvinfer1::INetworkDefinition>(builder->createNetwork());
+  if (!trt_network) {
+    return tensorflow::errors::Internal(
+        "Failed to create TensorRT network object");
+  }
+  auto ws = std::unique_ptr<TRTWeightStore>(new TRTWeightStore());
 
-  for (auto& i : output_nodes) {
-    auto node_port = tensorflow::str_util::Split(i, ":");
-    VLOG(1) << " " << i << " in graph " << node_maps.count(i);
-    auto out_node_name = node_port.at(0);
-    if (node_port.size() > 1) {
-      VLOG(1) << "Multi port output" << node_port.at(0) << " "
-              << node_port.at(1) << " size=" << node_port.size();
-    }
-    auto node_it = node_maps.find(out_node_name);
-    if (node_it != node_maps.end()) {
-      tensorflow::Node* out_node = node_it->second;
-      int port = 0;
-      if (node_port.size() == 2) {
-        port = std::strtoul(node_port.at(1).c_str(), nullptr, 10);
-        out_types.push_back(out_node->output_type(port));
-      } else {
-        out_types.push_back(out_node->output_type(0));
+  // Build the network
+  VLOG(1) << "Starting engine conversion ";
+  Converter converter(trt_network.get(), ws.get(), precision_mode == FP16MODE);
+  std::vector<std::pair<string, string>> output_tensors;
+  // Graph nodes are already topologically sorted during construction
+  for (const auto& node_def : gdef.node()) {
+    string node_name = node_def.name();
+    VLOG(1) << "Converting op name=" << node_name << ", op=" << node_def.op();
+    if (tensorflow::str_util::StartsWith(node_name, kInputPHName) &&
+        (node_def.op() == "Placeholder")) {
+      nvinfer1::DimsCHW input_dim_pseudo_chw;
+      for (int i = 0; i < 8; i++) input_dim_pseudo_chw.d[i] = 0;
+      nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
+      auto type_status =
+          ConvertDType(node_def.attr().at("dtype").type(), &dtype);
+      if (type_status != tensorflow::Status::OK()) {
+        LOG(WARNING) << "Type conversion failed for " << node_name;
+        return type_status;
       }
-      for (auto out_edge : out_node->out_edges()) {
-        if (subgraph_ids.count(out_edge->dst()->id()))
-          continue;  // skip internal edges;
-        if (out_edge->src_output() == port) {
-          out_edges.push_back(out_edge);
-          VLOG(1) << "OUTPUT EDGE " << out_edge->src()->name() << ":"
-                  << out_edge->src_output() << " -> " << out_edge->dst()->name()
-                  << ":" << out_edge->dst_input();
+      int32 slot_number = -1;
+      if (!tensorflow::strings::safe_strto32(node_name.c_str() + 8,
+                                             &slot_number)) {
+        LOG(ERROR) << "Failed to parse slot number from " << node_name
+                   << " +8= " << node_name.c_str() + 8;
+      }
+      auto shape = input_shapes.at(slot_number);
+      if (shape.dims() > 8) {
+        LOG(ERROR) << "Tensor rank is greater than 8 for " << node_name
+                   << " at input slot " << slot_number;
+        return tensorflow::errors::OutOfRange(
+            "Input tensor rank is greater than 8");
+      }
+      if (VLOG_IS_ON(1)) {
+        string dim_str("dims=");
+        StrAppend(&dim_str, "[ ", shape.dim_size(0));
+        for (int i = 1; i < shape.dims(); i++) {
+          StrAppend(&dim_str, ", ", shape.dim_size(i));
         }
+        StrAppend(&dim_str, " ]");
+        VLOG(1) << dim_str;
+      }
+      for (int i = 1; i < shape.dims(); i++) {
+        input_dim_pseudo_chw.d[i - 1] = shape.dim_size(i);
       }
-    } else {
-      LOG(WARNING) << " couldn't find output node " << out_node_name;
-    }
-  }
-  if (VLOG_IS_ON(1)) {
-    VLOG(1) << c_node->name() << " Input Nodes:";
-    for (auto& i : input_names) {
-      VLOG(1) << " Input " << i << " in graph " << node_maps.count(i);
-    }
-  }
-  auto trt_rm = tensorflow::tensorrt::TRTResourceManager::instance();
-  auto resmgr = trt_rm->getManager("TRTCalibOps");
-  tensorflow::tensorrt::TRTCalibrationResource* calib_res = nullptr;
-  auto status = resmgr->Lookup(res_name, res_name, &calib_res);
-  if (!status.ok() || !calib_res->calibrator_) {
-    return tensorflow::errors::FailedPrecondition(
-        "You must run calibration"
-        " and inference conversion in the same process");
-  }
-
-  calib_res->calibrator_->setDone();
-  calib_res->thr_->join();
-  delete calib_res->thr_;
-  if (!calib_res->engine_) {
-    LOG(ERROR) << "Calibration failed!, engine does not exist. Did you run "
-                  "calibration graph?";
-    return tensorflow::errors::FailedPrecondition(
-        "Calibration graph needs to be executed on"
-        " calibration data before convertsion to inference graph");
-  }
-  auto weight_rmgr = trt_rm->getManager("WeightStore");
-  TF_CHECK_OK(weight_rmgr->Delete<tensorflow::tensorrt::TRTWeightStore>(
-      res_name, res_name));
-  auto engine_plan = calib_res->engine_->serialize();
-  calib_res->engine_->destroy();
-  calib_res->network_->destroy();
-  calib_res->builder_->destroy();
-  calib_res->thr_ = nullptr;
-  calib_res->engine_ = nullptr;
-  calib_res->builder_ = nullptr;
-  tensorflow::NodeDefBuilder op_builder(engine_name, "TRTEngineOp");
-  std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
-  income_edges.resize(c_node->num_inputs());
-  for (const auto in_edge : c_node->in_edges()) {
-    auto src = in_edge->src();
-    int dest_port = in_edge->dst_input();
-    VLOG(1) << "Incoming connection " << src->name() << ":"
-            << in_edge->src_output() << " -> " << c_node->name() << ":"
-            << dest_port;
-    income_edges.at(dest_port) = {src->name(), in_edge->src_output(),
-                                  c_node->input_type(dest_port)};
-  }
-  tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
-      income_edges);
-  if (VLOG_IS_ON(2)) {
-    for (const auto& inp : input_list) {
-      VLOG(2) << " Input from inputlist " << inp.node << ":" << inp.index << " "
-              << tensorflow::DataTypeString(inp.data_type);
-    }
-  }
-  op_builder.Input(input_list);
-  tensorflow::NodeDef engine_node;
-  const char* engine_plan_data = static_cast<const char*>(engine_plan->data());
-  string engine_plan_string(engine_plan_data,
-                            engine_plan_data + engine_plan->size());
-  status = op_builder.Attr("serialized_engine", engine_plan_string)
-               .Attr("input_nodes", input_names)
-               .Attr("output_nodes", output_nodes)
-               .Attr("OutT", out_types)
-               .Finalize(&engine_node);
-  if (!status.ok()) {
-    LOG(ERROR) << "Engine Node creation failed";
-    return status;
-  }
-  auto trt_engine_node = graph.AddNode(engine_node, &status);
-  TF_RETURN_IF_ERROR(status);
-  std::map<string, int> port_map;
-  for (size_t t = 0; t < output_nodes.size(); t++) {
-    port_map.insert({output_nodes.at(t), t});
-  }
-  for (auto& i : out_edges) {
-    string s(i->src()->name());
-    if (i->src_output()) StrAppend(&s, ":", i->src_output());
-    int out_port = port_map.at(s);
-    VLOG(1) << "Connecting " << trt_engine_node->name() << ":" << out_port
-            << " -> " << i->dst()->name() << ":" << i->dst_input();
-    TF_RETURN_IF_ERROR(
-        graph.UpdateEdge(trt_engine_node, out_port, i->dst(), i->dst_input()));
-  }
-  for (const auto ed : trt_engine_node->in_edges()) {
-    VLOG(1) << "In Edge  " << ed->src()->name() << ":" << ed->src_output()
-            << " -> " << ed->dst()->name() << ":" << ed->dst_input();
-  }
-  for (const auto ed : trt_engine_node->out_edges()) {
-    VLOG(1) << "Out Edge " << ed->src()->name() << ":" << ed->src_output()
-            << " -> " << ed->dst()->name() << ":" << ed->dst_input();
-  }
-  VLOG(1) << "Segment nodes:";
-  for (auto& i : segment_nodes) {
-    VLOG(1) << " " << i << " in graph " << node_maps.count(i);
-    auto it = node_maps.find(i);
-    if (it != node_maps.end()) {
-      graph.RemoveNode(it->second);
-    }
-  }
-  graph.RemoveNode(c_node);
-  return tensorflow::Status::OK();
-}
 
-tensorflow::Status ReverseTopologicalSort(
-    const tensorrt::convert::SubGraphParams& s,
-    std::list<tensorflow::Node*>* order) {
-  std::vector<tensorflow::Node*> order_vec;
-  tensorflow::GetPostOrder(s.graph, &order_vec);
-  // Select just the subgraph
-  for (tensorflow::Node* node : order_vec) {
-    if (s.subgraph_node_ids.count(node->id())) {
-      // We want topological order to contstruct the
-      // network layer by layer
-      order->push_front(node);
+      input_dim_pseudo_chw.nbDims = shape.dims() - 1;
+      nvinfer1::ITensor* input_tensor = converter.network()->addInput(
+          node_name.c_str(), dtype, input_dim_pseudo_chw);
+      if (!input_tensor) {
+        return tensorflow::errors::InvalidArgument(
+            "Failed to create Input layer tensor ", node_name,
+            " rank=", shape.dims() - 1);
+      }
+      VLOG(1) << "Input tensor name :" << node_name;
+      if (!converter.insert_input_tensor(node_name, input_tensor)) {
+        return tensorflow::errors::AlreadyExists(
+            "Output tensor already exists for op: " + node_name);
+      }
+    } else if (tensorflow::str_util::StartsWith(node_name, kOutputPHName) &&
+               (node_def.op() == "Identity")) {
+      int32 slot_number = -1;
+      if (!tensorflow::strings::safe_strto32(node_name.c_str() + 9,
+                                             &slot_number)) {
+        LOG(ERROR) << "Failed to parse slot number from " << node_name
+                   << " +9=" << node_name.c_str() + 9;
+      }
+      if (output_tensors.size() <= slot_number) {
+        output_tensors.resize(slot_number + 1);
+      }
+      output_tensors.at(slot_number) = {node_def.input(0), node_name};
+    } else {
+      VLOG(2) << "Converting node: " << node_def.name() << " , "
+              << node_def.op();
+      TF_RETURN_IF_ERROR(converter.convert_node(node_def));
     }
   }
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status SetInputList(
-    const tensorrt::convert::SubGraphParams& s,
-    tensorflow::NodeDefBuilder* op_builder,
-    const std::vector<string>* input_names,
-    std::vector<tensorflow::DataType>* input_dtypes) {
-  std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
-  VLOG(2) << "input edge size: " << input_names->size();
-  for (size_t i = 0; i < input_names->size(); ++i) {
-    VLOG(2) << "input edges: " << i << " " << input_names->at(i);
-    int output_idx = s.input_inds.at(i).second;
-    // we wired up the input here already, it is redundant to do it again in
-    //  ConvertSubGraphToTensorRT(convert_graph.cc)
-    auto incoming_edge = tensorflow::NodeDefBuilder::NodeOut(
-        input_names->at(i), output_idx, input_dtypes->at(i));
-    income_edges.push_back(incoming_edge);
-  }
-  tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
-      income_edges);
-  op_builder->Input(input_list);
-  return tensorflow::Status::OK();
-}
-
-string SubgraphNameScopeGenerator(const std::list<tensorflow::Node*>* order) {
-  string subgraph_name_scope;
-  if (!order->empty()) {
-    subgraph_name_scope = order->front()->name();
-  }
-  for (const tensorflow::Node* node : *order) {
-    subgraph_name_scope = GetCommonNameScope(subgraph_name_scope, node->name());
-  }
-  // TODO(sami,ben,jie): proper naming!
-  return subgraph_name_scope;
-}
-
-tensorflow::Status ConvertSubgraph(
-    Converter& converter, tensorrt::convert::SubGraphParams& s,
-    std::list<tensorflow::Node*>* order, std::vector<string>* input_names,
-    std::vector<tensorflow::DataType>* input_dtypes,
-    std::vector<string>* output_names,
-    std::vector<tensorflow::DataType>* output_dtypes,
-    const string& engine_name) {
-  std::set<string> added_tensors;
-  for (const std::pair<int, int>& input : s.input_inds) {
-    VLOG(2) << "parsing input. Node id= " << input.first;
-    int node_id = input.first;
-    int output_idx = input.second;
-    tensorflow::Node* node = s.graph.FindNodeId(node_id);
-    auto node_name = node->name();
-    // input_names should use the node name in the graph
-    // here it should be the input tensor name -> matching the binding
-    // insert original node name without port
-    auto tensor_name = node_name;
-    if (output_idx != 0) {
-      tensor_name = StrCat(tensor_name, ":", output_idx);
-    }
-
-    VLOG(2) << "input name: " << node_name << " tensor_name: " << tensor_name
-            << " idx: " << output_idx;
-
-    auto shape_inference_node_name = node_name;
-    auto shape_inference_output_idx = output_idx;
-    // rewire the shape inference to original node in the graph
-    if (s.output_edge_map->count(tensor_name)) {
-      shape_inference_node_name = s.output_edge_map->at(tensor_name).second;
-      shape_inference_output_idx = s.output_edge_map->at(tensor_name).first;
-    }
-    if (shape_inference_output_idx < 0) continue;
-    VLOG(2) << "shapeinference name: " << shape_inference_node_name
-            << " idx: " << shape_inference_output_idx;
-
-    if (!s.graph_properties.HasOutputProperties(shape_inference_node_name))
-      return tensorflow::errors::Internal("failed to find input node: " +
-                                          shape_inference_node_name);
-
-    auto op_info_vec =
-        s.graph_properties.GetOutputProperties(shape_inference_node_name);
-    if (static_cast<int>(op_info_vec.size()) <= shape_inference_output_idx)
-      return tensorflow::errors::Internal(
-          "accessing output index of: ", shape_inference_output_idx,
-          ", at node: ", shape_inference_node_name,
-          " with output entry from shape_map: ", op_info_vec.size());
-
-    auto op_info = op_info_vec.at(shape_inference_output_idx);
-    tensorflow::DataType tf_dtype = op_info.dtype();
-
-    nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
-    auto type_status = ConvertDType(tf_dtype, &dtype);
-    if (type_status != tensorflow::Status::OK()) {
-      LOG(WARNING) << "Type conversion failed for " << node_name;
-      return type_status;
-    }
-
-    VLOG(2) << "Accessing output index of: " << output_idx
-            << ", at node: " << node_name
-            << " with output entry from shape_map: " << op_info_vec.size();
-    // TODO(ben,jie): update TRT input format/dimension
-    nvinfer1::DimsCHW input_dim_pseudo_chw;
-    for (int i = 0; i < 3; i++) input_dim_pseudo_chw.d[i] = 1;
-
-    // TODO(jie): TRT 3.x only support 4 dimensional input tensor.
-    //            update the code once TRT 4.0 comes out.
-    if (op_info.shape().dim_size() != 4) {
-      string err_str = "Require 4 dimensional input.";
-      StrAppend(&err_str, " Got ", op_info.shape().dim_size(), " ",
-                shape_inference_node_name);
-      return tensorflow::errors::Unimplemented(err_str);
-    }
-
-    for (int i = 1; i < op_info.shape().dim_size(); i++) {
-      VLOG(2) << "dimension: " << i
-              << " , size: " << op_info.shape().dim(i).size();
-      input_dim_pseudo_chw.d[i - 1] = op_info.shape().dim(i).size();
-    }
-
-    // TODO(ben,jie): proper way to restore input tensor name?
-    auto input_tensor_name = node_name;
-    if (output_idx != 0) {
-      input_tensor_name = StrCat(node_name, ":", output_idx);
-    }
-    if (added_tensors.count(input_tensor_name)) continue;
-    added_tensors.insert(input_tensor_name);
-    input_names->push_back(input_tensor_name);
-    input_dtypes->push_back(tf_dtype);
-    nvinfer1::ITensor* input_tensor = converter.network()->addInput(
-        input_tensor_name.c_str(), dtype, input_dim_pseudo_chw);
-
-    if (!input_tensor)
-      return tensorflow::errors::InvalidArgument(
-          "Failed to create Input layer");
-    VLOG(2) << "Input tensor name :" << input_tensor_name;
-
-    if (!converter.insert_input_tensor(input_tensor_name, input_tensor))
-      return tensorflow::errors::AlreadyExists(
-          "Output tensor already exists for op: " + input_tensor_name);
-  }
-
-  for (const tensorflow::Node* node : *order) {
-    const tensorflow::NodeDef& node_def = node->def();
-    VLOG(2) << "Converting node: " << node_def.name() << " , " << node_def.op();
-    TF_RETURN_IF_ERROR(converter.convert_node(node_def));
-  }
-
-  VLOG(2) << "Finished conversion";
-
-  // Gather output metadata
-  int trt_engine_op_output_idx = 0;
-  added_tensors.clear();
-  for (const std::pair<int, int>& output : s.output_inds) {
-    int node_id = output.first;
-    int output_idx = output.second;
-    tensorflow::Node* node = s.graph.FindNodeId(node_id);
-    string op_name = node->name();
-    string tensor_name = op_name;
-
-    s.output_edge_map->insert(
-        {trt_engine_op_output_idx == 0
-             ? engine_name
-             : StrCat(engine_name, ":", trt_engine_op_output_idx),
-         {output_idx, tensor_name}});
-    trt_engine_op_output_idx++;
-    if (output_idx != 0)
-      tensorflow::strings::StrAppend(&tensor_name, ":", output_idx);
-    VLOG(2) << "Output tensor name: " << tensor_name;
-    if (added_tensors.count(tensor_name)) continue;
-    added_tensors.insert(tensor_name);
-    output_names->push_back(tensor_name);
-    auto tensor_or_weights = converter.get_tensor(tensor_name);
+  for (const auto& output : output_tensors) {
+    auto tensor_or_weights = converter.get_tensor(output.first);
     if (!tensor_or_weights.is_tensor()) {
-      return tensorflow::errors::InvalidArgument("Output node '" + tensor_name +
-                                                 "' is weights not tensor");
+      return tensorflow::errors::InvalidArgument(
+          "Output node '" + output.first + "' is weights not tensor");
     }
     nvinfer1::ITensor* tensor = tensor_or_weights.tensor();
+    tensor->setName(output.second.c_str());
     if (!tensor) {
       return tensorflow::errors::NotFound("Output tensor not found: " +
-                                          tensor_name);
+                                          output.first);
     }
+    VLOG(1) << "Marking output tensor " << output.first << ", as output tensor "
+            << output.second;
+
     converter.network()->markOutput(*tensor);
-    tensorflow::DataType tf_dtype = node->output_type(output_idx);
-    output_dtypes->push_back(tf_dtype);
-    nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT;
-    TF_RETURN_IF_ERROR(ConvertDType(tf_dtype, &trt_dtype));
-    tensor->setType(trt_dtype);
   }
+  if (convert_successfully) *convert_successfully = true;
 
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
-  // Visit nodes in reverse topological order and construct the TRT network.
-  // Toposort
-  std::list<tensorflow::Node*> order;
-  TF_RETURN_IF_ERROR(ReverseTopologicalSort(s, &order));
-
-  static int static_id = 0;
-  string subgraph_name_scope = SubgraphNameScopeGenerator(&order);
-  // TODO(sami,ben,jie): proper naming!
-  string calib_op_name =
-      StrCat(subgraph_name_scope, "my_trt_calib_op_", static_id);
-  string engine_name = StrCat(subgraph_name_scope, "my_trt_op", static_id);
-  static_id++;
-
-  auto trt_rmgr = tensorflow::tensorrt::TRTResourceManager::instance();
-  auto op_rmgr = trt_rmgr->getManager("TRTCalibOps");
-  auto op_res = new tensorflow::tensorrt::TRTCalibrationResource();
-  TF_CHECK_OK(op_rmgr->Create(calib_op_name, calib_op_name, op_res));
-  op_res->logger_ = new tensorflow::tensorrt::Logger();
-  cudaSetDevice(s.cuda_gpu_id_);
-  op_res->builder_ = nvinfer1::createInferBuilder(*(op_res->logger_));
-  op_res->allocator_ = s.allocator_;
-#if NV_TENSORRT_MAJOR > 3
-  op_res->builder_->setGpuAllocator(s.allocator_.get());
-#endif
-  if (!op_res->builder_) {
-    return tensorflow::errors::Internal(
-        "failed to create TensorRT builder object");
+  // Build the engine.
+  VLOG(1) << "Starting engine creation";
+  engine->reset(builder->buildCudaEngine(*converter.network()));
+  if (engine->get() == nullptr) {
+    return tensorflow::errors::Internal("Failed to build TensorRT engine");
   }
-
-  op_res->network_ = op_res->builder_->createNetwork();
-  if (!op_res->network_) {
-    return tensorflow::errors::Internal(
-        "failed to create TensorRT network object");
-  }
-
-  // Build the network
-  auto weight_rmgr = trt_rmgr->getManager("WeightStore");
-  auto ws = new tensorflow::tensorrt::TRTWeightStore();
-  TF_CHECK_OK(weight_rmgr->Create(calib_op_name, calib_op_name, ws));
-  Converter converter(op_res->network_, ws, s.precision_mode == FP16MODE);
-
-  std::vector<string> input_names;
-  std::vector<tensorflow::DataType> input_dtypes;
-  std::vector<string> output_names;
-  std::vector<tensorflow::DataType> output_dtypes;
-  TF_RETURN_IF_ERROR(ConvertSubgraph(converter, s, &order, &input_names,
-                                     &input_dtypes, &output_names,
-                                     &output_dtypes, engine_name));
-
-  VLOG(2) << "Finished processing outputs";
-
-  // Build the engine
-  op_res->builder_->setMaxBatchSize(s.max_batch_size);
-  op_res->builder_->setMaxWorkspaceSize(s.max_workspace_size_bytes);
-  VLOG(0) << "Max batch size= " << s.max_batch_size
-          << " max workspace size= " << s.max_workspace_size_bytes;
-
-  // Build the TRT op
-  // TODO(sami,ben,jie): proper naming!
-  tensorflow::NodeDefBuilder op_builder(calib_op_name, "TRTCalibOp");
-  TF_RETURN_IF_ERROR(SetInputList(s, &op_builder, &input_names, &input_dtypes));
-
-  std::vector<string> segment_names;
-  segment_names.reserve(s.subgraph_node_ids.size());
-  for (int i : s.subgraph_node_ids) {
-    auto node = s.graph.FindNodeId(i);
-    segment_names.push_back(node->name());
-  }
-  LOG(INFO) << "finished op preparation";
-
-  auto status = op_builder.Attr("segment_nodes", segment_names)
-                    .Attr("input_names", input_names)
-                    .Attr("segment_output_names", output_names)
-                    .Attr("resource_name", calib_op_name)
-                    .Finalize(s.trt_node);
-
-  LOG(INFO) << status.ToString();
-  LOG(INFO) << "finished op building";
-
+  VLOG(1) << "Finished conversion";
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
-    tensorrt::convert::SubGraphParams& s) {
-  // Visit nodes in reverse topological order and construct the TRT network.
-  std::list<tensorflow::Node*> order;
-  TF_RETURN_IF_ERROR(ReverseTopologicalSort(s, &order));
-
-  static int static_id = 0;
-  string subgraph_name_scope = SubgraphNameScopeGenerator(&order);
-  string engine_name = StrCat(subgraph_name_scope, "my_trt_op", static_id++);
-
-  tensorflow::tensorrt::Logger trt_logger;
-  cudaSetDevice(s.cuda_gpu_id_);
-  auto trt_builder = infer_object(nvinfer1::createInferBuilder(trt_logger));
-  if (!trt_builder) {
-    return tensorflow::errors::Internal(
-        "Failed to create TensorRT builder object");
-  }
-#if NV_TENSORRT_MAJOR > 3
-  trt_builder->setGpuAllocator(s.allocator_.get());
-#endif
-  auto trt_network = infer_object(trt_builder->createNetwork());
-  if (!trt_network) {
-    return tensorflow::errors::Internal(
-        "Failed to create TensorRT network object");
-  }
-
-  auto trt_rmgr = tensorflow::tensorrt::TRTResourceManager::instance();
-  auto weight_rmgr = trt_rmgr->getManager("WeightStore");
-  auto ws = new tensorflow::tensorrt::TRTWeightStore();
-  TF_CHECK_OK(weight_rmgr->Create(engine_name, engine_name, ws));
-
-  // Build the network
-  Converter converter(trt_network.get(), ws, s.precision_mode == FP16MODE);
-
-  std::vector<string> input_names;
-  std::vector<tensorflow::DataType> input_dtypes;
-  std::vector<string> output_names;
-  std::vector<tensorflow::DataType> output_dtypes;
-  TF_RETURN_IF_ERROR(ConvertSubgraph(converter, s, &order, &input_names,
-                                     &input_dtypes, &output_names,
-                                     &output_dtypes, engine_name));
-
-  VLOG(2) << "Finished output";
-
-  // Build the engine
-  trt_builder->setMaxBatchSize(s.max_batch_size);
-  trt_builder->setMaxWorkspaceSize(s.max_workspace_size_bytes);
-  VLOG(0) << "Max batch size= " << s.max_batch_size
-          << " max workspace size= " << s.max_workspace_size_bytes;
-  if (s.precision_mode == FP16MODE) {
-    trt_builder->setHalf2Mode(true);
-    VLOG(0) << "Using FP16 precision mode";
-  }
-  LOG(INFO) << "starting build engine";
-  string engine_plan_string;
-  {
-    auto trt_engine =
-        infer_object(trt_builder->buildCudaEngine(*converter.network()));
-    VLOG(0) << "Built network";
-    if (trt_engine.get() == nullptr) {
-      return tensorflow::errors::Internal("Engine building failure");
+tensorflow::Status ConvertSegmentToGraphDef(
+    const tensorflow::Graph* graph,
+    const tensorflow::grappler::GraphProperties& graph_properties,
+    const std::vector<int>& subgraph_node_ids,  // In topological order
+    std::vector<EngineConnection>* connections,
+    tensorflow::GraphDef* segment_def, string* common_scope) {
+  std::set<string> marker_nodes;
+  // Update connection shapes/data types and add corresponding input/output
+  // nodes in the segment graphdef.
+  for (size_t i = 0; i < connections->size(); ++i) {
+    auto& connection = connections->at(i);
+    auto outside_node = graph->FindNodeId(connection.outside_id);
+    if (!outside_node) {
+      // This should never happen, unless the original graph is problematic.
+      return tensorflow::errors::NotFound(
+          "Cannot find node with id ", connection.outside_id, " in the graph.");
+    }
+    // Updates the shape and data types of input/output connections.
+    tensorflow::DataType input_type = tensorflow::DT_FLOAT;
+    tensorflow::PartialTensorShape partial_shape;
+    if (connection.is_input_edge) {
+      if (graph_properties.HasOutputProperties(connection.outside_node_name)) {
+        auto output_params =
+            graph_properties.GetOutputProperties(connection.outside_node_name);
+        auto out_shape = output_params.at(connection.outside_port);
+        input_type = out_shape.dtype();
+        std::vector<tensorflow::int64> dims;
+        partial_shape = out_shape.shape();
+        connection.outside_shape = partial_shape;
+      } else {
+        VLOG(0) << "Unknown output shape" << outside_node->name();
+        input_type = graph->FindNodeId(connection.outside_id)
+                         ->output_type(connection.outside_port);
+      }
+      connection.connection_type = input_type;
+
+    } else {  // output edge
+      if (graph_properties.HasInputProperties(connection.outside_node_name)) {
+        auto input_params =
+            graph_properties.GetInputProperties(connection.outside_node_name);
+        auto in_shape = input_params.at(connection.outside_port);
+        input_type = in_shape.dtype();
+        partial_shape = in_shape.shape();
+        connection.inside_shape = partial_shape;
+      } else {
+        input_type = graph->FindNodeId(connection.inside_id)
+                         ->output_type(connection.outside_port);
+      }
+      connection.connection_type = input_type;
     }
-    auto engine_plan = infer_object(trt_engine->serialize());
-    VLOG(0) << "Serialized engine";
-    const char* engine_plan_data =
-        static_cast<const char*>(engine_plan->data());
-    engine_plan_string =
-        string(engine_plan_data, engine_plan_data + engine_plan->size());
-  }
-  TF_RETURN_IF_ERROR(weight_rmgr->Delete<tensorflow::tensorrt::TRTWeightStore>(
-      engine_name, engine_name));
-  LOG(INFO) << "finished engine " << engine_name << " containing "
-            << s.subgraph_node_ids.size() << " nodes";
-
-  // Build the TRT op
-  tensorflow::NodeDefBuilder op_builder(engine_name, "TRTEngineOp");
-  TF_RETURN_IF_ERROR(SetInputList(s, &op_builder, &input_names, &input_dtypes));
-
-  VLOG(0) << "Finished op preparation";
-
-  auto status = op_builder.Attr("serialized_engine", engine_plan_string)
-                    .Attr("input_nodes", input_names)
-                    .Attr("output_nodes", output_names)
-                    .Attr("OutT", output_dtypes)
-                    .Device(s.device_name_)
-                    .Finalize(s.trt_node);
-
-  VLOG(0) << status.ToString() << " finished op building for " << engine_name
-          << " on device " << s.device_name_;
 
+    // Add dummy input/output nodes to the segment graphdef.
+    if (connection.is_input_edge) {
+      const string node_name = StrCat(kInputPHName, connection.port_number);
+      if (marker_nodes.count(node_name)) {
+        VLOG(1) << "Reusing input " << node_name << " for the edge "
+                << connection.outside_node_name << ":"
+                << connection.outside_port << " -> "
+                << connection.inside_node_name << ":" << connection.inside_port;
+        continue;
+      }
+      marker_nodes.insert(node_name);
+      auto seg_node = segment_def->add_node();
+      tensorflow::NodeDefBuilder builder(node_name, "Placeholder");
+      auto status = builder.Attr("shape", partial_shape)
+                        .Attr("dtype", input_type)
+                        .Finalize(seg_node);
+      VLOG(1) << "Constructing input " << node_name << " for the edge "
+              << connection.outside_node_name << ":" << connection.outside_port
+              << " -> " << connection.inside_node_name << ":"
+              << connection.inside_port;
+    } else {
+      const string node_name = StrCat(kOutputPHName, connection.port_number);
+      if (marker_nodes.count(node_name)) {
+        VLOG(1) << "Reusing output " << node_name << " for the edge "
+                << connection.inside_node_name << ":" << connection.inside_port
+                << " -> " << connection.outside_node_name << ":"
+                << connection.outside_port;
+        continue;
+      }
+      marker_nodes.insert(node_name);
+      auto seg_node = segment_def->add_node();
+      tensorflow::NodeDefBuilder builder(node_name, "Identity");
+      auto status = builder.Input(connection.inside_node_name, 0, input_type)
+                        .Finalize(seg_node);
+      VLOG(1) << "Constructing output " << node_name << " for the edge "
+              << connection.inside_node_name << ":" << connection.inside_port
+              << " -> " << connection.outside_node_name << ":"
+              << connection.outside_port;
+    }
+  }  // for each connection.
+
+  std::unordered_map<int, int> old_to_new_id_map;
+  // Copy internal nodes to new graphdef
+  string local_scope = graph->FindNodeId(*subgraph_node_ids.begin())->name();
+  for (const auto node_id : subgraph_node_ids) {
+    const auto node = graph->FindNodeId(node_id);
+    local_scope = GetCommonNameScope(local_scope, node->name());
+    old_to_new_id_map[node_id] = segment_def->node_size();
+    auto snode = segment_def->add_node();
+    snode->CopyFrom(node->def());
+    VLOG(1) << "Copying " << snode->name() << " to subgraph";
+  }
+  // Update the inputs of the new input nodes to point to placeholder nodes.
+  for (int i = 0; i < connections->size(); ++i) {
+    auto& connection = connections->at(i);
+    if (!connection.is_input_edge) continue;
+    auto snode =
+        segment_def->mutable_node(old_to_new_id_map[connection.inside_id]);
+    const string placeholder_name =
+        StrCat(kInputPHName, connection.port_number);
+    VLOG(1) << "Updating " << snode->name() << ":" << connection.inside_port
+            << " from " << snode->input(connection.inside_port) << " to "
+            << placeholder_name;
+    snode->set_input(connection.inside_port, placeholder_name);
+  }
+  *common_scope = local_scope;
+  VLOG(0) << "Segment @scope '" << local_scope << "', converted to graph";
   return tensorflow::Status::OK();
 }
 
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
index 3f6592cd25..1a4c0e755d 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
@@ -22,69 +22,112 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/contrib/tensorrt/convert/utils.h"
 #include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
+#include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/lib/core/status.h"
+
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
+static const char* kInputPHName = "InputPH_";
+static const char* kOutputPHName = "OutputPH_";
 namespace convert {
 
+// TODO(aaroey): use an enum instead.
 const int FP32MODE = 0;
 const int FP16MODE = 1;
 const int INT8MODE = 2;
 
-struct SubGraphParams {
-  SubGraphParams(
-      tensorflow::Graph& inp_graph,
-      const std::set<int>& subgraph_node_id_numbers,
-      const std::vector<std::pair<int, int>>& input_indices,
-      const std::vector<std::pair<int, int>>& output_indices,
-      size_t max_supported_batch_size, size_t max_consumed_workspace_size_bytes,
-      const tensorflow::grappler::GraphProperties& current_graph_properties,
-      std::unordered_map<string, std::pair<int, string>>* output_edges,
-      tensorflow::NodeDef* constructed_trt_node,
-      int engine_precision_mode = FP32MODE, const string& device_name = "",
-      std::shared_ptr<nvinfer1::IGpuAllocator> allocator = nullptr,
-      int cuda_gpu_id = 0)
-      : graph(inp_graph),
-        subgraph_node_ids(subgraph_node_id_numbers),
-        input_inds(input_indices),
-        output_inds(output_indices),
-        max_batch_size(max_supported_batch_size),
-        max_workspace_size_bytes(max_consumed_workspace_size_bytes),
-        graph_properties(current_graph_properties),
-        output_edge_map(output_edges),
-        trt_node(constructed_trt_node),
-        precision_mode(engine_precision_mode),
-        device_name_(device_name),
-        allocator_(allocator),
-        cuda_gpu_id_(cuda_gpu_id) {}
-
-  tensorflow::Graph& graph;
-  const std::set<int>& subgraph_node_ids;
-  const std::vector<std::pair<int, int>>& input_inds;   // {node_id, output_idx}
-  const std::vector<std::pair<int, int>>& output_inds;  // {node_id, output_idx}
-  size_t max_batch_size;
-  size_t max_workspace_size_bytes;
-  const tensorflow::grappler::GraphProperties& graph_properties;
-  std::unordered_map<string, std::pair<int, string>>* output_edge_map;
-  tensorflow::NodeDef* trt_node;
-  const int precision_mode;
-  const string device_name_;
-  std::shared_ptr<nvinfer1::IGpuAllocator> allocator_;
-  const int cuda_gpu_id_;
+struct EngineConnection {
+  EngineConnection(const string& outside, int out_id, int out_port,
+                   const string& inside, int in_id, int in_port,
+                   bool input_edge, int port)
+      : outside_node_name(outside),
+        outside_id(out_id),
+        outside_port(out_port),
+        inside_node_name(inside),
+        inside_id(in_id),
+        inside_port(in_port),
+        is_input_edge(input_edge),
+        port_number(port) {}
+
+  const string outside_node_name;
+  const int outside_id;
+  const int outside_port;
+  tensorflow::PartialTensorShape outside_shape;
+
+  const string inside_node_name;
+  const int inside_id;
+  const int inside_port;
+  tensorflow::PartialTensorShape inside_shape;
+
+  tensorflow::DataType connection_type;
+  bool is_input_edge;
+
+  // The port number of the TRT node connecting to this edge.
+  int port_number;
+};
+
+struct EngineInfo {
+  EngineInfo()
+      : engine_type(EngineType::TRTStatic),
+        max_workspace_size_bytes(0),
+        precision_mode(FP32MODE) {}
+
+  string engine_name;
+  string device;
+  tensorflow::GraphDef segment_graph_def;
+
+  // The segment nodes that are on one side of the edges are topological sorted.
+  std::vector<EngineConnection> connections;
+
+  enum class EngineType { TRTStatic = 0, TRTDynamic = 1 };
+  EngineType engine_type;
+  int64 max_workspace_size_bytes;
+  int maximum_cached_engines;
+  std::vector<int> cached_engine_batches;
+  int precision_mode;
 };
 
-// TODO(sami): Replace references with const reference or pointers
-tensorflow::Status ConvertSubGraphToTensorRTNodeDef(SubGraphParams& params);
-tensorflow::Status InjectCalibrationNode(SubGraphParams& params);
-tensorflow::Status ConvertCalibrationNodeToEngineNode(tensorflow::Graph& graph,
-                                                      tensorflow::Node* c_node);
+// Constructs a graphdef from the segment in the given graph. Adds placeholder
+// nodes for input edges (InputPH_*) and identity nodes for output edges
+// (OutputPH_*). This function needs to be called before TensorRT nodes
+// inserted in order to correctly get sizes from the original graph.
+//
+// - subgraph_node_ids: the node ids of the subgraph, must be sorted in
+//   topological order.
+// - segment_def: the output GraphDef, whose non-input/output nodedefs will be
+//   sorted in topological order.
+tensorflow::Status ConvertSegmentToGraphDef(
+    const tensorflow::Graph* graph,
+    const tensorflow::grappler::GraphProperties& graph_properties,
+    const std::vector<int>& subgraph_node_ids,
+    std::vector<EngineConnection>* connections,
+    tensorflow::GraphDef* segment_def, string* common_scope);
+
+// Converts given subgraph to a TRT engine saved in 'engine'. Returns ok iff
+// 'builder' successfully build the engine. If the result is not ok, 'engine'
+// will be set to nullptr
+// Once returned, 'builder' is not needed any more and can be safely detroyed.
+//
+// - convert_successfully: indicates whether the converson to TensorRT network
+//   is successful. This is different than successfully building the engine:
+//   building can still fail afterwards.
+tensorflow::Status ConvertGraphDefToEngine(
+    const tensorflow::GraphDef& gdef, int precision_mode, int max_batch_size,
+    size_t max_workspace_size_bytes,
+    const std::vector<tensorflow::PartialTensorShape>& input_shapes,
+    Logger* logger, nvinfer1::IGpuAllocator* allocator,
+    TRTInt8Calibrator* calibrator,
+    TrtUniquePtrType<nvinfer1::ICudaEngine>* engine,
+    bool* convert_successfully);
+
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc
index 8f634b1f74..ec9dbfa13b 100644
--- a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc
+++ b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc
@@ -45,8 +45,24 @@ tensorflow::Status TRTOptimizationPass::Init(
   if (params.count("max_batch_size")) {
     maximum_batch_size_ = params.at("max_batch_size").i();
   }
-  if (params.count("max_workspace_size_bytes"))
+  is_dynamic_op_ = false;
+  if (params.count("is_dynamic_op")) {
+    is_dynamic_op_ = params.at("is_dynamic_op").b();
+  }
+  if (params.count("cached_engine_batches")) {
+    auto batch_vec = params.at("cached_engine_batches").list();
+    batches_.reserve(batch_vec.i_size());
+    for (const auto i : batch_vec.i()) {
+      batches_.push_back(i);
+    }
+  }
+  max_cached_batches_ = 1;
+  if (params.count("maximum_cached_engines")) {
+    max_cached_batches_ = params.at("maximum_cached_engines").i();
+  }
+  if (params.count("max_workspace_size_bytes")) {
     maximum_workspace_size_ = params.at("max_workspace_size_bytes").i();
+  }
   if (params.count("precision_mode")) {
     string pm = Uppercase(params.at("precision_mode").s());
     if (pm == "FP32") {
@@ -175,6 +191,17 @@ tensorflow::Status TRTOptimizationPass::Optimize(
   if (VLOG_IS_ON(1)) {
     PrintDebugInfo(cluster, item);
   }
+  // This is a hack to workaround optimizer issue. MetaOptimizer calls
+  // optimization passes on function objects as well, we should not modify
+  // generated funcdefs! This is fragile but we don't have any other option
+  // until framework fixes it.
+  if (item.id != "tf_graph") {
+    LOG(WARNING) << name_
+                 << " is probably called on funcdef! This optimizer must *NOT* "
+                    "be called on function objects.";
+    *optimized_graph = item.graph;
+    return tensorflow::Status::OK();
+  }
   int max_dim = -1;
   if (item.feed.size()) {
     for (const auto& f : item.feed) {
@@ -204,11 +231,22 @@ tensorflow::Status TRTOptimizationPass::Optimize(
   }
   tensorflow::grappler::GraphProperties static_graph_properties(item);
   TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(true));
-  auto status = tensorflow::tensorrt::convert::ConvertAfterShapes(
-      item.graph, item.fetch, maximum_batch_size_, maximum_workspace_size_,
-      optimized_graph, precision_mode_, minimum_segment_size_,
-      static_graph_properties, cluster);
+  tensorflow::tensorrt::convert::ConversionParams cp;
+  cp.input_graph_def = &item.graph;
+  cp.output_names = &item.fetch;
+  cp.max_batch_size = maximum_batch_size_;
+  cp.max_workspace_size_bytes = maximum_workspace_size_;
+  cp.output_graph_def = optimized_graph;
+  cp.precision_mode = precision_mode_;
+  cp.minimum_segment_size = minimum_segment_size_;
+  cp.graph_properties = &static_graph_properties;
+  cp.cluster = cluster;
+  cp.is_dyn_op = is_dynamic_op_;
+  cp.cached_engine_batches = batches_;
+  cp.max_cached_engines = max_cached_batches_;
+  auto status = tensorflow::tensorrt::convert::ConvertAfterShapes(cp);
   VLOG(2) << optimized_graph->DebugString();
+  VLOG(1) << "Returning from " << name_;
   return status;
 }
 
diff --git a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h
index d8ecead23e..463ed3883e 100644
--- a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h
+++ b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h
@@ -61,6 +61,9 @@ class TRTOptimizationPass : public tensorflow::grappler::CustomGraphOptimizer {
   int minimum_segment_size_;
   int precision_mode_;
   int maximum_batch_size_;
+  bool is_dynamic_op_;
+  std::vector<int> batches_;
+  int max_cached_batches_;
   int64_t maximum_workspace_size_;
 };
 
diff --git a/tensorflow/contrib/tensorrt/convert/utils.h b/tensorflow/contrib/tensorrt/convert/utils.h
new file mode 100644
index 0000000000..f601c06701
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/convert/utils.h
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_CONVERT_UTILS_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_CONVERT_UTILS_H_
+
+#include <memory>
+
+namespace tensorflow {
+namespace tensorrt {
+
+template <typename T>
+struct TrtDestroyer {
+  void operator()(T* t) {
+    if (t) t->destroy();
+  }
+};
+
+template <typename T>
+using TrtUniquePtrType = std::unique_ptr<T, TrtDestroyer<T>>;
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_CONVERT_UTILS_H_
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
index 9ac8047944..8a17eb02f1 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
@@ -14,8 +14,16 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/tensorrt/kernels/trt_engine_op.h"
 
+#include <algorithm>
+#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
+#include "tensorflow/contrib/tensorrt/convert/utils.h"
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
+#include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h"
+#include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/types.h"
@@ -25,144 +33,556 @@ limitations under the License.
 #include "cuda/include/cuda_runtime_api.h"
 
 namespace tensorflow {
-static ::tensorflow::tensorrt::Logger logger;
-using IRuntime = nvinfer1::IRuntime;
-using Dims = nvinfer1::Dims;
-
 namespace tensorrt {
+static Logger logger;
+using ::nvinfer1::IRuntime;
+using ::tensorflow::strings::StrAppend;
+using ::tensorflow::strings::StrCat;
+
+// A helper class to call done() when destructed for asynchronous execution.
+// Helps simultaneous execution of native and TRT engines.
+class AsyncHelper : public tensorflow::core::RefCounted {
+ public:
+  AsyncHelper(tensorflow::AsyncOpKernel::DoneCallback done) { done_ = done; }
+  ~AsyncHelper() override { done_(); }
 
-TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) : OpKernel(context) {
+ private:
+  tensorflow::AsyncOpKernel::DoneCallback done_;
+};
+
+#define TYPECASE(dt, X, Y)                                                \
+  case dt: {                                                              \
+    return (void*)X->flat<tensorflow::EnumToDataType<dt>::Type>().data(); \
+  }
+
+void* GetTensorAddress(const Tensor* tensor_ptr) {
+  auto tensor_type = tensor_ptr->dtype();
+  switch (tensor_type) {
+    TYPECASE(tensorflow::DT_FLOAT, tensor_ptr, dest_ptr);
+    TYPECASE(tensorflow::DT_HALF, tensor_ptr, dest_ptr);
+    TYPECASE(tensorflow::DT_INT8, tensor_ptr, dest_ptr);
+    default: {
+      LOG(ERROR) << "Unsupported Data type "
+                 << tensorflow::DataTypeString(tensor_type);
+      return nullptr;
+    }
+  }
+}
+
+tensorflow::Status TRTEngineOp::ConstructFunctionHandle(OpKernelContext* ctx) {
+  VLOG(1) << "Constructing function handle";
+  auto lib = ctx->function_library();
+  if (lib == nullptr) {
+    return tensorflow::errors::Internal("Context function library is null");
+  }
+  auto fdef = lib->GetFunctionLibraryDefinition()->Find(funcdef_name_);
+  if (fdef == nullptr) {
+    return tensorflow::errors::Internal("Native FunctionDef ", funcdef_name_,
+                                        " can't be found in function library");
+  }
+  tensorflow::FunctionLibraryRuntime::InstantiateOptions inst_ops;
+  inst_ops.overlay_lib = nullptr;
+  inst_ops.state_handle = "";
+  inst_ops.target = ctx->device()->name();
+  native_func_ = 0;
+  auto status = lib->Instantiate(funcdef_name_, AttrSlice(&fdef->attr()),
+                                 inst_ops, &native_func_);
+  if (!status.ok()) {
+    LOG(ERROR) << " Instantiating native function " << funcdef_name_
+               << " failed!";
+  }
+  return status;
+}
+
+TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)
+    : AsyncOpKernel(context) {
   // read serialized_engine
   OP_REQUIRES_OK(context,
-                 context->GetAttr("serialized_engine", &serialized_engine_));
+                 context->GetAttr("serialized_segment", &serialized_segment_));
+  OP_REQUIRES_OK(context,
+                 context->GetAttr("workspace_size_bytes", &workspace_size_));
+  OP_REQUIRES_OK(context, context->GetAttr("static_engine", &static_engine_));
+  if (!static_engine_) {
+    if (!segment_graph_.ParseFromString(serialized_segment_)) {
+      LOG(ERROR) << "Parsing segment graph failed!";
+      context->SetStatus(tensorflow::errors::InvalidArgument(
+          "Failed to parse segment graphdef!"));
+      return;
+    }
+    serialized_segment_.resize(0);
+  }
+  VLOG(1) << "Constructing " << name();
+  string precision_string;
+  OP_REQUIRES_OK(context,
+                 context->GetAttr("precision_mode", &precision_string));
+  string calibration_data;
+  OP_REQUIRES_OK(context,
+                 context->GetAttr("calibration_data", &calibration_data));
+  OP_REQUIRES_OK(context,
+                 context->GetAttr("segment_funcdef_name", &funcdef_name_));
+  if (precision_string == "FP32") {
+    precision_mode_ = convert::FP32MODE;
+  } else if (precision_string == "FP16") {
+    precision_mode_ = convert::FP16MODE;
+  } else if (precision_string == "INT8") {
+    precision_mode_ = convert::INT8MODE;
+  }
+  calibration_mode_ =
+      (precision_mode_ == convert::INT8MODE && calibration_data.size() == 0);
+  if (calibration_data.size()) {
+    calibrator_.reset(new TRTInt8Calibrator(calibration_data));
+    calibration_data.resize(0);
+  }
+  native_func_ = tensorflow::kInvalidHandle;
+  OP_REQUIRES_OK(context, context->GetAttr("max_cached_engines_count",
+                                           &max_cached_engines_));
+  OP_REQUIRES_OK(context,
+                 context->GetAttr("fixed_input_size", &fixed_input_size_));
+  OP_REQUIRES_OK(context, context->GetAttr("cached_engine_batches",
+                                           &cached_engine_batches_));
+  std::sort(cached_engine_batches_.begin(), cached_engine_batches_.end());
+  if (VLOG_IS_ON(1)) {
+    string s("Engine Batches= ");
+    for (auto i : cached_engine_batches_) {
+      StrAppend(&s, i, " ");
+    }
+    VLOG(1) << s;
+  }
+}
 
-  // register input output node name in trt_sub_graph
-  OP_REQUIRES_OK(context, context->GetAttr("input_nodes", &input_nodes_));
-  OP_REQUIRES_OK(context, context->GetAttr("output_nodes", &output_nodes_));
+void TRTEngineOp::ExecuteNativeSegment(tensorflow::OpKernelContext* ctx,
+                                       AsyncHelper* helper) {
+  if (!calibration_mode_) {
+    VLOG(1) << "Executing native engine";
+  }
+  std::vector<Tensor> inputs;
+  std::vector<Tensor>* outputs = new std::vector<Tensor>();
+  if (native_func_ == tensorflow::kInvalidHandle) {
+    auto status = ConstructFunctionHandle(ctx);
+    if (!status.ok()) {
+      LOG(ERROR) << "Couldn't construct function handle " << funcdef_name_;
+      ctx->SetStatus(status);
+      return;
+    }
+  }
+  auto lib = ctx->function_library();
+  tensorflow::FunctionLibraryRuntime::Options opts;
+  opts.step_id = ctx->step_id();
+  opts.rendezvous = ctx->rendezvous();
+  opts.cancellation_manager = ctx->cancellation_manager();
+  opts.runner = ctx->runner();
+  for (int i = 0; i < ctx->num_inputs(); i++) {
+    inputs.push_back(ctx->input(i));
+  }
+  helper->Ref();  // Increment count for calculating native graph
+  VLOG(1) << "Executing native segment " << name();
+  lib->Run(opts, native_func_, inputs, outputs,
+           [ctx, outputs, helper](const tensorflow::Status& s) {
+             tensorflow::core::ScopedUnref sc(helper);
+             VLOG(1) << "Native Segment completed";
+             if (!s.ok()) {
+               ctx->SetStatus(s);
+               return;
+             }
+             for (size_t t = 0; t < outputs->size(); ++t) {
+               ctx->set_output(t, outputs->at(t));
+             }
+             delete outputs;
+           });
 }
 
-void TRTEngineOp::Compute(OpKernelContext* context) {
-  // TODO(samikama) runtime should be taken from a resourcemanager as well.
-  // Only engine should be in the op and context and runtime should be taken
-  // from resourcemanager
+void TRTEngineOp::ExecuteCalibration(tensorflow::OpKernelContext* ctx,
+                                     AsyncHelper* helper) {
+  helper->Ref();
+  tensorflow::core::ScopedUnref sc(helper);
+  // TODO(aaroey): remove the ResourceMgr singleton.
+  auto trt_rm = TRTResourceManager::instance();
+  auto res_mgr = trt_rm->getManager("TRTCalibration");
+  TRTCalibrationResource* calib_res = nullptr;
+  auto status = res_mgr->LookupOrCreate(
+      funcdef_name_, "Calibrator", &calib_res,
+      {[ctx, this](TRTCalibrationResource** cr) -> tensorflow::Status {
+        return this->AllocateCalibrationResources(ctx, cr);
+      }});
+  if (!status.ok()) {
+    ctx->SetStatus(status);
+    return;
+  }
+  int num_inputs = ctx->num_inputs();
+  // Pass input data to calibrator
+  std::unordered_map<string, void*> input_data;
+  for (int i = 0; i < num_inputs; i++) {
+    const Tensor& t = ctx->input(i);
+    void* data_address = GetTensorAddress(&t);
+    if (data_address == nullptr) {
+      ctx->SetStatus(tensorflow::errors::InvalidArgument(
+          "Unsupported data type encountered in input ", i));
+      return;
+    }
+    // Check the allocated buffer is sufficient for input
+    const auto device_tensor = dev_tensors_.at(i).AccessTensor(ctx);
+    CHECK_EQ(t.TotalBytes(), device_tensor->TotalBytes());
+    input_data.emplace(StrCat(kInputPHName, i), data_address);
+  }
+  VLOG(2) << "Filled map for sending";
+  // copied from cuda_kernel_helper since it seems only valid in *.cu.cc files
+  const cudaStream_t* stream = CHECK_NOTNULL(
+      reinterpret_cast<const cudaStream_t*>(ctx->op_device_context()
+                                                ->stream()
+                                                ->implementation()
+                                                ->CudaStreamMemberHack()));
+  calib_res->calibrator_->setBatch(input_data, *stream);
+  VLOG(2) << "Passed calibration data";
+  ExecuteNativeSegment(ctx, helper);
+}
 
-  if (!trt_execution_context_ptr_) {
-    IRuntime* infer = nvinfer1::createInferRuntime(logger);
-#if NV_TENSORRT_MAJOR > 3
-    auto device = context->device();
-    auto dev_allocator =
-        device->GetAllocator(tensorflow::AllocatorAttributes());
-    if (!dev_allocator) {
-      LOG(FATAL) << "Can't find device allocator for gpu device "
-                 << device->name();
-    }
-    allocator_ = std::make_shared<TRTDeviceAllocator>(dev_allocator);
-    infer->setGpuAllocator(allocator_.get());
-#endif
-    trt_engine_ptr_.reset(infer->deserializeCudaEngine(
-        serialized_engine_.c_str(), serialized_engine_.size(),
-        PluginFactoryTensorRT::GetInstance()));
-    trt_execution_context_ptr_.reset(trt_engine_ptr_->createExecutionContext());
-    // Runtime is safe to delete after engine creation
-    infer->destroy();
-    serialized_engine_.clear();
+int TRTEngineOp::GetEngineBatch(tensorflow::OpKernelContext* ctx) {
+  int num_batch = ctx->input(0).shape().dim_size(0);
+  int smallest_engine = 0;
+  for (const auto i : cached_engine_batches_) {
+    if (i >= num_batch) {
+      smallest_engine = i;
+      break;
+    }
   }
-  int num_binding = context->num_inputs() + context->num_outputs();
-  std::vector<void*> buffers(num_binding);
+  // TODO(sami): Need an LRU here
+  if (smallest_engine == 0) {
+    if (max_cached_engines_ > cached_engine_batches_.size()) {
+      smallest_engine = num_batch;
+      cached_engine_batches_.push_back(num_batch);
+      VLOG(1) << "Running with batch size " << num_batch;
+    } else {
+      string s("Engine buffer is full. buffer limit= ");
+      StrAppend(&s, max_cached_engines_, ", current entries= ");
+      for (auto i : cached_engine_batches_) StrAppend(&s, i, ", ");
+      StrAppend(&s, "Requested batch= ", num_batch);
+      LOG(ERROR) << s;
+      ctx->SetStatus(tensorflow::errors::ResourceExhausted(
+          "Requested batch size is not available and engine cache is full"));
+      return -1;
+    }
+  }
+  return smallest_engine;
+}
 
-  size_t binding_index;
-  int num_batch = 0;
-  for (int i = 0; i < context->num_inputs(); i++) {
-    // Grab the input tensor
-    binding_index = trt_engine_ptr_->getBindingIndex(input_nodes_[i].c_str());
+void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx,
+                               tensorflow::AsyncOpKernel::DoneCallback done) {
+  auto helper = new AsyncHelper(done);
+  tensorflow::core::ScopedUnref sc(helper);
+  if (calibration_mode_) {
+    ExecuteCalibration(ctx, helper);
+    return;
+  }
+  const int smallest_engine = GetEngineBatch(ctx);
+  if (smallest_engine < 0) return;  // GetEngineBatch already set the status.
+
+  const int num_batch = ctx->input(0).shape().dim_size(0);
+  auto& engine_ctx_pair = GetEngine(smallest_engine, ctx);
+  auto& trt_engine_ptr = engine_ctx_pair.first;
+  if (!trt_engine_ptr) {
+    LOG(WARNING) << "Engine retrieval for batch size " << num_batch
+                 << " failed Running native segment";
+    ExecuteNativeSegment(ctx, helper);
+    return;
+  }
 
-    const Tensor& input_tensor = context->input(i);
+  const int num_binding = ctx->num_inputs() + ctx->num_outputs();
+  std::vector<void*> buffers(num_binding);
+  for (int i = 0; i < ctx->num_inputs(); i++) {
+    const string inp_name = StrCat(kInputPHName, i);
+    const size_t binding_index =
+        trt_engine_ptr->getBindingIndex(inp_name.c_str());
+
+    const Tensor& input_tensor = ctx->input(i);
     const TensorShape& input_shape = input_tensor.shape();
-    if (i == 0) {
-      num_batch = input_shape.dim_size(0);
-      if (num_batch > trt_engine_ptr_->getMaxBatchSize()) {
-        LOG(FATAL) << "input tensor batch larger than max_batch_size: "
-                   << trt_engine_ptr_->getMaxBatchSize();
-      }
-    } else if (num_batch != input_shape.dim_size(0)) {
-      LOG(FATAL) << "input data inconsistent batch size";
-      break;
+    if (num_batch != input_shape.dim_size(0)) {
+      LOG(ERROR) << "input data inconsistent batch size";
+      ctx->SetStatus(tensorflow::errors::FailedPrecondition(
+          "Different batch sizes between input tensors"));
+      return;
     }
-    auto dtype = trt_engine_ptr_->getBindingDataType(binding_index);
+    auto dtype = trt_engine_ptr->getBindingDataType(binding_index);
     switch (dtype) {
       case nvinfer1::DataType::kFLOAT:
         buffers[binding_index] = (void*)(input_tensor.flat<float>().data());
         break;
       case nvinfer1::DataType::kHALF:
-        LOG(FATAL) << "half size is not supported yet!";
-        break;
+        LOG(ERROR) << "FP16 inputs are not supported yet!";
+        ctx->SetStatus(tensorflow::errors::InvalidArgument(
+            "FP16 inputs are not supported!"));
+        return;
       case nvinfer1::DataType::kINT8:
-        LOG(FATAL) << "int8 is not supported yet!";
-        break;
+        LOG(ERROR) << "INT8 inputs are not supported yet!";
+        ctx->SetStatus(tensorflow::errors::InvalidArgument(
+            "INT8 inputs are not supported!"));
+        return;
       default:
-        LOG(FATAL) << "Unknown data type: " << int(dtype);
-        break;
+        LOG(ERROR) << "Unknown TRT data type: " << int(dtype);
+        ctx->SetStatus(tensorflow::errors::InvalidArgument(
+            "Unknown output TRT data type! ", static_cast<int>(dtype)));
+        return;
     }
   }
 
-  for (int i = 0; i < static_cast<int>(output_nodes_.size()); i++) {
-    // This is bad that we have to reallocate output buffer every run.
+  for (int i = 0; i < ctx->num_outputs(); i++) {
     // Create an output tensor
-    binding_index = trt_engine_ptr_->getBindingIndex(output_nodes_[i].c_str());
+    const string output_name = StrCat(kOutputPHName, i);
+    const size_t binding_index =
+        trt_engine_ptr->getBindingIndex(output_name.c_str());
     Tensor* output_tensor = nullptr;
 
     TensorShape output_shape;
     if (binding_index != -1) {
-      auto dims = trt_engine_ptr_->getBindingDimensions(binding_index);
+      auto dims = trt_engine_ptr->getBindingDimensions(binding_index);
       std::vector<int> trt_shape(dims.nbDims + 1);
       trt_shape[0] = num_batch;
       for (int j = 0; j < dims.nbDims; j++) trt_shape[j + 1] = dims.d[j];
-      OP_REQUIRES_OK(context,
-                     TensorShapeUtils::MakeShape(
-                         trt_shape.data(), trt_shape.size(), &output_shape));
+      OP_REQUIRES_OK(
+          ctx, TensorShapeUtils::MakeShape(trt_shape.data(), trt_shape.size(),
+                                           &output_shape));
     } else {
-      LOG(FATAL) << "output node not found, at " << output_nodes_[i];
-      break;
+      LOG(ERROR) << "output node not found, at " << output_name;
+      ctx->SetStatus(tensorflow::errors::Internal("output ", output_name,
+                                                  " couldn't be found!"));
+      return;
     }
-
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(i, output_shape, &output_tensor));
-    auto dtype = trt_engine_ptr_->getBindingDataType(binding_index);
+    auto status = ctx->allocate_output(i, output_shape, &output_tensor);
+    if (!status.ok()) {
+      LOG(ERROR) << "Allocating output failed with " << status;
+      ctx->SetStatus(status);
+      return;
+    }
+    auto dtype = trt_engine_ptr->getBindingDataType(binding_index);
     switch (dtype) {
       case nvinfer1::DataType::kFLOAT:
         buffers[binding_index] =
             reinterpret_cast<void*>(output_tensor->flat<float>().data());
         break;
       case nvinfer1::DataType::kHALF:
-        LOG(FATAL) << "half size is not supported yet!";
-        break;
+        LOG(ERROR) << "half size is not supported yet!";
+        ctx->SetStatus(tensorflow::errors::InvalidArgument(
+            "Half outputs are not supported!"));
+        return;
       case nvinfer1::DataType::kINT8:
-        LOG(FATAL) << "int8 is not supported yet!";
-        break;
+        LOG(ERROR) << "int8 is not supported yet!";
+        ctx->SetStatus(tensorflow::errors::InvalidArgument(
+            "INT8 outputs are not supported!"));
+        return;
       default:
-        LOG(FATAL) << "Unknown data type: " << int(dtype);
-        break;
+        LOG(ERROR) << "Unknown TRT data type: " << static_cast<int>(dtype);
+        ctx->SetStatus(tensorflow::errors::InvalidArgument(
+            "Unsupported output data type! ", static_cast<int>(dtype)));
+        return;
     }
   }
   // copied from cuda_kernel_helper since it seems only valid in *.cu.cc files
   const cudaStream_t* stream = CHECK_NOTNULL(
-      reinterpret_cast<const cudaStream_t*>(context->op_device_context()
+      reinterpret_cast<const cudaStream_t*>(ctx->op_device_context()
                                                 ->stream()
                                                 ->implementation()
                                                 ->CudaStreamMemberHack()));
 
   // TODO(jie): trt enqueue does not return error
-  auto ret = trt_execution_context_ptr_->enqueue(num_batch, &buffers[0],
-                                                 *stream, nullptr);
-  VLOG(2) << "enqueue returns: " << ret;
+  auto& trt_execution_context_ptr = engine_ctx_pair.second;
+  auto ret = trt_execution_context_ptr->enqueue(num_batch, &buffers[0], *stream,
+                                                nullptr);
+  if (!ret) {
+    LOG(ERROR) << "Failed to enqueue batch for TRT engine: " << name();
+    ctx->SetStatus(tensorflow::errors::Internal(
+        "Failed to enqueue batch for TRT engine: ", name()));
+  }
   // sync should be done by TF.
 }
+
 TRTEngineOp::~TRTEngineOp() {
-  // Order matters!
-  trt_execution_context_ptr_.reset();
-  trt_engine_ptr_.reset();
+  // We need to manually destroy the engine and execution context before
+  // the allocator is destructed.
+  for (auto& eng : engine_map_) {
+    eng.second.first.reset();
+    eng.second.second.reset();
+  }
   allocator_.reset();
 }
+
+nvinfer1::IGpuAllocator* TRTEngineOp::GetAllocator(OpKernelContext* ctx) {
+  if (allocator_) return allocator_.get();
+  auto device = ctx->device();
+  auto alloc = device->GetAllocator(tensorflow::AllocatorAttributes());
+  if (!alloc) {
+    LOG(ERROR) << "Can't find device allocator for gpu device "
+               << device->name();
+    ctx->SetStatus(tensorflow::errors::Internal(
+        "Can't get device allocator for device ", device->name()));
+    return nullptr;
+  }
+  allocator_.reset(new TRTDeviceAllocator(alloc));
+  return allocator_.get();
+}
+
+TRTEngineOp::EngineCtxPair& TRTEngineOp::GetEngine(int batch_size,
+                                                   OpKernelContext* ctx) {
+  static EngineCtxPair null_pair = {
+      TrtUniquePtrType<nvinfer1::ICudaEngine>(nullptr),
+      TrtUniquePtrType<nvinfer1::IExecutionContext>(nullptr)};
+  // TODO(sami): This method needs to be re-written to use resource manager and
+  // with LRU mechanism option.
+  tensorflow::mutex_lock lock(engine_mutex_);
+
+  if (static_engine_) {
+    if (engine_map_.size()) {
+      if (engine_map_.begin()->first >= batch_size) {
+        return engine_map_.begin()->second;
+      }
+      return null_pair;
+    }
+    TrtUniquePtrType<IRuntime> infer(nvinfer1::createInferRuntime(logger));
+#if NV_TENSORRT_MAJOR > 3
+    auto allocator = GetAllocator(ctx);
+    if (allocator == nullptr) {
+      // GetAllocator already set the Status.
+      return null_pair;
+    }
+    infer->setGpuAllocator(allocator);
+#endif
+    TrtUniquePtrType<nvinfer1::ICudaEngine> static_engine(
+        infer->deserializeCudaEngine(serialized_segment_.c_str(),
+                                     serialized_segment_.size(), nullptr));
+    auto raw_static_engine = static_engine.get();
+    const auto max_batch_size = raw_static_engine->getMaxBatchSize();
+    engine_map_[max_batch_size] = {
+        std::move(static_engine),
+        TrtUniquePtrType<nvinfer1::IExecutionContext>(
+            raw_static_engine->createExecutionContext())};
+    // Runtime is safe to delete after engine creation
+    serialized_segment_.clear();
+    if (max_batch_size < batch_size) return null_pair;
+    return engine_map_.at(max_batch_size);
+  }  // static_engine_
+
+  // Handle the dynamic engine case.
+  auto engine_it = engine_map_.find(batch_size);
+  if (engine_it == engine_map_.end() &&
+      engine_map_.size() < (size_t)max_cached_engines_) {
+    nvinfer1::IGpuAllocator* allocator = nullptr;
+#if NV_TENSORRT_MAJOR > 3
+    allocator = GetAllocator(ctx);
+    if (allocator == nullptr) {
+      // GetAllocator already set the Status.
+      return null_pair;
+    }
+#endif
+    std::vector<tensorflow::PartialTensorShape> shapes;
+    for (int i = 0; i < ctx->num_inputs(); ++i) {
+      shapes.emplace_back(ctx->input(i).shape());
+    }
+    TrtUniquePtrType<nvinfer1::ICudaEngine> engine;
+    bool convert_successfully = false;
+    VLOG(0) << name() << " Constructing a new engine with batch size "
+            << batch_size;
+    // Up to this point, calibrator_ can never be empty, since otherwise it
+    // means calibration_mode_ is true and this path won't get executed.
+    auto status = convert::ConvertGraphDefToEngine(
+        segment_graph_, precision_mode_, batch_size, workspace_size_, shapes,
+        &logger, allocator, calibrator_.get(), &engine, &convert_successfully);
+    if (!status.ok()) {
+      if (convert_successfully) {
+        // This means it fail to build the engine even when the network is built
+        // successfully, probably due to internal issues. In this case we don't
+        // retry in the future.
+        engine_map_[batch_size] = {nullptr, nullptr};
+      }
+      LOG(ERROR) << "Engine creation for batch size " << batch_size
+                 << " failed " << status;
+      ctx->SetStatus(tensorflow::errors::Internal("Engine creation failed!"));
+      return null_pair;
+    }
+    VLOG(1) << "Conversion is done";
+    TrtUniquePtrType<nvinfer1::IExecutionContext> exec_context(
+        engine->createExecutionContext());
+    engine_map_[batch_size] = {std::move(engine), std::move(exec_context)};
+  }
+  return engine_map_.at(batch_size);
+}
+
+tensorflow::Status TRTEngineOp::AllocateCalibrationResources(
+    tensorflow::OpKernelContext* ctx, TRTCalibrationResource** cr) {
+  auto cres = new TRTCalibrationResource();
+  *cr = cres;
+  // Get the allocator.
+  auto alloc = ctx->device()->GetAllocator(tensorflow::AllocatorAttributes());
+  if (!alloc) {
+    LOG(WARNING) << "Can't get device allocator will not be able to "
+                    "allocate memory from TensorFlow memory pool";
+    cres->allocator_.reset(new TRTCudaAllocator);
+  } else {
+    cres->allocator_.reset(new TRTDeviceAllocator(alloc));
+  }
+  // Get the input shapes.
+  const int batch_size = ctx->input(0).dim_size(0);
+  const int num_inputs = ctx->num_inputs();
+  std::vector<tensorflow::PartialTensorShape> shapes;
+  dev_tensors_.resize(num_inputs);
+  VLOG(1) << " Constructing calibrator";
+  for (int i = 0; i < num_inputs; i++) {
+    // allocate workspace on device for inputs
+    const tensorflow::Tensor& t = ctx->input(i);
+    shapes.emplace_back(t.shape());
+    Tensor* device_tensor;
+    TF_RETURN_IF_ERROR(ctx->allocate_persistent(
+        t.dtype(), t.shape(), &dev_tensors_.at(i), &device_tensor));
+    CHECK_EQ(t.TotalBytes(), device_tensor->TotalBytes());
+    void* device_address = GetTensorAddress(device_tensor);
+    if (device_address == nullptr) {
+      return tensorflow::errors::InvalidArgument(
+          "Unsupported data type encountered in input ", i);
+    }
+    device_buffers_.emplace(
+        StrCat(kInputPHName, i),
+        std::pair<void*, size_t>(device_address, device_tensor->TotalBytes()));
+  }
+  cres->calibrator_.reset(
+      new TRTInt8Calibrator(device_buffers_, batch_size, name()));
+  const string label(name());
+  auto segment_graph = &segment_graph_;
+  const int cuda_gpu_id = ctx->device()->tensorflow_gpu_device_info()->gpu_id;
+  if (cuda_gpu_id < 0) {
+    LOG(ERROR) << "Can't get gpu_device_info from context->device()";
+    return tensorflow::errors::InvalidArgument(
+        "Context->device doesn't contain device info!");
+  }
+  const int64 workspace_size_bytes = workspace_size_;
+  cres->thr_.reset(new std::thread([cres, label, segment_graph, shapes,
+                                    cuda_gpu_id, workspace_size_bytes]() {
+    VLOG(0) << "Starting calibration thread on device " << cuda_gpu_id
+            << ", Calibration Resource @ " << cres;
+    auto err = cudaSetDevice(cuda_gpu_id);
+    if (err != cudaSuccess) {
+      // TODO(aaroey): should return error here.
+      LOG(ERROR) << "Couldn't set cuda device to " << cuda_gpu_id
+                 << " in calibration thread";
+    }
+    // ConvertGraphDefToEngine() will try to build the engine. This thread
+    // will loop inside buildCudaEngine() consuming the calibration data
+    // that is set by the TF op, and drive the builder until calibrator returns
+    // false. Engine is discarded after calibration table is generated
+    //
+    // TODO(aaroey): maybe setting the max batch size using the python
+    // calibration wrapper class.
+    auto s = convert::ConvertGraphDefToEngine(
+        *segment_graph, convert::INT8MODE, cres->calibrator_->getBatchSize(),
+        workspace_size_bytes, shapes, &cres->logger_, cres->allocator_.get(),
+        cres->calibrator_.get(), &cres->engine_,
+        /*convert_successfully=*/nullptr);
+    if (!s.ok()) {
+      LOG(ERROR) << "Calibration failed: " << s;
+      cres->calibrator_->setDone();  // Ignore further pushes
+    }
+    VLOG(1) << "Calibration loop terminated " << label;
+  }));
+  VLOG(1) << "initialized calibrator resource";
+  return tensorflow::Status::OK();
+}
+
 REGISTER_KERNEL_BUILDER(Name("TRTEngineOp").Device(DEVICE_GPU), TRTEngineOp);
 
 }  // namespace tensorrt
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
index e613a71422..6fe318be6a 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
@@ -19,9 +19,14 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "tensorflow/contrib/tensorrt/convert/utils.h"
+#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
 #include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/mutex.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
@@ -30,32 +35,95 @@ limitations under the License.
 
 namespace tensorflow {
 namespace tensorrt {
-class Logger;
-
+class TRTInt8Calibrator;
+class TRTCalibrationResource;
+class AsyncHelper;
 //  TODO(Sami): Remove this file?
-class TRTEngineOp : public OpKernel {
+
+//  This OP can construct TRTEngine on the fly and if construction of engine
+//  fails, executes equivalent subgraph as a TensorFlow function.
+class TRTEngineOp : public AsyncOpKernel {
  public:
   explicit TRTEngineOp(OpKernelConstruction* context);
 
-  void Compute(OpKernelContext* context) override;
+  void ComputeAsync(OpKernelContext* context,
+                    AsyncOpKernel::DoneCallback done) override;
   ~TRTEngineOp();
 
  private:
-  template <typename T>
-  struct Destroyer {
-    void operator()(T* d) { d->destroy(); }
-  };
-
-  template <typename T>
-  using destroyed_ptr = std::unique_ptr<T, Destroyer<T>>;
-  destroyed_ptr<nvinfer1::ICudaEngine> trt_engine_ptr_;
+  // Execute calibration
+  void ExecuteCalibration(OpKernelContext* ctx, AsyncHelper* helper);
+
+  // Construct a function handle for executing native funcdef graph
+  Status ConstructFunctionHandle(OpKernelContext* ctx);
+
+  // Execute replaced native segment as function Op.
+  void ExecuteNativeSegment(OpKernelContext* ctx, AsyncHelper* helper);
+
+  // Allocate necessary resources for calibration
+  Status AllocateCalibrationResources(OpKernelContext* ctx,
+                                      TRTCalibrationResource** cr);
+
   // TODO(samikama): context should go to a resource manager!
-  destroyed_ptr<nvinfer1::IExecutionContext> trt_execution_context_ptr_;
+  typedef std::pair<TrtUniquePtrType<nvinfer1::ICudaEngine>,
+                    TrtUniquePtrType<nvinfer1::IExecutionContext>>
+      EngineCtxPair;
+  EngineCtxPair& GetEngine(int batch_size, OpKernelContext* ctx);
 
+  // Return engine batch closest to input batch.
+  int GetEngineBatch(OpKernelContext* ctx);
+
+  nvinfer1::IGpuAllocator* GetAllocator(OpKernelContext* ctx);
+
+  // map to keep engines and their execution context for given batch size.
+  std::unordered_map<int, EngineCtxPair> engine_map_;
   std::vector<string> input_nodes_;
   std::vector<string> output_nodes_;
-  std::shared_ptr<nvinfer1::IGpuAllocator> allocator_;
-  string serialized_engine_;
+
+  // keep device allocator for TRT.
+  std::unique_ptr<TRTDeviceAllocator> allocator_;
+
+  // serialized protobuf segment or trt engine depending on static_engine_ flag.
+  string serialized_segment_;
+
+  // Name of the function for TF native execution of the segment.
+  string funcdef_name_;
+
+  // GraphDef representation of the segment.
+  GraphDef segment_graph_;
+
+  // Lookup table for temporary staging areas of input tensors for calibration.
+  std::unordered_map<string, std::pair<void*, size_t>> device_buffers_;
+
+  // Temporary staging areas for calibration inputs.
+  std::vector<PersistentTensor> dev_tensors_;
+
+  // Engine Precision mode.
+  int precision_mode_;
+
+  // Whether engine is constructed during the conversion or needs to be
+  // constructed from protobuf segment.
+  bool static_engine_;
+
+  // Whether to calibrate INT8 engine.
+  bool calibration_mode_;
+
+  // Whether non-batch ranks of the inputs are assumed to be fixed or not for
+  // engine construction.
+  bool fixed_input_size_;
+
+  // Batches of the cached engines
+  std::vector<int> cached_engine_batches_;
+
+  // Maximum number of cached engines
+  int max_cached_engines_;
+
+  int64 workspace_size_;
+  mutex engine_mutex_;
+  FunctionLibraryRuntime::Handle native_func_;
+
+  // The finalized calibrator for inference.
+  std::unique_ptr<TRTInt8Calibrator> calibrator_;
 };
 
 }  // namespace tensorrt
diff --git a/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc b/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
index 079d73f7be..383635f428 100644
--- a/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
@@ -28,11 +28,19 @@ extern Status TRTEngineOpShapeInference(InferenceContext* c);
 }
 
 REGISTER_OP("TRTEngineOp")
-    .Attr("serialized_engine: string")
-    .Attr("input_nodes: list(string)")
-    .Attr("output_nodes: list(string)")
-    .Attr("InT: list({float32})")
-    .Attr("OutT: list({float32})")
+    .Attr("serialized_segment: string")
+    .Attr("input_shapes: list(shape)")
+    .Attr("output_shapes: list(shape)")
+    .Attr("segment_funcdef_name: string")
+    .Attr("InT: list({int8,float16,float32})")
+    .Attr("OutT: list({int8,float16,float32})")
+    .Attr("static_engine: bool = true")
+    .Attr("fixed_input_size: bool = true")
+    .Attr("cached_engine_batches: list(int) = []")
+    .Attr("max_cached_engines_count: int = 1")
+    .Attr("workspace_size_bytes: int")
+    .Attr("precision_mode: {'FP32', 'FP16', 'INT8', 'INT8CALIB'}")
+    .Attr("calibration_data: string = ''")
     .Input("in_tensor: InT")
     .Output("out_tensor: OutT")
     .SetShapeFn(shape_inference::TRTEngineOpShapeInference);
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert.py b/tensorflow/contrib/tensorrt/python/trt_convert.py
index 338475d90e..79f512dbcf 100644
--- a/tensorflow/contrib/tensorrt/python/trt_convert.py
+++ b/tensorflow/contrib/tensorrt/python/trt_convert.py
@@ -21,6 +21,8 @@ from __future__ import print_function
 # pylint: disable=unused-import,line-too-long
 import six as _six
 from tensorflow.contrib.tensorrt.wrap_conversion import calib_convert
+from tensorflow.contrib.tensorrt.wrap_conversion import get_linked_tensorrt_version
+from tensorflow.contrib.tensorrt.wrap_conversion import get_loaded_tensorrt_version
 from tensorflow.contrib.tensorrt.wrap_conversion import trt_convert
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
@@ -29,7 +31,9 @@ from tensorflow.python.framework import errors_impl as _impl
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.grappler import tf_optimizer
+from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import compat
+
 # pylint: enable=unused-import,line-too-long
 
 
@@ -40,7 +44,10 @@ def create_inference_graph(input_graph_def,
                            max_batch_size=1,
                            max_workspace_size_bytes=2 << 20,
                            precision_mode="FP32",
-                           minimum_segment_size=3):
+                           minimum_segment_size=3,
+                           is_dynamic_op=False,
+                           maximum_cached_engines=1,
+                           cached_engine_batches=[]):
   """Python wrapper for the TRT transformation.
 
   Args:
@@ -51,6 +58,10 @@ def create_inference_graph(input_graph_def,
     precision_mode: one of 'FP32', 'FP16' and 'INT8'
     minimum_segment_size: the minimum number of nodes required for a subgraph to
       be replaced by TRTEngineOp.
+    is_dynamic_op: whether to generate dynamic TRT ops which will build the TRT
+      network and engine at run time.
+    maximum_cached_engines: max number of cached TRT engines in dynamic TRT ops.
+    cached_engine_batches: batch sizes used to pre-create cached engines.
 
   Returns:
     New GraphDef with TRTEngineOps placed in graph replacing subgraphs.
@@ -65,6 +76,30 @@ def create_inference_graph(input_graph_def,
                       "It should be one of {}").format(
                           precision_mode, "{'FP32', 'FP16', 'INT8'}"))
   mode = supported_precision_modes[precision_mode.upper()]
+  compiled_version = get_linked_tensorrt_version()
+  loaded_version = get_loaded_tensorrt_version()
+  version_mismatch = False
+  if loaded_version[0] < compiled_version[0]:
+    tf_logging.error(
+        "TensorRT version mismatch. Tensorflow was compiled against " +
+        "TensorRT %s but library loaded from environment is TensorRT %s" %
+        (".".join([str(x) for x in compiled_version]),
+         ".".join([str(x) for x in loaded_version])) +
+        ". Please make sure that correct version of TensorRT " +
+        "is available in the system and added to ldconfig or LD_LIBRARY_PATH"
+    )
+    raise RuntimeError("Incompatible TensorRT library version")
+  for i in zip(loaded_version, compiled_version):
+    if i[0] != i[1]:
+      tf_logging.warn("TensorRT mismatch. Compiled against version " +
+                      "%s, but loaded %s. Things may not work" %
+                      (".".join([str(x) for x in compiled_version]),
+                       ".".join([str(x) for x in loaded_version])))
+      version_mismatch = True
+      break
+  if not version_mismatch:
+    tf_logging.info("Running against TensorRT version %s" % ".".join(
+        [str(x) for x in loaded_version]))
 
   def py2bytes(inp):
     return inp
@@ -100,7 +135,9 @@ def create_inference_graph(input_graph_def,
   # pair or strings where first one is encoded status and the second
   # one is the transformed graphs protobuf string.
   out = trt_convert(input_graph_def_str, out_names, max_batch_size,
-                    max_workspace_size_bytes, mode, minimum_segment_size)
+                    max_workspace_size_bytes, mode, minimum_segment_size,
+                    is_dynamic_op, maximum_cached_engines,
+                    cached_engine_batches)
   status = to_string(out[0])
   output_graph_def_string = out[1]
   del input_graph_def_str  # Save some memory
@@ -120,11 +157,12 @@ def create_inference_graph(input_graph_def,
   return output_graph_def
 
 
-def calib_graph_to_infer_graph(calibration_graph_def):
+def calib_graph_to_infer_graph(calibration_graph_def, is_dynamic_op=False):
   """Convert an existing calibration graph to inference graph.
 
   Args:
     calibration_graph_def: the calibration GraphDef object with calibration data
+    is_dynamic_op: whether to create dynamic static engines from calibration
   Returns:
     New GraphDef with TRTEngineOps placed in graph replacing calibration nodes.
   Raises:
@@ -141,9 +179,16 @@ def calib_graph_to_infer_graph(calibration_graph_def):
     to_string = py2string
   else:
     to_string = py3string
-
+  is_calib_graph = False
+  for n in calibration_graph_def.node:
+    if n.op == "TRTEngineOp":
+      is_calib_graph = is_calib_graph or not n.attr["calibration_data"].s
+  if not is_calib_graph:
+    tf_logging.error(
+        "Not a calib graph. Doesn't seem to contain any calibration nodes.")
+    return None
   graph_str = calibration_graph_def.SerializeToString()
-  out = calib_convert(graph_str)
+  out = calib_convert(graph_str, is_dynamic_op)
   status = to_string(out[0])
   output_graph_def_string = out[1]
   del graph_str  # Save some memory
diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator.cc b/tensorflow/contrib/tensorrt/resources/trt_allocator.cc
index 0f0508331c..9f115990c3 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_allocator.cc
+++ b/tensorflow/contrib/tensorrt/resources/trt_allocator.cc
@@ -50,7 +50,7 @@ TRTDeviceAllocator::TRTDeviceAllocator(tensorflow::Allocator* allocator)
 }
 
 void TRTDeviceAllocator::free(void* memory) {
-  VLOG(2) << "Deallocating " << memory;
+  VLOG(2) << "Deallocating @ " << memory;
   allocator_->DeallocateRaw(memory);
 }
 
diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator.h b/tensorflow/contrib/tensorrt/resources/trt_allocator.h
index a0c2540a76..c5d2cec730 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_allocator.h
+++ b/tensorflow/contrib/tensorrt/resources/trt_allocator.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_ALLOCATOR_H_
 #define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_ALLOCATOR_H_
 
-
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
 #include "tensorflow/core/framework/allocator.h"
 
@@ -52,7 +51,9 @@ class TRTDeviceAllocator : public nvinfer1::IGpuAllocator {
   // Allocator implementation wrapping TF device allocators.
  public:
   TRTDeviceAllocator(tensorflow::Allocator* allocator);
-  virtual ~TRTDeviceAllocator() {}
+  virtual ~TRTDeviceAllocator() {
+    VLOG(1) << "Destroying allocator attached to " << allocator_->Name();
+  }
   void* allocate(uint64_t size, uint64_t alignment, uint32_t flags) override;
   void free(void* memory) override;
 
diff --git a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc b/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc
index dc7c93f869..32e81858b9 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc
+++ b/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h"
 
 #include <atomic>
-#include <chrono>
 #include <unordered_map>
 
 #include "tensorflow/core/platform/logging.h"
@@ -37,15 +36,22 @@ TRTInt8Calibrator::TRTInt8Calibrator(
     : batch_size_(batch_size),
       done_(false),
       dev_buffers_(dev_buffers),
-      calib_running_(false),
+      calib_running_(true),
       batch_is_set_(false),
       engine_name_(engine_name) {}
 
+TRTInt8Calibrator::TRTInt8Calibrator(const string& calib_data)
+    : batch_size_(0),
+      done_(false),
+      calib_running_(false),
+      batch_is_set_(false),
+      calibration_table_(calib_data) {}
+
 bool TRTInt8Calibrator::setBatch(const std::unordered_map<string, void*>& data,
                                  const cudaStream_t stream) {
   tensorflow::mutex_lock lock(cond_mtx_);
-  while ((calib_running_ || batch_is_set_) &&
-         !done_) {  // wait while calibration is running
+  // wait while calibration is running.
+  while ((calib_running_ || batch_is_set_) && !done_) {
     cond_.wait(lock);
   }
   if (done_) return false;
@@ -59,8 +65,6 @@ bool TRTInt8Calibrator::setBatch(const std::unordered_map<string, void*>& data,
     }
     const auto& d = devptr->second;
 
-    // TODO(aaroey): we should not use sync copy on default stream. Make sure
-    // stream->ThenMemcpy() is used in future PRs.
     // TODO(sami,aaroey): Need to figure out a way to ensure synchronization
     // between stream, perhaps using a tensor?
     auto status = cudaMemcpyAsync(d.first, it.second, d.second,
@@ -84,13 +88,11 @@ bool TRTInt8Calibrator::getBatch(void** bindings, const char** names,
   tensorflow::mutex_lock lock(cond_mtx_);
   calib_running_ = false;
   cond_.notify_all();
-  while ((!batch_is_set_ && !done_)) {  // wait until new batch arrives
+  // wait until new batch arrives
+  while ((!batch_is_set_ && !done_)) {
     cond_.wait(lock);
-
-  }
-  if (done_) {
-    return false;
   }
+  if (done_) return false;
 
   for (int i = 0; i < num_bindings; i++) {
     auto it = dev_buffers_.find(names[i]);
@@ -107,7 +109,9 @@ bool TRTInt8Calibrator::getBatch(void** bindings, const char** names,
 }
 
 const void* TRTInt8Calibrator::readCalibrationCache(std::size_t& length) {
-  return nullptr;
+  if (calibration_table_.empty()) return nullptr;
+  length = calibration_table_.size();
+  return calibration_table_.data();
 }
 
 void TRTInt8Calibrator::setDone() {
@@ -117,7 +121,11 @@ void TRTInt8Calibrator::setDone() {
 }
 
 void TRTInt8Calibrator::writeCalibrationCache(const void* ptr,
-                                              std::size_t length) {}
+                                              std::size_t length) {
+  calibration_table_ = string((const char*)ptr, length);
+  VLOG(1) << "Got calibration data for " << engine_name_ << " @" << ptr
+          << " length=" << length;
+}
 TRTInt8Calibrator::~TRTInt8Calibrator() {
   VLOG(1) << "Destroying calibrator for " << engine_name_;
 }
diff --git a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h b/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h
index d77aa2c5ab..994312d7c3 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h
+++ b/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h
@@ -39,29 +39,48 @@ struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator {
   TRTInt8Calibrator(
       const std::unordered_map<string, std::pair<void*, size_t>>& dev_buffers,
       int batch_size, string engine_name);
+
+  TRTInt8Calibrator(const string& calibration_data);
+
+  ~TRTInt8Calibrator();
+
   int getBatchSize() const override;
+
   bool getBatch(void* bindings[], const char* names[],
                 int num_bindings) override;
+
   bool setBatch(const std::unordered_map<string, void*>& data,
                 const cudaStream_t stream);
+
   void setDone();
+
+  // If not null, calibration is skipped.
   const void* readCalibrationCache(std::size_t& length) override;
+
   void writeCalibrationCache(const void* ptr, std::size_t length) override;
-  ~TRTInt8Calibrator();
+
+  const string& getCalibrationTableAsString() { return calibration_table_; }
 
  private:
   const int batch_size_;
-  tensorflow::mutex cond_mtx_;           // mutex for condition_variable
-  tensorflow::condition_variable cond_;  // condition variable to implement
-                                         // producer-consumer queue for
-                                         // calibration
+
+  // mutex for condition_variable
+  tensorflow::mutex cond_mtx_;
+
+  // condition variable to implement producer-consumer queue for calibration
+  tensorflow::condition_variable cond_;
+
+  // Is calibration finished?
   bool done_;
-  const std::unordered_map<string, std::pair<void*, size_t>>
-      dev_buffers_;  // map to keep tensorrt input buffers and sizes keyed with
-                     // buffer names
+
+  // Map to keep tensorrt input buffers and sizes keyed with buffer names
+  const std::unordered_map<string, std::pair<void*, size_t>> dev_buffers_;
+
   bool calib_running_;
   bool batch_is_set_;
+
   string engine_name_;
+  string calibration_table_;
 };
 
 }  // namespace tensorrt
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resources.h b/tensorflow/contrib/tensorrt/resources/trt_resources.h
index e3469124ac..b7d5ffd674 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_resources.h
+++ b/tensorflow/contrib/tensorrt/resources/trt_resources.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <thread>
 #include <vector>
 
+#include "tensorflow/contrib/tensorrt/convert/utils.h"
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
 #include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
 #include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h"
@@ -34,50 +35,48 @@ limitations under the License.
 
 namespace tensorflow {
 namespace tensorrt {
+
 class TRTCalibrationResource : public tensorflow::ResourceBase {
  public:
-  TRTCalibrationResource()
-      : calibrator_(nullptr),
-        builder_(nullptr),
-        network_(nullptr),
-        engine_(nullptr),
-        logger_(nullptr),
-        thr_(nullptr) {}
-
   ~TRTCalibrationResource() {
     VLOG(0) << "Destroying Calibration Resource " << std::endl << DebugString();
+    builder_.reset();
+    engine_.reset();
+    // We need to manually destroy the builder and engine before the allocator
+    // is destroyed.
+    allocator_.reset();
   }
 
   string DebugString() override {
     std::stringstream oss;
-    oss << " Calibrator = " << std::hex << calibrator_ << std::dec << std::endl
-        << " Builder    = " << std::hex << builder_ << std::dec << std::endl
-        << " Network    = " << std::hex << network_ << std::dec << std::endl
-        << " Engine     = " << std::hex << engine_ << std::dec << std::endl
-        << " Logger     = " << std::hex << logger_ << std::dec << std::endl
-        << " Allocator  = " << std::hex << allocator_.get() << std::dec
-        << std::endl
-        << " Thread     = " << std::hex << thr_ << std::dec << std::endl;
+    using std::dec;
+    using std::endl;
+    using std::hex;
+    oss << " Calibrator = " << hex << calibrator_.get() << dec << endl
+        << " Builder    = " << hex << builder_.get() << dec << endl
+        << " Engine     = " << hex << engine_.get() << dec << endl
+        << " Logger     = " << hex << &logger_ << dec << endl
+        << " Allocator  = " << hex << allocator_.get() << dec << endl
+        << " Thread     = " << hex << thr_.get() << dec << endl;
     return oss.str();
   }
 
-  TRTInt8Calibrator* calibrator_;
-  nvinfer1::IBuilder* builder_;
-  nvinfer1::INetworkDefinition* network_;
-  nvinfer1::ICudaEngine* engine_;
-  std::shared_ptr<nvinfer1::IGpuAllocator> allocator_;
-  tensorflow::tensorrt::Logger* logger_;
+  std::unique_ptr<TRTInt8Calibrator> calibrator_;
+  TrtUniquePtrType<nvinfer1::IBuilder> builder_;
+  TrtUniquePtrType<nvinfer1::ICudaEngine> engine_;
+  std::unique_ptr<nvinfer1::IGpuAllocator> allocator_;
+  tensorflow::tensorrt::Logger logger_;
   // TODO(sami): Use threadpool threads!
-  std::thread* thr_;
+  std::unique_ptr<std::thread> thr_;
 };
 
-class TRTWeightStore : public tensorflow::ResourceBase {
+class TRTWeightStore {
  public:
   TRTWeightStore() {}
 
   virtual ~TRTWeightStore() { VLOG(1) << "Destroying store" << DebugString(); }
 
-  string DebugString() override {
+  string DebugString() {
     std::stringstream oss;
     size_t len_bytes = 0;
     for (const auto& v : store_) {
diff --git a/tensorflow/contrib/tensorrt/segment/segment.h b/tensorflow/contrib/tensorrt/segment/segment.h
index 1568dd9153..81b4bfe49f 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.h
+++ b/tensorflow/contrib/tensorrt/segment/segment.h
@@ -29,8 +29,9 @@ namespace tensorflow {
 namespace tensorrt {
 namespace segment {
 
-// vector of segments, each entry contains a device name and a set of nodes in
-// segment
+// Vector of segments, each entry contains a set of node names and a device name
+// in the segment.
+// TODO(aaroey): use node pointer instead of node name.
 using SegmentNodesVector = std::vector<std::pair<std::set<string>, string>>;
 
 struct SegmentOptions {
@@ -48,6 +49,8 @@ struct SegmentOptions {
 // in the vector describes a subgraph by giving a set of the names of
 // all the NodeDefs in that subgraph.
 // @return the status.
+//
+// TODO(aaroey): remove this method.
 tensorflow::Status SegmentGraph(
     const tensorflow::GraphDef& gdef,
     const std::function<bool(const tensorflow::Node*)>& candidate_fn,
diff --git a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
index f36495f6b6..227ac120dd 100644
--- a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
+++ b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
@@ -29,61 +29,35 @@ namespace tensorflow {
 namespace shape_inference {
 
 tensorflow::Status TRTEngineOpShapeInference(InferenceContext* context) {
-  tensorflow::tensorrt::Logger logger;
-  string serialized_engine;
-  TF_RETURN_IF_ERROR(context->GetAttr("serialized_engine", &serialized_engine));
-  nvinfer1::IRuntime* infer = nvinfer1::createInferRuntime(logger);
-  nvinfer1::ICudaEngine* trt_engine = infer->deserializeCudaEngine(
-      serialized_engine.c_str(), serialized_engine.size(),
-      tensorrt::PluginFactoryTensorRT::GetInstance());
-
-  int num_batch = -1;
-  std::vector<::tensorflow::DataType> input_type;
-  TF_RETURN_IF_ERROR(context->GetAttr("InT", &input_type));
-  for (size_t i = 0; i < context->num_inputs(); i++) {
-    // Check if input shape is legit
-    auto input_shape = context->input(i);
-    for (int j = 0; j < context->Rank(input_shape); j++) {
-      auto dim_handler = context->Dim(input_shape, j);
-      if (j == 0) {
-        if (i == 0) {
-          num_batch = context->Value(dim_handler);
-        } else if (num_batch != context->Value(dim_handler)) {
-          // TODO(jie): TensorRT engine requires consistent batch between inputs
-          //            tensors. Segmenter should be aware of this.
-          LOG(FATAL) << "TensorRT engine requires consistent batch size";
-        }
-      }
-    }
+  std::vector<tensorflow::TensorShape> shapes;
+  for (int i = 0; i < context->num_outputs(); ++i) {
+    context->set_output(i, context->UnknownShape());
   }
-
-  // Arrange input here
-  std::vector<string> input_nodes;
-  TF_RETURN_IF_ERROR(context->GetAttr("input_nodes", &input_nodes));
-
-  // Arrange output here
-  std::vector<string> output_nodes;
-  TF_RETURN_IF_ERROR(context->GetAttr("output_nodes", &output_nodes));
-  for (size_t i = 0; i < output_nodes.size(); i++) {
-    int binding_index = trt_engine->getBindingIndex(output_nodes[i].c_str());
-    ShapeHandle output_shape;
-    std::vector<DimensionHandle> dim_vec;
-    dim_vec.emplace_back(context->MakeDim(num_batch));
-    if (binding_index != -1) {
-      auto dims = trt_engine->getBindingDimensions(binding_index);
-      for (int j = 0; j < dims.nbDims; j++) {
-        dim_vec.emplace_back(context->MakeDim(dims.d[j]));
-      }
-    } else {
-      LOG(FATAL) << "TensorRT engine cannot find binding: " << output_nodes[i];
-    }
-    output_shape = context->MakeShape(dim_vec);
-    context->set_output(i, output_shape);
+  auto status = context->GetAttr("input_shapes", &shapes);
+  // it is ok to not to have shapes
+  if (!status.ok()) return Status::OK();
+  if ((int)shapes.size() != context->num_inputs()) return Status::OK();
+  bool different_input = false;
+  for (int i = 0; i < context->num_inputs(); ++i) {
+    if (shapes.at(i) != context->input_tensor(i)->shape())
+      different_input = true;
+  }
+  if (different_input) return Status::OK();
+  shapes.resize(0);
+  status = context->GetAttr("output_shapes", &shapes);
+  if (!status.ok()) return Status::OK();
+  if ((int)shapes.size() != context->num_outputs()) return Status::OK();
+  std::vector<ShapeHandle> shape_handles(shapes.size());
+  for (size_t i = 0; i < shapes.size(); ++i) {
+    status =
+        context->MakeShapeFromTensorShape(shapes.at(i), &shape_handles.at(i));
+    if (!status.ok()) return Status::OK();
+  }
+  for (int i = 0; i < context->num_outputs(); ++i) {
+    context->set_output(i, shape_handles.at(i));
   }
-
   return Status::OK();
 }
-
 }  // namespace shape_inference
 }  // namespace tensorflow
 
diff --git a/tensorflow/contrib/tensorrt/test/test_tftrt.py b/tensorflow/contrib/tensorrt/test/test_tftrt.py
index 175ccd8006..090aa8bdb0 100644
--- a/tensorflow/contrib/tensorrt/test/test_tftrt.py
+++ b/tensorflow/contrib/tensorrt/test/test_tftrt.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import argparse
 import numpy as np
+import six as _six
 
 # normally we should do import tensorflow as tf and then
 # tf.placeholder, tf.constant, tf.nn.conv2d etc but
@@ -35,10 +36,75 @@ from tensorflow.python.framework import dtypes as dtypes
 from tensorflow.python.framework import importer as importer
 from tensorflow.python.framework import ops as ops
 from tensorflow.python.ops import array_ops as aops
+from tensorflow.python.ops import math_ops as mops
 from tensorflow.python.ops import nn as nn
 from tensorflow.python.ops import nn_ops as nn_ops
 
 
+def py2bytes(inp):
+  return inp
+
+
+def py3bytes(inp):
+  return inp.encode("utf-8", errors="surrogateescape")
+
+
+def py2string(inp):
+  return inp
+
+
+def py3string(inp):
+  return inp.decode("utf-8")
+
+
+if _six.PY2:
+  to_bytes = py2bytes
+  to_string = py2string
+else:
+  to_bytes = py3bytes
+  to_string = py3string
+
+
+def get_multi_engine_graph_def(mode="FP32"):
+  """Create a simple graph and return its graph_def."""
+  dtype = dtypes.float32
+  if mode.upper() == "FP16":
+    dtype = dtypes.float16
+  else:
+    pass
+
+  g = ops.Graph()
+  with g.as_default():
+    x = aops.placeholder(shape=[None, 3, 7, 5], name="input", dtype=dtype)
+    with g.name_scope("Global_scope"):
+      with g.name_scope("first_scope"):
+        e = cop.constant(
+            np.random.randn(3, 2, 3, 4), name="weights", dtype=dtype)
+        conv = nn.conv2d(
+            input=x,
+            filter=e,
+            data_format="NCHW",
+            strides=[1, 1, 1, 1],
+            padding="VALID",
+            name="conv")
+        b = cop.constant(np.random.randn(1, 4, 1, 1), name="bias1", dtype=dtype)
+        t = conv * b
+
+        b = cop.constant(np.random.randn(1, 4, 1, 1), name="bias2", dtype=dtype)
+        q = conv / b
+      edge = mops.sin(q)
+      edge1 = mops.cos(conv)
+      with g.name_scope("test_scope"):
+        de = edge + edge1
+        t -= edge1
+        q *= edge
+        t += q
+        t -= de
+    k = aops.squeeze(t, name="output")
+  print(k.dtype)
+  return g.as_graph_def()
+
+
 def get_simple_graph_def():
   """Create a simple graph and return its graph_def."""
   g = ops.Graph()
@@ -65,7 +131,9 @@ def get_simple_graph_def():
 def execute_graph(gdef, dumm_inp):
   """Run given graphdef once."""
   print("executing")
-  gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
+  gpu_options = None
+  if trt.trt_convert.get_linked_tensorrt_version()[0] == 3:
+    gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
   sessconfig = cpb2.ConfigProto(gpu_options=gpu_options)
   ops.reset_default_graph()
   g = ops.Graph()
@@ -83,7 +151,9 @@ def execute_graph(gdef, dumm_inp):
 # for calibration. For this test script it is random data.
 def execute_calibration(gdef, dumm_inp):
   """Run given calibration graph multiple times."""
-  gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
+  gpu_options = None
+  if trt.trt_convert.get_linked_tensorrt_version()[0] == 3:
+    gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
   ops.reset_default_graph()
   g = ops.Graph()
   with g.as_default():
@@ -100,12 +170,17 @@ def execute_calibration(gdef, dumm_inp):
   return val
 
 
-def user(run_graph=execute_graph, run_calibration=execute_calibration):
+def user(multi_engine,
+         run_graph=execute_graph,
+         run_calibration=execute_calibration):
   """Example function that converts a graph to TFTRT graph."""
-
-  inp_dims = (100, 24, 24, 2)
+  if multi_engine:
+    inp_dims = (2, 3, 7, 5)
+    orig_graph = get_multi_engine_graph_def()
+  else:
+    inp_dims = (100, 24, 24, 2)
+    orig_graph = get_simple_graph_def()  # use a frozen graph for inference
   dummy_input = np.random.random_sample(inp_dims)
-  orig_graph = get_simple_graph_def()  # use a frozen graph for inference
   # Get optimized graph
   trt_graph = trt.create_inference_graph(
       input_graph_def=orig_graph,
@@ -113,8 +188,10 @@ def user(run_graph=execute_graph, run_calibration=execute_calibration):
       max_batch_size=inp_dims[0],
       max_workspace_size_bytes=1 << 25,
       precision_mode="FP32",  # TRT Engine precision "FP32","FP16" or "INT8"
-      minimum_segment_size=2  # minimum number of nodes in an engine
-  )
+      minimum_segment_size=2,  # minimum number of nodes in an engine
+      is_dynamic_op=False,
+      maximum_cached_engines=1,
+      cached_engine_batches=[])
   o1 = run_graph(orig_graph, dummy_input)
   o2 = run_graph(trt_graph, dummy_input)
   o3 = run_graph(trt_graph, dummy_input)
@@ -126,40 +203,51 @@ def user(run_graph=execute_graph, run_calibration=execute_calibration):
       max_batch_size=inp_dims[0],
       max_workspace_size_bytes=1 << 25,
       precision_mode="FP16",  # TRT Engine precision "FP32","FP16" or "INT8"
-      minimum_segment_size=2  # minimum number of nodes in an engine
-  )
+      minimum_segment_size=2,  # minimum number of nodes in an engine
+      is_dynamic_op=False,
+      maximum_cached_engines=1,
+      cached_engine_batches=[])
   int8_calib_gdef = trt.create_inference_graph(
       input_graph_def=orig_graph,
       outputs=["output"],
       max_batch_size=inp_dims[0],
       max_workspace_size_bytes=1 << 25,
       precision_mode="INT8",  # TRT Engine precision "FP32","FP16" or "INT8"
-      minimum_segment_size=2  # minimum number of nodes in an engine
-  )
+      minimum_segment_size=2,  # minimum number of nodes in an engine
+      is_dynamic_op=False,
+      maximum_cached_engines=1,
+      cached_engine_batches=[])
   o4 = run_graph(fp16_graph, dummy_input)
   _ = run_calibration(int8_calib_gdef, dummy_input)
   int8_graph = trt.calib_graph_to_infer_graph(int8_calib_gdef)
   o5 = run_graph(int8_graph, dummy_input)
-  assert np.allclose(o1, o4)
-  assert np.allclose(o1, o5)
+  print("Is FP32 == FP16? %s (False is possible)" % np.allclose(o1, o4))
+  print("Is FP32 == INT8? %s (False is possible)" % np.allclose(o1, o5))
   print("Pass")
 
 
-def auto():
+def auto(multi_engine):
   """Run the conversion as an optimization pass."""
-  inp_dims = (100, 24, 24, 2)
+  if multi_engine:
+    inp_dims = (2, 3, 7, 5)
+    orig_graph = get_multi_engine_graph_def()
+  else:
+    inp_dims = (100, 24, 24, 2)
+    orig_graph = get_simple_graph_def()  # use a frozen graph for inference
   dummy_input = np.random.random_sample(inp_dims)
-  orig_graph = get_simple_graph_def()
   opt_config = rwpb2.RewriterConfig()
+  opt_config.meta_optimizer_iterations = opt_config.ONE
   opt_config.optimizers.extend(["constfold", "layout"])
   custom_op = opt_config.custom_optimizers.add()
   custom_op.name = "TensorRTOptimizer"
   custom_op.parameter_map["minimum_segment_size"].i = 3
-  custom_op.parameter_map["precision_mode"].s = "FP32"
+  custom_op.parameter_map["precision_mode"].s = to_bytes("FP32")
   custom_op.parameter_map["max_batch_size"].i = inp_dims[0]
   custom_op.parameter_map["max_workspace_size_bytes"].i = 1 << 25
   print(custom_op)
-  gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
+  gpu_options = None
+  if trt.trt_convert.get_linked_tensorrt_version()[0] == 3:
+    gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
   graph_options = cpb2.GraphOptions(rewrite_options=opt_config)
   sessconfig = cpb2.ConfigProto(
       gpu_options=gpu_options, graph_options=graph_options)
@@ -168,7 +256,7 @@ def auto():
   ops.reset_default_graph()
   with g.as_default():
     inp, out = importer.import_graph_def(
-        graph_def=orig_graph, return_elements=["input", "output"])
+        graph_def=orig_graph, return_elements=["input", "output"], name="")
     inp = inp.outputs[0]
     out = out.outputs[0]
     with csess.Session(config=sessconfig, graph=g) as sess:
@@ -186,8 +274,14 @@ if "__main__" in __name__:
       action="store_true",
       help="Do TRT conversion automatically",
       default=False)
+  P.add_argument(
+      "--multi-engine",
+      "-m",
+      action="store_true",
+      help="Use a graph that will result in 2 engines",
+      default=False)
   flags, unparsed = P.parse_known_args()
   if flags.automatic:
-    auto()
+    auto(flags.multi_engine)
   else:
-    user()
+    user(flags.multi_engine)
diff --git a/tensorflow/contrib/tensorrt/trt_conversion.i b/tensorflow/contrib/tensorrt/trt_conversion.i
index 46480e99a1..d51a0b59e2 100644
--- a/tensorflow/contrib/tensorrt/trt_conversion.i
+++ b/tensorflow/contrib/tensorrt/trt_conversion.i
@@ -48,12 +48,53 @@ PyObject* pair_helper(std::pair<string, string>* in) {
   }
   return tuple;
 }
+
+struct version_struct{
+  int vmajor;
+  int vminor;
+  int vpatch;
+};
+
+PyObject* version_helper(version_struct* in) {
+  PyObject *tuple(nullptr);
+  tuple = Py_BuildValue("(iii)", in->vmajor, in->vminor, in->vpatch);
+  if (!tuple) {
+    if (!PyErr_Occurred()) {
+      PyErr_SetString(PyExc_TypeError,
+                      "Tuple creation from version structure failed!");
+    }
+    return NULL;
+  }
+  return tuple;
+}
+/* Define converters for vector<int> */
+template<>
+bool _PyObjAs(PyObject *pyobj, int* dest) {
+  *dest = PyLong_AsLong(pyobj);
+  return true;
+}
+
+template<>
+PyObject *_PyObjFrom(const int& src) {
+  return PyLong_FromLong(src);
+}
+
 %}
+
+_LIST_OUTPUT_TYPEMAP(int, PyLong_FromLong);
+
 %typemap(out) std::pair<string, string> {
   PyObject *tuple = pair_helper(&$1);
   if (!tuple) SWIG_fail;
   $result = tuple;
 }
+
+%typemap(out) version_struct {
+  PyObject *tuple = version_helper(&$1);
+  if (!tuple) SWIG_fail;
+  $result = tuple;
+}
+
 %{
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -65,6 +106,8 @@ PyObject* pair_helper(std::pair<string, string>* in) {
 %unignore tensorflow;
 %unignore trt_convert;
 %unignore calib_convert;
+%unignore get_linked_tensorrt_version;
+%unignore get_loaded_tensorrt_version;
 
 %{
 
@@ -74,7 +117,10 @@ std::pair<string, string> trt_convert(
     size_t max_batch_size,
     size_t max_workspace_size_bytes,
     int precision_mode,
-    int minimum_segment_size
+    int minimum_segment_size,
+    bool is_dyn_op,
+    int max_cached_engines,
+    std::vector<int> cached_engine_batches
     // Unfortunately we can't use TF_Status here since it
     // is in c/c_api and brings in a lot of other libraries
     // which in turn declare ops. These ops are included
@@ -102,11 +148,12 @@ std::pair<string, string> trt_convert(
     out_status = "InvalidArgument;Size of the output_names vector is 0";
     return std::pair<string, string>{out_status, ""};
   }
-  tensorflow::GraphDef outGraph;
+  tensorflow::GraphDef out_graph;
   tensorflow::Status conversion_status =
       tensorflow::tensorrt::convert::ConvertGraphDefToTensorRT(
           graph_def, output_names, max_batch_size, max_workspace_size_bytes,
-          &outGraph, precision_mode, minimum_segment_size);
+          &out_graph, precision_mode, minimum_segment_size,
+          is_dyn_op, max_cached_engines, cached_engine_batches);
   if (!conversion_status.ok()) {
     auto retCode = (int)conversion_status.code();
     char buff[2000];
@@ -116,7 +163,7 @@ std::pair<string, string> trt_convert(
     return std::pair<string, string>{out_status, ""};
   }
   string result;
-  if (!outGraph.SerializeToString(&result)) {
+  if (!out_graph.SerializeToString(&result)) {
     out_status = "InvalidArgument;Couldn't serialize output as a GraphDef";
     return std::pair<string, string>{out_status, ""};
   }
@@ -128,7 +175,8 @@ std::pair<string, string> trt_convert(
 #endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 }
 
-std::pair<string, string> calib_convert(string graph_def_string  //  const tensorflow::GraphDef&
+std::pair<string, string> calib_convert(
+    string graph_def_string, bool is_dyn_op
     // unfortunately we can't use TF_Status here since it
     // is in c/c_api and brings in a lot of other libraries
     // which in turn declare ops. These ops are included
@@ -147,11 +195,11 @@ std::pair<string, string> calib_convert(string graph_def_string  //  const tenso
     out_status = "InvalidArgument;Couldn't interpret input as a GraphDef";
     return std::pair<string, string>{out_status, ""};
   }
-
-  tensorflow::GraphDef outGraph;
+  graph_def_string.resize(0);
+  tensorflow::GraphDef out_graph;
   tensorflow::Status conversion_status =
-      tensorflow::tensorrt::convert::ConvertCalibGraphToInferGraph(graph_def,
-                                                                   &outGraph);
+      tensorflow::tensorrt::convert::ConvertCalibGraphToInferGraph(
+          graph_def, &out_graph, is_dyn_op);
   if (!conversion_status.ok()) {
     auto retCode = (int)conversion_status.code();
     char buff[2000];
@@ -161,7 +209,7 @@ std::pair<string, string> calib_convert(string graph_def_string  //  const tenso
     return std::pair<string, string>{out_status, ""};
   }
   string result;
-  if (!outGraph.SerializeToString(&result)) {
+  if (!out_graph.SerializeToString(&result)) {
     out_status = "InvalidArgument;Couldn't serialize output as a GraphDef";
     return std::pair<string, string>{out_status, ""};
   }
@@ -172,15 +220,39 @@ std::pair<string, string> calib_convert(string graph_def_string  //  const tenso
   return std::pair<string, string>{"9;TensorRT is not enabled!", ""};
 #endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 }
+
+version_struct get_linked_tensorrt_version(){
+  // Return the version at the link time.
+  const auto &lv = tensorflow::tensorrt::convert::GetLinkedTensorRTVersion();
+  version_struct s;
+  s.vmajor = lv[0];
+  s.vminor = lv[1];
+  s.vpatch = lv[2];
+  return s;
+}
+version_struct get_loaded_tensorrt_version(){
+  // Return the version from the loaded library.
+  const auto &lv = tensorflow::tensorrt::convert::GetLoadedTensorRTVersion();
+  version_struct s;
+  s.vmajor = lv[0];
+  s.vminor = lv[1];
+  s.vpatch = lv[2];
+  return s;
+}
+
 %}
 
-std::pair<string, string> calib_convert(string graph_def_string);
+std::pair<string, string> calib_convert(string graph_def_string, bool is_dyn_op);
 
 std::pair<string, string> trt_convert(string graph_def_string,
                                       std::vector<string> output_names,
                                       size_t max_batch_size,
                                       size_t max_workspace_size_bytes,
-                                      int precision_mode, int minimum_segment_size);
-
+                                      int precision_mode, int minimum_segment_size,
+                                      bool is_dyn_op,
+                                      int max_cached_engines,
+                                      std::vector<int> cached_engine_batches);
+version_struct get_linked_tensorrt_version();
+version_struct get_loaded_tensorrt_version();
 
 %unignoreall
diff --git a/tensorflow/contrib/tpu/profiler/BUILD b/tensorflow/contrib/tpu/profiler/BUILD
index 3b2d7adfff..38d1c3049e 100644
--- a/tensorflow/contrib/tpu/profiler/BUILD
+++ b/tensorflow/contrib/tpu/profiler/BUILD
@@ -49,11 +49,11 @@ tf_cc_binary(
         ":tpu_profiler_analysis_proto_cc",
         ":tpu_profiler_proto_cc",
         ":version",
+        "//tensorflow:grpc++",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
         "//tensorflow/core/platform/cloud:gcs_file_system",
-        "@grpc//:grpc++",
     ],
 )
 
diff --git a/tensorflow/contrib/verbs/BUILD b/tensorflow/contrib/verbs/BUILD
index 1b45584dcb..19cb8983b6 100644
--- a/tensorflow/contrib/verbs/BUILD
+++ b/tensorflow/contrib/verbs/BUILD
@@ -53,12 +53,12 @@ cc_library(
         ":grpc_verbs_service_impl",
         ":rdma_mgr",
         ":verbs_service_proto_cc",
+        "//tensorflow:grpc++",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/distributed_runtime:session_mgr",
         "//tensorflow/core/distributed_runtime/rpc:async_service_interface",
         "//tensorflow/core/distributed_runtime/rpc:grpc_call",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
-        "@grpc//:grpc++",
     ],
     alwayslink = 1,
 )
@@ -69,7 +69,7 @@ cc_library(
     hdrs = ["grpc_verbs_service_impl.h"],
     deps = [
         ":verbs_service_proto_cc",
-        "@grpc//:grpc++",
+        "//tensorflow:grpc++",
     ],
 )
 
diff --git a/tensorflow/core/api_def/BUILD b/tensorflow/core/api_def/BUILD
index 19d6438809..06b797e32e 100644
--- a/tensorflow/core/api_def/BUILD
+++ b/tensorflow/core/api_def/BUILD
@@ -4,6 +4,7 @@
 # The following targets can be used to access ApiDefs:
 #   :base_api_def
 #   :python_api_def
+#   :java_api_def
 
 package(
     default_visibility = ["//visibility:private"],
@@ -29,6 +30,12 @@ filegroup(
     visibility = ["//tensorflow:internal"],
 )
 
+filegroup(
+    name = "java_api_def",
+    srcs = glob(["java_api/*"]),
+    visibility = ["//tensorflow:internal"],
+)
+
 cc_library(
     name = "excluded_ops_lib",
     srcs = ["excluded_ops.cc"],
diff --git a/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBox.pbtxt b/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBox.pbtxt
index 6f1121dd37..5ab5917bd3 100644
--- a/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBox.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBox.pbtxt
@@ -68,7 +68,7 @@ END
     name: "area_range"
     description: <<END
 The cropped area of the image must contain a fraction of the
-supplied image within in this range.
+supplied image within this range.
 END
   }
   attr {
diff --git a/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBoxV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBoxV2.pbtxt
index 473aec50aa..663fc582d4 100644
--- a/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBoxV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBoxV2.pbtxt
@@ -68,7 +68,7 @@ END
     name: "area_range"
     description: <<END
 The cropped area of the image must contain a fraction of the
-supplied image within in this range.
+supplied image within this range.
 END
   }
   attr {
diff --git a/tensorflow/core/api_def/base_api/api_def_SlideDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_SlideDataset.pbtxt
index 9fabe7863e..c80ee77f73 100644
--- a/tensorflow/core/api_def/base_api/api_def_SlideDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SlideDataset.pbtxt
@@ -11,7 +11,7 @@ END
     name: "stride"
     description: <<END
 A scalar representing the steps moving the sliding window
-forward in one iteration. It must be in `[1, window_size)`.
+forward in one iteration. It must be positive.
 END
   }
   summary: "Creates a dataset that passes a sliding window over `input_dataset`."
diff --git a/tensorflow/core/api_def/java_api/api_def_Assert.pbtxt b/tensorflow/core/api_def/java_api/api_def_Assert.pbtxt
new file mode 100644
index 0000000000..b1f868897d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Assert.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Assert" #TODO(karllessard) escape that reserved name
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Const.pbtxt b/tensorflow/core/api_def/java_api/api_def_Const.pbtxt
new file mode 100644
index 0000000000..2dbdca34e0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Const.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Const" #TODO(karllessard) escape that reserved name
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Switch.pbtxt b/tensorflow/core/api_def/java_api/api_def_Switch.pbtxt
new file mode 100644
index 0000000000..0d3362a91e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Switch.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Switch" #TODO(karllessard) escape that reserved name
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
index 486f0be698..0b096a14a3 100644
--- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
@@ -106,24 +106,24 @@ TEST(DirectSessionWithTrackingAllocTest, CostModelTest) {
         EXPECT_EQ(1, shape.dim(1).size());
         if (node->name() == y->name()) {
 #ifdef INTEL_MKL
-          // if MKL is used, it goes through various additional 
-          // graph rewrite pass. In TF, everytime a graph pass 
+          // if MKL is used, it goes through various additional
+          // graph rewrite pass. In TF, everytime a graph pass
           // happens, "constant" nodes are allocated
           // and deallocated. Each allocation calls the
           // (FindChunkPtr of BFCAllocator),
-          // which increments the value of AllocationId. 
-          // Thus AllocationId becomes more than 3 and 4 if 
-          // MKL is used. Now they are 9 and 10 for MKL. 
-          EXPECT_EQ(19, cm->AllocationId(node, 0));
+          // which increments the value of AllocationId.
+          // Thus AllocationId becomes more than TF if MKL
+          // is used. Now IDs for MKL are 8 more than TF.
+          EXPECT_EQ(29, cm->AllocationId(node, 0));
 #else
           EXPECT_EQ(21, cm->AllocationId(node, 0));
-#endif 
+#endif
         } else {
 #ifdef INTEL_MKL
-          EXPECT_EQ(20, cm->AllocationId(node, 0));
+          EXPECT_EQ(30, cm->AllocationId(node, 0));
 #else
           EXPECT_EQ(22, cm->AllocationId(node, 0));
-#endif 
+#endif
         }
       }
       EXPECT_LE(0, cm->MaxExecutionTime(node));
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.cc b/tensorflow/core/common_runtime/mkl_cpu_allocator.cc
index 43a909466e..4ec85457ad 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.cc
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.cc
@@ -17,6 +17,13 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/mkl_cpu_allocator.h"
 
+#ifdef _WIN32
+// Declare function to avoid unresolved symbol in VS
+i_malloc_t i_malloc;
+i_calloc_t i_calloc;
+i_realloc_t i_realloc;
+i_free_t i_free;
+#endif
 namespace tensorflow {
 
 constexpr const char* MklCPUAllocator::kMaxLimitStr;
diff --git a/tensorflow/core/debug/BUILD b/tensorflow/core/debug/BUILD
index 50f8a307d8..36e9b3455a 100644
--- a/tensorflow/core/debug/BUILD
+++ b/tensorflow/core/debug/BUILD
@@ -143,6 +143,7 @@ tf_cuda_library(
         ":debug_node_key",
         ":debug_service_proto_cc",
         ":debugger_event_metadata_proto_cc",
+        "//tensorflow:grpc++",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
@@ -150,7 +151,6 @@ tf_cuda_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
-        "@grpc//:grpc++",
     ],
     alwayslink = 1,
 )
@@ -166,11 +166,11 @@ tf_cuda_library(
         ":debug_io_utils",
         ":debug_service_proto_cc",
         ":debugger_event_metadata_proto_cc",
+        "//tensorflow:grpc++",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "@grpc//:grpc++",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index 8247651c24..75f8a19e9c 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -628,6 +628,7 @@ tf_cuda_cc_test(
         ":master",
         ":remote_device",
         ":worker_interface",
+        "//tensorflow:grpc++",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -649,7 +650,6 @@ tf_cuda_cc_test(
         "//tensorflow/core/kernels:dense_update_ops",
         "//tensorflow/core/kernels:identity_op",
         "//tensorflow/core/kernels:variable_ops",
-        "@grpc//:grpc++",
     ],
 )
 
@@ -667,6 +667,7 @@ tf_cuda_cc_test(
         ":master",
         ":remote_device",
         ":worker_interface",
+        "//tensorflow:grpc++",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -682,7 +683,6 @@ tf_cuda_cc_test(
         "//tensorflow/core/distributed_runtime/rpc:grpc_testlib",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
         "//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache",
-        "@grpc//:grpc++",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/eager/BUILD b/tensorflow/core/distributed_runtime/eager/BUILD
index 22d0902af2..055e5dfced 100644
--- a/tensorflow/core/distributed_runtime/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/eager/BUILD
@@ -48,6 +48,8 @@ cc_library(
         "eager_service_impl.h",
     ],
     deps = [
+        "//tensorflow:grpc",
+        "//tensorflow:grpc++",
         "//tensorflow/c:c_api_internal",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/core:core_cpu_internal",
@@ -67,8 +69,6 @@ cc_library(
         "//tensorflow/core/distributed_runtime:worker_env",
         "//tensorflow/core/distributed_runtime/eager:remote_tensor_handle",
         "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
-        "@grpc",
-        "@grpc//:grpc++",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 382ea336ca..d6c493c022 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -41,8 +41,8 @@ cc_library(
     srcs = ["grpc_util.cc"],
     hdrs = ["grpc_util.h"],
     deps = [
-        "@grpc",
-        "@grpc//:grpc++",
+        "//tensorflow:grpc",
+        "//tensorflow:grpc++",
         "//tensorflow/core:lib",
         # Required to be able to overload TensorResponse parsing.
         "//tensorflow/core/distributed_runtime:tensor_coding",
@@ -55,8 +55,8 @@ cc_library(
     hdrs = ["grpc_client_cq_tag.h"],
     deps = [
         ":grpc_util",
+        "//tensorflow:grpc++",
         "//tensorflow/core:lib",
-        "@grpc//:grpc++",
     ],
 )
 
@@ -67,10 +67,10 @@ cc_library(
     deps = [
         ":grpc_client_cq_tag",
         ":grpc_util",
+        "//tensorflow:grpc++",
         "//tensorflow/core:lib",
         "//tensorflow/core/distributed_runtime:call_options",
         "//tensorflow/core/distributed_runtime:tensor_coding",
-        "@grpc//:grpc++",
     ],
 )
 
@@ -83,6 +83,7 @@ cc_library(
         ":grpc_state",
         ":grpc_util",
         ":grpc_worker_service_impl",
+        "//tensorflow:grpc++",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -90,7 +91,6 @@ cc_library(
         "//tensorflow/core/distributed_runtime:tensor_coding",
         "//tensorflow/core/distributed_runtime:worker_cache_logger",
         "//tensorflow/core/distributed_runtime:worker_interface",
-        "@grpc//:grpc++",
     ],
 )
 
@@ -100,10 +100,10 @@ cc_library(
     hdrs = ["grpc_channel.h"],
     deps = [
         ":grpc_util",
+        "//tensorflow:grpc++",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "@grpc//:grpc++",
     ],
 )
 
@@ -112,13 +112,13 @@ cc_library(
     srcs = ["grpc_tensor_coding.cc"],
     hdrs = ["grpc_tensor_coding.h"],
     deps = [
+        "//tensorflow:grpc++",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:worker_proto_cc",
-        "@grpc//:grpc++",
     ],
 )
 
@@ -127,9 +127,9 @@ cc_library(
     srcs = [],
     hdrs = ["grpc_call.h"],
     deps = [
+        "//tensorflow:grpc++",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "@grpc//:grpc++",
     ],
 )
 
@@ -167,6 +167,7 @@ tf_cuda_library(
         ":grpc_tensor_coding",
         ":grpc_util",
         ":grpc_worker_service_impl",
+        "//tensorflow:grpc++",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -180,7 +181,6 @@ tf_cuda_library(
         "//tensorflow/core/distributed_runtime:worker_cache",
         "//tensorflow/core/distributed_runtime:worker_env",
         "//tensorflow/core/distributed_runtime:worker_session",
-        "@grpc//:grpc++",
     ],
 )
 
@@ -190,9 +190,9 @@ cc_library(
     hdrs = ["grpc_worker_service_impl.h"],
     deps = [
         ":grpc_util",
+        "//tensorflow:grpc++",
         "//tensorflow/core:worker_proto_cc",
         "//tensorflow/core/distributed_runtime:tensor_coding",
-        "@grpc//:grpc++",
     ],
 )
 
@@ -220,12 +220,12 @@ cc_library(
         ":async_service_interface",
         ":grpc_call",
         ":grpc_util",
+        "//tensorflow:grpc++",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:master_proto_cc",
         "//tensorflow/core:master_service_proto_cc",
         "//tensorflow/core/distributed_runtime:master",
-        "@grpc//:grpc++",
     ],
     alwayslink = 1,
 )
@@ -259,6 +259,8 @@ cc_library(
         ":grpc_worker_cache",
         ":grpc_worker_service",
         ":rpc_rendezvous_mgr",
+        "//tensorflow:grpc",
+        "//tensorflow:grpc++",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -277,8 +279,6 @@ cc_library(
         "//tensorflow/core/distributed_runtime:worker_cache_wrapper",
         "//tensorflow/core/distributed_runtime:worker_env",
         "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_service_impl",
-        "@grpc",
-        "@grpc//:grpc++",
     ],
     alwayslink = 1,
 )
@@ -299,13 +299,13 @@ tf_cc_binary(
     ],
     deps = [
         ":grpc_server_lib",
+        "//tensorflow:grpc++",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/kernels:data_flow",
-        "@grpc//:grpc++",
     ],
 )
 
@@ -317,6 +317,7 @@ tf_cc_binary(
     ],
     deps = [
         ":grpc_server_lib",
+        "//tensorflow:grpc++",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
@@ -330,7 +331,6 @@ tf_cc_binary(
         "//tensorflow/core/kernels:matmul_op",
         "//tensorflow/core/kernels:reduction_ops",
         "//tensorflow/core/kernels:variable_ops",
-        "@grpc//:grpc++",
     ],
 )
 
@@ -415,6 +415,7 @@ tf_cc_test(
     deps = [
         ":grpc_tensor_coding",
         ":grpc_testlib",
+        "//tensorflow:grpc++",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -424,7 +425,6 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core:worker_proto_cc",
-        "@grpc//:grpc++",
     ],
 )
 
@@ -434,11 +434,11 @@ tf_cc_test(
     srcs = ["grpc_util_test.cc"],
     deps = [
         ":grpc_util",
+        "//tensorflow:grpc",
+        "//tensorflow:grpc++",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:worker_proto_cc",
-        "@grpc",
-        "@grpc//:grpc++",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/BUILD b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
index 8cec497361..d09a85c6a5 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
@@ -11,8 +11,8 @@ cc_library(
     srcs = ["grpc_eager_service.cc"],
     hdrs = ["grpc_eager_service.h"],
     deps = [
+        "//tensorflow:grpc++",
         "//tensorflow/core:eager_service_proto_cc",
-        "@grpc//:grpc++",
     ],
 )
 
@@ -21,6 +21,7 @@ cc_library(
     srcs = ["grpc_eager_client.cc"],
     hdrs = ["grpc_eager_client.h"],
     deps = [
+        "//tensorflow:grpc++",
         "//tensorflow/core:eager_service_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core/distributed_runtime/eager:eager_client",
@@ -29,7 +30,6 @@ cc_library(
         "//tensorflow/core/distributed_runtime/rpc:grpc_state",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
         "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_service",
-        "@grpc//:grpc++",
     ],
 )
 
@@ -39,6 +39,7 @@ cc_library(
     hdrs = ["grpc_eager_service_impl.h"],
     deps = [
         ":grpc_eager_service",
+        "//tensorflow:grpc++",
         "//tensorflow/core:framework",
         "//tensorflow/core:ptr_util",
         "//tensorflow/core/distributed_runtime/eager:eager_service_impl",
@@ -47,6 +48,6 @@ cc_library(
         "//tensorflow/core/distributed_runtime/rpc:grpc_channel",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
         "//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache",
-        "@grpc//:grpc++",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_worker_service",
     ],
 )
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index 7a9f3c5198..2c833d11a9 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -289,6 +289,12 @@ Status GrpcServer::Init(
               nullptr);
 }
 
+Status GrpcServer::Init(
+    ServiceInitFunction service_func,
+    const RendezvousMgrCreationFunction& rendezvous_mgr_func) {
+  return Init(std::move(service_func), rendezvous_mgr_func, nullptr, nullptr);
+}
+
 Status GrpcServer::Init() { return Init(nullptr, nullptr, nullptr, nullptr); }
 
 Status GrpcServer::ParseChannelSpec(const WorkerCacheFactoryOptions& options,
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index c674da9490..3366246afb 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -97,6 +97,9 @@ class GrpcServer : public ServerInterface {
               const RendezvousMgrCreationFunction& rendezvous_mgr_func,
               const CollectiveMgrCreationFunction& collective_mgr_func);
 
+  Status Init(ServiceInitFunction service_func,
+              const RendezvousMgrCreationFunction& rendezvous_mgr_func);
+
   Status Init();
 
   // A subclass can override this method to support secure credentials.
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 7645b4a7f0..fc474c0dc8 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -1901,6 +1901,11 @@ BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000);
 
 #else  // INTEL_MKL_ML
 
+// NOTE: Unit tests in this file rely on a topological sorted graph for
+// printing. But since sibling nodes of a node in the topologically sorted graph
+// can be printed in different orders, tests may fail if the order in which
+// sibling nodes are visited is changed.
+
 namespace {
 
 const char kCPUDevice[] = "/job:a/replica:0/task:0/device:CPU:0";
@@ -2572,9 +2577,9 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_Mkl) {
             "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
             "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(_MklConv2D);"
             "F(_MklConv2D);G(Const);H(_MklConcat);I(Zeta)|A->E;A->I;"
-            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
-            "B->E:1;C->F;C:control->DMT/_0:control;C:control->DMT/_1:control;"
-            "D->F:1;DMT/_0->F:2;DMT/_1->F:3;DMT/_2->E:2;DMT/_3->E:3;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "B->E:1;C->F;C:control->DMT/_2:control;C:control->DMT/_3:control;"
+            "D->F:1;DMT/_0->E:2;DMT/_1->E:3;DMT/_2->F:2;DMT/_3->F:3;"
             "DMT/_4->H:3;E->H:1;E:2->H:4;F->H:2;F:2->H:5;G->H;"
             "G:control->DMT/_4:control;H->I:1");
 }
@@ -2681,9 +2686,9 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_Mkl) {
             "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
             "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(_MklConv2D);"
             "F(_MklConv2D);G(Const);H(_MklConcatV2);I(Zeta)|A->E;A->I;"
-            "A:control->DMT/_2:control;A:control->DMT/_3:control;B->E:1;C->F;"
-            "C:control->DMT/_0:control;C:control->DMT/_1:control;"
-            "D->F:1;DMT/_0->F:2;DMT/_1->F:3;DMT/_2->E:2;DMT/_3->E:3;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;B->E:1;C->F;"
+            "C:control->DMT/_2:control;C:control->DMT/_3:control;"
+            "D->F:1;DMT/_0->E:2;DMT/_1->E:3;DMT/_2->F:2;DMT/_3->F:3;"
             "DMT/_4->H:5;E->H;E:2->H:3;E:control->DMT/_4:control;F->H:1;"
             "F:2->H:4;G->H:2;H->I:1");
 }
@@ -3060,8 +3065,8 @@ TEST_F(MklLayoutPassTest, LRN_Negative3) {
             "C:control->DMT/_1:control;C:control->DMT/_2:control;"
             "C:control->DMT/_3:control;C:control->DMT/_4:control;"
             "C:control->DMT/_5:control;C:control->DMT/_6:control;"
-            "D->E:1;D->F:2;DMT/_0->B:1;DMT/_1->F:3;DMT/_2->F:7;DMT/_3->F:4;"
-            "DMT/_4->F:6;DMT/_5->E:4;DMT/_6->E:5;E->G;F->G:1");
+            "D->E:1;D->F:2;DMT/_0->B:1;DMT/_1->E:4;DMT/_2->E:5;DMT/_3->F:3;"
+            "DMT/_4->F:7;DMT/_5->F:4;DMT/_6->F:6;E->G;F->G:1");
 }
 
 /* Test MaxPool->MaxPoolGrad replacement by workspace+rewrite nodes. */
diff --git a/tensorflow/core/kernels/data/slide_dataset_op.cc b/tensorflow/core/kernels/data/slide_dataset_op.cc
index 48776cbf61..07cc91f9d5 100644
--- a/tensorflow/core/kernels/data/slide_dataset_op.cc
+++ b/tensorflow/core/kernels/data/slide_dataset_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
@@ -32,16 +33,24 @@ class SlideDatasetOp : public UnaryDatasetOpKernel {
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
     int64 window_size = 0;
-    int64 stride = 1;
+    int64 stride = 0;
     OP_REQUIRES_OK(
         ctx, ParseScalarArgument<int64>(ctx, "window_size", &window_size));
     OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "stride", &stride));
     OP_REQUIRES(
         ctx, window_size > 0,
         errors::InvalidArgument("Window size must be greater than zero."));
-    OP_REQUIRES(
-        ctx, stride > 0 && stride < window_size,
-        errors::InvalidArgument("Stride must be in [1, window_size)."));
+    OP_REQUIRES(ctx, stride > 0,
+                errors::InvalidArgument("Stride must be greater than zero."));
+    if (stride == window_size) {
+      LOG(WARNING) << "stride: " << stride
+                   << " is equal to window_size: " << window_size
+                   << ", to use `batch` instead.";
+    } else if (stride > window_size) {
+      LOG(WARNING) << "stride: " << stride
+                   << " is greater than window_size: " << window_size
+                   << ", you will lose some data.";
+    }
 
     *output = new Dataset(ctx, window_size, stride, input);
   }
@@ -124,12 +133,15 @@ class SlideDatasetOp : public UnaryDatasetOpKernel {
             return Status::OK();
           }
           batch_elements.reserve(window_size);
-          const bool first_call = cache_.empty();
-          if (first_call) {
-            cache_.reserve(window_size);
-          } else {
-            // Reuse cache in the previous iteration.
-            cache_.swap(batch_elements);
+          // Use cache if stride < window_size.
+          if (stride < window_size) {
+            const bool first_call = cache_.empty();
+            if (first_call) {
+              cache_.reserve(window_size);
+            } else {
+              // Reuse cache in the previous iteration.
+              cache_.swap(batch_elements);
+            }
           }
           // Fill up with new elements.
           *end_of_sequence = false;
@@ -149,9 +161,22 @@ class SlideDatasetOp : public UnaryDatasetOpKernel {
             DCHECK(*end_of_sequence);
             return Status::OK();
           }
-          // Cache the data used for the next iteration.
-          for (size_t i = stride; i < window_size; ++i) {
-            cache_.emplace_back(batch_elements[i]);
+
+          if (stride < window_size) {
+            // Cache the data used for the next iteration.
+            for (size_t i = stride; i < window_size; ++i) {
+              cache_.emplace_back(batch_elements[i]);
+            }
+          } else if (stride > window_size) {
+            // Drop the data before the next iteration.
+            std::vector<Tensor> batch_element_tuple;
+            for (size_t i = window_size; i < stride && !*end_of_sequence; ++i) {
+              TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &batch_element_tuple,
+                                                      end_of_sequence));
+              if (*end_of_sequence) {
+                input_impl_.reset();
+              }
+            }
           }
         }
 
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index f2b14f1278..1d0edb10b3 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -59,7 +59,8 @@ namespace tensorflow {
 
 #ifndef INTEL_MKL_ML
 
-struct ConvFwdDimensions {
+// This structure aggregates multiple inputs to Conv2DFwd* methods.
+struct MklConvFwdParams {
   memory::dims src_dims;
   memory::dims filter_dims;
   memory::dims bias_dims;
@@ -69,48 +70,56 @@ struct ConvFwdDimensions {
   memory::dims padding_left;
   memory::dims padding_right;
 
-  ConvFwdDimensions(memory::dims src_dims,
-    memory::dims filter_dims, memory::dims bias_dims,
-    memory::dims dst_dims, memory::dims strides,
-    memory::dims dilations, memory::dims padding_left,
-    memory::dims padding_right) :
-      src_dims(src_dims), filter_dims(filter_dims),
-      bias_dims(bias_dims), dst_dims(dst_dims),
-      strides(strides), dilations(dilations),
-      padding_left(padding_left), padding_right(padding_right) {
-  }
+  MklConvFwdParams(memory::dims src_dims, memory::dims filter_dims,
+                   memory::dims bias_dims, memory::dims dst_dims,
+                   memory::dims strides, memory::dims dilations,
+                   memory::dims padding_left, memory::dims padding_right)
+      : src_dims(src_dims),
+        filter_dims(filter_dims),
+        bias_dims(bias_dims),
+        dst_dims(dst_dims),
+        strides(strides),
+        dilations(dilations),
+        padding_left(padding_left),
+        padding_right(padding_right) {}
 };
 
 template <typename T>
-class Conv2DFwd : public DnnOp {
+class MklConv2DFwdPrimitive : public MklPrimitive {
  public:
-  explicit Conv2DFwd(const ConvFwdDimensions& convFwdDims) {
-    fwd_stream_.reset(new stream(stream::kind::eager));
+  explicit MklConv2DFwdPrimitive(const MklConvFwdParams& convFwdDims)
+      : cpu_engine_(engine::cpu, 0) {
+    context_.fwd_stream.reset(new stream(stream::kind::eager));
     // create conv primitive
-    if (conv_fwd_ == nullptr) {
+    if (context_.conv_fwd == nullptr) {
       Setup(convFwdDims);
     }
   }
 
-  ~Conv2DFwd() {}
+  ~MklConv2DFwdPrimitive() {}
 
   // Convolution forward execute with bias
   //   src_data:    input data buffer of src
   //   filter_data: input data buffer of filter (weights)
   //   bias_data:   input data buffer of bias
   //   dst_data:    output data buffer of dst
-  void Execute(T* src_data, T* filter_data, T* bias_data, T* dst_data) {
-    src_mem_->set_data_handle(static_cast<void*>(src_data));
-    filter_mem_->set_data_handle(static_cast<void*>(filter_data));
-    bias_mem_->set_data_handle(static_cast<void*>(bias_data));
-    dst_mem_->set_data_handle(static_cast<void*>(dst_data));
-    fwd_stream_->submit(fwd_primitives_);
+  void Execute(const T* src_data, const T* filter_data, const T* bias_data,
+               const T* dst_data) {
+    context_.src_mem->set_data_handle(
+        static_cast<void*>(const_cast<T*>(src_data)));
+    context_.filter_mem->set_data_handle(
+        static_cast<void*>(const_cast<T*>(filter_data)));
+    context_.bias_mem->set_data_handle(
+        static_cast<void*>(const_cast<T*>(bias_data)));
+    context_.dst_mem->set_data_handle(
+        static_cast<void*>(const_cast<T*>(dst_data)));
+    context_.fwd_stream->submit(context_.fwd_primitives);
 
     // after exec, set data handle back
-    src_mem_->set_data_handle(DummyData);
-    filter_mem_->set_data_handle(DummyData);
-    bias_mem_->set_data_handle(DummyData);
-    dst_mem_->set_data_handle(DummyData);
+    context_.src_mem->set_data_handle(DummyData);
+    context_.filter_mem->set_data_handle(DummyData);
+    context_.bias_mem->set_data_handle(DummyData);
+    context_.dst_mem->set_data_handle(DummyData);
 
     return;
   }
@@ -119,139 +128,177 @@ class Conv2DFwd : public DnnOp {
   //   src_data:    input data buffer of src
   //   filter_data: input data buffer of filter (weights)
   //   dst_data:    output data buffer of dst
-  void Execute(T* src_data, T* filter_data, T* dst_data) {
-    src_mem_->set_data_handle(static_cast<void*>(src_data));
-    filter_mem_->set_data_handle(static_cast<void*>(filter_data));
-    dst_mem_->set_data_handle(static_cast<void*>(dst_data));
-    fwd_stream_->submit(fwd_primitives_);
-
-    // after exec, set data handle back
-    src_mem_->set_data_handle(DummyData);
-    filter_mem_->set_data_handle(DummyData);
-    dst_mem_->set_data_handle(DummyData);
-
-    return;
+  void Execute(const T* src_data, const T* filter_data, const T* dst_data) {
+    context_.src_mem->set_data_handle(
+        static_cast<void*>(const_cast<T*>(src_data)));
+    context_.filter_mem->set_data_handle(
+        static_cast<void*>(const_cast<T*>(filter_data)));
+    context_.dst_mem->set_data_handle(
+        static_cast<void*>(const_cast<T*>(dst_data)));
+    context_.fwd_stream->submit(context_.fwd_primitives);
+
+    // after execution, set data handle back
+    context_.src_mem->set_data_handle(DummyData);
+    context_.filter_mem->set_data_handle(DummyData);
+    context_.dst_mem->set_data_handle(DummyData);
   }
 
-  // expected memory format for this primitive instance
-  memory::format src_fmt_;
-  memory::format filter_fmt_;
+  memory::format GetSrcMemoryFormat() const { return context_.src_fmt; }
 
-  // convolution primitive
-  std::shared_ptr<mkldnn::convolution_forward::primitive_desc> fwd_pd_;
-  std::shared_ptr<mkldnn::primitive> conv_fwd_;
+  memory::format GetFilterMemoryFormat() const { return context_.filter_fmt; }
+
+  std::shared_ptr<mkldnn::convolution_forward::primitive_desc>
+  GetPrimitiveDesc() const {
+    return context_.fwd_pd;
+  }
 
  private:
-  void Setup(const ConvFwdDimensions& convFwdDims) {
+  // Primitive reuse context for Conv2D Fwd op
+  struct ConvFwdContext {
+    // expected memory format for this primitive instance
+    memory::format src_fmt;
+    memory::format filter_fmt;
+
+    // MKLDNN memory
+    std::shared_ptr<mkldnn::memory> src_mem;
+    std::shared_ptr<mkldnn::memory> filter_mem;
+    std::shared_ptr<mkldnn::memory> bias_mem;
+    std::shared_ptr<mkldnn::memory> dst_mem;
+
+    // desc & prmitive desc
+    std::shared_ptr<mkldnn::convolution_forward::desc> fwd_desc;
+
+    // memory desc
+    std::shared_ptr<mkldnn::memory::desc> src_md;
+    std::shared_ptr<mkldnn::memory::desc> filter_md;
+    std::shared_ptr<mkldnn::memory::desc> bias_md;
+    std::shared_ptr<mkldnn::memory::desc> dst_md;
+
+    // convolution primitive
+    std::shared_ptr<mkldnn::convolution_forward::primitive_desc> fwd_pd;
+    std::shared_ptr<mkldnn::primitive> conv_fwd;
+
+    std::shared_ptr<mkldnn::stream> fwd_stream;
+    std::vector<mkldnn::primitive> fwd_primitives;
+
+    ConvFwdContext()
+        : src_fmt(memory::format::any),
+          filter_fmt(memory::format::any),
+          src_mem(nullptr),
+          filter_mem(nullptr),
+          bias_mem(nullptr),
+          dst_mem(nullptr),
+          fwd_desc(nullptr),
+          src_md(nullptr),
+          filter_md(nullptr),
+          bias_md(nullptr),
+          fwd_pd(nullptr),
+          conv_fwd(nullptr),
+          fwd_stream(nullptr) {}
+  };
+
+  void Setup(const MklConvFwdParams& convFwdDims) {
     // create memory descriptors for convolution data w/ no specified format
-    src_md_.reset(new memory::desc({convFwdDims.src_dims},
-        MklDnnType<T>(), memory::format::any));
+    context_.src_md.reset(new memory::desc(
+        {convFwdDims.src_dims}, MklDnnType<T>(), memory::format::any));
 
-    filter_md_.reset(new memory::desc({convFwdDims.filter_dims},
-        MklDnnType<T>(), memory::format::any));
+    context_.filter_md.reset(new memory::desc(
+        {convFwdDims.filter_dims}, MklDnnType<T>(), memory::format::any));
 
-    dst_md_.reset(new memory::desc({convFwdDims.dst_dims},
-        MklDnnType<T>(), memory::format::any));
+    context_.dst_md.reset(new memory::desc(
+        {convFwdDims.dst_dims}, MklDnnType<T>(), memory::format::any));
 
     if (!convFwdDims.bias_dims.empty())
-        bias_md_.reset(new memory::desc({convFwdDims.bias_dims},
-            MklDnnType<T>(), memory::format::any));
+      context_.bias_md.reset(new memory::desc(
+          {convFwdDims.bias_dims}, MklDnnType<T>(), memory::format::any));
 
     // create a convolution
     if (!convFwdDims.bias_dims.empty()) {
-      fwd_desc_.reset(new convolution_forward::desc(prop_kind::forward,
-          convolution_direct, *src_md_, *filter_md_, *bias_md_, *dst_md_,
+      context_.fwd_desc.reset(new convolution_forward::desc(
+          prop_kind::forward, convolution_direct, *context_.src_md,
+          *context_.filter_md, *context_.bias_md, *context_.dst_md,
           convFwdDims.strides, convFwdDims.dilations, convFwdDims.padding_left,
           convFwdDims.padding_right, padding_kind::zero));
     } else {
-      fwd_desc_.reset(new convolution_forward::desc(prop_kind::forward,
-          convolution_direct, *src_md_, *filter_md_, *dst_md_,
-          convFwdDims.strides, convFwdDims.dilations, convFwdDims.padding_left,
+      context_.fwd_desc.reset(new convolution_forward::desc(
+          prop_kind::forward, convolution_direct, *context_.src_md,
+          *context_.filter_md, *context_.dst_md, convFwdDims.strides,
+          convFwdDims.dilations, convFwdDims.padding_left,
           convFwdDims.padding_right, padding_kind::zero));
     }
 
-    fwd_pd_.reset(new convolution_forward::primitive_desc(
-        *fwd_desc_, cpu_engine_));
+    context_.fwd_pd.reset(new convolution_forward::primitive_desc(
+        *context_.fwd_desc, cpu_engine_));
 
     // store the expected memory format
-    src_fmt_ = static_cast<mkldnn::memory::format>(
-        fwd_pd_.get()->src_primitive_desc().desc().data.format);
+    context_.src_fmt = static_cast<mkldnn::memory::format>(
+        context_.fwd_pd.get()->src_primitive_desc().desc().data.format);
 
-    filter_fmt_ = static_cast<mkldnn::memory::format>(
-        fwd_pd_.get()->weights_primitive_desc().desc().data.format);
+    context_.filter_fmt = static_cast<mkldnn::memory::format>(
+        context_.fwd_pd.get()->weights_primitive_desc().desc().data.format);
 
     // create memory primitive based on dummy data
-    src_mem_.reset(new memory(fwd_pd_.get()->src_primitive_desc(), DummyData));
-    filter_mem_.reset(new memory(fwd_pd_.get()->weights_primitive_desc(),
-                      DummyData));
-    dst_mem_.reset(new memory(fwd_pd_.get()->dst_primitive_desc(), DummyData));
+    context_.src_mem.reset(
+        new memory(context_.fwd_pd.get()->src_primitive_desc(), DummyData));
+    context_.filter_mem.reset(
+        new memory(context_.fwd_pd.get()->weights_primitive_desc(), DummyData));
+    context_.dst_mem.reset(
+        new memory(context_.fwd_pd.get()->dst_primitive_desc(), DummyData));
 
     // create convolution primitive and add it to net
     if (!convFwdDims.bias_dims.empty()) {
-        bias_mem_.reset(new memory({{{convFwdDims.bias_dims}, MklDnnType<T>(),
-                        memory::format::x}, cpu_engine_}, DummyData));
-        conv_fwd_.reset(new convolution_forward(*fwd_pd_, *src_mem_,
-                        *filter_mem_, *bias_mem_, *dst_mem_));
+      context_.bias_mem.reset(new memory(
+          {{{convFwdDims.bias_dims}, MklDnnType<T>(), memory::format::x},
+           cpu_engine_},
+          DummyData));
+      context_.conv_fwd.reset(new convolution_forward(
+          *context_.fwd_pd, *context_.src_mem, *context_.filter_mem,
+          *context_.bias_mem, *context_.dst_mem));
     } else {
-        conv_fwd_.reset(new convolution_forward(*fwd_pd_, *src_mem_,
-                        *filter_mem_, *dst_mem_));
+      context_.conv_fwd.reset(
+          new convolution_forward(*context_.fwd_pd, *context_.src_mem,
+                                  *context_.filter_mem, *context_.dst_mem));
     }
 
-    fwd_primitives_.push_back(*conv_fwd_);
+    context_.fwd_primitives.push_back(*context_.conv_fwd);
     return;
   }
 
-  // MKLDNN memory
-  std::shared_ptr<mkldnn::memory> src_mem_;
-  std::shared_ptr<mkldnn::memory> filter_mem_;
-  std::shared_ptr<mkldnn::memory> bias_mem_;
-  std::shared_ptr<mkldnn::memory> dst_mem_;
-
-  std::shared_ptr<mkldnn::stream> fwd_stream_;
-  std::vector<mkldnn::primitive> fwd_primitives_;
-
-  // desc & prmitive desc
-  std::shared_ptr<mkldnn::convolution_forward::desc> fwd_desc_;
-
-  // memory desc
-  std::shared_ptr<mkldnn::memory::desc> src_md_;
-  std::shared_ptr<mkldnn::memory::desc> filter_md_;
-  std::shared_ptr<mkldnn::memory::desc> bias_md_;
-  std::shared_ptr<mkldnn::memory::desc> dst_md_;
-
-  engine cpu_engine_ = engine(engine::cpu, 0);
+  struct ConvFwdContext context_;
+  engine cpu_engine_;
 };
 
 template <typename T>
-class Conv2DFwdFactory : public DnnOpFactory<T> {
+class MklConv2DFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
  public:
-  static Conv2DFwd<T>* Get(const ConvFwdDimensions& convFwdDims) {
-     Conv2DFwd<T>* conv2d_fwd = nullptr;
-
-     // try to find a suitable one in pool
-     conv2d_fwd = dynamic_cast<Conv2DFwd<T>*> (
-       Conv2DFwdFactory<T>::GetInstance().GetConv2DFwd(convFwdDims));
-
-     if (conv2d_fwd == nullptr) {
-       conv2d_fwd = new Conv2DFwd<T>(convFwdDims);
-       Conv2DFwdFactory<T>::GetInstance().SetConv2DFwd(
-           convFwdDims, conv2d_fwd);
-     }
-     return conv2d_fwd;
+  static MklConv2DFwdPrimitive<T>* Get(const MklConvFwdParams& convFwdDims) {
+    MklConv2DFwdPrimitive<T>* conv2d_fwd = nullptr;
+
+    // try to find a suitable one in pool
+    conv2d_fwd = dynamic_cast<MklConv2DFwdPrimitive<T>*>(
+        MklConv2DFwdPrimitiveFactory<T>::GetInstance().GetConv2DFwd(
+            convFwdDims));
+
+    if (conv2d_fwd == nullptr) {
+      conv2d_fwd = new MklConv2DFwdPrimitive<T>(convFwdDims);
+      MklConv2DFwdPrimitiveFactory<T>::GetInstance().SetConv2DFwd(convFwdDims,
+                                                                  conv2d_fwd);
+    }
+    return conv2d_fwd;
   }
 
  private:
-  Conv2DFwdFactory() {}
-  ~Conv2DFwdFactory() {}
+  MklConv2DFwdPrimitiveFactory() {}
+  ~MklConv2DFwdPrimitiveFactory() {}
 
   static const int kDilationH = 0, kDilationW = 1;
 
-  static Conv2DFwdFactory& GetInstance() {
-    static Conv2DFwdFactory instance_;
+  static MklConv2DFwdPrimitiveFactory& GetInstance() {
+    static MklConv2DFwdPrimitiveFactory instance_;
     return instance_;
   }
 
-  static std::string CreateKey(const ConvFwdDimensions& convFwdDims) {
+  static std::string CreateKey(const MklConvFwdParams& convFwdDims) {
     std::string prefix = "conv2d_fwd_";
     FactoryKeyCreator key_creator;
     key_creator.AddAsKey(prefix);
@@ -266,12 +313,12 @@ class Conv2DFwdFactory : public DnnOpFactory<T> {
     return key_creator.GetKey();
   }
 
-  DnnOp* GetConv2DFwd(const ConvFwdDimensions& convFwdDims) {
+  MklPrimitive* GetConv2DFwd(const MklConvFwdParams& convFwdDims) {
     std::string key = CreateKey(convFwdDims);
     return this->GetOp(key);
   }
 
-  void SetConv2DFwd(const ConvFwdDimensions& convFwdDims, DnnOp *op) {
+  void SetConv2DFwd(const MklConvFwdParams& convFwdDims, MklPrimitive* op) {
     std::string key = CreateKey(convFwdDims);
     this->SetOp(key, op);
   }
@@ -762,7 +809,6 @@ class MklConv2DOp : public OpKernel {
 
       MklDnnData<T> src(&cpu_engine);
       MklDnnData<T> filter(&cpu_engine);
-      MklDnnData<T> dst(&cpu_engine);  // output
 
       memory::dims src_dims, filter_dims, padding_left, padding_right,
                    dilations, strides;
@@ -812,7 +858,6 @@ class MklConv2DOp : public OpKernel {
       auto src_md = src_mkl_shape.IsMklTensor()
                         ? src_mkl_shape.GetMklLayout()
                         : memory::desc(src_dims, MklDnnType<T>(), tf_fmt);
-      src.SetUsrMem(src_md, &src_tensor);
 
       // Although filter shape (filter_dims) required is in MKL-DNN order,
       // the layout is Tensorflow's layout (HWIO).
@@ -820,29 +865,30 @@ class MklConv2DOp : public OpKernel {
                            ? filter_mkl_shape.GetMklLayout()
                            : memory::desc(filter_dims, MklDnnType<T>(),
                                           memory::format::hwio);
-      filter.SetUsrMem(filter_md, &filter_tensor);
 
       // MKLDNN dilation starts from 0.
       dilations[kDilationH] -= 1;
       dilations[kDilationW] -= 1;
 
       // get a conv2d fwd from primitive pool
-      Conv2DFwd<T> *conv2d_fwd = nullptr;
+      MklConv2DFwdPrimitive<T>* conv2d_fwd = nullptr;
       if (biasEnabled) {
         memory::dims bias_dims = {};
         conv_utl.GetBiasSizeInMklOrder(kInputIndex_Bias, &bias_dims);
-        ConvFwdDimensions convFwdDims(src_dims, filter_dims, bias_dims,
-          dst_dims_mkl_order, strides, dilations, padding_left, padding_right);
-        conv2d_fwd = Conv2DFwdFactory<T>::Get(convFwdDims);
+        MklConvFwdParams convFwdDims(src_dims, filter_dims, bias_dims,
+                                     dst_dims_mkl_order, strides, dilations,
+                                     padding_left, padding_right);
+        conv2d_fwd = MklConv2DFwdPrimitiveFactory<T>::Get(convFwdDims);
       } else {
-        ConvFwdDimensions convFwdDims(src_dims, filter_dims, NONE_DIMS,
-          dst_dims_mkl_order, strides, dilations, padding_left, padding_right);
-        conv2d_fwd = Conv2DFwdFactory<T>::Get(convFwdDims);
+        MklConvFwdParams convFwdDims(src_dims, filter_dims, NONE_DIMS,
+                                     dst_dims_mkl_order, strides, dilations,
+                                     padding_left, padding_right);
+        conv2d_fwd = MklConv2DFwdPrimitiveFactory<T>::Get(convFwdDims);
       }
 
       // allocate output tensors output_tensor and filter_out_tensor
-      std::shared_ptr<mkldnn::convolution_forward::primitive_desc>
-      conv_fwd_pd = conv2d_fwd->fwd_pd_;
+      std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_fwd_pd =
+          conv2d_fwd->GetPrimitiveDesc();
       AllocateOutputTensor(context, *conv_fwd_pd,
                        dst_dims_mkl_order, tf_fmt, &dst_tensor);
       Tensor* filter_out_tensor = nullptr;
@@ -854,20 +900,28 @@ class MklConv2DOp : public OpKernel {
 
       // check whether src/filter need reorder
       std::vector<primitive> net;
-      if (src_md.data.format != conv2d_fwd->src_fmt_)
-          src.CheckReorderToOpMem(
-              conv_fwd_pd.get()->src_primitive_desc(), &net);
-
-      if (filter_md.data.format != conv2d_fwd->filter_fmt_)
-          filter.CheckReorderToOpMem(
-              conv_fwd_pd.get()->weights_primitive_desc(),
-              filter.GetTensorBuffer(filter_out_tensor), &net);
+      T* src_data = nullptr;
+      if (src_md.data.format != conv2d_fwd->GetSrcMemoryFormat()) {
+        src.SetUsrMem(src_md, &src_tensor);
+        src.CheckReorderToOpMem(conv_fwd_pd.get()->src_primitive_desc(), &net);
+        src_data = static_cast<T*>(src.GetOpMem().get_data_handle());
+      } else {
+        src_data = static_cast<T*>(const_cast<T*>(src_tensor.flat<T>().data()));
+      }
+      T* filter_data = nullptr;
+      if (filter_md.data.format != conv2d_fwd->GetFilterMemoryFormat()) {
+        filter.SetUsrMem(filter_md, &filter_tensor);
+        filter.CheckReorderToOpMem(conv_fwd_pd.get()->weights_primitive_desc(),
+                                   filter.GetTensorBuffer(filter_out_tensor),
+                                   &net);
+        filter_data = static_cast<T*>(filter.GetOpMem().get_data_handle());
+      } else {
+        filter_data =
+            static_cast<T*>(const_cast<T*>(filter_tensor.flat<T>().data()));
+      }
+
       stream(stream::kind::eager).submit(net).wait();
 
-      T* src_data = static_cast<T*>(
-                src.GetOpMem().get_data_handle());
-      T* filter_data = static_cast<T*>(
-                filter.GetOpMem().get_data_handle());
 
       // execute convolution
       if (biasEnabled) {
diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
index 6655084045..9af4cc23b6 100644
--- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
+++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -295,7 +295,7 @@ __global__ void ColumnReduceMax16ColumnsKernel(
 
   // 1D array necessary due to bug in CUDA 9 compiler.
   // TODO(nluehr) revert to 2D array when compiler is ready.
-  // This is the mimic the following, but without any constructors:
+  // This is to mimic the following, but without any constructors:
   //   __shared__ storage_type<value_type> partial_sums[32 * 33];
   __shared__ __align__(
       alignof(value_type)) char partial_sums_raw[32 * 33 * sizeof(value_type)];
diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h
index d65692a552..d28e35157b 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.h
+++ b/tensorflow/core/kernels/segment_reduction_ops.h
@@ -16,6 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 #define TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 
+// This file requires the following include because it uses CudaAtomicMax:
+// #include "tensorflow/core/util/cuda_kernel_helper.h"
+
+// Unfortunately we can't add the #include, since it breaks compilation for
+// non-GPU targets. This only breaks in clang, because it's more strict for
+// template code and CudaAtomicMax is used in template context.
 
 // This file requires the following include because it uses CudaAtomicMax:
 // #include "tensorflow/core/util/cuda_kernel_helper.h"
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 262526846d..c229bd5a41 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -614,7 +614,13 @@ REGISTER_OP("ApproximateEqual")
     .SetIsCommutative()
     .Attr("T: numbertype")
     .Attr("tolerance: float = 0.00001")
-    .SetShapeFn(shape_inference::UnchangedShape);
+    .SetShapeFn([](InferenceContext* c) {
+      // The inputs 'x' and 'y' must have the same shape.
+      ShapeHandle data_x = c->input(0);
+      ShapeHandle data_y = c->input(1);
+      TF_RETURN_IF_ERROR(c->Merge(data_x, data_y, &data_x));
+      return shape_inference::UnchangedShape(c);
+    });
 
 // --------------------------------------------------------------------------
 
diff --git a/tensorflow/core/platform/cloud/oauth_client.cc b/tensorflow/core/platform/cloud/oauth_client.cc
index e64653a67a..ee6ba7b041 100644
--- a/tensorflow/core/platform/cloud/oauth_client.cc
+++ b/tensorflow/core/platform/cloud/oauth_client.cc
@@ -137,8 +137,8 @@ Status EncodeJwtClaim(StringPiece client_email, StringPiece scope,
   const auto expiration_timestamp_sec =
       request_timestamp_sec + kRequestedTokenLifetimeSec;
 
-  root["iat"] = request_timestamp_sec;
-  root["exp"] = expiration_timestamp_sec;
+  root["iat"] = Json::Value::UInt64(request_timestamp_sec);
+  root["exp"] = Json::Value::UInt64(expiration_timestamp_sec);
 
   // Step 2: represent the JSON as a string.
   string claim = root.toStyledString();
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index a319ccbdbe..66ccd81e41 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -202,7 +202,10 @@ def cc_proto_library(
   )
 
   if use_grpc_plugin:
-    cc_libs += ["//external:grpc_lib"]
+    cc_libs += select({
+        "//tensorflow:linux_s390x": ["//external:grpc_lib_unsecure"],
+        "//conditions:default": ["//external:grpc_lib"],
+    })
 
   if default_header:
     header_only_name = name
diff --git a/tensorflow/core/platform/windows/port.cc b/tensorflow/core/platform/windows/port.cc
index 174f41a993..f2aaf13bec 100644
--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@@ -171,5 +171,10 @@ int64 AvailableRam() {
   return INT64_MAX;
 }
 
+int NumHyperthreadsPerCore() {
+  static const int ht_per_core = tensorflow::port::CPUIDNumSMT();
+  return (ht_per_core > 0) ? ht_per_core : 1;
+}
+
 }  // namespace port
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/internal/tfprof_timeline.cc b/tensorflow/core/profiler/internal/tfprof_timeline.cc
index b0dd8ce5e0..979b437914 100644
--- a/tensorflow/core/profiler/internal/tfprof_timeline.cc
+++ b/tensorflow/core/profiler/internal/tfprof_timeline.cc
@@ -47,9 +47,9 @@ Json::Value ChromeTraceFormatter::CreateEvent(const string& ph,
   event["ph"] = Json::Value(ph);
   event["cat"] = Json::Value(category);
   event["name"] = Json::Value(name);
-  event["pid"] = Json::Value(pid);
-  event["tid"] = Json::Value(tid);
-  event["ts"] = Json::Value(ts);
+  event["pid"] = Json::Int64(pid);
+  event["tid"] = Json::Int64(tid);
+  event["ts"] = Json::Int64(ts);
   return event;
 }
 
@@ -57,7 +57,7 @@ void ChromeTraceFormatter::EmitPID(const string& name, int64 pid) {
   Json::Value event(Json::objectValue);
   event["name"] = Json::Value("process_name");
   event["ph"] = Json::Value("M");
-  event["pid"] = Json::Value(pid);
+  event["pid"] = Json::Int64(pid);
   Json::Value args(Json::objectValue);
   args["name"] = Json::Value(name);
   event["args"] = args;
@@ -68,7 +68,7 @@ void ChromeTraceFormatter::EmitRegion(int64 ts, int64 duration, int64 pid,
                                       int64 tid, const string& category,
                                       const string& name, Json::Value args) {
   Json::Value event = CreateEvent("X", category, name, pid, tid, ts);
-  event["dur"] = Json::Value(duration);
+  event["dur"] = Json::Int64(duration);
   event["args"] = std::move(args);
   metadata_.push_back(event);
 }
@@ -76,14 +76,14 @@ void ChromeTraceFormatter::EmitRegion(int64 ts, int64 duration, int64 pid,
 void ChromeTraceFormatter::EmitFlowStart(const string& name, int64 ts,
                                          int64 pid, int64 tid, int64 flow_id) {
   Json::Value event = CreateEvent("s", "DataFlow", name, pid, tid, ts);
-  event["id"] = flow_id;
+  event["id"] = Json::Int64(flow_id);
   events_.push_back(event);
 }
 
 void ChromeTraceFormatter::EmitFlowEnd(const string& name, int64 ts, int64 pid,
                                        int64 tid, int64 flow_id) {
   Json::Value event = CreateEvent("t", "DataFlow", name, pid, tid, ts);
-  event["id"] = flow_id;
+  event["id"] = Json::Int64(flow_id);
   events_.push_back(event);
 }
 
@@ -93,7 +93,7 @@ void ChromeTraceFormatter::EmitCounter(
     const std::map<int64, std::vector<string>>& tensor_mem) {
   Json::Value event = CreateEvent("C", category, "Allocated Bytes", pid, 0, ts);
   Json::Value args(Json::objectValue);
-  args["Allocator Bytes in Use"] = Json::Value(bytes);
+  args["Allocator Bytes in Use"] = Json::Int64(bytes);
   event["args"] = args;
   events_.push_back(event);
 
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 90b6533690..b5e42f5384 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -1814,11 +1814,11 @@ class MklDnnData {
   }
 };
 
-/// Base class for operations with reuse of DNN primitives
+/// Base class for operations with reuse of primitives
 ///
-class DnnOp {
+class MklPrimitive {
  public:
-  virtual ~DnnOp() {}
+  virtual ~MklPrimitive() {}
 
   // Dummy data. Its size, hard-coded as 256 here, does
   // not matter since MKL should never operate on this buffer.
@@ -1826,33 +1826,33 @@ class DnnOp {
 };
 
 const mkldnn::memory::dims NONE_DIMS = {};
-// This constant is used to declare dummy buffer (size), for MKL primitives
+
 template <typename T>
-class DnnOpFactory {
+class MklPrimitiveFactory {
  public:
-  DnnOpFactory() {}
-  ~DnnOpFactory() {}
+  MklPrimitiveFactory() {}
+  ~MklPrimitiveFactory() {}
 
-  DnnOp* GetOp(const std::string& key) {
-    auto stream_iter = DnnOpFactory<T>::GetHashMap().find(key);
-    if (stream_iter == DnnOpFactory<T>::GetHashMap().end()) {
+  MklPrimitive* GetOp(const std::string& key) {
+    auto stream_iter = MklPrimitiveFactory<T>::GetHashMap().find(key);
+    if (stream_iter == MklPrimitiveFactory<T>::GetHashMap().end()) {
       return nullptr;
     } else {
       return stream_iter->second;
     }
   }
 
-  void SetOp(const std::string& key, DnnOp* op) {
-    auto stream_iter = DnnOpFactory<T>::GetHashMap().find(key);
+  void SetOp(const std::string& key, MklPrimitive* op) {
+    auto stream_iter = MklPrimitiveFactory<T>::GetHashMap().find(key);
 
-    CHECK(stream_iter == DnnOpFactory<T>::GetHashMap().end());
+    CHECK(stream_iter == MklPrimitiveFactory<T>::GetHashMap().end());
 
-    DnnOpFactory<T>::GetHashMap()[key] = op;
+    MklPrimitiveFactory<T>::GetHashMap()[key] = op;
   }
 
  private:
-  static inline std::unordered_map<std::string, DnnOp*> &GetHashMap() {
-    static thread_local std::unordered_map<std::string, DnnOp*> map_;
+  static inline std::unordered_map<std::string, MklPrimitive*>& GetHashMap() {
+    static thread_local std::unordered_map<std::string, MklPrimitive*> map_;
     return map_;
   }
 };
diff --git a/tensorflow/docs_src/get_started/index.md b/tensorflow/docs_src/get_started/index.md
new file mode 100644
index 0000000000..bd2a80d9ef
--- /dev/null
+++ b/tensorflow/docs_src/get_started/index.md
@@ -0,0 +1,29 @@
+# Get Started
+
+If you are new to machine learning, we recommend taking the following online
+course prior to diving into TensorFlow documentation:
+
+  * [Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course/),
+    which introduces machine learning concepts and encourages experimentation
+    with existing TensorFlow code.
+
+TensorFlow is a tool for machine learning. While it contains a wide range of
+functionality, TensorFlow is mainly designed for deep neural network models.
+
+The easiest way to get started with TensorFlow is by using Eager Execution.
+
+  * @{$get_started/eager}, is for anyone new to machine learning or TensorFlow.
+
+TensorFlow provides many APIs. The remainder of this section focuses on the
+Estimator API which provide scalable, high-performance models. See the
+@{$estimators} guide.
+
+For more advanced users:
+
+  * The @{$low_level_intro$Low Level Introduction} demonstrates how to use
+    TensorFlow outside of the Estimator framework, for debugging and
+    experimentation.
+  * The @{$guide$Programmer's Guide} details major
+    TensorFlow components.
+  * The @{$tutorials$Tutorials} provide walkthroughs of a variety of
+    TensorFlow models.
diff --git a/tensorflow/docs_src/guide/debugger.md b/tensorflow/docs_src/guide/debugger.md
index 5cf9af904a..dc4db58857 100644
--- a/tensorflow/docs_src/guide/debugger.md
+++ b/tensorflow/docs_src/guide/debugger.md
@@ -17,7 +17,7 @@ how to use the graphical user interface (GUI) of tfdbg, i.e., the
 Note: The TensorFlow debugger uses a
 [curses](https://en.wikipedia.org/wiki/Curses_\(programming_library\))-based text
 user interface. On Mac OS X, the `ncurses` library is required and can be
-installed with `brew install homebrew/dupes/ncurses`. On Windows, curses isn't as
+installed with `brew install ncurses`. On Windows, curses isn't as
 well supported, so a [readline](https://en.wikipedia.org/wiki/GNU_Readline)-based
 interface can be used with tfdbg by installing `pyreadline` with `pip`. If you
 use Anaconda3, you can install it with a command such as
diff --git a/tensorflow/go/attrs.go b/tensorflow/go/attrs.go
new file mode 100644
index 0000000000..f86c5737bc
--- /dev/null
+++ b/tensorflow/go/attrs.go
@@ -0,0 +1,245 @@
+/*
+Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package tensorflow
+
+// #include <stdlib.h>
+// #include "tensorflow/c/c_api.h"
+import "C"
+import (
+	"fmt"
+	"unsafe"
+)
+
+// makeCShape converts a shape specified in C.int64_t into a Shape.
+func makeCShape(shape []C.int64_t) Shape {
+	s := Shape{dims: make([]int64, len(shape))}
+	for i, n := range shape {
+		s.dims[i] = int64(n)
+	}
+	return s
+}
+
+// Attr returns the value of an attribute on op. It returns an error if the
+// attribute does not exist.
+func (op *Operation) Attr(name string) (interface{}, error) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+
+	status := newStatus()
+	meta := C.TF_OperationGetAttrMetadata(op.c, cname, status.c)
+	if err := status.Err(); err != nil {
+		return nil, err
+	}
+
+	if meta.is_list == 1 {
+		return listAttribute(op, cname, meta)
+	}
+	return scalarAttribute(op, cname, meta)
+}
+
+func listAttribute(op *Operation, cname *C.char, meta C.TF_AttrMetadata) (interface{}, error) {
+	status := newStatus()
+
+	switch meta._type {
+	case C.TF_ATTR_STRING:
+		if meta.list_size == 0 {
+			return []string(nil), nil
+		}
+		values := make([]unsafe.Pointer, meta.list_size)
+		lengths := make([]C.size_t, meta.list_size)
+		// Add one element in case total_size is zero.
+		storage := make([]C.char, meta.total_size+1)
+		C.TF_OperationGetAttrStringList(op.c, cname, &values[0], &lengths[0], C.int(meta.list_size), unsafe.Pointer(&storage[0]), C.size_t(meta.total_size), status.c)
+		if err := status.Err(); err != nil {
+			return nil, err
+		}
+		list := make([]string, meta.list_size)
+		for i, val := range values {
+			length := lengths[i]
+			list[i] = C.GoStringN((*C.char)(val), C.int(length))
+		}
+		return list, nil
+
+	case C.TF_ATTR_INT:
+		if meta.list_size == 0 {
+			return []int64(nil), nil
+		}
+		list := make([]C.int64_t, meta.list_size)
+		C.TF_OperationGetAttrIntList(op.c, cname, &list[0], C.int(meta.list_size), status.c)
+		if err := status.Err(); err != nil {
+			return nil, err
+		}
+		vals := make([]int64, meta.list_size)
+		for i, val := range list {
+			vals[i] = int64(val)
+		}
+		return vals, nil
+
+	case C.TF_ATTR_FLOAT:
+		if meta.list_size == 0 {
+			return []float32(nil), nil
+		}
+		list := make([]C.float, meta.list_size)
+		C.TF_OperationGetAttrFloatList(op.c, cname, &list[0], C.int(meta.list_size), status.c)
+		if err := status.Err(); err != nil {
+			return nil, err
+		}
+		vals := make([]float32, meta.list_size)
+		for i, val := range list {
+			vals[i] = float32(val)
+		}
+		return vals, nil
+
+	case C.TF_ATTR_BOOL:
+		if meta.list_size == 0 {
+			return []bool(nil), nil
+		}
+		list := make([]C.uchar, meta.list_size)
+		C.TF_OperationGetAttrBoolList(op.c, cname, &list[0], C.int(meta.list_size), status.c)
+		if err := status.Err(); err != nil {
+			return nil, err
+		}
+		vals := make([]bool, meta.list_size)
+		for i, val := range list {
+			vals[i] = val == 1
+		}
+		return vals, nil
+
+	case C.TF_ATTR_TYPE:
+		if meta.list_size == 0 {
+			return []DataType(nil), nil
+		}
+		list := make([]C.TF_DataType, meta.list_size)
+		C.TF_OperationGetAttrTypeList(op.c, cname, &list[0], C.int(meta.list_size), status.c)
+		if err := status.Err(); err != nil {
+			return nil, err
+		}
+		vals := make([]DataType, meta.list_size)
+		for i, val := range list {
+			vals[i] = DataType(val)
+		}
+		return vals, nil
+
+	case C.TF_ATTR_TENSOR:
+		if meta.list_size == 0 {
+			return []*Tensor(nil), nil
+		}
+		list := make([]*C.TF_Tensor, meta.list_size)
+		C.TF_OperationGetAttrTensorList(op.c, cname, &list[0], C.int(meta.list_size), status.c)
+		if err := status.Err(); err != nil {
+			return nil, err
+		}
+		vals := make([]*Tensor, meta.list_size)
+		for i, t := range list {
+			vals[i] = newTensorFromC(t)
+		}
+		return vals, nil
+
+	case C.TF_ATTR_SHAPE:
+		if meta.list_size == 0 {
+			return []Shape(nil), nil
+		}
+		dims := make([]*C.int64_t, meta.list_size)
+		numDims := make([]C.int, meta.list_size)
+		// Add one element in case total_size is zero.
+		storage := make([]C.int64_t, meta.total_size+1)
+		C.TF_OperationGetAttrShapeList(op.c, cname, &dims[0], &numDims[0], C.int(meta.list_size), &storage[0], C.int(meta.total_size), status.c)
+		if err := status.Err(); err != nil {
+			return nil, err
+		}
+		list := make([]Shape, meta.list_size)
+		for i, dim := range dims {
+			numDim := numDims[i]
+			// If the number of dimensions is unknown, default to empty shape.
+			if numDim < 0 {
+				continue
+			}
+			// A []C.int64_t slice backed by C memory.
+			// See: https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices
+			slice := (*[1 << 30]C.int64_t)(unsafe.Pointer(dim))[:numDim:numDim]
+			list[i] = makeCShape(slice)
+		}
+		return list, nil
+
+	default:
+		return nil, fmt.Errorf("list type %v not supported", meta._type)
+	}
+}
+
+func scalarAttribute(op *Operation, cname *C.char, meta C.TF_AttrMetadata) (interface{}, error) {
+	status := newStatus()
+
+	switch meta._type {
+	case C.TF_ATTR_STRING:
+		if meta.total_size == 0 {
+			return "", nil
+		}
+		v := make([]C.char, meta.total_size)
+		C.TF_OperationGetAttrString(op.c, cname, unsafe.Pointer(&v[0]), C.size_t(meta.total_size), status.c)
+		if err := status.Err(); err != nil {
+			return nil, err
+		}
+		return C.GoStringN(&v[0], C.int(meta.total_size)), nil
+
+	case C.TF_ATTR_INT:
+		var v C.int64_t
+		C.TF_OperationGetAttrInt(op.c, cname, &v, status.c)
+		return int64(v), status.Err()
+
+	case C.TF_ATTR_FLOAT:
+		var v C.float
+		C.TF_OperationGetAttrFloat(op.c, cname, &v, status.c)
+		return float32(v), status.Err()
+
+	case C.TF_ATTR_BOOL:
+		var v C.uchar
+		C.TF_OperationGetAttrBool(op.c, cname, &v, status.c)
+		return v == 1, status.Err()
+
+	case C.TF_ATTR_TYPE:
+		var v C.TF_DataType
+		C.TF_OperationGetAttrType(op.c, cname, &v, status.c)
+		return DataType(v), status.Err()
+
+	case C.TF_ATTR_TENSOR:
+		var v *C.TF_Tensor
+		C.TF_OperationGetAttrTensor(op.c, cname, &v, status.c)
+		if err := status.Err(); err != nil {
+			return nil, err
+		}
+		return newTensorFromC(v), nil
+
+	case C.TF_ATTR_SHAPE:
+		numDims := meta.total_size
+		// If number of dims is unknown return empty shape to indicate that.
+		if numDims < 0 {
+			return Shape{}, nil
+		}
+		if numDims == 0 {
+			return ScalarShape(), nil
+		}
+		dims := make([]C.int64_t, numDims)
+		C.TF_OperationGetAttrShape(op.c, cname, (*C.int64_t)(unsafe.Pointer(&dims[0])), C.int(numDims), status.c)
+		if err := status.Err(); err != nil {
+			return nil, err
+		}
+		return makeCShape(dims), nil
+
+	default:
+		return nil, fmt.Errorf("type %v not supported", meta._type)
+	}
+}
diff --git a/tensorflow/go/attrs_test.go b/tensorflow/go/attrs_test.go
new file mode 100644
index 0000000000..ea8af221ae
--- /dev/null
+++ b/tensorflow/go/attrs_test.go
@@ -0,0 +1,193 @@
+/*
+Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package tensorflow
+
+import (
+	"fmt"
+	"reflect"
+	"testing"
+)
+
+func TestOperationAttrs(t *testing.T) {
+	g := NewGraph()
+
+	i := 0
+	makeConst := func(v interface{}) Output {
+		op, err := Const(g, fmt.Sprintf("const/%d/%+v", i, v), v)
+		i++
+		if err != nil {
+			t.Fatal(err)
+		}
+		return op
+	}
+
+	makeTensor := func(v interface{}) *Tensor {
+		tensor, err := NewTensor(v)
+		if err != nil {
+			t.Fatal(err)
+		}
+		return tensor
+	}
+
+	cases := []OpSpec{
+		{
+			Name: "type",
+			Type: "Placeholder",
+			Attrs: map[string]interface{}{
+				"dtype": Float,
+			},
+		},
+		{
+			Name: "list(float)",
+			Type: "Bucketize",
+			Input: []Input{
+				makeConst([]float32{1, 2, 3, 4}),
+			},
+			Attrs: map[string]interface{}{
+				"boundaries": []float32{0, 1, 2, 3, 4, 5},
+			},
+		},
+		{
+			Name: "list(float) empty",
+			Type: "Bucketize",
+			Input: []Input{
+				makeConst([]float32{}),
+			},
+			Attrs: map[string]interface{}{
+				"boundaries": []float32(nil),
+			},
+		},
+    /* TODO(ashankar): debug this issue and add it back later.
+		{
+			Name: "list(type),list(shape)",
+			Type: "InfeedEnqueueTuple",
+			Input: []Input{
+				OutputList([]Output{
+					makeConst(float32(1)),
+					makeConst([][]int32{{2}}),
+				}),
+			},
+			Attrs: map[string]interface{}{
+				"dtypes": []DataType{Float, Int32},
+				"shapes": []Shape{ScalarShape(), MakeShape(1, 1)},
+			},
+		},
+		{
+			Name: "list(type),list(shape) empty",
+			Type: "InfeedEnqueueTuple",
+			Input: []Input{
+				OutputList([]Output{
+					makeConst([][]int32{{2}}),
+				}),
+			},
+			Attrs: map[string]interface{}{
+				"dtypes": []DataType{Int32},
+				"shapes": []Shape(nil),
+			},
+		},
+		{
+			Name: "list(type) empty,string empty,int",
+			Type: "_XlaSendFromHost",
+			Input: []Input{
+				OutputList([]Output{}),
+				makeConst(""),
+			},
+			Attrs: map[string]interface{}{
+				"Tinputs":        []DataType(nil),
+				"key":            "",
+				"device_ordinal": int64(0),
+			},
+		},
+    */
+		{
+			Name: "list(int),int",
+			Type: "StringToHashBucketStrong",
+			Input: []Input{
+				makeConst(""),
+			},
+			Attrs: map[string]interface{}{
+				"num_buckets": int64(2),
+				"key":         []int64{1, 2},
+			},
+		},
+		{
+			Name: "list(int) empty,int",
+			Type: "StringToHashBucketStrong",
+			Input: []Input{
+				makeConst(""),
+			},
+			Attrs: map[string]interface{}{
+				"num_buckets": int64(2),
+				"key":         ([]int64)(nil),
+			},
+		},
+		{
+			Name: "list(string),type",
+			Type: "TensorSummary",
+			Input: []Input{
+				makeConst(""),
+			},
+			Attrs: map[string]interface{}{
+				"T":      String,
+				"labels": []string{"foo", "bar"},
+			},
+		},
+		{
+			Name: "list(string) empty,type",
+			Type: "TensorSummary",
+			Input: []Input{
+				makeConst(""),
+			},
+			Attrs: map[string]interface{}{
+				"T":      String,
+				"labels": ([]string)(nil),
+			},
+		},
+		{
+			Name: "tensor",
+			Type: "Const",
+			Attrs: map[string]interface{}{
+				"dtype": String,
+				"value": makeTensor("foo"),
+			},
+		},
+	}
+
+	for i, spec := range cases {
+		op, err := g.AddOperation(spec)
+		if err != nil {
+			t.Fatal(err)
+		}
+		for key, want := range spec.Attrs {
+			out, err := op.Attr(key)
+			if err != nil {
+				t.Fatal(err)
+			}
+			if !reflect.DeepEqual(out, want) {
+				t.Fatalf("%d. %q: Got %#v, wanted %#v", i, key, out, want)
+			}
+			wantT, ok := want.(*Tensor)
+			if ok {
+				wantVal := wantT.Value()
+				outVal := out.(*Tensor).Value()
+				if !reflect.DeepEqual(outVal, wantVal) {
+					t.Fatalf("%d. %q: Got %#v, wanted %#v", i, key, outVal, wantVal)
+				}
+			}
+		}
+	}
+}
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index b2dbdafc5f..6d9cb7c6ec 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11210,7 +11210,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 // SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
 //
 // value: The cropped area of the image must contain a fraction of the
-// supplied image within in this range.
+// supplied image within this range.
 // If not specified, defaults to <f:0.05 f:1 >
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
@@ -17969,9 +17969,10 @@ func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_val
 }
 
 // Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
-//
 // if < 0, `scale * features` otherwise.
 //
+// Assumes weights to have zero mean and variance 1.0 / fan_in.
+//
 // See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
 func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
@@ -21655,7 +21656,7 @@ func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 //    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
 //
 // The `bad_color` argument is the color to use in the generated images for
-// non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
+// non-finite input values.  It is a `uint8` 1-D tensor of length `channels`.
 // Each element must be in the range `[0, 255]` (It represents the value of a
 // pixel in the output image).  Non-finite values in the input tensor are
 // replaced by this tensor in the output image.  The default value is the color
@@ -24048,7 +24049,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 // SampleDistortedBoundingBoxV2AreaRange sets the optional area_range attribute to value.
 //
 // value: The cropped area of the image must contain a fraction of the
-// supplied image within in this range.
+// supplied image within this range.
 // If not specified, defaults to <f:0.05 f:1 >
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
diff --git a/tensorflow/go/operation.go b/tensorflow/go/operation.go
index 8fcad61f4c..25ec718703 100644
--- a/tensorflow/go/operation.go
+++ b/tensorflow/go/operation.go
@@ -65,6 +65,11 @@ func (op *Operation) Output(i int) Output {
 	return Output{op, i}
 }
 
+// NumInputs returns the number of inputs of op.
+func (op *Operation) NumInputs() int {
+	return int(C.TF_OperationNumInputs(op.c))
+}
+
 // Output represents one of the outputs of an operation in the graph. Has a
 // DataType (and eventually a Shape).  May be passed as an input argument to a
 // function for adding operations to a graph, or to a Session's Run() method to
@@ -123,6 +128,67 @@ func (p Output) c() C.TF_Output {
 
 func (p Output) canBeAnInput() {}
 
+// Consumers returns the inputs that consume this output.
+func (p Output) Consumers() []Consumer {
+	max := int(C.TF_OperationOutputNumConsumers(p.c()))
+	if max == 0 {
+		return nil
+	}
+	inputs := make([]C.TF_Input, max)
+	n := C.TF_OperationOutputConsumers(p.c(), (*C.TF_Input)(unsafe.Pointer(&inputs[0])), C.int(max))
+	inputs = inputs[:int(n)]
+
+	var consumers []Consumer
+	for _, consumer := range inputs {
+		consumers = append(consumers, Consumer{
+			Index: int(consumer.index),
+			Op: &Operation{
+				c: consumer.oper,
+				g: p.Op.g,
+			},
+		})
+	}
+
+	return consumers
+}
+
+// Consumer identifies a specific input of an operation that consumes the output
+// of another operation.
+type Consumer struct {
+	// Op is the Operation that is consuming the output of another operation.
+	Op *Operation
+
+	// Index is the index of the input within Op that the output of another
+	// operation is connected to.
+	Index int
+}
+
+func (p Consumer) c() C.TF_Input {
+	if p.Op == nil {
+		// Attempt to provide a more useful panic message than "nil
+		// pointer dereference".
+		panic("nil-Operation. Consumer objects should only be created by a call to Output.Consumers")
+	}
+	return C.TF_Input{oper: p.Op.c, index: C.int(p.Index)}
+}
+
+// DataType returns the type of the input.
+func (p Consumer) DataType() DataType {
+	return DataType(C.TF_OperationInputType(p.c()))
+}
+
+// Producer returns the Output that is connected to this Consumer.
+func (p Consumer) Producer() Output {
+	output := C.TF_OperationInput(p.c())
+	return Output{
+		Op: &Operation{
+			c: output.oper,
+			g: p.Op.g,
+		},
+		Index: int(output.index),
+	}
+}
+
 // Input is the interface for specifying inputs to an operation being added to
 // a Graph.
 //
diff --git a/tensorflow/go/operation_test.go b/tensorflow/go/operation_test.go
index 40c951ab8c..06b65bdfb7 100644
--- a/tensorflow/go/operation_test.go
+++ b/tensorflow/go/operation_test.go
@@ -166,6 +166,68 @@ func TestOutputDataTypeAndShape(t *testing.T) {
 	}
 }
 
+func TestOperationInputs(t *testing.T) {
+	g := NewGraph()
+	x, err := Placeholder(g, "x", Float)
+	if err != nil {
+		t.Fatal(err)
+	}
+	y, err := Placeholder(g, "y", Float)
+	if err != nil {
+		t.Fatal(err)
+	}
+	add, err := Add(g, "add", x, y)
+	if err != nil {
+		t.Fatal(err)
+	}
+	addOp := add.Op
+
+	if out := addOp.NumInputs(); out != 2 {
+		t.Fatalf("Got %d inputs, wanted 2", out)
+	}
+}
+
+func TestOperationConsumers(t *testing.T) {
+	g := NewGraph()
+	x, err := Placeholder(g, "x", Float)
+	if err != nil {
+		t.Fatal(err)
+	}
+	a, err := Neg(g, "a", x)
+	if err != nil {
+		t.Fatal(err)
+	}
+	b, err := Neg(g, "b", x)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	consumers := []*Operation{a.Op, b.Op}
+
+	xConsumers := x.Consumers()
+	if out := len(xConsumers); out != 2 {
+		t.Fatalf("Got %d consumers, wanted 2", out)
+	}
+
+	for i, consumer := range xConsumers {
+		got := consumer.Op.Name()
+		want := consumers[i].Name()
+		if got != want {
+			t.Fatalf("%d. Got op name %q, wanted %q", i, got, want)
+		}
+
+		got = consumer.Producer().Op.Name()
+		want = x.Op.Name()
+		if got != want {
+			t.Fatalf("%d. Got op name %q, wanted %q", i, got, want)
+		}
+	}
+
+	if len(b.Consumers()) != 0 {
+		t.Fatalf("expected %+v to have no consumers", b)
+	}
+}
+
 func forceGC() {
 	var mem runtime.MemStats
 	runtime.ReadMemStats(&mem)
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index 19d2133a55..73e210fae0 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -56,6 +56,10 @@ java_library(
     srcs = glob(["src/gen/java/org/tensorflow/processor/**/*.java"]),
     javacopts = JAVACOPTS,
     resources = glob(["src/gen/resources/META-INF/services/javax.annotation.processing.Processor"]),
+    deps = [
+        "@com_google_guava",
+        "@com_squareup_javapoet",
+    ],
 )
 
 filegroup(
@@ -70,6 +74,7 @@ tf_java_op_gen_srcjar(
     name = "java_op_gen_sources",
     api_def_srcs = [
         "//tensorflow/core/api_def:base_api_def",
+        "//tensorflow/core/api_def:java_api_def",
     ],
     base_package = "org.tensorflow.op",
     gen_tool = ":java_op_gen_tool",
diff --git a/tensorflow/java/maven/.gitignore b/tensorflow/java/maven/.gitignore
index ff080515d5..657e2a60bc 100644
--- a/tensorflow/java/maven/.gitignore
+++ b/tensorflow/java/maven/.gitignore
@@ -11,4 +11,10 @@ tensorflow/src
 tensorflow/target
 proto/src
 proto/target
+hadoop/src
+hadoop/target
+spark-connector/src
+spark-connector/target
+spark-connector/dependency-reduced-pom.xml
+spark-connector/spark-warehouse
 pom.xml.versionsBackup
diff --git a/tensorflow/java/maven/README.md b/tensorflow/java/maven/README.md
index c7e8f03806..3e030dcd09 100644
--- a/tensorflow/java/maven/README.md
+++ b/tensorflow/java/maven/README.md
@@ -53,6 +53,12 @@ There are seven artifacts and thus `pom.xml`s involved in this release:
 7.  [`parentpom`](https://maven.apache.org/pom/index.html): Common settings
     shared by all of the above.
 
+8. `hadoop`: The TensorFlow TFRecord InputFormat/OutputFormat for Apache Hadoop.
+    The source code for this package is available in the [TensorFlow Ecosystem](https://github.com/tensorflow/ecosystem/tree/master/hadoop)
+
+9. `spark-connector`: A Scala library for loading and storing TensorFlow TFRecord
+    using Apache Spark DataFrames. The source code for this package is available
+    in the [TensorFlow Ecosystem](https://github.com/tensorflow/ecosystem/tree/master/spark/spark-tensorflow-connector)
 
 ## Updating the release
 
diff --git a/tensorflow/java/maven/hadoop/pom.xml b/tensorflow/java/maven/hadoop/pom.xml
new file mode 100644
index 0000000000..0642be06fa
--- /dev/null
+++ b/tensorflow/java/maven/hadoop/pom.xml
@@ -0,0 +1,24 @@
+<project
+        xmlns="http://maven.apache.org/POM/4.0.0"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+        xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <!-- Placeholder pom which is replaced by TensorFlow ecosystem Hadoop pom during build -->
+    <modelVersion>4.0.0</modelVersion>
+    <description>TensorFlow TFRecord InputFormat/OutputFormat for Apache Hadoop</description>
+    <artifactId>hadoop</artifactId>
+    <packaging>jar</packaging>
+
+    <scm>
+        <url>https://github.com/tensorflow/ecosystem.git</url>
+        <connection>git@github.com:tensorflow/ecosystem.git</connection>
+        <developerConnection>scm:git:https://github.com/tensorflow/ecosystem.git</developerConnection>
+    </scm>
+
+    <url>https://github.com/tensorflow/ecosystem/</url>
+    <parent>
+        <groupId>org.tensorflow</groupId>
+        <artifactId>parentpom</artifactId>
+        <version>1.9.0-rc0</version>
+        <relativePath>../</relativePath>
+    </parent>
+</project>
\ No newline at end of file
diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml
index 3890f3fcaa..b4746794ea 100644
--- a/tensorflow/java/maven/pom.xml
+++ b/tensorflow/java/maven/pom.xml
@@ -32,6 +32,8 @@
     <module>libtensorflow_jni_gpu</module>
     <module>tensorflow</module>
     <module>proto</module>
+    <module>hadoop</module>
+    <module>spark-connector</module>
   </modules>
 
   <!-- Two profiles are used:
diff --git a/tensorflow/java/maven/run_inside_container.sh b/tensorflow/java/maven/run_inside_container.sh
index bf19c09b1d..2e771064e4 100644
--- a/tensorflow/java/maven/run_inside_container.sh
+++ b/tensorflow/java/maven/run_inside_container.sh
@@ -19,6 +19,7 @@
 
 
 RELEASE_URL_PREFIX="https://storage.googleapis.com/tensorflow/libtensorflow"
+TF_ECOSYSTEM_URL="https://github.com/tensorflow/ecosystem.git"
 
 # By default we deploy to both ossrh and bintray. These two
 # environment variables can be set to skip either repository.
@@ -44,7 +45,9 @@ clean() {
   # (though if run inside a clean docker container, there won't be any dirty
   # artifacts lying around)
   mvn -q clean
-  rm -rf libtensorflow_jni/src libtensorflow_jni/target libtensorflow_jni_gpu/src libtensorflow_jni_gpu/target libtensorflow/src libtensorflow/target tensorflow-android/target
+  rm -rf libtensorflow_jni/src libtensorflow_jni/target libtensorflow_jni_gpu/src libtensorflow_jni_gpu/target \
+    libtensorflow/src libtensorflow/target tensorflow-android/target proto/src proto/target \
+    hadoop/src hadoop/target spark-connector/src spark-connector/target
 }
 
 update_version_in_pom() {
@@ -183,6 +186,43 @@ generate_java_protos() {
   rm -rf "${DIR}/proto/tmp"
 }
 
+
+# Download the TensorFlow ecosystem source from git.
+# The pom files from this repo do not inherit from the parent pom so the maven version
+# is updated for each module.
+download_tf_ecosystem() {
+  ECOSYSTEM_DIR="/tmp/tensorflow-ecosystem"
+  HADOOP_DIR="${DIR}/hadoop"
+  SPARK_DIR="${DIR}/spark-connector"
+
+  # Clean any previous attempts
+  rm -rf "${ECOSYSTEM_DIR}"
+
+  # Clone the TensorFlow ecosystem project
+  mkdir -p  "${ECOSYSTEM_DIR}"
+  cd "${ECOSYSTEM_DIR}"
+  git clone "${TF_ECOSYSTEM_URL}"
+  cd ecosystem
+  git checkout r${TF_VERSION}
+
+  # Copy the TensorFlow Hadoop source
+  cp -r "${ECOSYSTEM_DIR}/ecosystem/hadoop/src" "${HADOOP_DIR}"
+  cp "${ECOSYSTEM_DIR}/ecosystem/hadoop/pom.xml" "${HADOOP_DIR}"
+  cd "${HADOOP_DIR}"
+  update_version_in_pom
+
+  # Copy the TensorFlow Spark connector source
+  cp -r "${ECOSYSTEM_DIR}/ecosystem/spark/spark-tensorflow-connector/src" "${SPARK_DIR}"
+  cp "${ECOSYSTEM_DIR}/ecosystem/spark/spark-tensorflow-connector/pom.xml" "${SPARK_DIR}"
+  cd "${SPARK_DIR}"
+  update_version_in_pom
+
+  # Cleanup
+  rm -rf "${ECOSYSTEM_DIR}"
+
+  cd "${DIR}"
+}
+
 # Deploy artifacts using a specific profile.
 # Arguments:
 #   profile - name of selected profile.
@@ -240,7 +280,8 @@ cd "${DIR}"
 # Comment lines out appropriately if debugging/tinkering with the release
 # process.
 # gnupg2 is required for signing
-apt-get -qq update && apt-get -qqq install -y gnupg2
+apt-get -qq update && apt-get -qqq install -y gnupg2 git
+
 clean
 update_version_in_pom
 download_libtensorflow
@@ -248,6 +289,8 @@ download_libtensorflow_jni
 download_libtensorflow_jni_gpu
 update_tensorflow_android
 generate_java_protos
+download_tf_ecosystem
+
 # Build the release artifacts
 mvn verify
 # Push artifacts to repository
diff --git a/tensorflow/java/maven/spark-connector/pom.xml b/tensorflow/java/maven/spark-connector/pom.xml
new file mode 100644
index 0000000000..19c752d08b
--- /dev/null
+++ b/tensorflow/java/maven/spark-connector/pom.xml
@@ -0,0 +1,24 @@
+<project
+        xmlns="http://maven.apache.org/POM/4.0.0"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+        xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <!-- Placeholder pom which is replaced by TensorFlow ecosystem Spark pom during build -->
+    <modelVersion>4.0.0</modelVersion>
+    <description>TensorFlow TFRecord connector for Apache Spark DataFrames</description>
+    <artifactId>spark-connector</artifactId>
+    <packaging>jar</packaging>
+
+    <scm>
+        <url>https://github.com/tensorflow/ecosystem.git</url>
+        <connection>git@github.com:tensorflow/ecosystem.git</connection>
+        <developerConnection>scm:git:https://github.com/tensorflow/ecosystem.git</developerConnection>
+    </scm>
+
+    <url>https://github.com/tensorflow/ecosystem/</url>
+    <parent>
+        <groupId>org.tensorflow</groupId>
+        <artifactId>parentpom</artifactId>
+        <version>1.9.0-rc0</version>
+        <relativePath>../</relativePath>
+    </parent>
+</project>
\ No newline at end of file
diff --git a/tensorflow/java/src/gen/cc/op_generator.cc b/tensorflow/java/src/gen/cc/op_generator.cc
index 9b171f66ec..d5bd99bdd9 100644
--- a/tensorflow/java/src/gen/cc/op_generator.cc
+++ b/tensorflow/java/src/gen/cc/op_generator.cc
@@ -35,7 +35,7 @@ namespace tensorflow {
 namespace java {
 namespace {
 
-const char* kLicense =
+constexpr const char kLicense[] =
     "/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.\n"
     "\n"
     "Licensed under the Apache License, Version 2.0 (the \"License\");\n"
@@ -391,9 +391,12 @@ void GenerateOp(const OpSpec& op, const EndpointSpec& endpoint,
   }
   if (!op.hidden()) {
     // expose the op in the Ops Graph API only if it is visible
-    op_class.add_annotation(
-        Annotation::Create("Operator", "org.tensorflow.op.annotation")
-            .attributes("group = \"" + endpoint.package() + "\""));
+    Annotation oper_annot =
+        Annotation::Create("Operator", "org.tensorflow.op.annotation");
+    if (endpoint.package() != kDefaultEndpointPackage) {
+      oper_annot.attributes("group = \"" + endpoint.package() + "\"");
+    }
+    op_class.add_annotation(oper_annot);
   }
   // create op class file
   const string op_dir_name = io::JoinPath(
diff --git a/tensorflow/java/src/gen/cc/op_specs.h b/tensorflow/java/src/gen/cc/op_specs.h
index ca0ba16745..30ecb8ce53 100644
--- a/tensorflow/java/src/gen/cc/op_specs.h
+++ b/tensorflow/java/src/gen/cc/op_specs.h
@@ -27,6 +27,8 @@ limitations under the License.
 namespace tensorflow {
 namespace java {
 
+constexpr const char kDefaultEndpointPackage[] = "core";
+
 class EndpointSpec {
  public:
   // A specification for an operation endpoint
diff --git a/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java b/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java
index 11fda4fc22..796d6a62dc 100644
--- a/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java
+++ b/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java
@@ -15,19 +15,44 @@ limitations under the License.
 
 package org.tensorflow.processor;
 
+import com.google.common.base.CaseFormat;
+import com.google.common.base.Strings;
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.Multimap;
+import com.squareup.javapoet.ClassName;
+import com.squareup.javapoet.FieldSpec;
+import com.squareup.javapoet.JavaFile;
+import com.squareup.javapoet.MethodSpec;
+import com.squareup.javapoet.ParameterSpec;
+import com.squareup.javapoet.TypeName;
+import com.squareup.javapoet.TypeSpec;
+import com.squareup.javapoet.TypeVariableName;
 import java.io.IOException;
-import java.io.PrintWriter;
+import java.util.Collection;
 import java.util.Collections;
-import java.util.HashSet;
+import java.util.HashMap;
+import java.util.Map;
 import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 import javax.annotation.processing.AbstractProcessor;
 import javax.annotation.processing.Filer;
 import javax.annotation.processing.Messager;
 import javax.annotation.processing.ProcessingEnvironment;
 import javax.annotation.processing.RoundEnvironment;
 import javax.lang.model.SourceVersion;
+import javax.lang.model.element.AnnotationMirror;
+import javax.lang.model.element.AnnotationValue;
 import javax.lang.model.element.Element;
+import javax.lang.model.element.ExecutableElement;
+import javax.lang.model.element.Modifier;
 import javax.lang.model.element.TypeElement;
+import javax.lang.model.element.TypeParameterElement;
+import javax.lang.model.element.VariableElement;
+import javax.lang.model.type.TypeMirror;
+import javax.lang.model.type.TypeVariable;
+import javax.lang.model.util.ElementFilter;
+import javax.lang.model.util.Elements;
 import javax.tools.Diagnostic.Kind;
 
 /**
@@ -55,6 +80,7 @@ public final class OperatorProcessor extends AbstractProcessor {
     super.init(processingEnv);
     messager = processingEnv.getMessager();
     filer = processingEnv.getFiler();
+    elements = processingEnv.getElementUtils();
   }
 
   @Override
@@ -98,42 +124,77 @@ public final class OperatorProcessor extends AbstractProcessor {
     }
 
     // Collect all classes tagged with our annotation.
-    Set<TypeElement> opClasses = new HashSet<TypeElement>();
-    if (!collectOpClasses(roundEnv, opClasses, annotation)) {
+    Multimap<String, MethodSpec> groupedMethods = HashMultimap.create();
+    if (!collectOpsMethods(roundEnv, groupedMethods, annotation)) {
       return true;
     }
 
     // Nothing to do when there are no tagged classes.
-    if (opClasses.isEmpty()) {
+    if (groupedMethods.isEmpty()) {
       return true;
     }
 
-    // TODO:(kbsriram) validate operator classes and generate Op API.
-    writeApi();
+    // Validate operator classes and generate Op API.
+    writeApi(groupedMethods);
+
     hasRun = true;
     return true;
   }
 
   @Override
   public Set<String> getSupportedAnnotationTypes() {
-    return Collections.singleton(String.format("%s.annotation.Operator", OP_PACKAGE));
+    return Collections.singleton("org.tensorflow.op.annotation.Operator");
+  }
+
+  private static final Pattern JAVADOC_TAG_PATTERN =
+      Pattern.compile("@(?:param|return|throws|exception|see)\\s+.*");
+  private static final TypeName T_OPS = ClassName.get("org.tensorflow.op", "Ops");
+  private static final TypeName T_OPERATOR =
+      ClassName.get("org.tensorflow.op.annotation", "Operator");
+  private static final TypeName T_SCOPE = ClassName.get("org.tensorflow.op", "Scope");
+  private static final TypeName T_GRAPH = ClassName.get("org.tensorflow", "Graph");
+  private static final TypeName T_STRING = ClassName.get(String.class);
+
+  private Filer filer;
+  private Messager messager;
+  private Elements elements;
+  private boolean hasRun = false;
+
+  private void error(Element e, String message, Object... args) {
+    if (args != null && args.length > 0) {
+      message = String.format(message, args);
+    }
+    messager.printMessage(Kind.ERROR, message, e);
   }
 
-  private void writeApi() {
-    // Generate an empty class for now and get the build working correctly. This will be changed to
-    // generate the actual API once we've done with build-related changes.
-    // TODO:(kbsriram)
-    try (PrintWriter writer =
-        new PrintWriter(filer.createSourceFile(String.format("%s.Ops", OP_PACKAGE)).openWriter())) {
-      writer.println(String.format("package %s;", OP_PACKAGE));
-      writer.println("public class Ops{}");
+  private void write(TypeSpec spec) {
+    try {
+      JavaFile.builder("org.tensorflow.op", spec).skipJavaLangImports(true).build().writeTo(filer);
     } catch (IOException e) {
-      error(null, "Unexpected failure generating API: %s", e.getMessage());
+      throw new AssertionError(e);
+    }
+  }
+
+  private void writeApi(Multimap<String, MethodSpec> groupedMethods) {
+    Map<String, ClassName> groups = new HashMap<>();
+
+    // Generate a API class for each group collected other than the default one (= empty string)
+    for (Map.Entry<String, Collection<MethodSpec>> entry : groupedMethods.asMap().entrySet()) {
+      if (!entry.getKey().isEmpty()) {
+        TypeSpec groupClass = buildGroupClass(entry.getKey(), entry.getValue());
+        write(groupClass);
+        groups.put(entry.getKey(), ClassName.get("org.tensorflow.op", groupClass.name));
+      }
     }
+    // Generate the top API class, adding any methods added to the default group
+    TypeSpec topClass = buildTopClass(groups, groupedMethods.get(""));
+    write(topClass);
   }
 
-  private boolean collectOpClasses(
-      RoundEnvironment roundEnv, Set<TypeElement> opClasses, TypeElement annotation) {
+  private boolean collectOpsMethods(
+      RoundEnvironment roundEnv,
+      Multimap<String, MethodSpec> groupedMethods,
+      TypeElement annotation) {
     boolean result = true;
     for (Element e : roundEnv.getElementsAnnotatedWith(annotation)) {
       // @Operator can only apply to types, so e must be a TypeElement.
@@ -145,20 +206,251 @@ public final class OperatorProcessor extends AbstractProcessor {
         result = false;
         continue;
       }
-      opClasses.add((TypeElement) e);
+      TypeElement opClass = (TypeElement) e;
+      // Skip deprecated operations for now, as we do not guarantee API stability yet
+      if (opClass.getAnnotation(Deprecated.class) == null) {
+        collectOpMethods(groupedMethods, opClass, annotation);
+      }
     }
     return result;
   }
 
-  private void error(Element e, String message, Object... args) {
-    if (args != null && args.length > 0) {
-      message = String.format(message, args);
+  private void collectOpMethods(
+      Multimap<String, MethodSpec> groupedMethods, TypeElement opClass, TypeElement annotation) {
+    AnnotationMirror am = getAnnotationMirror(opClass, annotation);
+    String groupName = getAnnotationElementValueAsString("group", am);
+    String methodName = getAnnotationElementValueAsString("name", am);
+    ClassName opClassName = ClassName.get(opClass);
+    if (Strings.isNullOrEmpty(methodName)) {
+      methodName = CaseFormat.UPPER_CAMEL.to(CaseFormat.LOWER_CAMEL, opClassName.simpleName());
+    }
+    // Build a method for each @Operator found in the class path. There should be one method per
+    // operation factory called
+    // "create", which takes in parameter a scope and, optionally, a list of arguments
+    for (ExecutableElement opMethod : ElementFilter.methodsIn(opClass.getEnclosedElements())) {
+      if (opMethod.getModifiers().contains(Modifier.STATIC)
+          && opMethod.getSimpleName().contentEquals("create")) {
+        MethodSpec method = buildOpMethod(methodName, opClassName, opMethod);
+        groupedMethods.put(groupName, method);
+      }
     }
-    messager.printMessage(Kind.ERROR, message, e);
   }
 
-  private Filer filer;
-  private Messager messager;
-  private boolean hasRun = false;
-  private static final String OP_PACKAGE = "org.tensorflow.op";
+  private MethodSpec buildOpMethod(
+      String methodName, ClassName opClassName, ExecutableElement factoryMethod) {
+    MethodSpec.Builder builder =
+        MethodSpec.methodBuilder(methodName)
+            .addModifiers(Modifier.PUBLIC)
+            .returns(TypeName.get(factoryMethod.getReturnType()))
+            .varargs(factoryMethod.isVarArgs())
+            .addJavadoc("$L", buildOpMethodJavadoc(opClassName, factoryMethod));
+
+    for (TypeParameterElement tp : factoryMethod.getTypeParameters()) {
+      TypeVariableName tvn = TypeVariableName.get((TypeVariable) tp.asType());
+      builder.addTypeVariable(tvn);
+    }
+    for (TypeMirror thrownType : factoryMethod.getThrownTypes()) {
+      builder.addException(TypeName.get(thrownType));
+    }
+    StringBuilder call = new StringBuilder("return $T.create(scope");
+    boolean first = true;
+    for (VariableElement param : factoryMethod.getParameters()) {
+      ParameterSpec p = ParameterSpec.get(param);
+      if (first) {
+        first = false;
+        continue;
+      }
+      call.append(", ");
+      call.append(p.name);
+      builder.addParameter(p);
+    }
+    call.append(")");
+    builder.addStatement(call.toString(), opClassName);
+    return builder.build();
+  }
+
+  private String buildOpMethodJavadoc(ClassName opClassName, ExecutableElement factoryMethod) {
+    StringBuilder javadoc = new StringBuilder();
+    javadoc
+        .append("Adds an {@link ")
+        .append(opClassName.simpleName())
+        .append("} operation to the graph\n\n");
+
+    // Add all javadoc tags found in the operator factory method but the first one, which should be
+    // in all cases the
+    // 'scope' parameter that is implicitly passed by this API
+    Matcher tagMatcher = JAVADOC_TAG_PATTERN.matcher(elements.getDocComment(factoryMethod));
+    boolean firstParam = true;
+
+    while (tagMatcher.find()) {
+      String tag = tagMatcher.group();
+      if (tag.startsWith("@param") && firstParam) {
+        firstParam = false;
+      } else {
+        javadoc.append(tag).append('\n');
+      }
+    }
+    javadoc.append("@see {@link ").append(opClassName).append("}\n");
+
+    return javadoc.toString();
+  }
+
+  private static TypeSpec buildGroupClass(String group, Collection<MethodSpec> methods) {
+    MethodSpec.Builder ctorBuilder =
+        MethodSpec.constructorBuilder()
+            .addParameter(T_SCOPE, "scope")
+            .addStatement("this.scope = scope");
+
+    TypeSpec.Builder builder =
+        TypeSpec.classBuilder(CaseFormat.LOWER_CAMEL.to(CaseFormat.UPPER_CAMEL, group) + "Ops")
+            .addModifiers(Modifier.PUBLIC, Modifier.FINAL)
+            .addJavadoc(
+                "An API for adding {@code $L} operations to a {@link $T Graph}\n\n"
+                    + "@see {@link $T}\n",
+                group,
+                T_GRAPH,
+                T_OPS)
+            .addMethods(methods)
+            .addMethod(ctorBuilder.build());
+
+    builder.addField(
+        FieldSpec.builder(T_SCOPE, "scope").addModifiers(Modifier.PRIVATE, Modifier.FINAL).build());
+
+    return builder.build();
+  }
+
+  private static TypeSpec buildTopClass(
+      Map<String, ClassName> groupToClass, Collection<MethodSpec> methods) {
+    MethodSpec.Builder ctorBuilder =
+        MethodSpec.constructorBuilder()
+            .addModifiers(Modifier.PRIVATE)
+            .addParameter(T_SCOPE, "scope")
+            .addStatement("this.scope = scope", T_SCOPE);
+
+    for (Map.Entry<String, ClassName> entry : groupToClass.entrySet()) {
+      ctorBuilder.addStatement("$L = new $T(scope)", entry.getKey(), entry.getValue());
+    }
+
+    TypeSpec.Builder opsBuilder =
+        TypeSpec.classBuilder("Ops")
+            .addModifiers(Modifier.PUBLIC, Modifier.FINAL)
+            .addJavadoc(
+                "An API for building a {@link $T} with operation wrappers\n<p>\n"
+                    + "Any operation wrapper found in the classpath properly annotated as an"
+                    + "{@link $T @Operator} is exposed\n"
+                    + "by this API or one of its subgroup.\n<p>Example usage:\n<pre>{@code\n"
+                    + "try (Graph g = new Graph()) {\n"
+                    + "  Ops ops = new Ops(g);\n"
+                    + "  // Operations are typed classes with convenience\n"
+                    + "  // builders in Ops.\n"
+                    + "  Constant three = ops.constant(3);\n"
+                    + "  // Single-result operations implement the Operand\n"
+                    + "  // interface, so this works too.\n"
+                    + "  Operand four = ops.constant(4);\n"
+                    + "  // Most builders are found within a group, and accept\n"
+                    + "  // Operand types as operands\n"
+                    + "  Operand nine = ops.math().add(four, ops.constant(5));\n"
+                    + "  // Multi-result operations however offer methods to\n"
+                    + "  // select a particular result for use.\n"
+                    + "  Operand result = \n"
+                    + "      ops.math().add(ops.array().unique(s, a).y(), b);\n"
+                    + "  // Optional attributes\n"
+                    + "  ops.math().matMul(a, b, MatMul.transposeA(true));\n"
+                    + "  // Naming operators\n"
+                    + "  ops.withName(“foo”).constant(5); // name “foo”\n"
+                    + "  // Names can exist in a hierarchy\n"
+                    + "  Ops sub = ops.withSubScope(“sub”);\n"
+                    + "  sub.withName(“bar”).constant(4); // “sub/bar”\n"
+                    + "}\n"
+                    + "}</pre>\n",
+                T_GRAPH,
+                T_OPERATOR)
+            .addMethods(methods)
+            .addMethod(ctorBuilder.build());
+
+    opsBuilder.addMethod(
+        MethodSpec.methodBuilder("withSubScope")
+            .addModifiers(Modifier.PUBLIC)
+            .addParameter(T_STRING, "childScopeName")
+            .returns(T_OPS)
+            .addStatement("return new $T(scope.withSubScope(childScopeName))", T_OPS)
+            .addJavadoc(
+                "Returns an API that adds operations to the graph with the provided name prefix.\n"
+                    + "\n@see {@link $T#withSubScope(String)}\n",
+                T_SCOPE)
+            .build());
+
+    opsBuilder.addMethod(
+        MethodSpec.methodBuilder("withName")
+            .addModifiers(Modifier.PUBLIC)
+            .addParameter(T_STRING, "opName")
+            .returns(T_OPS)
+            .addStatement("return new Ops(scope.withName(opName))")
+            .addJavadoc(
+                "Returns an API that uses the provided name for an op.\n\n"
+                    + "@see {@link $T#withName(String)}\n",
+                T_SCOPE)
+            .build());
+
+    opsBuilder.addField(
+        FieldSpec.builder(T_SCOPE, "scope").addModifiers(Modifier.PRIVATE, Modifier.FINAL).build());
+
+    opsBuilder.addMethod(
+        MethodSpec.methodBuilder("scope")
+            .addModifiers(Modifier.PUBLIC, Modifier.FINAL)
+            .returns(T_SCOPE)
+            .addStatement("return scope")
+            .addJavadoc("Returns the current {@link $T scope} of this API\n", T_SCOPE)
+            .build());
+
+    for (Map.Entry<String, ClassName> entry : groupToClass.entrySet()) {
+      opsBuilder.addField(
+          FieldSpec.builder(entry.getValue(), entry.getKey())
+              .addModifiers(Modifier.PUBLIC, Modifier.FINAL)
+              .build());
+
+      opsBuilder.addMethod(
+          MethodSpec.methodBuilder(entry.getKey())
+              .addModifiers(Modifier.PUBLIC, Modifier.FINAL)
+              .returns(entry.getValue())
+              .addStatement("return $L", entry.getKey())
+              .addJavadoc(
+                  "Returns an API for adding {@code $L} operations to the graph\n", entry.getKey())
+              .build());
+    }
+
+    opsBuilder.addMethod(
+        MethodSpec.methodBuilder("create")
+            .addModifiers(Modifier.PUBLIC, Modifier.STATIC)
+            .addParameter(T_GRAPH, "graph")
+            .returns(T_OPS)
+            .addStatement("return new Ops(new $T(graph))", T_SCOPE)
+            .addJavadoc("Creates an API for adding operations to the provided {@code graph}\n")
+            .build());
+
+    return opsBuilder.build();
+  }
+
+  private static AnnotationMirror getAnnotationMirror(Element element, TypeElement annotation) {
+    for (AnnotationMirror am : element.getAnnotationMirrors()) {
+      if (am.getAnnotationType().asElement().equals(annotation)) {
+        return am;
+      }
+    }
+    throw new IllegalArgumentException(
+        "Annotation "
+            + annotation.getSimpleName()
+            + " not present on element "
+            + element.getSimpleName());
+  }
+
+  private static String getAnnotationElementValueAsString(String elementName, AnnotationMirror am) {
+    for (Map.Entry<? extends ExecutableElement, ? extends AnnotationValue> entry :
+        am.getElementValues().entrySet()) {
+      if (entry.getKey().getSimpleName().contentEquals(elementName)) {
+        return entry.getValue().getValue().toString();
+      }
+    }
+    return "";
+  }
 }
diff --git a/tensorflow/python/estimator/canned/baseline.py b/tensorflow/python/estimator/canned/baseline.py
index 78d18e41ed..20c7a69b7c 100644
--- a/tensorflow/python/estimator/canned/baseline.py
+++ b/tensorflow/python/estimator/canned/baseline.py
@@ -24,10 +24,10 @@ Example:
 classifier = BaselineClassifier(n_classes=3)
 
 # Input builders
-def input_fn_train: # returns x, y (where y represents label's class index).
+def input_fn_train(): # returns x, y (where y represents label's class index).
   pass
 
-def input_fn_eval: # returns x, y (where y represents label's class index).
+def input_fn_eval(): # returns x, y (where y represents label's class index).
   pass
 
 # Fit model.
diff --git a/tensorflow/python/estimator/export/export.py b/tensorflow/python/estimator/export/export.py
index 010c0f3f59..ca26341445 100644
--- a/tensorflow/python/estimator/export/export.py
+++ b/tensorflow/python/estimator/export/export.py
@@ -333,11 +333,7 @@ def build_raw_serving_input_receiver_fn(features, default_batch_size=None):
     """A serving_input_receiver_fn that expects features to be fed directly."""
     receiver_tensors = _placeholders_from_receiver_tensors_dict(
         features, default_batch_size)
-
-    # TODO(b/34885899): remove the unnecessary copy
-    # The features provided are simply the placeholders, but we defensively copy
-    # the dict because it may be mutated.
-    return ServingInputReceiver(receiver_tensors, receiver_tensors.copy())
+    return ServingInputReceiver(receiver_tensors, receiver_tensors)
 
   return serving_input_receiver_fn
 
diff --git a/tensorflow/python/keras/datasets/boston_housing.py b/tensorflow/python/keras/datasets/boston_housing.py
index 4c4cab8c08..eeb7cbc44a 100644
--- a/tensorflow/python/keras/datasets/boston_housing.py
+++ b/tensorflow/python/keras/datasets/boston_housing.py
@@ -45,10 +45,9 @@ def load_data(path='boston_housing.npz', test_split=0.2, seed=113):
       origin=origin_folder + 'boston_housing.npz',
       file_hash=
       'f553886a1f8d56431e820c5b82552d9d95cfcb96d1e678153f8839538947dff5')
-  f = np.load(path)
-  x = f['x']
-  y = f['y']
-  f.close()
+  with np.load(path) as f:
+    x = f['x']
+    y = f['y']
 
   np.random.seed(seed)
   indices = np.arange(len(x))
diff --git a/tensorflow/python/keras/datasets/mnist.py b/tensorflow/python/keras/datasets/mnist.py
index 03564accc7..a96b581960 100644
--- a/tensorflow/python/keras/datasets/mnist.py
+++ b/tensorflow/python/keras/datasets/mnist.py
@@ -47,8 +47,8 @@ def load_data(path='mnist.npz'):
       path,
       origin=origin_folder + 'mnist.npz',
       file_hash='8a61469f7ea1b51cbae51d4f78837e45')
-  f = np.load(path)
-  x_train, y_train = f['x_train'], f['y_train']
-  x_test, y_test = f['x_test'], f['y_test']
-  f.close()
-  return (x_train, y_train), (x_test, y_test)
+  with np.load(path) as f:
+    x_train, y_train = f['x_train'], f['y_train']
+    x_test, y_test = f['x_test'], f['y_test']
+
+    return (x_train, y_train), (x_test, y_test)
diff --git a/tensorflow/python/keras/datasets/reuters.py b/tensorflow/python/keras/datasets/reuters.py
index 2120b4b242..cb796bb06c 100644
--- a/tensorflow/python/keras/datasets/reuters.py
+++ b/tensorflow/python/keras/datasets/reuters.py
@@ -130,7 +130,5 @@ def get_word_index(path='reuters_word_index.json'):
       path,
       origin=origin_folder + 'reuters_word_index.json',
       file_hash='4d44cc38712099c9e383dc6e5f11a921')
-  f = open(path)
-  data = json.load(f)
-  f.close()
-  return data
+  with open(path) as f:
+    return json.load(f)
diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index 3234c05be0..e3a686f45d 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -87,9 +87,11 @@ from tensorflow.python.keras.layers.local import LocallyConnected2D
 
 # Merge layers.
 from tensorflow.python.keras.layers.merge import Add
+from tensorflow.python.keras.layers.merge import Subtract
 from tensorflow.python.keras.layers.merge import Multiply
 from tensorflow.python.keras.layers.merge import Average
 from tensorflow.python.keras.layers.merge import Maximum
+from tensorflow.python.keras.layers.merge import Minimum
 from tensorflow.python.keras.layers.merge import Concatenate
 from tensorflow.python.keras.layers.merge import Dot
 from tensorflow.python.keras.layers.merge import add
diff --git a/tensorflow/python/keras/layers/merge.py b/tensorflow/python/keras/layers/merge.py
index 770665c5fb..f295af3fe0 100644
--- a/tensorflow/python/keras/layers/merge.py
+++ b/tensorflow/python/keras/layers/merge.py
@@ -250,6 +250,7 @@ class Add(_Merge):
     return output
 
 
+@tf_export('keras.layers.Subtract')
 class Subtract(_Merge):
   """Layer that subtracts two inputs.
 
@@ -336,6 +337,7 @@ class Maximum(_Merge):
     return output
 
 
+@tf_export('keras.layers.Minimum')
 class Minimum(_Merge):
   """Layer that computes the minimum (element-wise) a list of inputs.
 
@@ -586,6 +588,7 @@ def add(inputs, **kwargs):
   return Add(**kwargs)(inputs)
 
 
+@tf_export('keras.layers.subtract')
 def subtract(inputs, **kwargs):
   """Functional interface to the `Subtract` layer.
 
@@ -656,6 +659,7 @@ def maximum(inputs, **kwargs):
   return Maximum(**kwargs)(inputs)
 
 
+@tf_export('keras.layers.minimum')
 def minimum(inputs, **kwargs):
   """Functional interface to the `Minimum` layer.
 
diff --git a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
index 159cba5fa3..c4d4ce780b 100644
--- a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
+++ b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
@@ -27,7 +27,6 @@ from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gradients_impl
 import tensorflow.python.ops.data_flow_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
-from tensorflow.python.framework import dtypes
 
 
 class DynamicStitchTestBase(object):
diff --git a/tensorflow/python/lib/core/numpy.h b/tensorflow/python/lib/core/numpy.h
index 25322b458b..d4621d61ee 100644
--- a/tensorflow/python/lib/core/numpy.h
+++ b/tensorflow/python/lib/core/numpy.h
@@ -29,7 +29,9 @@ limitations under the License.
 #define NO_IMPORT_ARRAY
 #endif
 
+// Place `<locale>` before <Python.h> to avoid build failure in macOS.
 #include <Python.h>
+#include <locale>
 
 #include "numpy/arrayobject.h"
 #include "numpy/ufuncobject.h"
diff --git a/tensorflow/python/lib/core/py_util.cc b/tensorflow/python/lib/core/py_util.cc
index dcda1f4a44..6b6c82015f 100644
--- a/tensorflow/python/lib/core/py_util.cc
+++ b/tensorflow/python/lib/core/py_util.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include "tensorflow/python/lib/core/py_util.h"
 
+// Place `<locale>` before <Python.h> to avoid build failure in macOS.
 #include <Python.h>
+#include <locale>
 
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/strcat.h"
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index f27d9224c1..a2eae452ae 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -57,6 +57,7 @@ ops.NotDifferentiable('NonMaxSuppression')
 ops.NotDifferentiable('NonMaxSuppressionV2')
 
 
+# pylint: disable=invalid-name
 def _assert(cond, ex_type, msg):
   """A polymorphic assert, works with tensors and boolean expressions.
 
@@ -945,7 +946,7 @@ def resize_images(images,
 
   Resized images will be distorted if their original aspect ratio is not
   the same as `size`.  To avoid distortions see
-  @{tf.image.resize_image_with_crop_or_pad}.
+  @{tf.image.resize_image_with_pad}.
 
   `method` can be one of:
 
@@ -1069,6 +1070,106 @@ def resize_images(images,
     return images
 
 
+@tf_export('image.resize_image_with_pad')
+def resize_image_with_pad(image,
+                          target_height,
+                          target_width,
+                          method=ResizeMethod.BILINEAR):
+  """Resizes and pads an image to a target width and height.
+
+  Resizes an image to a target width and height by keeping
+  the aspect ratio the same without distortion. If the target
+  dimensions don't match the image dimensions, the image
+  is resized and then padded with zeroes to match requested
+  dimensions.
+
+  Args:
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
+    target_height: Target height.
+    target_width: Target width.
+    method: Method to use for resizing image. See `resize_images()`
+
+  Raises:
+    ValueError: if `target_height` or `target_width` are zero or negative.
+
+  Returns:
+    Resized and padded image.
+    If `images` was 4-D, a 4-D float Tensor of shape
+    `[batch, new_height, new_width, channels]`.
+    If `images` was 3-D, a 3-D float Tensor of shape
+    `[new_height, new_width, channels]`.
+  """
+  with ops.name_scope(None, 'resize_image_with_pad', [image]):
+    image = ops.convert_to_tensor(image, name='image')
+    image_shape = image.get_shape()
+    is_batch = True
+    if image_shape.ndims == 3:
+      is_batch = False
+      image = array_ops.expand_dims(image, 0)
+    elif image_shape.ndims is None:
+      is_batch = False
+      image = array_ops.expand_dims(image, 0)
+      image.set_shape([None] * 4)
+    elif image_shape.ndims != 4:
+      raise ValueError('\'image\' must have either 3 or 4 dimensions.')
+
+    assert_ops = _CheckAtLeast3DImage(image, require_static=False)
+    assert_ops += _assert(target_width > 0, ValueError,
+                          'target_width must be > 0.')
+    assert_ops += _assert(target_height > 0, ValueError,
+                          'target_height must be > 0.')
+
+    image = control_flow_ops.with_dependencies(assert_ops, image)
+
+    def max_(x, y):
+      if _is_tensor(x) or _is_tensor(y):
+        return math_ops.maximum(x, y)
+      else:
+        return max(x, y)
+
+    _, height, width, _ = _ImageDimensions(image, rank=4)
+
+    # convert values to float, to ease divisions
+    f_height = math_ops.cast(height, dtype=dtypes.float64)
+    f_width = math_ops.cast(width, dtype=dtypes.float64)
+    f_target_height = math_ops.cast(target_height, dtype=dtypes.float64)
+    f_target_width = math_ops.cast(target_width, dtype=dtypes.float64)
+
+    # Find the ratio by which the image must be adjusted
+    # to fit within the target
+    ratio = max_(f_width / f_target_width, f_height / f_target_height)
+    resized_height_float = f_height / ratio
+    resized_width_float = f_width / ratio
+    resized_height = math_ops.cast(
+        math_ops.floor(resized_height_float), dtype=dtypes.int32)
+    resized_width = math_ops.cast(
+        math_ops.floor(resized_width_float), dtype=dtypes.int32)
+
+    padding_height = (f_target_height - resized_height_float) / 2
+    padding_width = (f_target_width - resized_width_float) / 2
+    f_padding_height = math_ops.floor(padding_height)
+    f_padding_width = math_ops.floor(padding_width)
+    p_height = max_(0, math_ops.cast(f_padding_height, dtype=dtypes.int32))
+    p_width = max_(0, math_ops.cast(f_padding_width, dtype=dtypes.int32))
+
+    # Resize first, then pad to meet requested dimensions
+    resized = resize_images(image, [resized_height, resized_width], method)
+
+    padded = pad_to_bounding_box(resized, p_height, p_width, target_height,
+                                 target_width)
+
+    if padded.get_shape().ndims is None:
+      raise ValueError('padded contains no shape.')
+
+    _ImageDimensions(padded, rank=4)
+
+    if not is_batch:
+      padded = array_ops.squeeze(padded, squeeze_dims=[0])
+
+    return padded
+
+
 @tf_export('image.per_image_standardization')
 def per_image_standardization(image):
   """Linearly scales `image` to have zero mean and unit norm.
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 2a6ab26e96..cf9761803b 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -2680,6 +2680,102 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     self._assertResizeCheckShape(x, x_shape, [3840, 2160], [3840, 2160, 3])
 
 
+class ResizeImageWithPadTest(test_util.TensorFlowTestCase):
+
+  def _ResizeImageWithPad(self, x, target_height, target_width,
+                          use_tensor_inputs):
+    if use_tensor_inputs:
+      target_height = ops.convert_to_tensor(target_height)
+      target_width = ops.convert_to_tensor(target_width)
+      x_tensor = array_ops.placeholder(x.dtype, shape=[None] * x.ndim)
+      feed_dict = {x_tensor: x}
+    else:
+      x_tensor = x
+      feed_dict = {}
+
+    y = image_ops.resize_image_with_pad(x_tensor, target_height,
+                                        target_width)
+    if not use_tensor_inputs:
+      self.assertTrue(y.get_shape().is_fully_defined())
+
+    with self.test_session(use_gpu=True):
+      return y.eval(feed_dict=feed_dict)
+
+  def _assertReturns(self,
+                     x,
+                     x_shape,
+                     y,
+                     y_shape,
+                     use_tensor_inputs_options=None):
+    use_tensor_inputs_options = use_tensor_inputs_options or [False, True]
+    target_height, target_width, _ = y_shape
+    x = np.array(x).reshape(x_shape)
+    y = np.array(y).reshape(y_shape)
+
+    for use_tensor_inputs in use_tensor_inputs_options:
+      y_tf = self._ResizeImageWithPad(x, target_height, target_width,
+                                      use_tensor_inputs)
+      self.assertAllClose(y, y_tf)
+
+  def _assertRaises(self,
+                    x,
+                    x_shape,
+                    target_height,
+                    target_width,
+                    err_msg,
+                    use_tensor_inputs_options=None):
+    use_tensor_inputs_options = use_tensor_inputs_options or [False, True]
+    x = np.array(x).reshape(x_shape)
+
+    for use_tensor_inputs in use_tensor_inputs_options:
+      try:
+        self._ResizeImageWithPad(x, target_height, target_width,
+                                 use_tensor_inputs)
+      except Exception as e:  # pylint: disable=broad-except
+        if err_msg not in str(e):
+          raise
+      else:
+        raise AssertionError("Exception not raised: %s" % err_msg)
+
+  def _assertShapeInference(self, pre_shape, height, width, post_shape):
+    image = array_ops.placeholder(dtypes.float32, shape=pre_shape)
+    y = image_ops.resize_image_with_pad(image, height, width)
+    self.assertEqual(y.get_shape().as_list(), post_shape)
+
+  def testNoOp(self):
+    x_shape = [10, 10, 10]
+    x = np.random.uniform(size=x_shape)
+
+    self._assertReturns(x, x_shape, x, x_shape)
+
+  def testPad(self):
+    # Reduce vertical dimension
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
+    x_shape = [2, 4, 1]
+
+    y = [0, 1, 3, 0]
+    y_shape = [1, 4, 1]
+
+    self._assertReturns(x, x_shape, y, y_shape)
+
+    # Reduce horizontal dimension
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
+    x_shape = [2, 4, 1]
+
+    y = [1, 3, 0, 0]
+    y_shape = [2, 2, 1]
+
+    self._assertReturns(x, x_shape, y, y_shape)
+
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
+    x_shape = [2, 4, 1]
+
+    y = [1, 3]
+    y_shape = [1, 2, 1]
+
+    self._assertReturns(x, x_shape, y, y_shape)
+
+
 class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
 
   def _ResizeImageWithCropOrPad(self, x, target_height, target_width,
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 8417d8a7b1..6b709e5e7f 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -235,6 +235,15 @@ class ApproximateEqualTest(test_util.TensorFlowTestCase):
         z_tf = self.evaluate(math_ops.approximate_equal(x, y, tolerance=0.0001))
         self.assertAllEqual(z, z_tf)
 
+  def testApproximateEqualShape(self):
+    for dtype in [np.float32, np.double]:
+      x = np.array([1, 2], dtype=dtype)
+      y = np.array([[1, 2]], dtype=dtype)
+      # The inputs 'x' and 'y' must have the same shape.
+      with self.assertRaisesRegexp(
+          ValueError, "Shapes must be equal rank, but are 1 and 2"):
+        math_ops.approximate_equal(x, y)
+
 
 class ScalarMulTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py
index d06b0c318d..9a10abfcf7 100644
--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@@ -201,6 +201,8 @@ def einsum(equation, *inputs, **kwargs):
         indices in its subscript, or
       - the input shapes are inconsistent along a particular axis.
   """
+  equation = equation.replace(' ', '')
+
   name = kwargs.pop('name', None)
   if kwargs:
     raise TypeError('invalid keyword arguments for this function: ' + ', '.join(
diff --git a/tensorflow/python/ops/special_math_ops_test.py b/tensorflow/python/ops/special_math_ops_test.py
index 8646e48571..9bc4098d5b 100644
--- a/tensorflow/python/ops/special_math_ops_test.py
+++ b/tensorflow/python/ops/special_math_ops_test.py
@@ -241,6 +241,12 @@ class EinsumTest(test.TestCase):
       'iJ,Jk->ik',
       'iJ,Ki->JK',
       'iJk,Jklm->Jk'
+      'ij, jk, kl -> il',
+      'a, ab, abc -> abc',
+      'ab, ab, cd, cd, ef, ef -> ',
+      'abc, bac',
+      'iJ, Ki -> JK',
+      'iJk, Jklm -> Jk'
   ]
 
   long_cases = [
@@ -249,6 +255,8 @@ class EinsumTest(test.TestCase):
       'ea,fb,gc,hd,abcd->efgh',
       'ea,fb,abcd,gc,hd->efgh',
       'abhe,hidj,jgba,hiab,gab',
+      'efc, dbc, acf, fd -> abe',
+      'abhe, hidj, jgba, hiab, gab',
   ]
 
   invalid_cases = [
@@ -319,7 +327,7 @@ class EinsumTest(test.TestCase):
     input_axes, _, _ = axes.partition('->')
 
     for idx in input_axes.split(','):
-      shape = [all_axes[ax] for ax in idx]
+      shape = [all_axes[ax] for ax in idx if ax.isalpha()]
       input_vals.append(np.random.random(shape))
 
     input_tensors = [constant_op.constant(val) for val in input_vals]
diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py
index 08b7cda73b..8cb6a0537e 100644
--- a/tensorflow/python/ops/state_ops.py
+++ b/tensorflow/python/ops/state_ops.py
@@ -394,7 +394,7 @@ def scatter_add(ref, indices, updates, use_locking=False, name=None):
       A tensor of indices into the first dimension of `ref`.
     updates: A `Tensor`. Must have the same type as `ref`.
       A tensor of updated values to store in `ref`.
-    use_locking: An optional `bool`. Defaults to `True`.
+    use_locking: An optional `bool`. Defaults to `False`.
       If True, the assignment will be protected by a lock;
       otherwise the behavior is undefined, but may exhibit less contention.
     name: A name for the operation (optional).
@@ -458,7 +458,7 @@ def scatter_nd_add(ref, indices, updates, use_locking=False, name=None):
       A tensor of indices into ref.
     updates: A `Tensor`. Must have the same type as `ref`.
       A tensor of updated values to add to ref.
-    use_locking: An optional `bool`. Defaults to `True`.
+    use_locking: An optional `bool`. Defaults to `False`.
       An optional bool. Defaults to True. If True, the assignment will
       be protected by a lock; otherwise the behavior is undefined,
       but may exhibit less contention.
diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py
index c2f0e9d3e6..5b372e82b3 100644
--- a/tensorflow/python/training/checkpoint_utils.py
+++ b/tensorflow/python/training/checkpoint_utils.py
@@ -147,7 +147,7 @@ def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
                            partitioner=lambda shape, dtype: [5, 1])
 
   # Initialize all variables in `new_scope_1` from `old_scope_1`.
-  init_from_checkpoint('/tmp/model.ckpt', {'old_scope_1/', 'new_scope_1'})
+  init_from_checkpoint('/tmp/model.ckpt', {'old_scope_1/': 'new_scope_1'})
 
   # Use names to specify which variables to initialize from checkpoint.
   init_from_checkpoint('/tmp/model.ckpt',
diff --git a/tensorflow/tf_framework_version_script.lds b/tensorflow/tf_framework_version_script.lds
new file mode 100644
index 0000000000..d4977f88c0
--- /dev/null
+++ b/tensorflow/tf_framework_version_script.lds
@@ -0,0 +1,11 @@
+VERS_1.0 {
+  # Hide libjpeg symbols to avoid symbol conflict with OpenCV
+  local:
+    jpeg_*;
+    jinit_*;
+    jdiv_round_up;
+    jround_up;
+    jzero_far;
+    jcopy_*;
+    jsimd_*;
+};
diff --git a/tensorflow/tools/api/golden/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
index 5398d3cf28..e89b4dbffd 100644
--- a/tensorflow/tools/api/golden/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
@@ -176,6 +176,10 @@ tf_module {
     name: "resize_image_with_crop_or_pad"
     argspec: "args=[\'image\', \'target_height\', \'target_width\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "resize_image_with_pad"
+    argspec: "args=[\'image\', \'target_height\', \'target_width\', \'method\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
   member_method {
     name: "resize_images"
     argspec: "args=[\'images\', \'size\', \'method\', \'align_corners\', \'preserve_aspect_ratio\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-minimum.pbtxt
new file mode 100644
index 0000000000..56e32e9d36
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-minimum.pbtxt
@@ -0,0 +1,176 @@
+path: "tensorflow.keras.layers.Minimum"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.merge.Minimum\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-subtract.pbtxt
new file mode 100644
index 0000000000..35ad87ad5d
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-subtract.pbtxt
@@ -0,0 +1,176 @@
+path: "tensorflow.keras.layers.Subtract"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.merge.Subtract\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
index 0df5a1b91e..9d7e5bb8c7 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
@@ -280,6 +280,10 @@ tf_module {
     name: "Maximum"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Minimum"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Multiply"
     mtype: "<type \'type\'>"
@@ -352,6 +356,10 @@ tf_module {
     name: "StackedRNNCells"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Subtract"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "ThresholdedReLU"
     mtype: "<type \'type\'>"
@@ -412,8 +420,16 @@ tf_module {
     name: "maximum"
     argspec: "args=[\'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "minimum"
+    argspec: "args=[\'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
   member_method {
     name: "multiply"
     argspec: "args=[\'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "subtract"
+    argspec: "args=[\'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/ci_build/Dockerfile.cpu.ppc64le b/tensorflow/tools/ci_build/Dockerfile.cpu.ppc64le
new file mode 100644
index 0000000000..f496ac59b6
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.cpu.ppc64le
@@ -0,0 +1,19 @@
+FROM ubuntu:16.04
+
+LABEL maintainer="William Irons <wdirons@us.ibm.com>"
+
+# Copy and run the install scripts.
+COPY install/*.sh /install/
+RUN /install/install_bootstrap_deb_packages.sh
+RUN add-apt-repository -y ppa:openjdk-r/ppa
+RUN /install/install_deb_packages.sh
+RUN apt-get update && apt-get install -y libopenblas-dev
+RUN /install/install_pip_packages.sh
+RUN /install/install_bazel_from_source.sh
+RUN /install/install_proto3.sh
+RUN /install/install_buildifier_from_source.sh
+RUN /install/install_auditwheel.sh
+RUN /install/install_golang_ppc64le.sh
+
+# Set up the master bazelrc configuration file.
+COPY install/.bazelrc /etc/bazel.bazelrc
diff --git a/tensorflow/tools/ci_build/Dockerfile.gpu.ppc64le b/tensorflow/tools/ci_build/Dockerfile.gpu.ppc64le
new file mode 100644
index 0000000000..3eddc56550
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.gpu.ppc64le
@@ -0,0 +1,27 @@
+FROM nvidia/cuda-ppc64le:9.0-cudnn7-devel-ubuntu16.04
+
+LABEL maintainer="William Irons <wdirons@us.ibm.com>"
+
+# In the Ubuntu 16.04 images, cudnn is placed in system paths. Move them to
+# /usr/local/cuda
+RUN cp -P /usr/include/cudnn.h /usr/local/cuda/include
+RUN cp -P /usr/lib/powerpc64le-linux-gnu/libcudnn* /usr/local/cuda/lib64
+
+# Copy and run the install scripts.
+COPY install/*.sh /install/
+ARG DEBIAN_FRONTEND=noninteractive
+RUN /install/install_bootstrap_deb_packages.sh
+RUN add-apt-repository -y ppa:openjdk-r/ppa
+RUN /install/install_deb_packages.sh
+RUN apt-get update && apt-get install -y libopenblas-dev
+RUN /install/install_pip_packages.sh
+RUN /install/install_bazel_from_source.sh
+RUN /install/install_golang_ppc64le.sh
+
+# Set up the master bazelrc configuration file.
+COPY install/.bazelrc /etc/bazel.bazelrc
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+
+# Configure the build for our CUDA configuration.
+ENV TF_NEED_CUDA 1
+ENV TF_CUDA_COMPUTE_CAPABILITIES 3.0
diff --git a/tensorflow/tools/ci_build/ci_build.sh b/tensorflow/tools/ci_build/ci_build.sh
index 1f0fd0387a..f6a50d3d4c 100755
--- a/tensorflow/tools/ci_build/ci_build.sh
+++ b/tensorflow/tools/ci_build/ci_build.sh
@@ -79,7 +79,7 @@ if [[ "${CONTAINER_TYPE}" == "cmake" ]]; then
 fi
 
 # Use nvidia-docker if the container is GPU.
-if [[ "${CONTAINER_TYPE}" == "gpu" ]]; then
+if [[ "${CONTAINER_TYPE}" == gpu* ]]; then
   DOCKER_BINARY="nvidia-docker"
 else
   DOCKER_BINARY="docker"
@@ -99,7 +99,7 @@ BUILD_TAG="${BUILD_TAG:-tf_ci}"
 
 # Add extra params for cuda devices and libraries for GPU container.
 # And clear them if we are not building for GPU.
-if [[ "${CONTAINER_TYPE}" != "gpu" ]]; then
+if [[ "${CONTAINER_TYPE}" != gpu* ]]; then
   GPU_EXTRA_PARAMS=""
 fi
 
diff --git a/tensorflow/tools/ci_build/ci_parameterized_build.sh b/tensorflow/tools/ci_build/ci_parameterized_build.sh
index 90bd8bc3d0..300ba8ea0b 100755
--- a/tensorflow/tools/ci_build/ci_parameterized_build.sh
+++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh
@@ -258,9 +258,9 @@ function set_script_variable() {
 
 
 # Process container type
-if [[ ${CTYPE} == "cpu" ]] || [[ ${CTYPE} == "debian.jessie.cpu" ]]; then
+if [[ ${CTYPE} == cpu* ]] || [[ ${CTYPE} == "debian.jessie.cpu" ]]; then
   :
-elif [[ ${CTYPE} == "gpu" ]]; then
+elif [[ ${CTYPE} == gpu* ]]; then
   set_script_variable TF_NEED_CUDA 1
 
   if [[ $TF_CUDA_CLANG == "1" ]]; then
@@ -418,12 +418,12 @@ if [[ ${TF_BUILD_IS_PIP} == "no_pip" ]] ||
     BAZEL_TARGET=${TF_BUILD_BAZEL_TARGET}
   fi
 
-  if [[ ${CTYPE} == "cpu" ]] || \
+  if [[ ${CTYPE} == cpu* ]] || \
      [[ ${CTYPE} == "debian.jessie.cpu" ]]; then
     # CPU only command, fully parallel.
     NO_PIP_MAIN_CMD="${MAIN_CMD} ${BAZEL_CMD} ${OPT_FLAG} ${EXTRA_ARGS} -- "\
 "${BAZEL_TARGET}"
-  elif [[ ${CTYPE} == "gpu" ]]; then
+  elif [[ ${CTYPE} == gpu* ]]; then
     # GPU only command, run as many jobs as the GPU count only.
     NO_PIP_MAIN_CMD="${BAZEL_CMD} ${OPT_FLAG} "\
 "--local_test_jobs=${TF_GPU_COUNT} "\
diff --git a/tensorflow/tools/ci_build/install/install_bazel_from_source.sh b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
new file mode 100755
index 0000000000..ddad00c5f0
--- /dev/null
+++ b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# This script is to be used to install bzel on non x86_64 systems
+# It will compile bazel from source and install it in /usr/local/bin
+
+# Select bazel version.
+BAZEL_VERSION="0.11.0"
+
+set +e
+local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
+
+if [[ "$local_bazel_ver" == "$BAZEL_VERSION" ]]; then
+  exit 0
+fi
+
+set -e
+
+# Compile bazel from source
+mkdir -p /bazel
+cd /bazel
+
+curl -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-dist.zip
+unzip bazel-$BAZEL_VERSION-dist.zip
+bash ./compile.sh
+cp output/bazel /usr/local/bin/
+rm -rf /bazel
diff --git a/tensorflow/tools/ci_build/install/install_buildifier_from_source.sh b/tensorflow/tools/ci_build/install/install_buildifier_from_source.sh
new file mode 100755
index 0000000000..a93c258fad
--- /dev/null
+++ b/tensorflow/tools/ci_build/install/install_buildifier_from_source.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -e
+BUILDTOOLS_VERSION="0.11.1"
+
+# Clone buildtools
+git clone -b $BUILDTOOLS_VERSION https://github.com/bazelbuild/buildtools
+cd buildtools
+
+# Build buildifier
+bazel build //buildifier
+sudo mv bazel-bin/buildifier/linux*stripped/buildifier /usr/local/bin
+
+# Build buildozer
+bazel build //buildozer
+sudo mv bazel-bin/buildozer/linux*stripped/buildozer /usr/local/bin
diff --git a/tensorflow/tools/ci_build/install/install_golang_ppc64le.sh b/tensorflow/tools/ci_build/install/install_golang_ppc64le.sh
new file mode 100755
index 0000000000..47d23a59b3
--- /dev/null
+++ b/tensorflow/tools/ci_build/install/install_golang_ppc64le.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -ex
+
+GOLANG_URL="https://storage.googleapis.com/golang/go1.10.linux-ppc64le.tar.gz"
+
+sudo mkdir -p /usr/local
+wget -q -O - "${GOLANG_URL}" | sudo tar -C /usr/local -xz
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index fbed4574e0..221b5b80fb 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -110,6 +110,10 @@ pip3 install --upgrade gast
 pip2 install --upgrade termcolor
 pip3 install --upgrade termcolor
 
+# Install last working version of setuptools.
+pip2 install --upgrade setuptools==39.1.0
+pip3 install --upgrade setuptools==39.1.0
+
 # Keras
 pip2 install keras_applications==1.0.2
 pip3 install keras_applications==1.0.2
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index 037fc0e2e1..45a30c6e82 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -81,6 +81,9 @@ pip3.5 install --upgrade astor
 pip3.5 install --upgrade gast
 pip3.5 install --upgrade termcolor
 
+# Install last working version of setuptools.
+pip3.5 install --upgrade setuptools==39.1.0
+
 # Keras
 pip3.5 install keras_applications==1.0.2
 pip3.5 install keras_preprocessing==1.0.1
diff --git a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
index 8fd65a3ee2..d66b2aa18a 100755
--- a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
@@ -97,11 +97,11 @@ pip3 install --upgrade astor
 pip3 install --upgrade gast
 pip3 install --upgrade termcolor
 
+# Install last working version of setuptools.
+pip3 install --upgrade setuptools==39.1.0
+
 # Keras
 pip3.5 install keras_applications==1.0.2
 pip3.5 install keras_preprocessing==1.0.1
 
-# Install last working version of setuptools.
-pip3 install --upgrade setuptools==39.1.0
-
 # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh)
diff --git a/tensorflow/tools/ci_build/linux/gpu/run_mkl.sh b/tensorflow/tools/ci_build/linux/gpu/run_mkl.sh
new file mode 100755
index 0000000000..50ee07e727
--- /dev/null
+++ b/tensorflow/tools/ci_build/linux/gpu/run_mkl.sh
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+
+set -e
+set -x
+
+N_JOBS=$(grep -c ^processor /proc/cpuinfo)
+
+echo ""
+echo "Bazel will use ${N_JOBS} concurrent job(s)."
+echo ""
+
+# Run configure.
+export PYTHON_BIN_PATH=`which python2`
+
+export TF_NEED_CUDA=1
+export TF_CUDA_VERSION=9.0
+export TF_CUDNN_VERSION=7
+export TF_CUDA_COMPUTE_CAPABILITIES=3.7
+
+yes "" | $PYTHON_BIN_PATH configure.py
+
+# Run bazel test command. Double test timeouts to avoid flakes.
+# Setting KMP_BLOCKTIME to 0 lets OpenMP threads to sleep right after parallel execution
+# in an MKL primitive. This reduces the effects of an oversubscription of OpenMP threads
+# caused by executing multiple tests concurrently.
+bazel test --config=cuda --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-benchmark-test \
+  --test_lang_filters=cc,py -k --jobs="${N_JOBS}" \
+  --test_timeout 300,450,1200,3600 --build_tests_only --test_env=KMP_BLOCKTIME=0\
+  --config=mkl --config=opt --test_output=errors --local_test_jobs=8 \
+  --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute -- \
+  //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
+
diff --git a/tensorflow/tools/ci_build/linux/mkl/basic-mkl-gpu-test.sh b/tensorflow/tools/ci_build/linux/mkl/basic-mkl-gpu-test.sh
new file mode 100755
index 0000000000..68354bf7c1
--- /dev/null
+++ b/tensorflow/tools/ci_build/linux/mkl/basic-mkl-gpu-test.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Usage: basic_mkl_test.sh
+
+# Helper function to traverse directories up until given file is found.
+function upsearch () {
+  test / == "$PWD" && return || \
+      test -e "$1" && echo "$PWD" && return || \
+      cd .. && upsearch "$1"
+}
+
+# Set up WORKSPACE.
+WORKSPACE="${WORKSPACE:-$(upsearch WORKSPACE)}"
+
+BUILD_TAG=mkl-gpu-ci-test CI_BUILD_USER_FORCE_BADNAME=yes ${WORKSPACE}/tensorflow/tools/ci_build/ci_build.sh gpu tensorflow/tools/ci_build/linux/gpu/run_mkl.sh
diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index 73dee98bae..cc2288a7fa 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -164,14 +164,17 @@ def get_git_version(git_base_path, git_tag_override):
         "git", str("--git-dir=%s/.git" % git_base_path),
         str("--work-tree=" + git_base_path), "describe", "--long", "--tags"
     ]).strip())
-    if git_tag_override:
+    if git_tag_override and val:
       split_val = val.split("-")
-      if len(split_val) != 3:
+      if len(split_val) < 3:
         raise Exception(
             ("Expected git version in format 'TAG-COMMITS AFTER TAG-HASH' "
              "but got '%s'") % val)
-      split_val[0] = git_tag_override
-      val = bytes("-".join(split_val))
+      # There might be "-" in the tag name. But we can be sure that the final
+      # two "-" are those inserted by the git describe command.
+      abbrev_commit = split_val[-1]
+      val = bytes(
+          "-".join([git_tag_override, "0", abbrev_commit]))
     return val if val else unknown_label
   except (subprocess.CalledProcessError, OSError):
     return unknown_label
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 77f83b77a0..05c23cd3ee 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -130,7 +130,7 @@ genrule(
         "@highwayhash//:LICENSE",
         "@jemalloc//:COPYING",
         "@jpeg//:LICENSE.md",
-        "@libxsmm_archive//:LICENSE",
+        "@libxsmm_archive//:LICENSE.md",
         "@llvm//:LICENSE.TXT",
         "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
@@ -168,7 +168,7 @@ genrule(
         "@highwayhash//:LICENSE",
         "@jemalloc//:COPYING",
         "@jpeg//:LICENSE.md",
-        "@libxsmm_archive//:LICENSE",
+        "@libxsmm_archive//:LICENSE.md",
         "@llvm//:LICENSE.TXT",
         "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 6cfd271968..a0caf42331 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -147,7 +147,7 @@ filegroup(
         "@jemalloc//:COPYING",
         "@jpeg//:LICENSE.md",
         "@kafka//:LICENSE",
-        "@libxsmm_archive//:LICENSE",
+        "@libxsmm_archive//:LICENSE.md",
         "@lmdb//:LICENSE",
         "@local_config_nccl//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index f7e42ce536..9e41514cfa 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -24,9 +24,15 @@ function real_path() {
 function cp_external() {
   local src_dir=$1
   local dest_dir=$2
-  for f in `find "$src_dir" -maxdepth 1 -mindepth 1 ! -name '*local_config_cuda*' ! -name '*local_config_tensorrt*' ! -name '*org_tensorflow*'`; do
-    cp -R "$f" "$dest_dir"
+
+  pushd .
+  cd "$src_dir"
+  for f in `find . ! -type d ! -name '*.py' ! -name '*local_config_cuda*' ! -name '*local_config_tensorrt*' ! -name '*org_tensorflow*'`; do
+    mkdir -p "${dest_dir}/$(dirname ${f})"
+    cp "${f}" "${dest_dir}/$(dirname ${f})/"
   done
+  popd
+
   mkdir -p "${dest_dir}/local_config_cuda/cuda/cuda/"
   cp "${src_dir}/local_config_cuda/cuda/cuda/cuda_config.h" "${dest_dir}/local_config_cuda/cuda/cuda/"
 }
@@ -49,6 +55,8 @@ function prepare_src() {
 
   TMPDIR="$1"
   mkdir -p "$TMPDIR"
+  EXTERNAL_INCLUDES="${TMPDIR}/tensorflow/include/external"
+
   echo $(date) : "=== Preparing sources in dir: ${TMPDIR}"
 
   if [ ! -d bazel-bin/tensorflow ]; then
@@ -66,10 +74,9 @@ function prepare_src() {
     cp -R \
       bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles/org_tensorflow/tensorflow \
       "${TMPDIR}"
-    mkdir "${TMPDIR}/external"
     cp_external \
       bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles \
-      "${TMPDIR}/external"
+      "${EXTERNAL_INCLUDES}/"
     RUNFILES=bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles/org_tensorflow
   else
     RUNFILES=bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow
@@ -78,10 +85,9 @@ function prepare_src() {
       cp -R \
         bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/tensorflow \
         "${TMPDIR}"
-      mkdir "${TMPDIR}/external"
       cp_external \
         bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/external \
-        "${TMPDIR}/external"
+        "${EXTERNAL_INCLUDES}"
       # Copy MKL libs over so they can be loaded at runtime
       so_lib_dir=$(ls $RUNFILES | grep solib) || true
       if [ -n "${so_lib_dir}" ]; then
@@ -96,10 +102,9 @@ function prepare_src() {
       cp -R \
         bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/tensorflow \
         "${TMPDIR}"
-      mkdir "${TMPDIR}/external"
       cp_external \
         bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles \
-        "${TMPDIR}/external"
+        "${EXTERNAL_INCLUDES}"
       # Copy MKL libs over so they can be loaded at runtime
       so_lib_dir=$(ls $RUNFILES | grep solib) || true
       if [ -n "${so_lib_dir}" ]; then
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 253802b959..55cd4f37c6 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -53,7 +53,7 @@ REQUIRED_PACKAGES = [
     'gast >= 0.2.0',
     'numpy >= 1.13.3',
     'six >= 1.10.0',
-    'protobuf >= 3.6.0',
+    'protobuf >= 3.4.0',
     'setuptools <= 39.1.0',
     'tensorboard >= 1.8.0, < 1.9.0',
     'termcolor >= 1.1.0',
@@ -84,7 +84,7 @@ else:
 if 'tf_nightly' in project_name:
   for i, pkg in enumerate(REQUIRED_PACKAGES):
     if 'tensorboard' in pkg:
-      REQUIRED_PACKAGES[i] = 'tb-nightly >= 1.9.0a0, < 1.10.0a0'
+      REQUIRED_PACKAGES[i] = 'tb-nightly >= 1.10.0a0, < 1.11.0a0'
       break
 
 # weakref.finalize and enum were introduced in Python 3.4
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 3a5e0d1163..ba679e0055 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -131,11 +131,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "libxsmm_archive",
       urls = [
-          "https://mirror.bazel.build/github.com/hfp/libxsmm/archive/1.8.1.tar.gz",
-          "https://github.com/hfp/libxsmm/archive/1.8.1.tar.gz",
+          "https://mirror.bazel.build/github.com/hfp/libxsmm/archive/1.9.tar.gz",
+          "https://github.com/hfp/libxsmm/archive/1.9.tar.gz",
       ],
-      sha256 = "2ade869c3f42f23b5263c7d594aa3c7e5e61ac6a3afcaf5d6e42899d2a7986ce",
-      strip_prefix = "libxsmm-1.8.1",
+      sha256 = "cd8532021352b4a0290d209f7f9bfd7c2411e08286a893af3577a43457287bfa",
+      strip_prefix = "libxsmm-1.9",
       build_file = clean_dep("//third_party:libxsmm.BUILD"),
   )
 
@@ -155,12 +155,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "com_googlesource_code_re2",
       urls = [
-          "https://mirror.bazel.build/github.com/google/re2/archive/26cd968b735e227361c9703683266f01e5df7857.tar.gz",
-          "https://github.com/google/re2/archive/26cd968b735e227361c9703683266f01e5df7857.tar.gz",
+          "https://mirror.bazel.build/github.com/google/re2/archive/2018-04-01.tar.gz",
+          "https://github.com/google/re2/archive/2018-04-01.tar.gz",
 
       ],
-      sha256 = "e57eeb837ac40b5be37b2c6197438766e73343ffb32368efea793dfd8b28653b",
-      strip_prefix = "re2-26cd968b735e227361c9703683266f01e5df7857",
+      sha256 = "2f945446b71336e7f5a2bcace1abcf0b23fbba368266c6a1be33de3de3b3c912",
+      strip_prefix = "re2-2018-04-01",
   )
 
   tf_http_archive(
@@ -200,6 +200,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       urls = [
           "https://mirror.bazel.build/www.nasm.us/pub/nasm/releasebuilds/2.12.02/nasm-2.12.02.tar.bz2",
           "http://pkgs.fedoraproject.org/repo/pkgs/nasm/nasm-2.12.02.tar.bz2/d15843c3fb7db39af80571ee27ec6fad/nasm-2.12.02.tar.bz2",
+          "http://www.nasm.us/pub/nasm/releasebuilds/2.12.02/nasm-2.12.02.tar.bz2",
       ],
       sha256 = "00b0891c678c065446ca59bcee64719d0096d54d6886e6e472aeee2e170ae324",
       strip_prefix = "nasm-2.12.02",
@@ -298,11 +299,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "absl_py",
       urls = [
-          "https://mirror.bazel.build/github.com/abseil/abseil-py/archive/ea8c4d2ddbf3fba610c4d613260561699b776db8.tar.gz",
-          "https://github.com/abseil/abseil-py/archive/ea8c4d2ddbf3fba610c4d613260561699b776db8.tar.gz",
+          "https://mirror.bazel.build/github.com/abseil/abseil-py/archive/pypi-v0.2.2.tar.gz",
+          "https://github.com/abseil/abseil-py/archive/pypi-v0.2.2.tar.gz",
       ],
-      sha256 = "c30b48e0d2580ef1412e55c5c0e1dab8db2ee4ab56e2075eccff29c90c7c7059",
-      strip_prefix = "abseil-py-ea8c4d2ddbf3fba610c4d613260561699b776db8",
+      sha256 = "95160f778a62c7a60ddeadc7bf2d83f85a23a27359814aca12cf949e896fa82c",
+      strip_prefix = "abseil-py-pypi-v0.2.2",
   )
 
   tf_http_archive(
@@ -392,12 +393,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
 
   tf_http_archive(
       name = "pcre",
-      sha256 = "ccdf7e788769838f8285b3ee672ed573358202305ee361cfec7a4a4fb005bbc7",
+      sha256 = "69acbc2fbdefb955d42a4c606dfde800c2885711d2979e356c0636efde9ec3b5",
       urls = [
-          "https://mirror.bazel.build/ftp.exim.org/pub/pcre/pcre-8.39.tar.gz",
-          "http://ftp.exim.org/pub/pcre/pcre-8.39.tar.gz",
+          "https://mirror.bazel.build/ftp.exim.org/pub/pcre/pcre-8.42.tar.gz",
+          "http://ftp.exim.org/pub/pcre/pcre-8.42.tar.gz",
       ],
-      strip_prefix = "pcre-8.39",
+      strip_prefix = "pcre-8.42",
       build_file = clean_dep("//third_party:pcre.BUILD"),
   )
 
@@ -415,12 +416,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
 
   tf_http_archive(
       name = "curl",
-      sha256 = "ff3e80c1ca6a068428726cd7dd19037a47cc538ce58ef61c59587191039b2ca6",
+      sha256 = "e9c37986337743f37fd14fe8737f246e97aec94b39d1b71e8a5973f72a9fc4f5",
       urls = [
-          "https://mirror.bazel.build/curl.haxx.se/download/curl-7.49.1.tar.gz",
-          "https://curl.haxx.se/download/curl-7.49.1.tar.gz",
+          "https://mirror.bazel.build/curl.haxx.se/download/curl-7.60.0.tar.gz",
+          "https://curl.haxx.se/download/curl-7.60.0.tar.gz",
       ],
-      strip_prefix = "curl-7.49.1",
+      strip_prefix = "curl-7.60.0",
       build_file = clean_dep("//third_party:curl.BUILD"),
   )
 
@@ -462,22 +463,22 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "lmdb",
       urls = [
-          "https://mirror.bazel.build/github.com/LMDB/lmdb/archive/LMDB_0.9.19.tar.gz",
-          "https://github.com/LMDB/lmdb/archive/LMDB_0.9.19.tar.gz",
+          "https://mirror.bazel.build/github.com/LMDB/lmdb/archive/LMDB_0.9.22.tar.gz",
+          "https://github.com/LMDB/lmdb/archive/LMDB_0.9.22.tar.gz",
       ],
-      sha256 = "108532fb94c6f227558d45be3f3347b52539f0f58290a7bb31ec06c462d05326",
-      strip_prefix = "lmdb-LMDB_0.9.19/libraries/liblmdb",
+      sha256 = "f3927859882eb608868c8c31586bb7eb84562a40a6bf5cc3e13b6b564641ea28",
+      strip_prefix = "lmdb-LMDB_0.9.22/libraries/liblmdb",
       build_file = clean_dep("//third_party:lmdb.BUILD"),
   )
 
   tf_http_archive(
       name = "jsoncpp_git",
       urls = [
-          "https://mirror.bazel.build/github.com/open-source-parsers/jsoncpp/archive/11086dd6a7eba04289944367ca82cea71299ed70.tar.gz",
-          "https://github.com/open-source-parsers/jsoncpp/archive/11086dd6a7eba04289944367ca82cea71299ed70.tar.gz",
+          "https://mirror.bazel.build/github.com/open-source-parsers/jsoncpp/archive/1.8.4.tar.gz",
+          "https://github.com/open-source-parsers/jsoncpp/archive/1.8.4.tar.gz",
       ],
-      sha256 = "07d34db40593d257324ec5fb9debc4dc33f29f8fb44e33a2eeb35503e61d0fe2",
-      strip_prefix = "jsoncpp-11086dd6a7eba04289944367ca82cea71299ed70",
+      sha256 = "c49deac9e0933bcb7044f08516861a2d560988540b23de2ac1ad443b219afdb6",
+      strip_prefix = "jsoncpp-1.8.4",
       build_file = clean_dep("//third_party:jsoncpp.BUILD"),
   )
 
@@ -627,6 +628,16 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       licenses = ["notice"],  # Apache 2.0
   )
 
+  java_import_external(
+      name = "com_squareup_javapoet",
+      jar_sha256 = "5bb5abdfe4366c15c0da3332c57d484e238bd48260d6f9d6acf2b08fdde1efea",
+      jar_urls = [
+          "http://mirror.bazel.build/repo1.maven.org/maven2/com/squareup/javapoet/1.9.0/javapoet-1.9.0.jar",
+          "http://repo1.maven.org/maven2/com/squareup/javapoet/1.9.0/javapoet-1.9.0.jar",
+      ],
+      licenses = ["notice"],  # Apache 2.0
+  )
+
   tf_http_archive(
       name = "com_google_pprof",
       urls = [
@@ -684,11 +695,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
 
   tf_http_archive(
       name = "flatbuffers",
-      strip_prefix = "flatbuffers-971a68110e4fc1bace10fcb6deeb189e7e1a34ce",
-      sha256 = "874088d2ee0d9f8524191f77209556415f03dd44e156276edf19e5b90ceb5f55",
+      strip_prefix = "flatbuffers-1.9.0",
+      sha256 = "5ca5491e4260cacae30f1a5786d109230db3f3a6e5a0eb45d0d0608293d247e3",
       urls = [
-          "https://mirror.bazel.build/github.com/google/flatbuffers/archive/971a68110e4fc1bace10fcb6deeb189e7e1a34ce.tar.gz",
-          "https://github.com/google/flatbuffers/archive/971a68110e4fc1bace10fcb6deeb189e7e1a34ce.tar.gz",
+          "https://mirror.bazel.build/github.com/google/flatbuffers/archive/v1.9.0.tar.gz",
+          "https://github.com/google/flatbuffers/archive/v1.9.0.tar.gz",
       ],
       build_file = clean_dep("//third_party/flatbuffers:flatbuffers.BUILD"),
   )
@@ -793,6 +804,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       actual = "@grpc//:grpc++",
   )
 
+  native.bind(
+      name = "grpc_lib_unsecure",
+      actual = "@grpc//:grpc++_unsecure",
+  )
+
   # Needed by gRPC
   native.bind(
       name = "libssl",
diff --git a/third_party/curl.BUILD b/third_party/curl.BUILD
index 4def6f9489..1638b72161 100644
--- a/third_party/curl.BUILD
+++ b/third_party/curl.BUILD
@@ -7,6 +7,7 @@ exports_files(["COPYING"])
 
 CURL_WIN_COPTS = [
     "/Iexternal/curl/lib",
+    "/DBUILDING_LIBCURL",
     "/DHAVE_CONFIG_H",
     "/DCURL_DISABLE_FTP",
     "/DCURL_DISABLE_NTLM",
@@ -49,6 +50,8 @@ cc_library(
         "lib/curl_addrinfo.c",
         "lib/curl_addrinfo.h",
         "lib/curl_base64.h",
+        "lib/curl_ctype.c",
+        "lib/curl_ctype.h",
         "lib/curl_des.h",
         "lib/curl_endian.h",
         "lib/curl_fnmatch.c",
@@ -75,6 +78,7 @@ cc_library(
         "lib/curl_sec.h",
         "lib/curl_setup.h",
         "lib/curl_setup_once.h",
+        "lib/curl_sha256.h",
         "lib/curl_sspi.c",
         "lib/curl_sspi.h",
         "lib/curl_threads.c",
@@ -134,6 +138,8 @@ cc_library(
         "lib/md5.c",
         "lib/memdebug.c",
         "lib/memdebug.h",
+        "lib/mime.c",
+        "lib/mime.h",
         "lib/mprintf.c",
         "lib/multi.c",
         "lib/multihandle.h",
@@ -153,8 +159,8 @@ cc_library(
         "lib/pop3.h",
         "lib/progress.c",
         "lib/progress.h",
-        "lib/rawstr.c",
-        "lib/rawstr.h",
+        "lib/rand.c",
+        "lib/rand.h",
         "lib/rtsp.c",
         "lib/rtsp.h",
         "lib/security.c",
@@ -162,8 +168,11 @@ cc_library(
         "lib/select.h",
         "lib/sendf.c",
         "lib/sendf.h",
+        "lib/setopt.c",
+        "lib/setopt.h",
         "lib/setup-os400.h",
         "lib/setup-vms.h",
+        "lib/sha256.c",
         "lib/share.c",
         "lib/share.h",
         "lib/sigpipe.h",
@@ -179,10 +188,10 @@ cc_library(
         "lib/splay.c",
         "lib/splay.h",
         "lib/ssh.h",
+        "lib/strcase.c",
+        "lib/strcase.h",
         "lib/strdup.c",
         "lib/strdup.h",
-        "lib/strequal.c",
-        "lib/strequal.h",
         "lib/strerror.c",
         "lib/strerror.h",
         "lib/strtok.c",
@@ -241,13 +250,12 @@ cc_library(
     }),
     hdrs = [
         "include/curl/curl.h",
-        "include/curl/curlbuild.h",
-        "include/curl/curlrules.h",
         "include/curl/curlver.h",
         "include/curl/easy.h",
         "include/curl/mprintf.h",
         "include/curl/multi.h",
         "include/curl/stdcheaders.h",
+        "include/curl/system.h",
         "include/curl/typecheck-gcc.h",
     ],
     copts = select({
@@ -256,6 +264,7 @@ cc_library(
         "//conditions:default": [
             "-Iexternal/curl/lib",
             "-D_GNU_SOURCE",
+            "-DBUILDING_LIBCURL",
             "-DHAVE_CONFIG_H",
             "-DCURL_DISABLE_FTP",
             "-DCURL_DISABLE_NTLM",  # turning it off in configure is not enough
@@ -676,6 +685,7 @@ genrule(
         "#  define SIZEOF_INT 4",
         "#  define SIZEOF_LONG 8",
         "#  define SIZEOF_OFF_T 8",
+        "#  define SIZEOF_CURL_OFF_T 8",
         "#  define SIZEOF_SHORT 2",
         "#  define SIZEOF_SIZE_T 8",
         "#  define SIZEOF_TIME_T 8",
diff --git a/third_party/flatbuffers/flatbuffers.BUILD b/third_party/flatbuffers/flatbuffers.BUILD
index 824c97be60..639dff2cd0 100644
--- a/third_party/flatbuffers/flatbuffers.BUILD
+++ b/third_party/flatbuffers/flatbuffers.BUILD
@@ -98,6 +98,8 @@ cc_binary(
         "grpc/src/compiler/cpp_generator.h",
         "grpc/src/compiler/go_generator.cc",
         "grpc/src/compiler/go_generator.h",
+        "grpc/src/compiler/java_generator.cc",
+        "grpc/src/compiler/java_generator.h",
         "grpc/src/compiler/schema_interface.h",
         "src/flatc_main.cpp",
         "src/idl_gen_cpp.cpp",
diff --git a/third_party/jsoncpp.BUILD b/third_party/jsoncpp.BUILD
index 65f98410b2..cf3cba0555 100644
--- a/third_party/jsoncpp.BUILD
+++ b/third_party/jsoncpp.BUILD
@@ -6,7 +6,6 @@ cc_library(
     name = "jsoncpp",
     srcs = [
         "include/json/assertions.h",
-        "src/lib_json/json_batchallocator.h",
         "src/lib_json/json_reader.cpp",
         "src/lib_json/json_tool.h",
         "src/lib_json/json_value.cpp",
@@ -20,9 +19,13 @@ cc_library(
         "include/json/json.h",
         "include/json/reader.h",
         "include/json/value.h",
+        "include/json/version.h",
         "include/json/writer.h",
     ],
-    copts = ["-DJSON_USE_EXCEPTION=0"],
+    copts = [
+        "-DJSON_USE_EXCEPTION=0",
+        "-DJSON_HAS_INT64",
+    ],
     includes = ["include"],
     visibility = ["//visibility:public"],
     deps = [":private"],
diff --git a/third_party/libxsmm.BUILD b/third_party/libxsmm.BUILD
index 78ed1f4e16..ee49d281ab 100644
--- a/third_party/libxsmm.BUILD
+++ b/third_party/libxsmm.BUILD
@@ -3,7 +3,7 @@
 
 licenses(["notice"])  # BSD 3-clause
 
-exports_files(["LICENSE"])
+exports_files(["LICENSE.md"])
 
 # Arguments to ./scripts/libxsmm_interface.py, see that file for detailed description.
 #  precision: SP & DP
-- 
cgit v1.2.3


From f70dfea26f6ecc9faccea4800ca28ddfc129e79f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 Jun 2018 01:44:59 -0700
Subject: Checking that the NCCL 2 license file is found, see GitHub issue
 19679.

PiperOrigin-RevId: 202613754
---
 configure.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'configure.py')

diff --git a/configure.py b/configure.py
index ad585fa52e..5243e09b24 100644
--- a/configure.py
+++ b/configure.py
@@ -1134,7 +1134,9 @@ def set_tf_nccl_install_path(environ_cp):
 
     nccl_lib_path = os.path.join(nccl_install_path, nccl_lib_path)
     nccl_hdr_path = os.path.join(nccl_install_path, 'include/nccl.h')
-    if os.path.exists(nccl_lib_path) and os.path.exists(nccl_hdr_path):
+    nccl_license_path = os.path.join(nccl_install_path, 'NCCL-SLA.txt')
+    if os.path.exists(nccl_lib_path) and os.path.exists(
+        nccl_hdr_path) and os.path.exists(nccl_license_path):
       # Set NCCL_INSTALL_PATH
       environ_cp['NCCL_INSTALL_PATH'] = nccl_install_path
       write_action_env_to_bazelrc('NCCL_INSTALL_PATH', nccl_install_path)
-- 
cgit v1.2.3


From a7b7aa856f34bf2e44fbeb91d817742c61483618 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 2 Jul 2018 07:41:42 -0700
Subject: Add KinesisDataset support for tensorflow Dataset (#19712)

* Add KinesisDataset support for tensorflow Dataset

This fix is an attempt to add Kinesis support
for tensorflow's Dataset. Kinesis is provided by
AWS as a managed data streaming service. It is
similiar to Apache Kafka, often used in places
where maintaining a independent Kafka cluster on AWS
is not desirable or possible.

This fix adds the Kinesis support for tensorflow Dataset.
Similiar to the Kafka integration in tensorflow,
KinesisDataset outputs tf.string for records.

Test cases have also been added, which could be invoked manually.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Expose KinesisDataset in dataset_ops.cc

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Expose KinesisDataset in python wrapper

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add test cases for KinesisDataset

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Update AWS library include files

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add Bazel BUILD files

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Rename s3_crypto to aws_crypto

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Rename with_s3_support to with_aws_support

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Selectively add kinesis to tensorflow/contrib/BUILD

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Set different partition key and pylint fix.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add missing modules in cmake's python_modules.txt

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Address review feedback

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 configure.py                                       |   6 +-
 tensorflow/BUILD                                   |  16 +-
 tensorflow/contrib/BUILD                           |  18 ++
 tensorflow/contrib/cmake/python_modules.txt        |   2 +
 tensorflow/contrib/kinesis/BUILD                   | 113 +++++++
 tensorflow/contrib/kinesis/__init__.py             |  32 ++
 .../contrib/kinesis/kernels/kinesis_dataset_ops.cc | 359 +++++++++++++++++++++
 tensorflow/contrib/kinesis/ops/dataset_ops.cc      |  42 +++
 .../kinesis/python/kernel_tests/kinesis_test.py    | 139 ++++++++
 .../kinesis/python/ops/kinesis_dataset_ops.py      |  96 ++++++
 .../kinesis/python/ops/kinesis_op_loader.py        |  24 ++
 tensorflow/core/platform/default/build_config.bzl  |   8 +-
 tensorflow/core/platform/s3/BUILD                  |  14 +-
 tensorflow/core/platform/s3/aws_crypto.cc          | 113 +++++++
 tensorflow/core/platform/s3/aws_crypto.h           |  35 ++
 tensorflow/core/platform/s3/s3_crypto.cc           | 113 -------
 tensorflow/core/platform/s3/s3_crypto.h            |  35 --
 tensorflow/core/platform/s3/s3_file_system.cc      |   6 +-
 third_party/aws.BUILD                              |   3 +
 19 files changed, 1000 insertions(+), 174 deletions(-)
 create mode 100644 tensorflow/contrib/kinesis/BUILD
 create mode 100644 tensorflow/contrib/kinesis/__init__.py
 create mode 100644 tensorflow/contrib/kinesis/kernels/kinesis_dataset_ops.cc
 create mode 100644 tensorflow/contrib/kinesis/ops/dataset_ops.cc
 create mode 100644 tensorflow/contrib/kinesis/python/kernel_tests/kinesis_test.py
 create mode 100644 tensorflow/contrib/kinesis/python/ops/kinesis_dataset_ops.py
 create mode 100644 tensorflow/contrib/kinesis/python/ops/kinesis_op_loader.py
 create mode 100644 tensorflow/core/platform/s3/aws_crypto.cc
 create mode 100644 tensorflow/core/platform/s3/aws_crypto.h
 delete mode 100644 tensorflow/core/platform/s3/s3_crypto.cc
 delete mode 100644 tensorflow/core/platform/s3/s3_crypto.h

(limited to 'configure.py')

diff --git a/configure.py b/configure.py
index 5243e09b24..31a83b4a15 100644
--- a/configure.py
+++ b/configure.py
@@ -1449,7 +1449,7 @@ def main():
   setup_python(environ_cp)
 
   if is_windows():
-    environ_cp['TF_NEED_S3'] = '0'
+    environ_cp['TF_NEED_AWS'] = '0'
     environ_cp['TF_NEED_GCP'] = '0'
     environ_cp['TF_NEED_HDFS'] = '0'
     environ_cp['TF_NEED_JEMALLOC'] = '0'
@@ -1473,8 +1473,8 @@ def main():
                 'with_gcp_support', True, 'gcp')
   set_build_var(environ_cp, 'TF_NEED_HDFS', 'Hadoop File System',
                 'with_hdfs_support', True, 'hdfs')
-  set_build_var(environ_cp, 'TF_NEED_S3', 'Amazon S3 File System',
-                'with_s3_support', True, 's3')
+  set_build_var(environ_cp, 'TF_NEED_AWS', 'Amazon AWS Platform',
+                'with_aws_support', True, 'aws')
   set_build_var(environ_cp, 'TF_NEED_KAFKA', 'Apache Kafka Platform',
                 'with_kafka_support', True, 'kafka')
   set_build_var(environ_cp, 'TF_ENABLE_XLA', 'XLA JIT', 'with_xla_support',
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index f362900387..51eea94847 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -216,8 +216,8 @@ config_setting(
 )
 
 config_setting(
-    name = "with_s3_support",
-    define_values = {"with_s3_support": "true"},
+    name = "with_aws_support",
+    define_values = {"with_aws_support": "true"},
     visibility = ["//visibility:public"],
 )
 
@@ -244,8 +244,8 @@ config_setting(
 )
 
 config_setting(
-    name = "with_s3_support_windows_override",
-    define_values = {"with_s3_support": "true"},
+    name = "with_aws_support_windows_override",
+    define_values = {"with_aws_support": "true"},
     values = {"cpu": "x64_windows"},
     visibility = ["//visibility:public"],
 )
@@ -279,8 +279,8 @@ config_setting(
 )
 
 config_setting(
-    name = "with_s3_support_android_override",
-    define_values = {"with_s3_support": "true"},
+    name = "with_aws_support_android_override",
+    define_values = {"with_aws_support": "true"},
     values = {"crosstool_top": "//external:android/crosstool"},
     visibility = ["//visibility:public"],
 )
@@ -300,8 +300,8 @@ config_setting(
 )
 
 config_setting(
-    name = "with_s3_support_ios_override",
-    define_values = {"with_s3_support": "true"},
+    name = "with_aws_support_ios_override",
+    define_values = {"with_aws_support": "true"},
     values = {"crosstool_top": "//tools/osx/crosstool:crosstool"},
     visibility = ["//visibility:public"],
 )
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index e2c85f3995..fa69efa3f6 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -122,6 +122,12 @@ py_library(
             "//tensorflow/contrib/kafka",
         ],
         "//conditions:default": [],
+    }) + select({
+        "//tensorflow:with_aws_support_windows_override": [],
+        "//tensorflow:with_aws_support": [
+            "//tensorflow/contrib/kinesis",
+        ],
+        "//conditions:default": [],
     }) + if_not_windows_cuda([
         "//tensorflow/contrib/fused_conv:fused_conv_py",  # unresolved symbols, need to export more symbols
     ]) + if_not_windows([
@@ -157,6 +163,12 @@ cc_library(
             "//tensorflow/contrib/kafka:dataset_kernels",
         ],
         "//conditions:default": [],
+    }) + select({
+        "//tensorflow:with_aws_support_windows_override": [],
+        "//tensorflow:with_aws_support": [
+            "//tensorflow/contrib/kinesis:dataset_kernels",
+        ],
+        "//conditions:default": [],
     }),
 )
 
@@ -186,5 +198,11 @@ cc_library(
             "//tensorflow/contrib/kafka:dataset_ops_op_lib",
         ],
         "//conditions:default": [],
+    }) + select({
+        "//tensorflow:with_aws_support_windows_override": [],
+        "//tensorflow:with_aws_support": [
+            "//tensorflow/contrib/kinesis:dataset_ops_op_lib",
+        ],
+        "//conditions:default": [],
     }),
 )
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index 8ff6ebedab..a5eba5a8c9 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -240,6 +240,8 @@ tensorflow/contrib/keras/api/keras/wrappers/scikit_learn
 tensorflow/contrib/kernel_methods
 tensorflow/contrib/kernel_methods/python
 tensorflow/contrib/kernel_methods/python/mappers
+tensorflow/contrib/kinesis/python
+tensorflow/contrib/kinesis/python/ops
 tensorflow/contrib/kfac
 tensorflow/contrib/kfac/examples
 tensorflow/contrib/kfac/python
diff --git a/tensorflow/contrib/kinesis/BUILD b/tensorflow/contrib/kinesis/BUILD
new file mode 100644
index 0000000000..25443d0ad4
--- /dev/null
+++ b/tensorflow/contrib/kinesis/BUILD
@@ -0,0 +1,113 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_custom_op_library",
+    "tf_custom_op_py_library",
+    "tf_gen_op_libs",
+    "tf_gen_op_wrapper_py",
+    "tf_kernel_library",
+    "tf_py_test",
+)
+
+py_library(
+    name = "kinesis",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dataset_ops",
+    ],
+)
+
+tf_custom_op_library(
+    name = "_dataset_ops.so",
+    srcs = ["ops/dataset_ops.cc"],
+    deps = [":dataset_kernels"],
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["dataset_ops"],
+)
+
+cc_library(
+    name = "dataset_kernels",
+    srcs = [
+        "kernels/kinesis_dataset_ops.cc",
+    ],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core/platform/s3:aws_crypto",
+        "//third_party/eigen3",
+        "@aws",
+        "@protobuf_archive//:protobuf_headers",
+    ],
+    alwayslink = 1,
+)
+
+py_library(
+    name = "dataset_ops",
+    srcs = [
+        "python/ops/kinesis_dataset_ops.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":kinesis_op_loader",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_dataset_ops",
+    out = "python/ops/gen_dataset_ops.py",
+    deps = ["//tensorflow/contrib/kinesis:dataset_ops_op_lib"],
+)
+
+tf_kernel_library(
+    name = "dataset_ops_kernels",
+    deps = [
+        ":dataset_kernels",
+        "//tensorflow/core:framework",
+    ],
+    alwayslink = 1,
+)
+
+tf_custom_op_py_library(
+    name = "kinesis_op_loader",
+    srcs = ["python/ops/kinesis_op_loader.py"],
+    dso = ["//tensorflow/contrib/kinesis:_dataset_ops.so"],
+    kernels = [
+        ":dataset_ops_kernels",
+        "//tensorflow/contrib/kinesis:dataset_ops_op_lib",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":gen_dataset_ops",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:platform",
+    ],
+)
+
+tf_py_test(
+    name = "kinesis_test",
+    srcs = ["python/kernel_tests/kinesis_test.py"],
+    additional_deps = [
+        ":kinesis",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+    tags = [
+        "manual",
+        "no_windows",
+        "notap",
+    ],
+)
diff --git a/tensorflow/contrib/kinesis/__init__.py b/tensorflow/contrib/kinesis/__init__.py
new file mode 100644
index 0000000000..3824b8ae75
--- /dev/null
+++ b/tensorflow/contrib/kinesis/__init__.py
@@ -0,0 +1,32 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Kinesis Dataset.
+
+@@KinesisDataset
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.kinesis.python.ops.kinesis_dataset_ops import KinesisDataset
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    "KinesisDataset",
+]
+
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/kinesis/kernels/kinesis_dataset_ops.cc b/tensorflow/contrib/kinesis/kernels/kinesis_dataset_ops.cc
new file mode 100644
index 0000000000..3212279c4c
--- /dev/null
+++ b/tensorflow/contrib/kinesis/kernels/kinesis_dataset_ops.cc
@@ -0,0 +1,359 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <aws/core/Aws.h>
+#include <aws/core/config/AWSProfileConfigLoader.h>
+#include <aws/core/utils/Outcome.h>
+#include <aws/kinesis/KinesisClient.h>
+#include <aws/kinesis/model/DescribeStreamRequest.h>
+#include <aws/kinesis/model/GetRecordsRequest.h>
+#include <aws/kinesis/model/GetShardIteratorRequest.h>
+#include <aws/kinesis/model/PutRecordsRequest.h>
+#include <aws/kinesis/model/ShardIteratorType.h>
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/platform/s3/aws_crypto.h"
+
+namespace tensorflow {
+namespace {
+
+Aws::Client::ClientConfiguration* InitializeDefaultClientConfig() {
+  static Aws::Client::ClientConfiguration config;
+  const char* endpoint = getenv("KINESIS_ENDPOINT");
+  if (endpoint) {
+    config.endpointOverride = Aws::String(endpoint);
+  }
+  const char* region = getenv("AWS_REGION");
+  if (region) {
+    config.region = Aws::String(region);
+  } else {
+    // Load config file (e.g., ~/.aws/config) only if AWS_SDK_LOAD_CONFIG
+    // is set with a truthy value.
+    const char* load_config_env = getenv("AWS_SDK_LOAD_CONFIG");
+    string load_config =
+        load_config_env ? str_util::Lowercase(load_config_env) : "";
+    if (load_config == "true" || load_config == "1") {
+      Aws::String config_file;
+      // If AWS_CONFIG_FILE is set then use it, otherwise use ~/.aws/config.
+      const char* config_file_env = getenv("AWS_CONFIG_FILE");
+      if (config_file_env) {
+        config_file = config_file_env;
+      } else {
+        const char* home_env = getenv("HOME");
+        if (home_env) {
+          config_file = home_env;
+          config_file += "/.aws/config";
+        }
+      }
+      Aws::Config::AWSConfigFileProfileConfigLoader loader(config_file);
+      // Load the configuration. If successful, get the region.
+      // If the load is not successful, then generate a warning.
+      if (loader.Load()) {
+        auto profiles = loader.GetProfiles();
+        if (!profiles["default"].GetRegion().empty()) {
+          config.region = profiles["default"].GetRegion();
+        }
+      } else {
+        LOG(WARNING) << "Failed to load the profile in " << config_file << ".";
+      }
+    }
+  }
+  const char* use_https = getenv("KINESIS_USE_HTTPS");
+  if (use_https) {
+    if (use_https[0] == '0') {
+      config.scheme = Aws::Http::Scheme::HTTP;
+    } else {
+      config.scheme = Aws::Http::Scheme::HTTPS;
+    }
+  }
+  const char* verify_ssl = getenv("KINESIS_VERIFY_SSL");
+  if (verify_ssl) {
+    if (verify_ssl[0] == '0') {
+      config.verifySSL = false;
+    } else {
+      config.verifySSL = true;
+    }
+  }
+  const char* connect_timeout = getenv("KINESIS_CONNECT_TIMEOUT_MSEC");
+  if (connect_timeout) {
+    int64 timeout;
+
+    if (strings::safe_strto64(connect_timeout, &timeout)) {
+      config.connectTimeoutMs = timeout;
+    }
+  }
+  const char* request_timeout = getenv("KINESIS_REQUEST_TIMEOUT_MSEC");
+  if (request_timeout) {
+    int64 timeout;
+
+    if (strings::safe_strto64(request_timeout, &timeout)) {
+      config.requestTimeoutMs = timeout;
+    }
+  }
+
+  return &config;
+}
+
+Aws::Client::ClientConfiguration& GetDefaultClientConfig() {
+  static Aws::Client::ClientConfiguration* config =
+      InitializeDefaultClientConfig();
+  return *config;
+}
+
+static mutex mu(LINKER_INITIALIZED);
+static unsigned count(0);
+void AwsInitAPI() {
+  mutex_lock lock(mu);
+  count++;
+  if (count == 1) {
+    Aws::SDKOptions options;
+    options.cryptoOptions.sha256Factory_create_fn = []() {
+      return Aws::MakeShared<AWSSHA256Factory>(AWSCryptoAllocationTag);
+    };
+    options.cryptoOptions.sha256HMACFactory_create_fn = []() {
+      return Aws::MakeShared<AWSSHA256HmacFactory>(AWSCryptoAllocationTag);
+    };
+    Aws::InitAPI(options);
+  }
+}
+void AwsShutdownAPI() {
+  mutex_lock lock(mu);
+  count--;
+  if (count == 0) {
+    Aws::SDKOptions options;
+    Aws::ShutdownAPI(options);
+  }
+}
+void ShutdownClient(Aws::Kinesis::KinesisClient* client) {
+  if (client != nullptr) {
+    delete client;
+    AwsShutdownAPI();
+  }
+}
+}
+class KinesisDatasetOp : public DatasetOpKernel {
+ public:
+  using DatasetOpKernel::DatasetOpKernel;
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
+    std::string stream = "";
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument<std::string>(ctx, "stream", &stream));
+    std::string shard = "";
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<std::string>(ctx, "shard", &shard));
+    bool read_indefinitely = true;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<bool>(ctx, "read_indefinitely",
+                                                  &read_indefinitely));
+    int64 interval = -1;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "interval", &interval));
+    OP_REQUIRES(ctx, (interval > 0),
+                errors::InvalidArgument(
+                    "Interval value should be large than 0, got ", interval));
+    *output = new Dataset(ctx, stream, shard, read_indefinitely, interval);
+  }
+
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const string& stream, const string& shard,
+            const bool read_indefinitely, const int64 interval)
+        : GraphDatasetBase(ctx),
+          stream_(stream),
+          shard_(shard),
+          read_indefinitely_(read_indefinitely),
+          interval_(interval) {}
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::Kinesis")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      static DataTypeVector* dtypes = new DataTypeVector({DT_STRING});
+      return *dtypes;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      static std::vector<PartialTensorShape>* shapes =
+          new std::vector<PartialTensorShape>({{}});
+      return *shapes;
+    }
+
+    string DebugString() const override { return "KinesisDatasetOp::Dataset"; }
+
+   protected:
+    Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* stream = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(stream_, &stream));
+      Node* shard = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(shard_, &shard));
+      Node* read_indefinitely = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(read_indefinitely_, &read_indefinitely));
+      Node* interval = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(interval_, &interval));
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this, {stream, shard, read_indefinitely, interval}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params),
+            client_(nullptr, ShutdownClient) {}
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        if (iterator_ == "") {
+          TF_RETURN_IF_ERROR(SetupStreamsLocked());
+        }
+        do {
+          Aws::Kinesis::Model::GetRecordsRequest request;
+          auto outcome = client_->GetRecords(
+              request.WithShardIterator(iterator_).WithLimit(1));
+          if (!outcome.IsSuccess()) {
+            return errors::Unknown(outcome.GetError().GetExceptionName(), ": ",
+                                   outcome.GetError().GetMessage());
+          }
+          if (outcome.GetResult().GetRecords().size() == 0) {
+            // If no records were returned then nothing is available at the
+            // moment.
+            if (!dataset()->read_indefinitely_) {
+              *end_of_sequence = true;
+              return Status::OK();
+            }
+            // Continue the loop after a period of time.
+            ctx->env()->SleepForMicroseconds(dataset()->interval_);
+            continue;
+          }
+          if (outcome.GetResult().GetRecords().size() != 1) {
+            return errors::Unknown("invalid number of records ",
+                                   outcome.GetResult().GetRecords().size(),
+                                   " returned");
+          }
+
+          iterator_ = outcome.GetResult().GetNextShardIterator();
+
+          const auto& data = outcome.GetResult().GetRecords()[0].GetData();
+          StringPiece value(
+              reinterpret_cast<const char*>(data.GetUnderlyingData()),
+              data.GetLength());
+          Tensor value_tensor(ctx->allocator({}), DT_STRING, {});
+          value_tensor.scalar<std::string>()() = std::string(value);
+          out_tensors->emplace_back(std::move(value_tensor));
+
+          *end_of_sequence = false;
+          return Status::OK();
+        } while (true);
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        return errors::Unimplemented("SaveInternal is currently not supported");
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        return errors::Unimplemented(
+            "RestoreInternal is currently not supported");
+      }
+
+     private:
+      // Sets up Kinesis streams to read from.
+      Status SetupStreamsLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        AwsInitAPI();
+        client_.reset(
+            new Aws::Kinesis::KinesisClient(GetDefaultClientConfig()));
+
+        Aws::Kinesis::Model::DescribeStreamRequest request;
+        auto outcome = client_->DescribeStream(
+            request.WithStreamName(dataset()->stream_.c_str()));
+        if (!outcome.IsSuccess()) {
+          return errors::Unknown(outcome.GetError().GetExceptionName(), ": ",
+                                 outcome.GetError().GetMessage());
+        }
+        Aws::String shard;
+        Aws::String sequence;
+        if (dataset()->shard_ == "") {
+          if (outcome.GetResult().GetStreamDescription().GetShards().size() !=
+              1) {
+            return errors::InvalidArgument(
+                "shard has to be provided unless the stream only have one "
+                "shard, there are ",
+                outcome.GetResult().GetStreamDescription().GetShards().size(),
+                " shards in stream ", dataset()->stream_);
+          }
+          shard = outcome.GetResult()
+                      .GetStreamDescription()
+                      .GetShards()[0]
+                      .GetShardId();
+          sequence = outcome.GetResult()
+                         .GetStreamDescription()
+                         .GetShards()[0]
+                         .GetSequenceNumberRange()
+                         .GetStartingSequenceNumber();
+        } else {
+          for (const auto& entry :
+               outcome.GetResult().GetStreamDescription().GetShards()) {
+            if (entry.GetShardId() == dataset()->shard_.c_str()) {
+              shard = entry.GetShardId();
+              sequence =
+                  entry.GetSequenceNumberRange().GetStartingSequenceNumber();
+              break;
+            }
+          }
+          if (shard == "") {
+            return errors::InvalidArgument("no shard ", dataset()->shard_,
+                                           " in stream ", dataset()->stream_);
+          }
+        }
+
+        Aws::Kinesis::Model::GetShardIteratorRequest iterator_request;
+        auto iterator_outcome = client_->GetShardIterator(
+            iterator_request.WithStreamName(dataset()->stream_.c_str())
+                .WithShardId(shard)
+                .WithShardIteratorType(
+                    Aws::Kinesis::Model::ShardIteratorType::AT_SEQUENCE_NUMBER)
+                .WithStartingSequenceNumber(sequence));
+        if (!iterator_outcome.IsSuccess()) {
+          return errors::Unknown(iterator_outcome.GetError().GetExceptionName(),
+                                 ": ",
+                                 iterator_outcome.GetError().GetMessage());
+        }
+        iterator_ = iterator_outcome.GetResult().GetShardIterator();
+        return Status::OK();
+      }
+
+      mutex mu_;
+      Aws::String iterator_ GUARDED_BY(mu_);
+      std::unique_ptr<Aws::Kinesis::KinesisClient, decltype(&ShutdownClient)>
+          client_ GUARDED_BY(mu_);
+    };
+
+    const std::string stream_;
+    const std::string shard_;
+    const bool read_indefinitely_;
+    const int64 interval_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("KinesisDataset").Device(DEVICE_CPU),
+                        KinesisDatasetOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/kinesis/ops/dataset_ops.cc b/tensorflow/contrib/kinesis/ops/dataset_ops.cc
new file mode 100644
index 0000000000..54204513cf
--- /dev/null
+++ b/tensorflow/contrib/kinesis/ops/dataset_ops.cc
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_OP("KinesisDataset")
+    .Input("stream: string")
+    .Input("shard: string")
+    .Input("read_indefinitely: bool")
+    .Input("interval: int64")
+    .Output("handle: variant")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that emits the messages of one or more Kinesis topics.
+
+stream: A `tf.string` tensor containing the name of the stream.
+shard: A `tf.string` tensor containing the id of the shard.
+read_indefinitely: If `True`, the Kinesis dataset will keep retry
+  again on `EOF` after the `interval` period. If `False`, then
+  the dataset will stop on `EOF`. The default value is `True`.
+interval: The interval for the Kinesis Client to wait before
+  it tries to get records again (in millisecond).
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/kinesis/python/kernel_tests/kinesis_test.py b/tensorflow/contrib/kinesis/python/kernel_tests/kinesis_test.py
new file mode 100644
index 0000000000..7289b45c50
--- /dev/null
+++ b/tensorflow/contrib/kinesis/python/kernel_tests/kinesis_test.py
@@ -0,0 +1,139 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License.  You may obtain a copy of
+# the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations under
+# the License.
+# ==============================================================================
+"""Tests for KinesisDataset.
+NOTE: boto3 is needed and the test has to be invoked manually:
+```
+$ bazel test -s --verbose_failures --config=opt \
+    --action_env=AWS_ACCESS_KEY_ID=XXXXXX       \
+    --action_env=AWS_SECRET_ACCESS_KEY=XXXXXX   \
+    //tensorflow/contrib/kinesis:kinesis_test
+```
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import boto3
+
+from tensorflow.contrib.kinesis.python.ops import kinesis_dataset_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class KinesisDatasetTest(test.TestCase):
+
+  def testKinesisDatasetOneShard(self):
+    client = boto3.client('kinesis', region_name='us-east-1')
+
+    # Setup the Kinesis with 1 shard.
+    stream_name = "tf_kinesis_test_1"
+    client.create_stream(StreamName=stream_name, ShardCount=1)
+    # Wait until stream exists, default is 10 * 18 seconds.
+    client.get_waiter('stream_exists').wait(StreamName=stream_name)
+    for i in range(10):
+      data = "D" + str(i)
+      client.put_record(
+          StreamName=stream_name, Data=data, PartitionKey="TensorFlow" + str(i))
+
+    stream = array_ops.placeholder(dtypes.string, shape=[])
+    num_epochs = array_ops.placeholder(dtypes.int64, shape=[])
+    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
+
+    repeat_dataset = kinesis_dataset_ops.KinesisDataset(
+        stream, read_indefinitely=False).repeat(num_epochs)
+    batch_dataset = repeat_dataset.batch(batch_size)
+
+    iterator = iterator_ops.Iterator.from_structure(batch_dataset.output_types)
+    init_op = iterator.make_initializer(repeat_dataset)
+    init_batch_op = iterator.make_initializer(batch_dataset)
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      # Basic test: read from shard 0 of stream 1.
+      sess.run(init_op, feed_dict={stream: stream_name, num_epochs: 1})
+      for i in range(10):
+        self.assertEqual("D" + str(i), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+    client.delete_stream(StreamName=stream_name)
+    # Wait until stream deleted, default is 10 * 18 seconds.
+    client.get_waiter('stream_not_exists').wait(StreamName=stream_name)
+
+  def testKinesisDatasetTwoShards(self):
+    client = boto3.client('kinesis', region_name='us-east-1')
+
+    # Setup the Kinesis with 2 shards.
+    stream_name = "tf_kinesis_test_2"
+    client.create_stream(StreamName=stream_name, ShardCount=2)
+    # Wait until stream exists, default is 10 * 18 seconds.
+    client.get_waiter('stream_exists').wait(StreamName=stream_name)
+
+    for i in range(10):
+      data = "D" + str(i)
+      client.put_record(
+          StreamName=stream_name, Data=data, PartitionKey="TensorFlow" + str(i))
+    response = client.describe_stream(StreamName=stream_name)
+    shard_id_0 = response["StreamDescription"]["Shards"][0]["ShardId"]
+    shard_id_1 = response["StreamDescription"]["Shards"][1]["ShardId"]
+
+    stream = array_ops.placeholder(dtypes.string, shape=[])
+    shard = array_ops.placeholder(dtypes.string, shape=[])
+    num_epochs = array_ops.placeholder(dtypes.int64, shape=[])
+    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
+
+    repeat_dataset = kinesis_dataset_ops.KinesisDataset(
+        stream, shard, read_indefinitely=False).repeat(num_epochs)
+    batch_dataset = repeat_dataset.batch(batch_size)
+
+    iterator = iterator_ops.Iterator.from_structure(batch_dataset.output_types)
+    init_op = iterator.make_initializer(repeat_dataset)
+    init_batch_op = iterator.make_initializer(batch_dataset)
+    get_next = iterator.get_next()
+
+    data = list()
+    with self.test_session() as sess:
+      # Basic test: read from shard 0 of stream 2.
+      sess.run(
+          init_op, feed_dict={
+              stream: stream_name, shard: shard_id_0, num_epochs: 1})
+      with self.assertRaises(errors.OutOfRangeError):
+        # Use range(11) to guarantee the OutOfRangeError.
+        for i in range(11):
+          data.append(sess.run(get_next))
+
+      # Basic test: read from shard 1 of stream 2.
+      sess.run(
+          init_op, feed_dict={
+              stream: stream_name, shard: shard_id_1, num_epochs: 1})
+      with self.assertRaises(errors.OutOfRangeError):
+        # Use range(11) to guarantee the OutOfRangeError.
+        for i in range(11):
+          data.append(sess.run(get_next))
+
+    data.sort()
+    self.assertEqual(data, ["D" + str(i) for i in range(10)])
+
+    client.delete_stream(StreamName=stream_name)
+    # Wait until stream deleted, default is 10 * 18 seconds.
+    client.get_waiter('stream_not_exists').wait(StreamName=stream_name)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/kinesis/python/ops/kinesis_dataset_ops.py b/tensorflow/contrib/kinesis/python/ops/kinesis_dataset_ops.py
new file mode 100644
index 0000000000..ca2df95ba4
--- /dev/null
+++ b/tensorflow/contrib/kinesis/python/ops/kinesis_dataset_ops.py
@@ -0,0 +1,96 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Kinesis Dataset."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.kinesis.python.ops import kinesis_op_loader  # pylint: disable=unused-import
+from tensorflow.contrib.kinesis.python.ops import gen_dataset_ops
+from tensorflow.python.data.ops.dataset_ops import Dataset
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+
+
+class KinesisDataset(Dataset):
+  """A Kinesis Dataset that consumes the message.
+
+  Kinesis is a managed service provided by AWS for data streaming.
+  This dataset reads messages from Kinesis with each message presented
+  as a `tf.string`.
+
+  For example, we can construct and use the KinesisDataset as follows:
+  ```python
+  dataset = tf.contrib.kinesis.KinesisDataset(
+      "kinesis_stream_name", read_indefinitely=False)
+  next = dataset.make_one_shot_iterator().get_next()
+  with tf.Session() as sess:
+    while True:
+      try:
+        print(sess.run(nxt))
+      except tf.errors.OutOfRangeError:
+        break
+  ```
+
+  Since Kinesis is a data streaming service, data may not be available
+  at the time it is being read. The argument `read_indefinitely` is
+  used to control the behavior in this situation. If `read_indefinitely`
+  is `True`, then `KinesisDataset` will keep retrying to retrieve data
+  from the stream. If `read_indefinitely` is `False`, an `OutOfRangeError`
+  is returned immediately instead.
+  """
+
+  def __init__(self,
+               stream,
+               shard="",
+               read_indefinitely=True,
+               interval=100000):
+    """Create a KinesisDataset.
+
+    Args:
+      stream: A `tf.string` tensor containing the name of the stream.
+      shard: A `tf.string` tensor containing the id of the shard.
+      read_indefinitely: If `True`, the Kinesis dataset will keep retry
+        again on `EOF` after the `interval` period. If `False`, then
+        the dataset will stop on `EOF`. The default value is `True`.
+      interval: The interval for the Kinesis Client to wait before
+        it tries to get records again (in millisecond).
+    """
+    super(KinesisDataset, self).__init__()
+    self._stream = ops.convert_to_tensor(
+        stream, dtype=dtypes.string, name="stream")
+    self._shard = ops.convert_to_tensor(
+        shard, dtype=dtypes.string, name="shard")
+    self._read_indefinitely = ops.convert_to_tensor(
+        read_indefinitely, dtype=dtypes.bool, name="read_indefinitely")
+    self._interval = ops.convert_to_tensor(
+        interval, dtype=dtypes.int64, name="interval")
+
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.kinesis_dataset(
+        self._stream, self._shard, self._read_indefinitely, self._interval)
+
+  @property
+  def output_classes(self):
+    return ops.Tensor
+
+  @property
+  def output_shapes(self):
+    return tensor_shape.scalar()
+
+  @property
+  def output_types(self):
+    return dtypes.string
diff --git a/tensorflow/contrib/kinesis/python/ops/kinesis_op_loader.py b/tensorflow/contrib/kinesis/python/ops/kinesis_op_loader.py
new file mode 100644
index 0000000000..c9ce9f3646
--- /dev/null
+++ b/tensorflow/contrib/kinesis/python/ops/kinesis_op_loader.py
@@ -0,0 +1,24 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python helper for loading kinesis ops and kernels."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.util import loader
+from tensorflow.python.platform import resource_loader
+
+_dataset_ops = loader.load_op_library(
+    resource_loader.get_path_to_datafile("../../_dataset_ops.so"))
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 66ccd81e41..28891320c4 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -620,10 +620,10 @@ def tf_additional_core_deps():
       ],
       "//conditions:default": [],
   }) + select({
-      "//tensorflow:with_s3_support_windows_override": [],
-      "//tensorflow:with_s3_support_android_override": [],
-      "//tensorflow:with_s3_support_ios_override": [],
-      "//tensorflow:with_s3_support": [
+      "//tensorflow:with_aws_support_windows_override": [],
+      "//tensorflow:with_aws_support_android_override": [],
+      "//tensorflow:with_aws_support_ios_override": [],
+      "//tensorflow:with_aws_support": [
           "//tensorflow/core/platform/s3:s3_file_system",
       ],
       "//conditions:default": [],
diff --git a/tensorflow/core/platform/s3/BUILD b/tensorflow/core/platform/s3/BUILD
index 21038cfeb1..41184b6fd9 100644
--- a/tensorflow/core/platform/s3/BUILD
+++ b/tensorflow/core/platform/s3/BUILD
@@ -16,10 +16,10 @@ load(
 tf_cc_binary(
     name = "s3_file_system.so",
     srcs = [
+        "aws_crypto.cc",
+        "aws_crypto.h",
         "aws_logging.cc",
         "aws_logging.h",
-        "s3_crypto.cc",
-        "s3_crypto.h",
         "s3_file_system.cc",
         "s3_file_system.h",
     ],
@@ -40,16 +40,14 @@ tf_cc_binary(
 )
 
 cc_library(
-    name = "s3_crypto",
+    name = "aws_crypto",
     srcs = [
-        "s3_crypto.cc",
+        "aws_crypto.cc",
     ],
     hdrs = [
-        "s3_crypto.h",
+        "aws_crypto.h",
     ],
     deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
         "@aws",
         "@boringssl//:crypto",
     ],
@@ -81,8 +79,8 @@ cc_library(
         "s3_file_system.h",
     ],
     deps = [
+        ":aws_crypto",
         ":aws_logging",
-        ":s3_crypto",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "@aws",
diff --git a/tensorflow/core/platform/s3/aws_crypto.cc b/tensorflow/core/platform/s3/aws_crypto.cc
new file mode 100644
index 0000000000..90e46d6c1d
--- /dev/null
+++ b/tensorflow/core/platform/s3/aws_crypto.cc
@@ -0,0 +1,113 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/platform/s3/aws_crypto.h"
+#include <openssl/hmac.h>
+#include <openssl/sha.h>
+
+#include <aws/core/utils/crypto/HashResult.h>
+#include <aws/s3/S3Client.h>
+
+namespace tensorflow {
+
+class AWSSha256HMACOpenSSLImpl : public Aws::Utils::Crypto::HMAC {
+ public:
+  AWSSha256HMACOpenSSLImpl() {}
+
+  virtual ~AWSSha256HMACOpenSSLImpl() = default;
+
+  virtual Aws::Utils::Crypto::HashResult Calculate(
+      const Aws::Utils::ByteBuffer& toSign,
+      const Aws::Utils::ByteBuffer& secret) override {
+    unsigned int length = SHA256_DIGEST_LENGTH;
+    Aws::Utils::ByteBuffer digest(length);
+    memset(digest.GetUnderlyingData(), 0, length);
+
+    HMAC_CTX ctx;
+    HMAC_CTX_init(&ctx);
+
+    HMAC_Init_ex(&ctx, secret.GetUnderlyingData(),
+                 static_cast<int>(secret.GetLength()), EVP_sha256(), NULL);
+    HMAC_Update(&ctx, toSign.GetUnderlyingData(), toSign.GetLength());
+    HMAC_Final(&ctx, digest.GetUnderlyingData(), &length);
+    HMAC_CTX_cleanup(&ctx);
+
+    return Aws::Utils::Crypto::HashResult(std::move(digest));
+  }
+};
+
+class AWSSha256OpenSSLImpl : public Aws::Utils::Crypto::Hash {
+ public:
+  AWSSha256OpenSSLImpl() {}
+
+  virtual ~AWSSha256OpenSSLImpl() = default;
+
+  virtual Aws::Utils::Crypto::HashResult Calculate(
+      const Aws::String& str) override {
+    SHA256_CTX sha256;
+    SHA256_Init(&sha256);
+    SHA256_Update(&sha256, str.data(), str.size());
+
+    Aws::Utils::ByteBuffer hash(SHA256_DIGEST_LENGTH);
+    SHA256_Final(hash.GetUnderlyingData(), &sha256);
+
+    return Aws::Utils::Crypto::HashResult(std::move(hash));
+  }
+
+  virtual Aws::Utils::Crypto::HashResult Calculate(
+      Aws::IStream& stream) override {
+    SHA256_CTX sha256;
+    SHA256_Init(&sha256);
+
+    auto currentPos = stream.tellg();
+    if (currentPos == std::streampos(std::streamoff(-1))) {
+      currentPos = 0;
+      stream.clear();
+    }
+
+    stream.seekg(0, stream.beg);
+
+    char streamBuffer
+        [Aws::Utils::Crypto::Hash::INTERNAL_HASH_STREAM_BUFFER_SIZE];
+    while (stream.good()) {
+      stream.read(streamBuffer,
+                  Aws::Utils::Crypto::Hash::INTERNAL_HASH_STREAM_BUFFER_SIZE);
+      auto bytesRead = stream.gcount();
+
+      if (bytesRead > 0) {
+        SHA256_Update(&sha256, streamBuffer, static_cast<size_t>(bytesRead));
+      }
+    }
+
+    stream.clear();
+    stream.seekg(currentPos, stream.beg);
+
+    Aws::Utils::ByteBuffer hash(SHA256_DIGEST_LENGTH);
+    SHA256_Final(hash.GetUnderlyingData(), &sha256);
+
+    return Aws::Utils::Crypto::HashResult(std::move(hash));
+  }
+};
+
+std::shared_ptr<Aws::Utils::Crypto::Hash>
+AWSSHA256Factory::CreateImplementation() const {
+  return Aws::MakeShared<AWSSha256OpenSSLImpl>(AWSCryptoAllocationTag);
+}
+
+std::shared_ptr<Aws::Utils::Crypto::HMAC>
+AWSSHA256HmacFactory::CreateImplementation() const {
+  return Aws::MakeShared<AWSSha256HMACOpenSSLImpl>(AWSCryptoAllocationTag);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/s3/aws_crypto.h b/tensorflow/core/platform/s3/aws_crypto.h
new file mode 100644
index 0000000000..f05771b904
--- /dev/null
+++ b/tensorflow/core/platform/s3/aws_crypto.h
@@ -0,0 +1,35 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <aws/core/Aws.h>
+#include <aws/core/utils/crypto/Factories.h>
+#include <aws/core/utils/crypto/HMAC.h>
+#include <aws/core/utils/crypto/Hash.h>
+
+namespace tensorflow {
+static const char* AWSCryptoAllocationTag = "AWSCryptoAllocation";
+
+class AWSSHA256Factory : public Aws::Utils::Crypto::HashFactory {
+ public:
+  std::shared_ptr<Aws::Utils::Crypto::Hash> CreateImplementation()
+      const override;
+};
+
+class AWSSHA256HmacFactory : public Aws::Utils::Crypto::HMACFactory {
+ public:
+  std::shared_ptr<Aws::Utils::Crypto::HMAC> CreateImplementation()
+      const override;
+};
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/s3/s3_crypto.cc b/tensorflow/core/platform/s3/s3_crypto.cc
deleted file mode 100644
index d7062a59d2..0000000000
--- a/tensorflow/core/platform/s3/s3_crypto.cc
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/platform/s3/s3_crypto.h"
-#include <openssl/hmac.h>
-#include <openssl/sha.h>
-
-#include <aws/core/utils/crypto/HashResult.h>
-#include <aws/s3/S3Client.h>
-
-namespace tensorflow {
-
-class S3Sha256HMACOpenSSLImpl : public Aws::Utils::Crypto::HMAC {
- public:
-  S3Sha256HMACOpenSSLImpl() {}
-
-  virtual ~S3Sha256HMACOpenSSLImpl() = default;
-
-  virtual Aws::Utils::Crypto::HashResult Calculate(
-      const Aws::Utils::ByteBuffer& toSign,
-      const Aws::Utils::ByteBuffer& secret) override {
-    unsigned int length = SHA256_DIGEST_LENGTH;
-    Aws::Utils::ByteBuffer digest(length);
-    memset(digest.GetUnderlyingData(), 0, length);
-
-    HMAC_CTX ctx;
-    HMAC_CTX_init(&ctx);
-
-    HMAC_Init_ex(&ctx, secret.GetUnderlyingData(),
-                 static_cast<int>(secret.GetLength()), EVP_sha256(), NULL);
-    HMAC_Update(&ctx, toSign.GetUnderlyingData(), toSign.GetLength());
-    HMAC_Final(&ctx, digest.GetUnderlyingData(), &length);
-    HMAC_CTX_cleanup(&ctx);
-
-    return Aws::Utils::Crypto::HashResult(std::move(digest));
-  }
-};
-
-class S3Sha256OpenSSLImpl : public Aws::Utils::Crypto::Hash {
- public:
-  S3Sha256OpenSSLImpl() {}
-
-  virtual ~S3Sha256OpenSSLImpl() = default;
-
-  virtual Aws::Utils::Crypto::HashResult Calculate(
-      const Aws::String& str) override {
-    SHA256_CTX sha256;
-    SHA256_Init(&sha256);
-    SHA256_Update(&sha256, str.data(), str.size());
-
-    Aws::Utils::ByteBuffer hash(SHA256_DIGEST_LENGTH);
-    SHA256_Final(hash.GetUnderlyingData(), &sha256);
-
-    return Aws::Utils::Crypto::HashResult(std::move(hash));
-  }
-
-  virtual Aws::Utils::Crypto::HashResult Calculate(
-      Aws::IStream& stream) override {
-    SHA256_CTX sha256;
-    SHA256_Init(&sha256);
-
-    auto currentPos = stream.tellg();
-    if (currentPos == std::streampos(std::streamoff(-1))) {
-      currentPos = 0;
-      stream.clear();
-    }
-
-    stream.seekg(0, stream.beg);
-
-    char streamBuffer
-        [Aws::Utils::Crypto::Hash::INTERNAL_HASH_STREAM_BUFFER_SIZE];
-    while (stream.good()) {
-      stream.read(streamBuffer,
-                  Aws::Utils::Crypto::Hash::INTERNAL_HASH_STREAM_BUFFER_SIZE);
-      auto bytesRead = stream.gcount();
-
-      if (bytesRead > 0) {
-        SHA256_Update(&sha256, streamBuffer, static_cast<size_t>(bytesRead));
-      }
-    }
-
-    stream.clear();
-    stream.seekg(currentPos, stream.beg);
-
-    Aws::Utils::ByteBuffer hash(SHA256_DIGEST_LENGTH);
-    SHA256_Final(hash.GetUnderlyingData(), &sha256);
-
-    return Aws::Utils::Crypto::HashResult(std::move(hash));
-  }
-};
-
-std::shared_ptr<Aws::Utils::Crypto::Hash>
-S3SHA256Factory::CreateImplementation() const {
-  return Aws::MakeShared<S3Sha256OpenSSLImpl>(S3CryptoAllocationTag);
-}
-
-std::shared_ptr<Aws::Utils::Crypto::HMAC>
-S3SHA256HmacFactory::CreateImplementation() const {
-  return Aws::MakeShared<S3Sha256HMACOpenSSLImpl>(S3CryptoAllocationTag);
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/platform/s3/s3_crypto.h b/tensorflow/core/platform/s3/s3_crypto.h
deleted file mode 100644
index e376b8b0c0..0000000000
--- a/tensorflow/core/platform/s3/s3_crypto.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <aws/core/Aws.h>
-#include <aws/core/utils/crypto/Factories.h>
-#include <aws/core/utils/crypto/HMAC.h>
-#include <aws/core/utils/crypto/Hash.h>
-
-namespace tensorflow {
-static const char* S3CryptoAllocationTag = "S3CryptoAllocation";
-
-class S3SHA256Factory : public Aws::Utils::Crypto::HashFactory {
- public:
-  std::shared_ptr<Aws::Utils::Crypto::Hash> CreateImplementation()
-      const override;
-};
-
-class S3SHA256HmacFactory : public Aws::Utils::Crypto::HMACFactory {
- public:
-  std::shared_ptr<Aws::Utils::Crypto::HMAC> CreateImplementation()
-      const override;
-};
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index 6da679dc75..bdc8f808df 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/file_system_helper.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/s3/aws_crypto.h"
 #include "tensorflow/core/platform/s3/aws_logging.h"
-#include "tensorflow/core/platform/s3/s3_crypto.h"
 
 #include <aws/core/Aws.h>
 #include <aws/core/config/AWSProfileConfigLoader.h>
@@ -300,10 +300,10 @@ std::shared_ptr<Aws::S3::S3Client> S3FileSystem::GetS3Client() {
 
     Aws::SDKOptions options;
     options.cryptoOptions.sha256Factory_create_fn = []() {
-      return Aws::MakeShared<S3SHA256Factory>(S3CryptoAllocationTag);
+      return Aws::MakeShared<AWSSHA256Factory>(AWSCryptoAllocationTag);
     };
     options.cryptoOptions.sha256HMACFactory_create_fn = []() {
-      return Aws::MakeShared<S3SHA256HmacFactory>(S3CryptoAllocationTag);
+      return Aws::MakeShared<AWSSHA256HmacFactory>(AWSCryptoAllocationTag);
     };
     Aws::InitAPI(options);
 
diff --git a/third_party/aws.BUILD b/third_party/aws.BUILD
index 2dc921933c..5426f79e46 100644
--- a/third_party/aws.BUILD
+++ b/third_party/aws.BUILD
@@ -46,6 +46,8 @@ cc_library(
         "aws-cpp-sdk-core/source/utils/xml/**/*.cpp",
         "aws-cpp-sdk-core/source/utils/crypto/*.cpp",
         "aws-cpp-sdk-core/source/utils/crypto/factory/**/*.cpp",
+        "aws-cpp-sdk-kinesis/include/**/*.h",
+        "aws-cpp-sdk-kinesis/source/**/*.cpp",
         "aws-cpp-sdk-s3/include/**/*.h",
         "aws-cpp-sdk-s3/source/**/*.cpp",
     ]),
@@ -72,6 +74,7 @@ cc_library(
     }),
     includes = [
         "aws-cpp-sdk-core/include/",
+        "aws-cpp-sdk-kinesis/include/",
         "aws-cpp-sdk-s3/include/",
     ],
     deps = [
-- 
cgit v1.2.3


From 02f17fe8b7d1c58a3a2f224fe7ba6649c9ad84fd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 7 Jul 2018 06:59:19 -0700
Subject: Set correct environment variable for CUDA install path

configure.py respects CUDA_TOOLKIT_PATH instead of CUDA_INSTALL_PATH

PiperOrigin-RevId: 203591214
---
 configure.py                                          | 2 ++
 tensorflow/tools/ci_build/windows/bazel/common_env.sh | 6 +++---
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'configure.py')

diff --git a/configure.py b/configure.py
index 31a83b4a15..03c03aad9c 100644
--- a/configure.py
+++ b/configure.py
@@ -835,6 +835,8 @@ def set_tf_cuda_version(environ_cp):
                      '[Default is %s]: ') % (tf_cuda_version, default_cuda_path)
     cuda_toolkit_path = get_from_env_or_user_or_default(
         environ_cp, 'CUDA_TOOLKIT_PATH', ask_cuda_path, default_cuda_path)
+    if is_windows() or is_cygwin():
+      cuda_toolkit_path = cygpath(cuda_toolkit_path)
 
     if is_windows():
       cuda_rt_lib_path = 'lib/x64/cudart.lib'
diff --git a/tensorflow/tools/ci_build/windows/bazel/common_env.sh b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
index 8a237e4e28..3af132217e 100644
--- a/tensorflow/tools/ci_build/windows/bazel/common_env.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
@@ -54,10 +54,10 @@ export PATH="/c/${PYTHON_BASE_PATH}/Scripts:$PATH"
 export TF_CUDA_VERSION=${TF_CUDA_VERSION:-9.0}
 export TF_CUDNN_VERSION=${TF_CUDNN_VERSION:-7.0}
 export TF_CUDA_COMPUTE_CAPABILITIES=${TF_CUDA_COMPUTE_CAPABILITIES:-3.7}
-export CUDA_INSTALL_PATH=${CUDA_INSTALL_PATH:-"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${TF_CUDA_VERSION}"}
+export CUDA_TOOLKIT_PATH=${CUDA_TOOLKIT_PATH:-"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${TF_CUDA_VERSION}"}
 export CUDNN_INSTALL_PATH=${CUDNN_INSTALL_PATH:-"C:/tools/cuda"}
 
 # Add Cuda and Cudnn dll directories into PATH
-export PATH="$(cygpath -u "${CUDA_INSTALL_PATH}")/bin:$PATH"
-export PATH="$(cygpath -u "${CUDA_INSTALL_PATH}")/extras/CUPTI/libx64:$PATH"
+export PATH="$(cygpath -u "${CUDA_TOOLKIT_PATH}")/bin:$PATH"
+export PATH="$(cygpath -u "${CUDA_TOOLKIT_PATH}")/extras/CUPTI/libx64:$PATH"
 export PATH="$(cygpath -u "${CUDNN_INSTALL_PATH}")/bin:$PATH"
-- 
cgit v1.2.3


From ab39198aceb641d7be631ba85091a4139edf203f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 11 Jul 2018 04:52:49 -0700
Subject: Add CROSSTOOL for Windows GPU build with nvcc compiler

After this change, to build TensorFlow with GPU support on Windows, you just need to follow the same procedure as Linux.

Also re-enable remote cache since the bug in b/111106393 has been fixed by the new wrapper script.

The clean-up of the wrapper scripts also made the build around 5 mins faster than using Bazel's msvc wrapper script.

PiperOrigin-RevId: 204105368
---
 configure.py                                       |   27 +-
 .../tools/ci_build/windows/bazel/bazel_test_lib.sh |    4 -
 third_party/gpus/crosstool/BUILD.tpl               |   20 +
 third_party/gpus/crosstool/CROSSTOOL.tpl           |  869 ++++++++
 .../windows/msvc_wrapper_for_nvcc.bat.tpl          |   20 +
 .../crosstool/windows/msvc_wrapper_for_nvcc.py.tpl |  192 ++
 third_party/gpus/cuda/BUILD.windows.tpl            |  163 ++
 third_party/gpus/cuda_configure.bzl                | 2167 +++++++++++---------
 tools/bazel.rc                                     |    2 -
 9 files changed, 2487 insertions(+), 977 deletions(-)
 create mode 100644 third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.bat.tpl
 create mode 100644 third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
 create mode 100644 third_party/gpus/cuda/BUILD.windows.tpl

(limited to 'configure.py')

diff --git a/configure.py b/configure.py
index 03c03aad9c..8930c3a1f1 100644
--- a/configure.py
+++ b/configure.py
@@ -1236,28 +1236,13 @@ def set_tf_cuda_compute_capabilities(environ_cp):
 
 def set_other_cuda_vars(environ_cp):
   """Set other CUDA related variables."""
-  if is_windows():
-    # The following three variables are needed for MSVC toolchain configuration
-    # in Bazel
-    environ_cp['CUDA_PATH'] = environ_cp.get('CUDA_TOOLKIT_PATH')
-    environ_cp['CUDA_COMPUTE_CAPABILITIES'] = environ_cp.get(
-        'TF_CUDA_COMPUTE_CAPABILITIES')
-    environ_cp['NO_WHOLE_ARCHIVE_OPTION'] = 1
-    write_action_env_to_bazelrc('CUDA_PATH', environ_cp.get('CUDA_PATH'))
-    write_action_env_to_bazelrc('CUDA_COMPUTE_CAPABILITIE',
-                                environ_cp.get('CUDA_COMPUTE_CAPABILITIE'))
-    write_action_env_to_bazelrc('NO_WHOLE_ARCHIVE_OPTION',
-                                environ_cp.get('NO_WHOLE_ARCHIVE_OPTION'))
-    write_to_bazelrc('build --config=win-cuda')
-    write_to_bazelrc('test --config=win-cuda')
+  # If CUDA is enabled, always use GPU during build and test.
+  if environ_cp.get('TF_CUDA_CLANG') == '1':
+    write_to_bazelrc('build --config=cuda_clang')
+    write_to_bazelrc('test --config=cuda_clang')
   else:
-    # If CUDA is enabled, always use GPU during build and test.
-    if environ_cp.get('TF_CUDA_CLANG') == '1':
-      write_to_bazelrc('build --config=cuda_clang')
-      write_to_bazelrc('test --config=cuda_clang')
-    else:
-      write_to_bazelrc('build --config=cuda')
-      write_to_bazelrc('test --config=cuda')
+    write_to_bazelrc('build --config=cuda')
+    write_to_bazelrc('test --config=cuda')
 
 
 def set_host_cxx_compiler(environ_cp):
diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
index e10483e7fd..c03cbd9c66 100644
--- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
@@ -23,10 +23,6 @@ function run_configure_for_gpu_build {
   # Enable CUDA support
   export TF_NEED_CUDA=1
 
-  # TODO(pcloudy): Remove this after TensorFlow uses its own CRSOOTOOL
-  # for GPU build on Windows
-  export USE_MSVC_WRAPPER=1
-
   yes "" | ./configure
 }
 
diff --git a/third_party/gpus/crosstool/BUILD.tpl b/third_party/gpus/crosstool/BUILD.tpl
index 98cb326572..f638756d23 100644
--- a/third_party/gpus/crosstool/BUILD.tpl
+++ b/third_party/gpus/crosstool/BUILD.tpl
@@ -7,6 +7,7 @@ cc_toolchain_suite(
     toolchains = {
         "local|compiler": ":cc-compiler-local",
         "darwin|compiler": ":cc-compiler-darwin",
+        "x64_windows|msvc-cl": ":cc-compiler-windows",
     },
 )
 
@@ -42,6 +43,20 @@ cc_toolchain(
     supports_param_files = 0,
 )
 
+cc_toolchain(
+    name = "cc-compiler-windows",
+    all_files = "%{win_linker_files}",
+    compiler_files = ":empty",
+    cpu = "x64_windows",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = "%{win_linker_files}",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 1,
+)
+
 filegroup(
     name = "empty",
     srcs = [],
@@ -51,3 +66,8 @@ filegroup(
     name = "crosstool_wrapper_driver_is_not_gcc",
     srcs = ["clang/bin/crosstool_wrapper_driver_is_not_gcc"],
 )
+
+filegroup(
+    name = "windows_msvc_wrapper_files",
+    srcs = glob(["windows/msvc_*"]),
+)
diff --git a/third_party/gpus/crosstool/CROSSTOOL.tpl b/third_party/gpus/crosstool/CROSSTOOL.tpl
index 1424ff6511..3972c96a2f 100644
--- a/third_party/gpus/crosstool/CROSSTOOL.tpl
+++ b/third_party/gpus/crosstool/CROSSTOOL.tpl
@@ -22,6 +22,10 @@ default_toolchain {
   cpu: "ppc"
   toolchain_identifier: "local_linux"
 }
+default_toolchain {
+  cpu: "x64_windows"
+  toolchain_identifier: "local_windows"
+}
 
 toolchain {
   abi_version: "local"
@@ -537,3 +541,868 @@ toolchain {
 
 %{host_compiler_includes}
 }
+
+toolchain {
+  toolchain_identifier: "local_windows"
+  host_system_name: "local"
+  target_system_name: "local"
+
+  abi_version: "local"
+  abi_libc_version: "local"
+  target_cpu: "x64_windows"
+  compiler: "msvc-cl"
+  target_libc: "msvcrt"
+
+%{cxx_builtin_include_directory}
+
+  tool_path {
+    name: "ar"
+    path: "%{msvc_lib_path}"
+  }
+  tool_path {
+    name: "ml"
+    path: "%{msvc_ml_path}"
+  }
+  tool_path {
+    name: "cpp"
+    path: "%{msvc_cl_path}"
+  }
+  tool_path {
+    name: "gcc"
+    path: "%{msvc_cl_path}"
+  }
+  tool_path {
+    name: "gcov"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "ld"
+    path: "%{msvc_link_path}"
+  }
+  tool_path {
+    name: "nm"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objcopy"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objdump"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "strip"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  supports_interface_shared_objects: true
+
+  # TODO(pcloudy): Review those flags below, they should be defined by cl.exe
+  compiler_flag: "/DCOMPILER_MSVC"
+
+  # Don't define min/max macros in windows.h.
+  compiler_flag: "/DNOMINMAX"
+
+  # Platform defines.
+  compiler_flag: "/D_WIN32_WINNT=0x0600"
+  # Turn off warning messages.
+  compiler_flag: "/D_CRT_SECURE_NO_DEPRECATE"
+  compiler_flag: "/D_CRT_SECURE_NO_WARNINGS"
+  compiler_flag: "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS"
+
+  # Useful options to have on for compilation.
+  # Increase the capacity of object files to 2^32 sections.
+  compiler_flag: "/bigobj"
+  # Allocate 500MB for precomputed headers.
+  compiler_flag: "/Zm500"
+  # Use unsigned char by default.
+  compiler_flag: "/J"
+  # Use function level linking.
+  compiler_flag: "/Gy"
+  # Use string pooling.
+  compiler_flag: "/GF"
+  # Catch C++ exceptions only and tell the compiler to assume that functions declared
+  # as extern "C" never throw a C++ exception.
+  compiler_flag: "/EHsc"
+
+  # Globally disabled warnings.
+  # Don't warn about elements of array being be default initialized.
+  compiler_flag: "/wd4351"
+  # Don't warn about no matching delete found.
+  compiler_flag: "/wd4291"
+  # Don't warn about diamond inheritance patterns.
+  compiler_flag: "/wd4250"
+  # Don't warn about insecure functions (e.g. non _s functions).
+  compiler_flag: "/wd4996"
+
+  linker_flag: "/MACHINE:X64"
+
+  feature {
+    name: "no_legacy_features"
+  }
+
+  # Suppress startup banner.
+  feature {
+    name: "nologo"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      flag_group {
+        flag: "/nologo"
+      }
+    }
+  }
+
+  feature {
+    name: 'has_configured_linker_path'
+  }
+
+  # This feature indicates strip is not supported, building stripped binary will just result a copy of orignial binary
+  feature {
+    name: 'no_stripping'
+  }
+
+  # This feature indicates this is a toolchain targeting Windows.
+  feature {
+    name: 'targets_windows'
+    implies: 'copy_dynamic_libraries_to_binary'
+    enabled: true
+  }
+
+  feature {
+    name: 'copy_dynamic_libraries_to_binary'
+  }
+
+  action_config {
+    config_name: 'assemble'
+    action_name: 'assemble'
+    tool {
+      tool_path: '%{msvc_ml_path}'
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'sysroot'
+  }
+
+  action_config {
+    config_name: 'preprocess-assemble'
+    action_name: 'preprocess-assemble'
+    tool {
+      tool_path: '%{msvc_ml_path}'
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'sysroot'
+  }
+
+  action_config {
+    config_name: 'c-compile'
+    action_name: 'c-compile'
+    tool {
+      tool_path: '%{msvc_cl_path}'
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'legacy_compile_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'parse_showincludes'
+    implies: 'user_compile_flags'
+    implies: 'sysroot'
+    implies: 'unfiltered_compile_flags'
+  }
+
+  action_config {
+    config_name: 'c++-compile'
+    action_name: 'c++-compile'
+    tool {
+      tool_path: '%{msvc_cl_path}'
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'legacy_compile_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'parse_showincludes'
+    implies: 'user_compile_flags'
+    implies: 'sysroot'
+    implies: 'unfiltered_compile_flags'
+  }
+
+  action_config {
+    config_name: 'c++-link-executable'
+    action_name: 'c++-link-executable'
+    tool {
+      tool_path: '%{msvc_link_path}'
+    }
+    implies: 'nologo'
+    implies: 'linkstamps'
+    implies: 'output_execpath_flags'
+    implies: 'input_param_flags'
+    implies: 'user_link_flags'
+    implies: 'legacy_link_flags'
+    implies: 'linker_subsystem_flag'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+    implies: 'no_stripping'
+  }
+
+  action_config {
+    config_name: 'c++-link-dynamic-library'
+    action_name: 'c++-link-dynamic-library'
+    tool {
+      tool_path: '%{msvc_link_path}'
+    }
+    implies: 'nologo'
+    implies: 'shared_flag'
+    implies: 'linkstamps'
+    implies: 'output_execpath_flags'
+    implies: 'input_param_flags'
+    implies: 'user_link_flags'
+    implies: 'legacy_link_flags'
+    implies: 'linker_subsystem_flag'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+    implies: 'no_stripping'
+    implies: 'has_configured_linker_path'
+    implies: 'def_file'
+  }
+
+  action_config {
+      config_name: 'c++-link-nodeps-dynamic-library'
+      action_name: 'c++-link-nodeps-dynamic-library'
+      tool {
+        tool_path: '%{msvc_link_path}'
+      }
+      implies: 'nologo'
+      implies: 'shared_flag'
+      implies: 'linkstamps'
+      implies: 'output_execpath_flags'
+      implies: 'input_param_flags'
+      implies: 'user_link_flags'
+      implies: 'legacy_link_flags'
+      implies: 'linker_subsystem_flag'
+      implies: 'linker_param_file'
+      implies: 'msvc_env'
+      implies: 'no_stripping'
+      implies: 'has_configured_linker_path'
+      implies: 'def_file'
+    }
+
+  action_config {
+    config_name: 'c++-link-static-library'
+    action_name: 'c++-link-static-library'
+    tool {
+      tool_path: '%{msvc_lib_path}'
+    }
+    implies: 'nologo'
+    implies: 'archiver_flags'
+    implies: 'input_param_flags'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+  }
+
+  # TODO(b/65151735): Remove legacy_compile_flags feature when legacy fields are
+  # not used in this crosstool
+  feature {
+    name: 'legacy_compile_flags'
+    flag_set {
+      expand_if_all_available: 'legacy_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'legacy_compile_flags'
+        flag: '%{legacy_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: "msvc_env"
+    env_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      env_entry {
+        key: "PATH"
+        value: "%{msvc_env_path}"
+      }
+      env_entry {
+        key: "INCLUDE"
+        value: "%{msvc_env_include}"
+      }
+      env_entry {
+        key: "LIB"
+        value: "%{msvc_env_lib}"
+      }
+      env_entry {
+        key: "TMP"
+        value: "%{msvc_env_tmp}"
+      }
+      env_entry {
+        key: "TEMP"
+        value: "%{msvc_env_tmp}"
+      }
+    }
+  }
+
+  feature {
+    name: 'include_paths'
+    flag_set {
+      action: "assemble"
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      flag_group {
+        iterate_over: 'quote_include_paths'
+        flag: '/I%{quote_include_paths}'
+      }
+      flag_group {
+        iterate_over: 'include_paths'
+        flag: '/I%{include_paths}'
+      }
+      flag_group {
+        iterate_over: 'system_include_paths'
+        flag: '/I%{system_include_paths}'
+      }
+    }
+  }
+
+  feature {
+    name: "preprocessor_defines"
+    flag_set {
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-header-parsing"
+      action: "c++-module-compile"
+      flag_group {
+        flag: "/D%{preprocessor_defines}"
+        iterate_over: "preprocessor_defines"
+      }
+    }
+  }
+
+  # Tell Bazel to parse the output of /showIncludes
+  feature {
+    name: 'parse_showincludes'
+    flag_set {
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-module-compile'
+      action: 'c++-header-parsing'
+      flag_group {
+        flag: "/showIncludes"
+      }
+    }
+  }
+
+
+  feature {
+    name: 'generate_pdb_file'
+    requires: {
+      feature: 'dbg'
+    }
+    requires: {
+      feature: 'fastbuild'
+    }
+  }
+
+  feature {
+    name: 'shared_flag'
+    flag_set {
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/DLL'
+      }
+    }
+  }
+
+  feature {
+    name: 'linkstamps'
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      expand_if_all_available: 'linkstamp_paths'
+      flag_group {
+        iterate_over: 'linkstamp_paths'
+        flag: '%{linkstamp_paths}'
+      }
+    }
+  }
+
+  feature {
+    name: 'output_execpath_flags'
+    flag_set {
+      expand_if_all_available: 'output_execpath'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/OUT:%{output_execpath}'
+      }
+    }
+  }
+
+  feature {
+    name: 'archiver_flags'
+    flag_set {
+      expand_if_all_available: 'output_execpath'
+      action: 'c++-link-static-library'
+      flag_group {
+        flag: '/OUT:%{output_execpath}'
+      }
+    }
+  }
+
+  feature {
+    name: 'input_param_flags'
+    flag_set {
+      expand_if_all_available: 'interface_library_output_path'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/IMPLIB:%{interface_library_output_path}"
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'libopts'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'libopts'
+        flag: '%{libopts}'
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'libraries_to_link'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      action: 'c++-link-static-library'
+      flag_group {
+        iterate_over: 'libraries_to_link'
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'object_file_group'
+          }
+          iterate_over: 'libraries_to_link.object_files'
+          flag_group {
+            flag: '%{libraries_to_link.object_files}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'object_file'
+          }
+          flag_group {
+            flag: '%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'interface_library'
+          }
+          flag_group {
+            flag: '%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'static_library'
+          }
+          flag_group {
+            expand_if_false: 'libraries_to_link.is_whole_archive'
+            flag: '%{libraries_to_link.name}'
+          }
+          flag_group {
+            expand_if_true: 'libraries_to_link.is_whole_archive'
+            flag: '/WHOLEARCHIVE:%{libraries_to_link.name}'
+          }
+        }
+      }
+    }
+  }
+
+  # Since this feature is declared earlier in the CROSSTOOL than
+  # "user_link_flags", this feature will be applied prior to it anwyhere they
+  # are both implied. And since "user_link_flags" contains the linkopts from
+  # the build rule, this allows the user to override the /SUBSYSTEM in the BUILD
+  # file.
+  feature {
+    name: 'linker_subsystem_flag'
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/SUBSYSTEM:CONSOLE'
+      }
+    }
+  }
+
+  # The "user_link_flags" contains user-defined linkopts (from build rules)
+  # so it should be defined after features that declare user-overridable flags.
+  # For example the "linker_subsystem_flag" defines a default "/SUBSYSTEM" flag
+  # but we want to let the user override it, therefore "link_flag_subsystem" is
+  # defined earlier in the CROSSTOOL file than "user_link_flags".
+  feature {
+    name: 'user_link_flags'
+    flag_set {
+      expand_if_all_available: 'user_link_flags'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'user_link_flags'
+        flag: '%{user_link_flags}'
+      }
+    }
+  }
+  feature {
+    name: 'legacy_link_flags'
+    flag_set {
+      expand_if_all_available: 'legacy_link_flags'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'legacy_link_flags'
+        flag: '%{legacy_link_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'linker_param_file'
+    flag_set {
+      expand_if_all_available: 'linker_param_file'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      action: 'c++-link-static-library'
+      flag_group {
+        flag: '@%{linker_param_file}'
+      }
+    }
+  }
+
+  feature {
+    name: 'static_link_msvcrt'
+  }
+
+  feature {
+    name: 'static_link_msvcrt_no_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MT"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:libcmt.lib"
+      }
+    }
+    requires: { feature: 'fastbuild'}
+    requires: { feature: 'opt'}
+  }
+
+  feature {
+    name: 'dynamic_link_msvcrt_no_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MD"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:msvcrt.lib"
+      }
+    }
+    requires: { feature: 'fastbuild'}
+    requires: { feature: 'opt'}
+  }
+
+  feature {
+    name: 'static_link_msvcrt_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MTd"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:libcmtd.lib"
+      }
+    }
+    requires: { feature: 'dbg'}
+  }
+
+  feature {
+    name: 'dynamic_link_msvcrt_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MDd"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:msvcrtd.lib"
+      }
+    }
+    requires: { feature: 'dbg'}
+  }
+
+  feature {
+    name: 'dbg'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/Od"
+        flag: "/Z7"
+        flag: "/DDEBUG"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEBUG:FULL"
+        flag: "/INCREMENTAL:NO"
+      }
+    }
+    implies: 'generate_pdb_file'
+  }
+
+  feature {
+    name: 'fastbuild'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/Od"
+        flag: "/Z7"
+        flag: "/DDEBUG"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEBUG:FASTLINK"
+        flag: "/INCREMENTAL:NO"
+      }
+    }
+    implies: 'generate_pdb_file'
+  }
+
+  feature {
+    name: 'opt'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/O2"
+        flag: "/DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: 'user_compile_flags'
+    flag_set {
+      expand_if_all_available: 'user_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'user_compile_flags'
+        flag: '%{user_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'sysroot'
+    flag_set {
+      expand_if_all_available: 'sysroot'
+      action: 'assemble'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'sysroot'
+        flag: '--sysroot=%{sysroot}'
+      }
+    }
+  }
+
+  feature {
+    name: 'unfiltered_compile_flags'
+    flag_set {
+      expand_if_all_available: 'unfiltered_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'unfiltered_compile_flags'
+        flag: '%{unfiltered_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'compiler_output_flags'
+    flag_set {
+      action: 'assemble'
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_none_available: 'output_assembly_file'
+        expand_if_none_available: 'output_preprocess_file'
+        flag: '/Fo%{output_file}'
+        flag: '/Zi'
+      }
+    }
+    flag_set {
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_none_available: 'output_assembly_file'
+        expand_if_none_available: 'output_preprocess_file'
+        flag: '/Fo%{output_file}'
+      }
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_all_available: 'output_assembly_file'
+        flag: '/Fa%{output_file}'
+      }
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_all_available: 'output_preprocess_file'
+        flag: '/P'
+        flag: '/Fi%{output_file}'
+      }
+    }
+  }
+
+  feature {
+    name: 'compiler_input_flags'
+    flag_set {
+      action: 'assemble'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        expand_if_all_available: 'source_file'
+        flag: '/c'
+        flag: '%{source_file}'
+      }
+    }
+  }
+
+  feature {
+    name : 'def_file',
+    flag_set {
+      expand_if_all_available: 'def_file_path'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEF:%{def_file_path}"
+        # We can specify a different DLL name in DEF file, /ignore:4070 suppresses
+        # the warning message about DLL name doesn't match the default one.
+        # See https://msdn.microsoft.com/en-us/library/sfkk2fz7.aspx
+        flag: "/ignore:4070"
+      }
+    }
+  }
+
+  feature {
+    name: 'windows_export_all_symbols'
+  }
+
+  feature {
+    name: 'no_windows_export_all_symbols'
+  }
+
+  linking_mode_flags { mode: DYNAMIC }
+}
diff --git a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.bat.tpl b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.bat.tpl
new file mode 100644
index 0000000000..8f8fb3e423
--- /dev/null
+++ b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.bat.tpl
@@ -0,0 +1,20 @@
+:: Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+:: Invoke msvc_wrapper_for_nvcc.py, which is located in the same directory.
+@echo OFF
+set arg0=%~0
+for %%F in ("%arg0%") do set DRIVER_BIN=%%~dpF
+"%{python_binary}" -B "%DRIVER_BIN%\msvc_wrapper_for_nvcc.py" %*
diff --git a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
new file mode 100644
index 0000000000..1a09756813
--- /dev/null
+++ b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
@@ -0,0 +1,192 @@
+#!/usr/bin/env python
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Crosstool wrapper for compiling CUDA programs with nvcc on Windows.
+
+DESCRIPTION:
+  This script is the Windows version of //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc
+"""
+
+from __future__ import print_function
+
+from argparse import ArgumentParser
+import os
+import subprocess
+import re
+import sys
+import pipes
+
+# Template values set by cuda_autoconf.
+CPU_COMPILER = ('%{cpu_compiler}')
+GCC_HOST_COMPILER_PATH = ('%{gcc_host_compiler_path}')
+
+NVCC_PATH = '%{nvcc_path}'
+NVCC_VERSION = '%{cuda_version}'
+NVCC_TEMP_DIR = "%{nvcc_tmp_dir}"
+supported_cuda_compute_capabilities = [ %{cuda_compute_capabilities} ]
+
+def Log(s):
+  print('gpus/crosstool: {0}'.format(s))
+
+
+def GetOptionValue(argv, option):
+  """Extract the list of values for option from options.
+
+  Args:
+    option: The option whose value to extract, without the leading '/'.
+
+  Returns:
+    1. A list of values, either directly following the option,
+    (eg., /opt val1 val2) or values collected from multiple occurrences of
+    the option (eg., /opt val1 /opt val2).
+    2. The leftover options.
+  """
+
+  parser = ArgumentParser(prefix_chars='/')
+  parser.add_argument('/' + option, nargs='*', action='append')
+  args, leftover = parser.parse_known_args(argv)
+  if args and vars(args)[option]:
+    return (sum(vars(args)[option], []), leftover)
+  return ([], leftover)
+
+def _update_options(nvcc_options):
+  if NVCC_VERSION in ("7.0",):
+    return nvcc_options
+
+  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
+  return [ update_options[opt] if opt in update_options else opt
+                    for opt in nvcc_options ]
+
+def GetNvccOptions(argv):
+  """Collect the -nvcc_options values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    1. The string that can be passed directly to nvcc.
+    2. The leftover options.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-nvcc_options', nargs='*', action='append')
+
+  args, leftover = parser.parse_known_args(argv)
+
+  if args.nvcc_options:
+    options = _update_options(sum(args.nvcc_options, []))
+    return (['--' + a for a in options], leftover)
+  return ([], leftover)
+
+
+def InvokeNvcc(argv, log=False):
+  """Call nvcc with arguments assembled from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    log: True if logging is requested.
+
+  Returns:
+    The return value of calling os.system('nvcc ' + args)
+  """
+
+  src_files = [f for f in argv if
+               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  if len(src_files) == 0:
+    raise Error('No source files found for cuda compilation.')
+
+  out_file = [ f for f in argv if f.startswith('/Fo') ]
+  if len(out_file) != 1:
+    raise Error('Please sepecify exactly one output file for cuda compilation.')
+  out = ['-o', out_file[0][len('/Fo'):]]
+
+  nvcc_compiler_options, argv = GetNvccOptions(argv)
+
+  opt_option, argv = GetOptionValue(argv, 'O')
+  opt = ['-g', '-G']
+  if (len(opt_option) > 0 and opt_option[0] != 'd'):
+    opt = ['-O2']
+
+  include_options, argv = GetOptionValue(argv, 'I')
+  includes = ["-I " + include for include in include_options]
+
+  defines, argv = GetOptionValue(argv, 'D')
+  defines = ['-D' + define for define in defines]
+
+  undefines, argv = GetOptionValue(argv, 'U')
+  undefines = ['-U' + define for define in undefines]
+
+  # The rest of the unrecongized options should be passed to host compiler
+  host_compiler_options = [option for option in argv if option not in (src_files + out_file)]
+
+  m_options = ["-m64"]
+
+  nvccopts = ['-D_FORCE_INLINES']
+  for capability in supported_cuda_compute_capabilities:
+    capability = capability.replace('.', '')
+    nvccopts += [r'-gencode=arch=compute_%s,"code=sm_%s,compute_%s"' % (
+        capability, capability, capability)]
+  nvccopts += nvcc_compiler_options
+  nvccopts += undefines
+  nvccopts += defines
+  nvccopts += m_options
+  nvccopts += ['--compiler-options="' + " ".join(host_compiler_options) + '"']
+  nvccopts += ['-x', 'cu'] + opt + includes + out + ['-c'] + src_files
+  # If we don't specify --keep-dir, nvcc will generate intermediate files under TEMP
+  # Put them under NVCC_TEMP_DIR instead, then Bazel can ignore files under NVCC_TEMP_DIR during dependency check
+  # http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-guiding-compiler-driver
+  # Different actions are sharing NVCC_TEMP_DIR, so we cannot remove it if the directory already exists.
+  if os.path.isfile(NVCC_TEMP_DIR):
+    os.remove(NVCC_TEMP_DIR)
+  if not os.path.exists(NVCC_TEMP_DIR):
+    os.makedirs(NVCC_TEMP_DIR)
+  nvccopts += ['--keep', '--keep-dir', NVCC_TEMP_DIR]
+  cmd = [NVCC_PATH] + nvccopts
+  if log:
+    Log(cmd)
+  proc = subprocess.Popen(cmd,
+                          stdout=sys.stdout,
+                          stderr=sys.stderr,
+                          env=os.environ.copy(),
+                          shell=True)
+  proc.wait()
+  return proc.returncode
+
+def main():
+  parser = ArgumentParser()
+  parser.add_argument('-x', nargs=1)
+  parser.add_argument('--cuda_log', action='store_true')
+  args, leftover = parser.parse_known_args(sys.argv[1:])
+
+  if args.x and args.x[0] == 'cuda':
+    if args.cuda_log: Log('-x cuda')
+    leftover = [pipes.quote(s) for s in leftover]
+    if args.cuda_log: Log('using nvcc')
+    return InvokeNvcc(leftover, log=args.cuda_log)
+
+  # Strip our flags before passing through to the CPU compiler for files which
+  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
+  # We not only want to pass -x to the CPU compiler, but also keep it in its
+  # relative location in the argv list (the compiler is actually sensitive to
+  # this).
+  cpu_compiler_flags = [flag for flag in sys.argv[1:]
+                             if not flag.startswith(('--cuda_log'))
+                             and not flag.startswith(('-nvcc_options'))]
+
+  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/third_party/gpus/cuda/BUILD.windows.tpl b/third_party/gpus/cuda/BUILD.windows.tpl
new file mode 100644
index 0000000000..ff6b3cc351
--- /dev/null
+++ b/third_party/gpus/cuda/BUILD.windows.tpl
@@ -0,0 +1,163 @@
+licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
+
+package(default_visibility = ["//visibility:public"])
+
+config_setting(
+    name = "using_nvcc",
+    values = {
+        "define": "using_cuda_nvcc=true",
+    },
+)
+
+config_setting(
+    name = "using_clang",
+    values = {
+        "define": "using_cuda_clang=true",
+    },
+)
+
+# Equivalent to using_clang && -c opt.
+config_setting(
+    name = "using_clang_opt",
+    values = {
+        "define": "using_cuda_clang=true",
+        "compilation_mode": "opt",
+    },
+)
+
+config_setting(
+    name = "darwin",
+    values = {"cpu": "darwin"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "freebsd",
+    values = {"cpu": "freebsd"},
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cuda_headers",
+    hdrs = [
+        "cuda/cuda_config.h",
+        %{cuda_headers}
+    ],
+    includes = [
+        ".",
+        "cuda/include",
+        "cuda/include/crt",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cudart_static",
+    # /WHOLEARCHIVE:cudart_static.lib will cause a
+    # "Internal error during CImplib::EmitThunk" error.
+    # Treat this library as interface library to avoid being whole archived when
+    # linking a DLL that depends on this.
+    # TODO(pcloudy): Remove this rule after b/111278841 is resolved.
+    interface_library = "cuda/lib/%{cudart_static_lib}",
+    system_provided = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cuda_driver",
+    interface_library = "cuda/lib/%{cuda_driver_lib}",
+    system_provided = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cudart",
+    interface_library = "cuda/lib/%{cudart_lib}",
+    system_provided = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cublas",
+    interface_library = "cuda/lib/%{cublas_lib}",
+    system_provided = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cusolver",
+    interface_library = "cuda/lib/%{cusolver_lib}",
+    system_provided = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cudnn",
+    interface_library = "cuda/lib/%{cudnn_lib}",
+    system_provided = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cudnn_header",
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cufft",
+    interface_library = "cuda/lib/%{cufft_lib}",
+    system_provided = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "curand",
+    interface_library = "cuda/lib/%{curand_lib}",
+    system_provided = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cuda",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cublas",
+        ":cuda_headers",
+        ":cudart",
+        ":cudnn",
+        ":cufft",
+        ":curand",
+    ],
+)
+
+cc_library(
+    name = "cupti_headers",
+    hdrs = [
+        "cuda/cuda_config.h",
+        ":cuda-extras",
+    ],
+    includes = [
+        ".",
+        "cuda/extras/CUPTI/include/",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cupti_dsos",
+    interface_library = "cuda/lib/%{cupti_lib}",
+    system_provided = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "libdevice_root",
+    data = [":cuda-nvvm"],
+    visibility = ["//visibility:public"],
+)
+
+%{cuda_include_genrules}
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index de87d96785..e848fa175c 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -20,6 +20,7 @@
     `/usr/local/cuda`.
   * `TF_CUDA_COMPUTE_CAPABILITIES`: The CUDA compute capabilities. Default is
     `3.5,5.2`.
+  * `PYTHON_BIN_PATH`: The python binary path
 """
 
 _GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
@@ -31,6 +32,7 @@ _CUDNN_INSTALL_PATH = "CUDNN_INSTALL_PATH"
 _TF_CUDA_COMPUTE_CAPABILITIES = "TF_CUDA_COMPUTE_CAPABILITIES"
 _TF_CUDA_CONFIG_REPO = "TF_CUDA_CONFIG_REPO"
 _TF_DOWNLOAD_CLANG = "TF_DOWNLOAD_CLANG"
+_PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
 
 _DEFAULT_CUDA_VERSION = ""
 _DEFAULT_CUDNN_VERSION = ""
@@ -44,12 +46,12 @@ _DEFAULT_CUDA_COMPUTE_CAPABILITIES = ["3.5", "5.2"]
 # will be used. For example, when looking for the cudart libraries, the first
 # attempt will be lib64/cudart inside the CUDA toolkit.
 CUDA_LIB_PATHS = [
-  "lib64/",
-  "lib64/stubs/",
-  "lib/x86_64-linux-gnu/",
-  "lib/x64/",
-  "lib/",
-  "",
+    "lib64/",
+    "lib64/stubs/",
+    "lib/x86_64-linux-gnu/",
+    "lib/x64/",
+    "lib/",
+    "",
 ]
 
 # Lookup paths for cupti.h, relative to the CUDA toolkit directory.
@@ -57,8 +59,8 @@ CUDA_LIB_PATHS = [
 # On most systems, the cupti library is not installed in the same directory as
 # the other CUDA libraries but rather in a special extras/CUPTI directory.
 CUPTI_HEADER_PATHS = [
-  "extras/CUPTI/include/",
-  "include/cuda/CUPTI/",
+    "extras/CUPTI/include/",
+    "include/cuda/CUPTI/",
 ]
 
 # Lookup paths for the cupti library, relative to the
@@ -66,25 +68,25 @@ CUPTI_HEADER_PATHS = [
 # On most systems, the cupti library is not installed in the same directory as
 # the other CUDA libraries but rather in a special extras/CUPTI directory.
 CUPTI_LIB_PATHS = [
-  "extras/CUPTI/lib64/",
-  "lib/x86_64-linux-gnu",
-  "lib64/",
-  "extras/CUPTI/libx64/",
-  "extras/CUPTI/lib/",
-  "lib/",
+    "extras/CUPTI/lib64/",
+    "lib/x86_64-linux-gnu",
+    "lib64/",
+    "extras/CUPTI/libx64/",
+    "extras/CUPTI/lib/",
+    "lib/",
 ]
 
 # Lookup paths for CUDA headers (cuda.h) relative to the CUDA toolkit directory.
 CUDA_INCLUDE_PATHS = [
-  "include/",
-  "include/cuda/"
+    "include/",
+    "include/cuda/",
 ]
 
 # Lookup paths for cudnn.h relative to the CUDNN install directory.
 CUDNN_INCLUDE_PATHS = [
-  "",
-  "include/",
-  "include/cuda/",
+    "",
+    "include/",
+    "include/cuda/",
 ]
 
 # Lookup paths for NVVM libdevice relative to the CUDA directory toolkit.
@@ -92,696 +94,841 @@ CUDNN_INCLUDE_PATHS = [
 # libdevice implements mathematical functions for GPU kernels, and is provided
 # in NVVM bitcode (a subset of LLVM bitcode).
 NVVM_LIBDEVICE_PATHS = [
-  "nvvm/libdevice/",
-  "share/cuda/",
+    "nvvm/libdevice/",
+    "share/cuda/",
 ]
 
 # Files used to detect the NVVM libdevice path.
 NVVM_LIBDEVICE_FILES = [
-  # CUDA 9.0 has a single file.
-  "libdevice.10.bc",
+    # CUDA 9.0 has a single file.
+    "libdevice.10.bc",
 
-  # CUDA 8.0 has separate files for compute versions 2.0, 3.0, 3.5 and 5.0.
-  # Probing for one of them is sufficient.
-  "libdevice.compute_20.10.bc",
+    # CUDA 8.0 has separate files for compute versions 2.0, 3.0, 3.5 and 5.0.
+    # Probing for one of them is sufficient.
+    "libdevice.compute_20.10.bc",
 ]
 
 load("//third_party/clang_toolchain:download_clang.bzl", "download_clang")
+load(
+    "@bazel_tools//tools/cpp:lib_cc_configure.bzl",
+    "escape_string",
+    "get_env_var",
+)
+load(
+    "@bazel_tools//tools/cpp:windows_cc_configure.bzl",
+    "find_msvc_tool",
+    "find_vc_path",
+    "setup_vc_env_vars",
+)
+
+def _get_python_bin(repository_ctx):
+    """Gets the python bin path."""
+    python_bin = repository_ctx.os.environ.get(_PYTHON_BIN_PATH)
+    if python_bin != None:
+        return python_bin
+    python_bin_name = "python.exe" if _is_windows(repository_ctx) else "python"
+    python_bin_path = repository_ctx.which(python_bin_name)
+    if python_bin_path != None:
+        return str(python_bin_path)
+    auto_configure_fail("Cannot find python in PATH, please make sure " +
+                        "python is installed and add its directory in PATH, or --define " +
+                        "%s='/something/else'.\nPATH=%s" % (
+                            _PYTHON_BIN_PATH,
+                            repository_ctx.os.environ.get("PATH", ""),
+                        ))
+
+def _get_nvcc_tmp_dir_for_windows(repository_ctx):
+    """Return the tmp directory for nvcc to generate intermediate source files."""
+    escaped_tmp_dir = escape_string(
+        get_env_var(repository_ctx, "TMP", "C:\\Windows\\Temp").replace("\\", "\\\\"),
+    )
+    return escaped_tmp_dir + "\\\\nvcc_inter_files_tmp_dir"
+
+def _get_msvc_compiler(repository_ctx):
+    vc_path = find_vc_path(repository_ctx)
+    return find_msvc_tool(repository_ctx, vc_path, "cl.exe").replace("\\", "/")
+
+def _get_win_cuda_defines(repository_ctx):
+    """Return CROSSTOOL defines for Windows"""
+
+    # If we are not on Windows, return empty vaules for Windows specific fields.
+    # This ensures the CROSSTOOL file parser is happy.
+    if not _is_windows(repository_ctx):
+        return {
+            "%{msvc_env_tmp}": "",
+            "%{msvc_env_path}": "",
+            "%{msvc_env_include}": "",
+            "%{msvc_env_lib}": "",
+            "%{msvc_cl_path}": "",
+            "%{msvc_ml_path}": "",
+            "%{msvc_link_path}": "",
+            "%{msvc_lib_path}": "",
+            "%{cxx_builtin_include_directory}": "",
+        }
+
+    vc_path = find_vc_path(repository_ctx)
+    if not vc_path:
+        auto_configure_fail("Visual C++ build tools not found on your machine." +
+                            "Please check your installation following https://docs.bazel.build/versions/master/windows.html#using")
+        return {}
+
+    env = setup_vc_env_vars(repository_ctx, vc_path)
+    escaped_paths = escape_string(env["PATH"])
+    escaped_include_paths = escape_string(env["INCLUDE"])
+    escaped_lib_paths = escape_string(env["LIB"])
+    escaped_tmp_dir = escape_string(
+        get_env_var(repository_ctx, "TMP", "C:\\Windows\\Temp").replace("\\", "\\\\"),
+    )
+
+    msvc_cl_path = "windows/msvc_wrapper_for_nvcc.bat"
+    msvc_ml_path = find_msvc_tool(repository_ctx, vc_path, "ml64.exe").replace("\\", "/")
+    msvc_link_path = find_msvc_tool(repository_ctx, vc_path, "link.exe").replace("\\", "/")
+    msvc_lib_path = find_msvc_tool(repository_ctx, vc_path, "lib.exe").replace("\\", "/")
+
+    # nvcc will generate some temporary source files under %{nvcc_tmp_dir}
+    # The generated files are guranteed to have unique name, so they can share the same tmp directory
+    escaped_cxx_include_directories = ["cxx_builtin_include_directory: \"%s\"" % _get_nvcc_tmp_dir_for_windows(repository_ctx)]
+    for path in escaped_include_paths.split(";"):
+        if path:
+            escaped_cxx_include_directories.append("cxx_builtin_include_directory: \"%s\"" % path)
+
+    return {
+        "%{msvc_env_tmp}": escaped_tmp_dir,
+        "%{msvc_env_path}": escaped_paths,
+        "%{msvc_env_include}": escaped_include_paths,
+        "%{msvc_env_lib}": escaped_lib_paths,
+        "%{msvc_cl_path}": msvc_cl_path,
+        "%{msvc_ml_path}": msvc_ml_path,
+        "%{msvc_link_path}": msvc_link_path,
+        "%{msvc_lib_path}": msvc_lib_path,
+        "%{cxx_builtin_include_directory}": "\n".join(escaped_cxx_include_directories),
+    }
 
 # TODO(dzc): Once these functions have been factored out of Bazel's
 # cc_configure.bzl, load them from @bazel_tools instead.
 # BEGIN cc_configure common functions.
 def find_cc(repository_ctx):
-  """Find the C++ compiler."""
-  # On Windows, we use Bazel's MSVC CROSSTOOL for GPU build
-  # Return a dummy value for GCC detection here to avoid error
-  if _is_windows(repository_ctx):
-    return "/use/--config=win-cuda --cpu=x64_windows_msvc/instead"
-
-  if _use_cuda_clang(repository_ctx):
-    target_cc_name = "clang"
-    cc_path_envvar = _CLANG_CUDA_COMPILER_PATH
-    if _flag_enabled(repository_ctx, _TF_DOWNLOAD_CLANG):
-      return "extra_tools/bin/clang"
-  else:
-    target_cc_name = "gcc"
-    cc_path_envvar = _GCC_HOST_COMPILER_PATH
-  cc_name = target_cc_name
-
-  if cc_path_envvar in repository_ctx.os.environ:
-    cc_name_from_env = repository_ctx.os.environ[cc_path_envvar].strip()
-    if cc_name_from_env:
-      cc_name = cc_name_from_env
-  if cc_name.startswith("/"):
-    # Absolute path, maybe we should make this supported by our which function.
-    return cc_name
-  cc = repository_ctx.which(cc_name)
-  if cc == None:
-    fail(("Cannot find {}, either correct your path or set the {}" +
-          " environment variable").format(target_cc_name, cc_path_envvar))
-  return cc
-
+    """Find the C++ compiler."""
+    if _is_windows(repository_ctx):
+        return _get_msvc_compiler(repository_ctx)
+
+    if _use_cuda_clang(repository_ctx):
+        target_cc_name = "clang"
+        cc_path_envvar = _CLANG_CUDA_COMPILER_PATH
+        if _flag_enabled(repository_ctx, _TF_DOWNLOAD_CLANG):
+            return "extra_tools/bin/clang"
+    else:
+        target_cc_name = "gcc"
+        cc_path_envvar = _GCC_HOST_COMPILER_PATH
+    cc_name = target_cc_name
+
+    if cc_path_envvar in repository_ctx.os.environ:
+        cc_name_from_env = repository_ctx.os.environ[cc_path_envvar].strip()
+        if cc_name_from_env:
+            cc_name = cc_name_from_env
+    if cc_name.startswith("/"):
+        # Absolute path, maybe we should make this supported by our which function.
+        return cc_name
+    cc = repository_ctx.which(cc_name)
+    if cc == None:
+        fail(("Cannot find {}, either correct your path or set the {}" +
+              " environment variable").format(target_cc_name, cc_path_envvar))
+    return cc
 
 _INC_DIR_MARKER_BEGIN = "#include <...>"
 
-
 # OSX add " (framework directory)" at the end of line, strip it.
 _OSX_FRAMEWORK_SUFFIX = " (framework directory)"
-_OSX_FRAMEWORK_SUFFIX_LEN =  len(_OSX_FRAMEWORK_SUFFIX)
-def _cxx_inc_convert(path):
-  """Convert path returned by cc -E xc++ in a complete path."""
-  path = path.strip()
-  if path.endswith(_OSX_FRAMEWORK_SUFFIX):
-    path = path[:-_OSX_FRAMEWORK_SUFFIX_LEN].strip()
-  return path
+_OSX_FRAMEWORK_SUFFIX_LEN = len(_OSX_FRAMEWORK_SUFFIX)
 
+def _cxx_inc_convert(path):
+    """Convert path returned by cc -E xc++ in a complete path."""
+    path = path.strip()
+    if path.endswith(_OSX_FRAMEWORK_SUFFIX):
+        path = path[:-_OSX_FRAMEWORK_SUFFIX_LEN].strip()
+    return path
 
 def _normalize_include_path(repository_ctx, path):
-  """Normalizes include paths before writing them to the crosstool.
-
-  If path points inside the 'crosstool' folder of the repository, a relative
-  path is returned.
-  If path points outside the 'crosstool' folder, an absolute path is returned.
-  """
-  path = str(repository_ctx.path(path))
-  crosstool_folder = str(repository_ctx.path(".").get_child('crosstool'))
+    """Normalizes include paths before writing them to the crosstool.
 
-  if path.startswith(crosstool_folder):
-    # We drop the path to "$REPO/crosstool" and a trailing path separator.
-    return path[len(crosstool_folder)+1:]
-  return path
+    If path points inside the 'crosstool' folder of the repository, a relative
+    path is returned.
+    If path points outside the 'crosstool' folder, an absolute path is returned.
+    """
+    path = str(repository_ctx.path(path))
+    crosstool_folder = str(repository_ctx.path(".").get_child("crosstool"))
 
+    if path.startswith(crosstool_folder):
+        # We drop the path to "$REPO/crosstool" and a trailing path separator.
+        return path[len(crosstool_folder) + 1:]
+    return path
 
 def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp):
-  """Compute the list of default C or C++ include directories."""
-  if lang_is_cpp:
-    lang = "c++"
-  else:
-    lang = "c"
-  result = repository_ctx.execute([cc, "-E", "-x" + lang, "-", "-v"])
-  index1 = result.stderr.find(_INC_DIR_MARKER_BEGIN)
-  if index1 == -1:
-    return []
-  index1 = result.stderr.find("\n", index1)
-  if index1 == -1:
-    return []
-  index2 = result.stderr.rfind("\n ")
-  if index2 == -1 or index2 < index1:
-    return []
-  index2 = result.stderr.find("\n", index2 + 1)
-  if index2 == -1:
-    inc_dirs = result.stderr[index1 + 1:]
-  else:
-    inc_dirs = result.stderr[index1 + 1:index2].strip()
-
-  return [
-      _normalize_include_path(repository_ctx, _cxx_inc_convert(p))
-      for p in inc_dirs.split("\n")
-  ]
+    """Compute the list of default C or C++ include directories."""
+    if lang_is_cpp:
+        lang = "c++"
+    else:
+        lang = "c"
+    result = repository_ctx.execute([cc, "-E", "-x" + lang, "-", "-v"])
+    index1 = result.stderr.find(_INC_DIR_MARKER_BEGIN)
+    if index1 == -1:
+        return []
+    index1 = result.stderr.find("\n", index1)
+    if index1 == -1:
+        return []
+    index2 = result.stderr.rfind("\n ")
+    if index2 == -1 or index2 < index1:
+        return []
+    index2 = result.stderr.find("\n", index2 + 1)
+    if index2 == -1:
+        inc_dirs = result.stderr[index1 + 1:]
+    else:
+        inc_dirs = result.stderr[index1 + 1:index2].strip()
 
+    return [
+        _normalize_include_path(repository_ctx, _cxx_inc_convert(p))
+        for p in inc_dirs.split("\n")
+    ]
 
 def get_cxx_inc_directories(repository_ctx, cc):
-  """Compute the list of default C and C++ include directories."""
-  # For some reason `clang -xc` sometimes returns include paths that are
-  # different from the ones from `clang -xc++`. (Symlink and a dir)
-  # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists
-  includes_cpp = _get_cxx_inc_directories_impl(repository_ctx, cc, True)
-  includes_c = _get_cxx_inc_directories_impl(repository_ctx, cc, False)
+    """Compute the list of default C and C++ include directories."""
 
-  includes_cpp_set = depset(includes_cpp)
-  return includes_cpp + [inc for inc in includes_c
-                         if inc not in includes_cpp_set]
+    # For some reason `clang -xc` sometimes returns include paths that are
+    # different from the ones from `clang -xc++`. (Symlink and a dir)
+    # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists
+    includes_cpp = _get_cxx_inc_directories_impl(repository_ctx, cc, True)
+    includes_c = _get_cxx_inc_directories_impl(repository_ctx, cc, False)
 
+    includes_cpp_set = depset(includes_cpp)
+    return includes_cpp + [
+        inc
+        for inc in includes_c
+        if inc not in includes_cpp_set
+    ]
 
 def auto_configure_fail(msg):
-  """Output failure message when cuda configuration fails."""
-  red = "\033[0;31m"
-  no_color = "\033[0m"
-  fail("\n%sCuda Configuration Error:%s %s\n" % (red, no_color, msg))
-# END cc_configure common functions (see TODO above).
+    """Output failure message when cuda configuration fails."""
+    red = "\033[0;31m"
+    no_color = "\033[0m"
+    fail("\n%sCuda Configuration Error:%s %s\n" % (red, no_color, msg))
 
+# END cc_configure common functions (see TODO above).
 
 def _host_compiler_includes(repository_ctx, cc):
-  """Generates the cxx_builtin_include_directory entries for gcc inc dirs.
-
-  Args:
-    repository_ctx: The repository context.
-    cc: The path to the gcc host compiler.
-
-  Returns:
-    A string containing the cxx_builtin_include_directory for each of the gcc
-    host compiler include directories, which can be added to the CROSSTOOL
-    file.
-  """
-  inc_dirs = get_cxx_inc_directories(repository_ctx, cc)
-  inc_entries = []
-  for inc_dir in inc_dirs:
-    inc_entries.append("  cxx_builtin_include_directory: \"%s\"" % inc_dir)
-  return "\n".join(inc_entries)
+    """Generates the cxx_builtin_include_directory entries for gcc inc dirs.
+
+    Args:
+      repository_ctx: The repository context.
+      cc: The path to the gcc host compiler.
+
+    Returns:
+      A string containing the cxx_builtin_include_directory for each of the gcc
+      host compiler include directories, which can be added to the CROSSTOOL
+      file.
+    """
+    inc_dirs = get_cxx_inc_directories(repository_ctx, cc)
+    inc_entries = []
+    for inc_dir in inc_dirs:
+        inc_entries.append("  cxx_builtin_include_directory: \"%s\"" % inc_dir)
+    return "\n".join(inc_entries)
 
 def _cuda_include_path(repository_ctx, cuda_config):
-  """Generates the cxx_builtin_include_directory entries for cuda inc dirs.
-
-  Args:
-    repository_ctx: The repository context.
-    cc: The path to the gcc host compiler.
-
-  Returns:
-    A string containing the cxx_builtin_include_directory for each of the gcc
-    host compiler include directories, which can be added to the CROSSTOOL
-    file.
-  """
-  nvcc_path = repository_ctx.path("%s/bin/nvcc%s" %
-                                  (cuda_config.cuda_toolkit_path,
-                                   ".exe" if cuda_config.cpu_value == "Windows" else ""))
-  result = repository_ctx.execute([nvcc_path, '-v',
-                                  '/dev/null', '-o', '/dev/null'])
-  target_dir = ""
-  for one_line in result.stderr.splitlines():
-    if one_line.startswith('#$ _TARGET_DIR_='):
-      target_dir = (cuda_config.cuda_toolkit_path + '/' +
-                    one_line.replace('#$ _TARGET_DIR_=', '') + "/include")
-  inc_entries = []
-  if target_dir != "":
-    inc_entries.append("  cxx_builtin_include_directory: \"%s\"" % target_dir)
-  default_include = cuda_config.cuda_toolkit_path + '/include'
-  inc_entries.append("  cxx_builtin_include_directory: \"%s\"" %
-                     default_include)
-  return "\n".join(inc_entries)
+    """Generates the cxx_builtin_include_directory entries for cuda inc dirs.
 
+    Args:
+      repository_ctx: The repository context.
+      cc: The path to the gcc host compiler.
 
-def _enable_cuda(repository_ctx):
-  if "TF_NEED_CUDA" in repository_ctx.os.environ:
-    enable_cuda = repository_ctx.os.environ["TF_NEED_CUDA"].strip()
-    return enable_cuda == "1"
-  return False
+    Returns:
+      A string containing the cxx_builtin_include_directory for each of the gcc
+      host compiler include directories, which can be added to the CROSSTOOL
+      file.
+    """
+    nvcc_path = repository_ctx.path("%s/bin/nvcc%s" %
+                                    (
+                                        cuda_config.cuda_toolkit_path,
+                                        ".exe" if cuda_config.cpu_value == "Windows" else "",
+                                    ))
+    result = repository_ctx.execute([
+        nvcc_path,
+        "-v",
+        "/dev/null",
+        "-o",
+        "/dev/null",
+    ])
+    target_dir = ""
+    for one_line in result.stderr.splitlines():
+        if one_line.startswith("#$ _TARGET_DIR_="):
+            target_dir = (cuda_config.cuda_toolkit_path + "/" +
+                          one_line.replace("#$ _TARGET_DIR_=", "") + "/include")
+    inc_entries = []
+    if target_dir != "":
+        inc_entries.append("  cxx_builtin_include_directory: \"%s\"" % target_dir)
+    default_include = cuda_config.cuda_toolkit_path + "/include"
+    inc_entries.append("  cxx_builtin_include_directory: \"%s\"" %
+                       default_include)
+    return "\n".join(inc_entries)
 
+def _enable_cuda(repository_ctx):
+    if "TF_NEED_CUDA" in repository_ctx.os.environ:
+        enable_cuda = repository_ctx.os.environ["TF_NEED_CUDA"].strip()
+        return enable_cuda == "1"
+    return False
 
 def _cuda_toolkit_path(repository_ctx):
-  """Finds the cuda toolkit directory.
+    """Finds the cuda toolkit directory.
 
-  Args:
-    repository_ctx: The repository context.
-
-  Returns:
-    A speculative real path of the cuda toolkit install directory.
-  """
-  cuda_toolkit_path = _DEFAULT_CUDA_TOOLKIT_PATH
-  if _CUDA_TOOLKIT_PATH in repository_ctx.os.environ:
-    cuda_toolkit_path = repository_ctx.os.environ[_CUDA_TOOLKIT_PATH].strip()
-  if not repository_ctx.path(cuda_toolkit_path).exists:
-    auto_configure_fail("Cannot find cuda toolkit path.")
-  return str(repository_ctx.path(cuda_toolkit_path).realpath)
+    Args:
+      repository_ctx: The repository context.
 
+    Returns:
+      A speculative real path of the cuda toolkit install directory.
+    """
+    cuda_toolkit_path = _DEFAULT_CUDA_TOOLKIT_PATH
+    if _CUDA_TOOLKIT_PATH in repository_ctx.os.environ:
+        cuda_toolkit_path = repository_ctx.os.environ[_CUDA_TOOLKIT_PATH].strip()
+    if not repository_ctx.path(cuda_toolkit_path).exists:
+        auto_configure_fail("Cannot find cuda toolkit path.")
+    return str(repository_ctx.path(cuda_toolkit_path).realpath)
 
 def _cudnn_install_basedir(repository_ctx):
-  """Finds the cudnn install directory."""
-  cudnn_install_path = _DEFAULT_CUDNN_INSTALL_PATH
-  if _CUDNN_INSTALL_PATH in repository_ctx.os.environ:
-    cudnn_install_path = repository_ctx.os.environ[_CUDNN_INSTALL_PATH].strip()
-  if not repository_ctx.path(cudnn_install_path).exists:
-    auto_configure_fail("Cannot find cudnn install path.")
-  return cudnn_install_path
-
+    """Finds the cudnn install directory."""
+    cudnn_install_path = _DEFAULT_CUDNN_INSTALL_PATH
+    if _CUDNN_INSTALL_PATH in repository_ctx.os.environ:
+        cudnn_install_path = repository_ctx.os.environ[_CUDNN_INSTALL_PATH].strip()
+    if not repository_ctx.path(cudnn_install_path).exists:
+        auto_configure_fail("Cannot find cudnn install path.")
+    return cudnn_install_path
 
 def matches_version(environ_version, detected_version):
-  """Checks whether the user-specified version matches the detected version.
-
-  This function performs a weak matching so that if the user specifies only the
-  major or major and minor versions, the versions are still considered matching
-  if the version parts match. To illustrate:
-
-      environ_version  detected_version  result
-      -----------------------------------------
-      5.1.3            5.1.3             True
-      5.1              5.1.3             True
-      5                5.1               True
-      5.1.3            5.1               False
-      5.2.3            5.1.3             False
-
-  Args:
-    environ_version: The version specified by the user via environment
-      variables.
-    detected_version: The version autodetected from the CUDA installation on
-      the system.
-
-  Returns: True if user-specified version matches detected version and False
-    otherwise.
-  """
-  environ_version_parts = environ_version.split(".")
-  detected_version_parts = detected_version.split(".")
-  if len(detected_version_parts) < len(environ_version_parts):
-    return False
-  for i, part in enumerate(detected_version_parts):
-    if i >= len(environ_version_parts):
-      break
-    if part != environ_version_parts[i]:
-      return False
-  return True
-
+    """Checks whether the user-specified version matches the detected version.
+
+    This function performs a weak matching so that if the user specifies only the
+    major or major and minor versions, the versions are still considered matching
+    if the version parts match. To illustrate:
+
+        environ_version  detected_version  result
+        -----------------------------------------
+        5.1.3            5.1.3             True
+        5.1              5.1.3             True
+        5                5.1               True
+        5.1.3            5.1               False
+        5.2.3            5.1.3             False
+
+    Args:
+      environ_version: The version specified by the user via environment
+        variables.
+      detected_version: The version autodetected from the CUDA installation on
+        the system.
+
+    Returns: True if user-specified version matches detected version and False
+      otherwise.
+    """
+    environ_version_parts = environ_version.split(".")
+    detected_version_parts = detected_version.split(".")
+    if len(detected_version_parts) < len(environ_version_parts):
+        return False
+    for i, part in enumerate(detected_version_parts):
+        if i >= len(environ_version_parts):
+            break
+        if part != environ_version_parts[i]:
+            return False
+    return True
 
 _NVCC_VERSION_PREFIX = "Cuda compilation tools, release "
 
-
 def _cuda_version(repository_ctx, cuda_toolkit_path, cpu_value):
-  """Detects the version of CUDA installed on the system.
-
-  Args:
-    repository_ctx: The repository context.
-    cuda_toolkit_path: The CUDA install directory.
-
-  Returns:
-    String containing the version of CUDA.
-  """
-  # Run nvcc --version and find the line containing the CUDA version.
-  nvcc_path = repository_ctx.path("%s/bin/nvcc%s" %
-                                  (cuda_toolkit_path,
-                                   ".exe" if cpu_value == "Windows" else ""))
-  if not nvcc_path.exists:
-    auto_configure_fail("Cannot find nvcc at %s" % str(nvcc_path))
-  result = repository_ctx.execute([str(nvcc_path), '--version'])
-  if result.stderr:
-    auto_configure_fail("Error running nvcc --version: %s" % result.stderr)
-  lines = result.stdout.splitlines()
-  version_line = lines[len(lines) - 1]
-  if version_line.find(_NVCC_VERSION_PREFIX) == -1:
-    auto_configure_fail(
-        "Could not parse CUDA version from nvcc --version. Got: %s" %
-        result.stdout)
-
-  # Parse the CUDA version from the line containing the CUDA version.
-  prefix_removed = version_line.replace(_NVCC_VERSION_PREFIX, '')
-  parts = prefix_removed.split(",")
-  if len(parts) != 2 or len(parts[0]) < 2:
-    auto_configure_fail(
-        "Could not parse CUDA version from nvcc --version. Got: %s" %
-        result.stdout)
-  full_version = parts[1].strip()
-  if full_version.startswith('V'):
-    full_version = full_version[1:]
-
-  # Check whether TF_CUDA_VERSION was set by the user and fail if it does not
-  # match the detected version.
-  environ_version = ""
-  if _TF_CUDA_VERSION in repository_ctx.os.environ:
-    environ_version = repository_ctx.os.environ[_TF_CUDA_VERSION].strip()
-  if environ_version and not matches_version(environ_version, full_version):
-    auto_configure_fail(
-        ("CUDA version detected from nvcc (%s) does not match " +
-         "TF_CUDA_VERSION (%s)") % (full_version, environ_version))
-
-  # We only use the version consisting of the major and minor version numbers.
-  version_parts = full_version.split('.')
-  if len(version_parts) < 2:
-    auto_configure_fail("CUDA version detected from nvcc (%s) is incomplete.")
-  if cpu_value == "Windows":
-    version = "64_%s%s" % (version_parts[0], version_parts[1])
-  else:
-    version = "%s.%s" % (version_parts[0], version_parts[1])
-  return version
+    """Detects the version of CUDA installed on the system.
+
+    Args:
+      repository_ctx: The repository context.
+      cuda_toolkit_path: The CUDA install directory.
+
+    Returns:
+      String containing the version of CUDA.
+    """
+
+    # Run nvcc --version and find the line containing the CUDA version.
+    nvcc_path = repository_ctx.path("%s/bin/nvcc%s" %
+                                    (
+                                        cuda_toolkit_path,
+                                        ".exe" if cpu_value == "Windows" else "",
+                                    ))
+    if not nvcc_path.exists:
+        auto_configure_fail("Cannot find nvcc at %s" % str(nvcc_path))
+    result = repository_ctx.execute([str(nvcc_path), "--version"])
+    if result.stderr:
+        auto_configure_fail("Error running nvcc --version: %s" % result.stderr)
+    lines = result.stdout.splitlines()
+    version_line = lines[len(lines) - 1]
+    if version_line.find(_NVCC_VERSION_PREFIX) == -1:
+        auto_configure_fail(
+            "Could not parse CUDA version from nvcc --version. Got: %s" %
+            result.stdout,
+        )
+
+    # Parse the CUDA version from the line containing the CUDA version.
+    prefix_removed = version_line.replace(_NVCC_VERSION_PREFIX, "")
+    parts = prefix_removed.split(",")
+    if len(parts) != 2 or len(parts[0]) < 2:
+        auto_configure_fail(
+            "Could not parse CUDA version from nvcc --version. Got: %s" %
+            result.stdout,
+        )
+    full_version = parts[1].strip()
+    if full_version.startswith("V"):
+        full_version = full_version[1:]
+
+    # Check whether TF_CUDA_VERSION was set by the user and fail if it does not
+    # match the detected version.
+    environ_version = ""
+    if _TF_CUDA_VERSION in repository_ctx.os.environ:
+        environ_version = repository_ctx.os.environ[_TF_CUDA_VERSION].strip()
+    if environ_version and not matches_version(environ_version, full_version):
+        auto_configure_fail(
+            ("CUDA version detected from nvcc (%s) does not match " +
+             "TF_CUDA_VERSION (%s)") % (full_version, environ_version),
+        )
 
+    # We only use the version consisting of the major and minor version numbers.
+    version_parts = full_version.split(".")
+    if len(version_parts) < 2:
+        auto_configure_fail("CUDA version detected from nvcc (%s) is incomplete.")
+    if cpu_value == "Windows":
+        version = "64_%s%s" % (version_parts[0], version_parts[1])
+    else:
+        version = "%s.%s" % (version_parts[0], version_parts[1])
+    return version
 
 _DEFINE_CUDNN_MAJOR = "#define CUDNN_MAJOR"
 _DEFINE_CUDNN_MINOR = "#define CUDNN_MINOR"
 _DEFINE_CUDNN_PATCHLEVEL = "#define CUDNN_PATCHLEVEL"
 
-
 def find_cuda_define(repository_ctx, header_dir, header_file, define):
-  """Returns the value of a #define in a header file.
-
-  Greps through a header file and returns the value of the specified #define.
-  If the #define is not found, then raise an error.
-
-  Args:
-    repository_ctx: The repository context.
-    header_dir: The directory containing the header file.
-    header_file: The header file name.
-    define: The #define to search for.
-
-  Returns:
-    The value of the #define found in the header.
-  """
-  # Confirm location of the header and grep for the line defining the macro.
-  h_path = repository_ctx.path("%s/%s" % (header_dir, header_file))
-  if not h_path.exists:
-    auto_configure_fail("Cannot find %s at %s" % (header_file, str(h_path)))
-  result = repository_ctx.execute(
-      # Grep one more lines as some #defines are splitted into two lines.
-      ["grep", "--color=never", "-A1", "-E", define, str(h_path)])
-  if result.stderr:
-    auto_configure_fail("Error reading %s: %s" % (str(h_path), result.stderr))
-
-  # Parse the version from the line defining the macro.
-  if result.stdout.find(define) == -1:
-    auto_configure_fail("Cannot find line containing '%s' in %s" %
-                        (define, h_path))
-  # Split results to lines
-  lines = result.stdout.split('\n')
-  num_lines = len(lines)
-  for l in range(num_lines):
-    line = lines[l]
-    if define in line:  # Find the line with define
-      version = line
-      if l != num_lines-1 and line[-1] == '\\':  # Add next line, if multiline
-        version = version[:-1] + lines[l+1]
-      break
-  # Remove any comments
-  version = version.split("//")[0]
-  # Remove define name
-  version = version.replace(define, "").strip()
-  # Remove the code after the version number.
-  version_end = version.find(" ")
-  if version_end != -1:
-    if version_end == 0:
-      auto_configure_fail(
-          "Cannot extract the version from line containing '%s' in %s" %
-          (define, str(h_path)))
-    version = version[:version_end].strip()
-  return version
+    """Returns the value of a #define in a header file.
 
+    Greps through a header file and returns the value of the specified #define.
+    If the #define is not found, then raise an error.
+
+    Args:
+      repository_ctx: The repository context.
+      header_dir: The directory containing the header file.
+      header_file: The header file name.
+      define: The #define to search for.
+
+    Returns:
+      The value of the #define found in the header.
+    """
+
+    # Confirm location of the header and grep for the line defining the macro.
+    h_path = repository_ctx.path("%s/%s" % (header_dir, header_file))
+    if not h_path.exists:
+        auto_configure_fail("Cannot find %s at %s" % (header_file, str(h_path)))
+    result = repository_ctx.execute(
+        # Grep one more lines as some #defines are splitted into two lines.
+        ["grep", "--color=never", "-A1", "-E", define, str(h_path)],
+    )
+    if result.stderr:
+        auto_configure_fail("Error reading %s: %s" % (str(h_path), result.stderr))
+
+    # Parse the version from the line defining the macro.
+    if result.stdout.find(define) == -1:
+        auto_configure_fail("Cannot find line containing '%s' in %s" %
+                            (define, h_path))
+
+    # Split results to lines
+    lines = result.stdout.split("\n")
+    num_lines = len(lines)
+    for l in range(num_lines):
+        line = lines[l]
+        if define in line:  # Find the line with define
+            version = line
+            if l != num_lines - 1 and line[-1] == "\\":  # Add next line, if multiline
+                version = version[:-1] + lines[l + 1]
+            break
+
+    # Remove any comments
+    version = version.split("//")[0]
+
+    # Remove define name
+    version = version.replace(define, "").strip()
+
+    # Remove the code after the version number.
+    version_end = version.find(" ")
+    if version_end != -1:
+        if version_end == 0:
+            auto_configure_fail(
+                "Cannot extract the version from line containing '%s' in %s" %
+                (define, str(h_path)),
+            )
+        version = version[:version_end].strip()
+    return version
 
 def _cudnn_version(repository_ctx, cudnn_install_basedir, cpu_value):
-  """Detects the version of cuDNN installed on the system.
-
-  Args:
-    repository_ctx: The repository context.
-    cpu_value: The name of the host operating system.
-    cudnn_install_basedir: The cuDNN install directory.
-
-  Returns:
-    A string containing the version of cuDNN.
-  """
-  cudnn_header_dir = _find_cudnn_header_dir(repository_ctx,
-                                            cudnn_install_basedir)
-  major_version = find_cuda_define(
-      repository_ctx, cudnn_header_dir, "cudnn.h", _DEFINE_CUDNN_MAJOR)
-  minor_version = find_cuda_define(
-      repository_ctx, cudnn_header_dir, "cudnn.h", _DEFINE_CUDNN_MINOR)
-  patch_version = find_cuda_define(
-      repository_ctx, cudnn_header_dir, "cudnn.h", _DEFINE_CUDNN_PATCHLEVEL)
-  full_version = "%s.%s.%s" % (major_version, minor_version, patch_version)
-
-  # Check whether TF_CUDNN_VERSION was set by the user and fail if it does not
-  # match the detected version.
-  environ_version = ""
-  if _TF_CUDNN_VERSION in repository_ctx.os.environ:
-    environ_version = repository_ctx.os.environ[_TF_CUDNN_VERSION].strip()
-  if environ_version and not matches_version(environ_version, full_version):
-    cudnn_h_path = repository_ctx.path("%s/include/cudnn.h" %
-                                       cudnn_install_basedir)
-    auto_configure_fail(
-        ("cuDNN version detected from %s (%s) does not match " +
-        "TF_CUDNN_VERSION (%s)") %
-        (str(cudnn_h_path), full_version, environ_version))
-
-  # We only use the major version since we use the libcudnn libraries that are
-  # only versioned with the major version (e.g. libcudnn.so.5).
-  version = major_version
-  if cpu_value == "Windows":
-    version = "64_" + version
-  return version
+    """Detects the version of cuDNN installed on the system.
 
+    Args:
+      repository_ctx: The repository context.
+      cpu_value: The name of the host operating system.
+      cudnn_install_basedir: The cuDNN install directory.
 
-def _compute_capabilities(repository_ctx):
-  """Returns a list of strings representing cuda compute capabilities."""
-  if _TF_CUDA_COMPUTE_CAPABILITIES not in repository_ctx.os.environ:
-    return _DEFAULT_CUDA_COMPUTE_CAPABILITIES
-  capabilities_str = repository_ctx.os.environ[_TF_CUDA_COMPUTE_CAPABILITIES]
-  capabilities = capabilities_str.split(",")
-  for capability in capabilities:
-    # Workaround for Skylark's lack of support for regex. This check should
-    # be equivalent to checking:
-    #     if re.match("[0-9]+.[0-9]+", capability) == None:
-    parts = capability.split(".")
-    if len(parts) != 2 or not parts[0].isdigit() or not parts[1].isdigit():
-      auto_configure_fail("Invalid compute capability: %s" % capability)
-  return capabilities
+    Returns:
+      A string containing the version of cuDNN.
+    """
+    cudnn_header_dir = _find_cudnn_header_dir(
+        repository_ctx,
+        cudnn_install_basedir,
+    )
+    major_version = find_cuda_define(
+        repository_ctx,
+        cudnn_header_dir,
+        "cudnn.h",
+        _DEFINE_CUDNN_MAJOR,
+    )
+    minor_version = find_cuda_define(
+        repository_ctx,
+        cudnn_header_dir,
+        "cudnn.h",
+        _DEFINE_CUDNN_MINOR,
+    )
+    patch_version = find_cuda_define(
+        repository_ctx,
+        cudnn_header_dir,
+        "cudnn.h",
+        _DEFINE_CUDNN_PATCHLEVEL,
+    )
+    full_version = "%s.%s.%s" % (major_version, minor_version, patch_version)
+
+    # Check whether TF_CUDNN_VERSION was set by the user and fail if it does not
+    # match the detected version.
+    environ_version = ""
+    if _TF_CUDNN_VERSION in repository_ctx.os.environ:
+        environ_version = repository_ctx.os.environ[_TF_CUDNN_VERSION].strip()
+    if environ_version and not matches_version(environ_version, full_version):
+        cudnn_h_path = repository_ctx.path("%s/include/cudnn.h" %
+                                           cudnn_install_basedir)
+        auto_configure_fail(
+            ("cuDNN version detected from %s (%s) does not match " +
+             "TF_CUDNN_VERSION (%s)") %
+            (str(cudnn_h_path), full_version, environ_version),
+        )
 
+    # We only use the major version since we use the libcudnn libraries that are
+    # only versioned with the major version (e.g. libcudnn.so.5).
+    version = major_version
+    if cpu_value == "Windows":
+        version = "64_" + version
+    return version
 
-def get_cpu_value(repository_ctx):
-  """Returns the name of the host operating system.
+def _compute_capabilities(repository_ctx):
+    """Returns a list of strings representing cuda compute capabilities."""
+    if _TF_CUDA_COMPUTE_CAPABILITIES not in repository_ctx.os.environ:
+        return _DEFAULT_CUDA_COMPUTE_CAPABILITIES
+    capabilities_str = repository_ctx.os.environ[_TF_CUDA_COMPUTE_CAPABILITIES]
+    capabilities = capabilities_str.split(",")
+    for capability in capabilities:
+        # Workaround for Skylark's lack of support for regex. This check should
+        # be equivalent to checking:
+        #     if re.match("[0-9]+.[0-9]+", capability) == None:
+        parts = capability.split(".")
+        if len(parts) != 2 or not parts[0].isdigit() or not parts[1].isdigit():
+            auto_configure_fail("Invalid compute capability: %s" % capability)
+    return capabilities
 
-  Args:
-    repository_ctx: The repository context.
+def get_cpu_value(repository_ctx):
+    """Returns the name of the host operating system.
 
-  Returns:
-    A string containing the name of the host operating system.
-  """
-  os_name = repository_ctx.os.name.lower()
-  if os_name.startswith("mac os"):
-    return "Darwin"
-  if os_name.find("windows") != -1:
-    return "Windows"
-  result = repository_ctx.execute(["uname", "-s"])
-  return result.stdout.strip()
+    Args:
+      repository_ctx: The repository context.
 
+    Returns:
+      A string containing the name of the host operating system.
+    """
+    os_name = repository_ctx.os.name.lower()
+    if os_name.startswith("mac os"):
+        return "Darwin"
+    if os_name.find("windows") != -1:
+        return "Windows"
+    result = repository_ctx.execute(["uname", "-s"])
+    return result.stdout.strip()
 
 def _is_windows(repository_ctx):
-  """Returns true if the host operating system is windows."""
-  return get_cpu_value(repository_ctx) == "Windows"
-
-def _lib_name(lib, cpu_value, version="", static=False):
-  """Constructs the platform-specific name of a library.
-
-  Args:
-    lib: The name of the library, such as "cudart"
-    cpu_value: The name of the host operating system.
-    version: The version of the library.
-    static: True the library is static or False if it is a shared object.
-
-  Returns:
-    The platform-specific name of the library.
-  """
-  if cpu_value in ("Linux", "FreeBSD"):
-    if static:
-      return "lib%s.a" % lib
-    else:
-      if version:
-        version = ".%s" % version
-      return "lib%s.so%s" % (lib, version)
-  elif cpu_value == "Windows":
-    return "%s.lib" % lib
-  elif cpu_value == "Darwin":
-    if static:
-      return "lib%s.a" % lib
-    else:
-      if version:
-        version = ".%s" % version
-    return "lib%s%s.dylib" % (lib, version)
-  else:
-    auto_configure_fail("Invalid cpu_value: %s" % cpu_value)
-
-
-def _find_cuda_lib(lib, repository_ctx, cpu_value, basedir, version="",
-                   static=False):
-  """Finds the given CUDA or cuDNN library on the system.
-
-  Args:
-    lib: The name of the library, such as "cudart"
-    repository_ctx: The repository context.
-    cpu_value: The name of the host operating system.
-    basedir: The install directory of CUDA or cuDNN.
-    version: The version of the library.
-    static: True if static library, False if shared object.
-
-  Returns:
-    Returns a struct with the following fields:
-      file_name: The basename of the library found on the system.
-      path: The full path to the library.
-  """
-  file_name = _lib_name(lib, cpu_value, version, static)
-  for relative_path in CUDA_LIB_PATHS:
-    path = repository_ctx.path("%s/%s%s" % (basedir, relative_path, file_name))
-    if path.exists:
-      return struct(file_name=file_name, path=str(path.realpath))
-  auto_configure_fail("Cannot find cuda library %s" % file_name)
+    """Returns true if the host operating system is windows."""
+    return get_cpu_value(repository_ctx) == "Windows"
 
+def _lib_name(lib, cpu_value, version = "", static = False):
+    """Constructs the platform-specific name of a library.
 
-def _find_cupti_header_dir(repository_ctx, cuda_config):
-  """Returns the path to the directory containing cupti.h
+    Args:
+      lib: The name of the library, such as "cudart"
+      cpu_value: The name of the host operating system.
+      version: The version of the library.
+      static: True the library is static or False if it is a shared object.
 
-  On most systems, the cupti library is not installed in the same directory as
-  the other CUDA libraries but rather in a special extras/CUPTI directory.
+    Returns:
+      The platform-specific name of the library.
+    """
+    if cpu_value in ("Linux", "FreeBSD"):
+        if static:
+            return "lib%s.a" % lib
+        else:
+            if version:
+                version = ".%s" % version
+            return "lib%s.so%s" % (lib, version)
+    elif cpu_value == "Windows":
+        return "%s.lib" % lib
+    elif cpu_value == "Darwin":
+        if static:
+            return "lib%s.a" % lib
+        elif version:
+            version = ".%s" % version
+        return "lib%s%s.dylib" % (lib, version)
+    else:
+        auto_configure_fail("Invalid cpu_value: %s" % cpu_value)
+
+def _find_cuda_lib(
+        lib,
+        repository_ctx,
+        cpu_value,
+        basedir,
+        version = "",
+        static = False):
+    """Finds the given CUDA or cuDNN library on the system.
+
+    Args:
+      lib: The name of the library, such as "cudart"
+      repository_ctx: The repository context.
+      cpu_value: The name of the host operating system.
+      basedir: The install directory of CUDA or cuDNN.
+      version: The version of the library.
+      static: True if static library, False if shared object.
+
+    Returns:
+      Returns a struct with the following fields:
+        file_name: The basename of the library found on the system.
+        path: The full path to the library.
+    """
+    file_name = _lib_name(lib, cpu_value, version, static)
+    for relative_path in CUDA_LIB_PATHS:
+        path = repository_ctx.path("%s/%s%s" % (basedir, relative_path, file_name))
+        if path.exists:
+            return struct(file_name = file_name, path = str(path.realpath))
+    auto_configure_fail("Cannot find cuda library %s" % file_name)
 
-  Args:
-    repository_ctx: The repository context.
-    cuda_config: The CUDA config as returned by _get_cuda_config
+def _find_cupti_header_dir(repository_ctx, cuda_config):
+    """Returns the path to the directory containing cupti.h
+
+    On most systems, the cupti library is not installed in the same directory as
+    the other CUDA libraries but rather in a special extras/CUPTI directory.
 
-  Returns:
-    The path of the directory containing the cupti header.
-  """
-  cuda_toolkit_path = cuda_config.cuda_toolkit_path
-  for relative_path in CUPTI_HEADER_PATHS:
-    if repository_ctx.path("%s/%scupti.h" % (cuda_toolkit_path, relative_path)).exists:
-        return ("%s/%s" % (cuda_toolkit_path, relative_path))[:-1]
-  auto_configure_fail("Cannot find cupti.h under %s" % ", ".join([cuda_toolkit_path + "/" + s for s in CUPTI_HEADER_PATHS]))
+    Args:
+      repository_ctx: The repository context.
+      cuda_config: The CUDA config as returned by _get_cuda_config
 
+    Returns:
+      The path of the directory containing the cupti header.
+    """
+    cuda_toolkit_path = cuda_config.cuda_toolkit_path
+    for relative_path in CUPTI_HEADER_PATHS:
+        if repository_ctx.path("%s/%scupti.h" % (cuda_toolkit_path, relative_path)).exists:
+            return ("%s/%s" % (cuda_toolkit_path, relative_path))[:-1]
+    auto_configure_fail("Cannot find cupti.h under %s" % ", ".join([cuda_toolkit_path + "/" + s for s in CUPTI_HEADER_PATHS]))
 
 def _find_cupti_lib(repository_ctx, cuda_config):
-  """Finds the cupti library on the system.
-
-  On most systems, the cupti library is not installed in the same directory as
-  the other CUDA libraries but rather in a special extras/CUPTI directory.
-
-  Args:
-    repository_ctx: The repository context.
-    cuda_config: The cuda configuration as returned by _get_cuda_config.
-
-  Returns:
-    Returns a struct with the following fields:
-      file_name: The basename of the library found on the system.
-      path: The full path to the library.
-  """
-  file_name = _lib_name("cupti", cuda_config.cpu_value,
-                        cuda_config.cuda_version)
-  cuda_toolkit_path = cuda_config.cuda_toolkit_path
-  for relative_path in CUPTI_LIB_PATHS:
-    path = repository_ctx.path(
-        "%s/%s%s" % (cuda_toolkit_path, relative_path, file_name))
-    if path.exists:
-      return struct(file_name=file_name, path=str(path.realpath))
-
-  auto_configure_fail("Cannot find cupti library %s" % file_name)
+    """Finds the cupti library on the system.
+
+    On most systems, the cupti library is not installed in the same directory as
+    the other CUDA libraries but rather in a special extras/CUPTI directory.
+
+    Args:
+      repository_ctx: The repository context.
+      cuda_config: The cuda configuration as returned by _get_cuda_config.
+
+    Returns:
+      Returns a struct with the following fields:
+        file_name: The basename of the library found on the system.
+        path: The full path to the library.
+    """
+    file_name = _lib_name(
+        "cupti",
+        cuda_config.cpu_value,
+        cuda_config.cuda_version,
+    )
+    cuda_toolkit_path = cuda_config.cuda_toolkit_path
+    for relative_path in CUPTI_LIB_PATHS:
+        path = repository_ctx.path(
+            "%s/%s%s" % (cuda_toolkit_path, relative_path, file_name),
+        )
+        if path.exists:
+            return struct(file_name = file_name, path = str(path.realpath))
+
+    auto_configure_fail("Cannot find cupti library %s" % file_name)
 
 def _find_libs(repository_ctx, cuda_config):
-  """Returns the CUDA and cuDNN libraries on the system.
-
-  Args:
-    repository_ctx: The repository context.
-    cuda_config: The CUDA config as returned by _get_cuda_config
-
-  Returns:
-    Map of library names to structs of filename and path.
-  """
-  cpu_value = cuda_config.cpu_value
-  return {
-      "cuda": _find_cuda_lib("cuda", repository_ctx, cpu_value, cuda_config.cuda_toolkit_path),
-      "cudart": _find_cuda_lib(
-          "cudart", repository_ctx, cpu_value, cuda_config.cuda_toolkit_path,
-          cuda_config.cuda_version),
-      "cudart_static": _find_cuda_lib(
-          "cudart_static", repository_ctx, cpu_value,
-          cuda_config.cuda_toolkit_path, cuda_config.cuda_version, static=True),
-      "cublas": _find_cuda_lib(
-          "cublas", repository_ctx, cpu_value, cuda_config.cuda_toolkit_path,
-          cuda_config.cuda_version),
-      "cusolver": _find_cuda_lib(
-          "cusolver", repository_ctx, cpu_value, cuda_config.cuda_toolkit_path,
-          cuda_config.cuda_version),
-      "curand": _find_cuda_lib(
-          "curand", repository_ctx, cpu_value, cuda_config.cuda_toolkit_path,
-          cuda_config.cuda_version),
-      "cufft": _find_cuda_lib(
-          "cufft", repository_ctx, cpu_value, cuda_config.cuda_toolkit_path,
-          cuda_config.cuda_version),
-      "cudnn": _find_cuda_lib(
-          "cudnn", repository_ctx, cpu_value, cuda_config.cudnn_install_basedir,
-          cuda_config.cudnn_version),
-      "cupti": _find_cupti_lib(repository_ctx, cuda_config)
-  }
+    """Returns the CUDA and cuDNN libraries on the system.
 
+    Args:
+      repository_ctx: The repository context.
+      cuda_config: The CUDA config as returned by _get_cuda_config
 
-def _find_cuda_include_path(repository_ctx, cuda_config):
-  """Returns the path to the directory containing cuda.h
+    Returns:
+      Map of library names to structs of filename and path.
+    """
+    cpu_value = cuda_config.cpu_value
+    return {
+        "cuda": _find_cuda_lib("cuda", repository_ctx, cpu_value, cuda_config.cuda_toolkit_path),
+        "cudart": _find_cuda_lib(
+            "cudart",
+            repository_ctx,
+            cpu_value,
+            cuda_config.cuda_toolkit_path,
+            cuda_config.cuda_version,
+        ),
+        "cudart_static": _find_cuda_lib(
+            "cudart_static",
+            repository_ctx,
+            cpu_value,
+            cuda_config.cuda_toolkit_path,
+            cuda_config.cuda_version,
+            static = True,
+        ),
+        "cublas": _find_cuda_lib(
+            "cublas",
+            repository_ctx,
+            cpu_value,
+            cuda_config.cuda_toolkit_path,
+            cuda_config.cuda_version,
+        ),
+        "cusolver": _find_cuda_lib(
+            "cusolver",
+            repository_ctx,
+            cpu_value,
+            cuda_config.cuda_toolkit_path,
+            cuda_config.cuda_version,
+        ),
+        "curand": _find_cuda_lib(
+            "curand",
+            repository_ctx,
+            cpu_value,
+            cuda_config.cuda_toolkit_path,
+            cuda_config.cuda_version,
+        ),
+        "cufft": _find_cuda_lib(
+            "cufft",
+            repository_ctx,
+            cpu_value,
+            cuda_config.cuda_toolkit_path,
+            cuda_config.cuda_version,
+        ),
+        "cudnn": _find_cuda_lib(
+            "cudnn",
+            repository_ctx,
+            cpu_value,
+            cuda_config.cudnn_install_basedir,
+            cuda_config.cudnn_version,
+        ),
+        "cupti": _find_cupti_lib(repository_ctx, cuda_config),
+    }
 
-  Args:
-    repository_ctx: The repository context.
-    cuda_config: The CUDA config as returned by _get_cuda_config
+def _find_cuda_include_path(repository_ctx, cuda_config):
+    """Returns the path to the directory containing cuda.h
 
-  Returns:
-    The path of the directory containing the CUDA headers.
-  """
-  cuda_toolkit_path = cuda_config.cuda_toolkit_path
-  for relative_path in CUDA_INCLUDE_PATHS:
-    if repository_ctx.path("%s/%scuda.h" % (cuda_toolkit_path, relative_path)).exists:
-        return ("%s/%s" % (cuda_toolkit_path, relative_path))[:-1]
-  auto_configure_fail("Cannot find cuda.h under %s" % cuda_toolkit_path)
+    Args:
+      repository_ctx: The repository context.
+      cuda_config: The CUDA config as returned by _get_cuda_config
 
+    Returns:
+      The path of the directory containing the CUDA headers.
+    """
+    cuda_toolkit_path = cuda_config.cuda_toolkit_path
+    for relative_path in CUDA_INCLUDE_PATHS:
+        if repository_ctx.path("%s/%scuda.h" % (cuda_toolkit_path, relative_path)).exists:
+            return ("%s/%s" % (cuda_toolkit_path, relative_path))[:-1]
+    auto_configure_fail("Cannot find cuda.h under %s" % cuda_toolkit_path)
 
 def _find_cudnn_header_dir(repository_ctx, cudnn_install_basedir):
-  """Returns the path to the directory containing cudnn.h
-
-  Args:
-    repository_ctx: The repository context.
-    cudnn_install_basedir: The cudnn install directory as returned by
-      _cudnn_install_basedir.
+    """Returns the path to the directory containing cudnn.h
 
-  Returns:
-    The path of the directory containing the cudnn header.
-  """
-  for relative_path in CUDA_INCLUDE_PATHS:
-    if repository_ctx.path("%s/%scudnn.h" % (cudnn_install_basedir, relative_path)).exists:
-        return ("%s/%s" % (cudnn_install_basedir, relative_path))[:-1]
-  if repository_ctx.path("/usr/include/cudnn.h").exists:
-    return "/usr/include"
-  auto_configure_fail("Cannot find cudnn.h under %s" % cudnn_install_basedir)
+    Args:
+      repository_ctx: The repository context.
+      cudnn_install_basedir: The cudnn install directory as returned by
+        _cudnn_install_basedir.
 
+    Returns:
+      The path of the directory containing the cudnn header.
+    """
+    for relative_path in CUDA_INCLUDE_PATHS:
+        if repository_ctx.path("%s/%scudnn.h" % (cudnn_install_basedir, relative_path)).exists:
+            return ("%s/%s" % (cudnn_install_basedir, relative_path))[:-1]
+    if repository_ctx.path("/usr/include/cudnn.h").exists:
+        return "/usr/include"
+    auto_configure_fail("Cannot find cudnn.h under %s" % cudnn_install_basedir)
 
 def _find_nvvm_libdevice_dir(repository_ctx, cuda_config):
-  """Returns the path to the directory containing libdevice in bitcode format.
-
-  Args:
-    repository_ctx: The repository context.
-    cuda_config: The CUDA config as returned by _get_cuda_config
-
-  Returns:
-    The path of the directory containing the CUDA headers.
-  """
-  cuda_toolkit_path = cuda_config.cuda_toolkit_path
-  for libdevice_file in NVVM_LIBDEVICE_FILES:
-    for relative_path in NVVM_LIBDEVICE_PATHS:
-      if repository_ctx.path("%s/%s%s" % (cuda_toolkit_path, relative_path, libdevice_file)).exists:
-        return ("%s/%s" % (cuda_toolkit_path, relative_path))[:-1]
-  auto_configure_fail("Cannot find libdevice*.bc files under %s" % cuda_toolkit_path)
+    """Returns the path to the directory containing libdevice in bitcode format.
+
+    Args:
+      repository_ctx: The repository context.
+      cuda_config: The CUDA config as returned by _get_cuda_config
+
+    Returns:
+      The path of the directory containing the CUDA headers.
+    """
+    cuda_toolkit_path = cuda_config.cuda_toolkit_path
+    for libdevice_file in NVVM_LIBDEVICE_FILES:
+        for relative_path in NVVM_LIBDEVICE_PATHS:
+            if repository_ctx.path("%s/%s%s" % (cuda_toolkit_path, relative_path, libdevice_file)).exists:
+                return ("%s/%s" % (cuda_toolkit_path, relative_path))[:-1]
+    auto_configure_fail("Cannot find libdevice*.bc files under %s" % cuda_toolkit_path)
 
 def _cudart_static_linkopt(cpu_value):
-  """Returns additional platform-specific linkopts for cudart."""
-  return "" if cpu_value == "Darwin" else "\"-lrt\","
+    """Returns additional platform-specific linkopts for cudart."""
+    return "" if cpu_value == "Darwin" else "\"-lrt\","
 
 def _get_cuda_config(repository_ctx):
-  """Detects and returns information about the CUDA installation on the system.
-
-  Args:
-    repository_ctx: The repository context.
-
-  Returns:
-    A struct containing the following fields:
-      cuda_toolkit_path: The CUDA toolkit installation directory.
-      cudnn_install_basedir: The cuDNN installation directory.
-      cuda_version: The version of CUDA on the system.
-      cudnn_version: The version of cuDNN on the system.
-      compute_capabilities: A list of the system's CUDA compute capabilities.
-      cpu_value: The name of the host operating system.
-  """
-  cpu_value = get_cpu_value(repository_ctx)
-  cuda_toolkit_path = _cuda_toolkit_path(repository_ctx)
-  cuda_version = _cuda_version(repository_ctx, cuda_toolkit_path, cpu_value)
-  cudnn_install_basedir = _cudnn_install_basedir(repository_ctx)
-  cudnn_version = _cudnn_version(repository_ctx, cudnn_install_basedir, cpu_value)
-  return struct(
-      cuda_toolkit_path = cuda_toolkit_path,
-      cudnn_install_basedir = cudnn_install_basedir,
-      cuda_version = cuda_version,
-      cudnn_version = cudnn_version,
-      compute_capabilities = _compute_capabilities(repository_ctx),
-      cpu_value = cpu_value)
-
-
-def _tpl(repository_ctx, tpl, substitutions={}, out=None):
-  if not out:
-    out = tpl.replace(":", "/")
-  repository_ctx.template(
-      out,
-      Label("//third_party/gpus/%s.tpl" % tpl),
-      substitutions)
-
+    """Detects and returns information about the CUDA installation on the system.
+
+    Args:
+      repository_ctx: The repository context.
+
+    Returns:
+      A struct containing the following fields:
+        cuda_toolkit_path: The CUDA toolkit installation directory.
+        cudnn_install_basedir: The cuDNN installation directory.
+        cuda_version: The version of CUDA on the system.
+        cudnn_version: The version of cuDNN on the system.
+        compute_capabilities: A list of the system's CUDA compute capabilities.
+        cpu_value: The name of the host operating system.
+    """
+    cpu_value = get_cpu_value(repository_ctx)
+    cuda_toolkit_path = _cuda_toolkit_path(repository_ctx)
+    cuda_version = _cuda_version(repository_ctx, cuda_toolkit_path, cpu_value)
+    cudnn_install_basedir = _cudnn_install_basedir(repository_ctx)
+    cudnn_version = _cudnn_version(repository_ctx, cudnn_install_basedir, cpu_value)
+    return struct(
+        cuda_toolkit_path = cuda_toolkit_path,
+        cudnn_install_basedir = cudnn_install_basedir,
+        cuda_version = cuda_version,
+        cudnn_version = cudnn_version,
+        compute_capabilities = _compute_capabilities(repository_ctx),
+        cpu_value = cpu_value,
+    )
+
+def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
+    if not out:
+        out = tpl.replace(":", "/")
+    repository_ctx.template(
+        out,
+        Label("//third_party/gpus/%s.tpl" % tpl),
+        substitutions,
+    )
 
 def _file(repository_ctx, label):
-  repository_ctx.template(
-      label.replace(":", "/"),
-      Label("//third_party/gpus/%s.tpl" % label),
-      {})
-
+    repository_ctx.template(
+        label.replace(":", "/"),
+        Label("//third_party/gpus/%s.tpl" % label),
+        {},
+    )
 
 _DUMMY_CROSSTOOL_BZL_FILE = """
 def error_gpu_disabled():
@@ -802,379 +949,498 @@ def error_gpu_disabled():
   )
 """
 
-
 _DUMMY_CROSSTOOL_BUILD_FILE = """
 load("//crosstool:error_gpu_disabled.bzl", "error_gpu_disabled")
 
 error_gpu_disabled()
 """
 
-
 def _create_dummy_repository(repository_ctx):
-  cpu_value = get_cpu_value(repository_ctx)
-
-  # Set up BUILD file for cuda/.
-  _tpl(repository_ctx, "cuda:build_defs.bzl",
-       {
-           "%{cuda_is_configured}": "False",
-           "%{cuda_extra_copts}": "[]",
-       })
-  _tpl(repository_ctx, "cuda:BUILD",
-       {
-           "%{cuda_driver_lib}": _lib_name("cuda", cpu_value),
-           "%{cudart_static_lib}": _lib_name("cudart_static", cpu_value,
-                                             static=True),
-           "%{cudart_static_linkopt}": _cudart_static_linkopt(cpu_value),
-           "%{cudart_lib}": _lib_name("cudart", cpu_value),
-           "%{cublas_lib}": _lib_name("cublas", cpu_value),
-           "%{cusolver_lib}": _lib_name("cusolver", cpu_value),
-           "%{cudnn_lib}": _lib_name("cudnn", cpu_value),
-           "%{cufft_lib}": _lib_name("cufft", cpu_value),
-           "%{curand_lib}": _lib_name("curand", cpu_value),
-           "%{cupti_lib}": _lib_name("cupti", cpu_value),
-           "%{cuda_include_genrules}": '',
-           "%{cuda_headers}": '',
-       })
-
-  # Create dummy files for the CUDA toolkit since they are still required by
-  # tensorflow/core/platform/default/build_config:cuda.
-  repository_ctx.file("cuda/cuda/include/cuda.h", "")
-  repository_ctx.file("cuda/cuda/include/cublas.h", "")
-  repository_ctx.file("cuda/cuda/include/cudnn.h", "")
-  repository_ctx.file("cuda/cuda/extras/CUPTI/include/cupti.h", "")
-  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cuda", cpu_value))
-  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cudart", cpu_value))
-  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cudart_static", cpu_value))
-  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cublas", cpu_value))
-  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cusolver", cpu_value))
-  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cudnn", cpu_value))
-  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("curand", cpu_value))
-  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cufft", cpu_value))
-  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cupti", cpu_value))
-
-  # Set up cuda_config.h, which is used by
-  # tensorflow/stream_executor/dso_loader.cc.
-  _tpl(repository_ctx, "cuda:cuda_config.h",
-       {
-           "%{cuda_version}": _DEFAULT_CUDA_VERSION,
-           "%{cudnn_version}": _DEFAULT_CUDNN_VERSION,
-           "%{cuda_compute_capabilities}": ",".join([
-               "CudaVersion(\"%s\")" % c
-               for c in _DEFAULT_CUDA_COMPUTE_CAPABILITIES]),
-           "%{cuda_toolkit_path}": _DEFAULT_CUDA_TOOLKIT_PATH,
-       }, "cuda/cuda/cuda_config.h")
-
-  # If cuda_configure is not configured to build with GPU support, and the user
-  # attempts to build with --config=cuda, add a dummy build rule to intercept
-  # this and fail with an actionable error message.
-  repository_ctx.file("crosstool/error_gpu_disabled.bzl",
-                      _DUMMY_CROSSTOOL_BZL_FILE)
-  repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
-
-
-def _execute(repository_ctx, cmdline, error_msg=None, error_details=None,
-             empty_stdout_fine=False):
-  """Executes an arbitrary shell command.
-
-  Args:
-    repository_ctx: the repository_ctx object
-    cmdline: list of strings, the command to execute
-    error_msg: string, a summary of the error if the command fails
-    error_details: string, details about the error or steps to fix it
-    empty_stdout_fine: bool, if True, an empty stdout result is fine, otherwise
-      it's an error
-  Return:
-    the result of repository_ctx.execute(cmdline)
-  """
-  result = repository_ctx.execute(cmdline)
-  if result.stderr or not (empty_stdout_fine or result.stdout):
-    auto_configure_fail(
-        "\n".join([
-            error_msg.strip() if error_msg else "Repository command failed",
-            result.stderr.strip(),
-            error_details if error_details else ""]))
-  return result
-
+    cpu_value = get_cpu_value(repository_ctx)
+
+    # Set up BUILD file for cuda/.
+    _tpl(
+        repository_ctx,
+        "cuda:build_defs.bzl",
+        {
+            "%{cuda_is_configured}": "False",
+            "%{cuda_extra_copts}": "[]",
+        },
+    )
+    _tpl(
+        repository_ctx,
+        "cuda:BUILD",
+        {
+            "%{cuda_driver_lib}": _lib_name("cuda", cpu_value),
+            "%{cudart_static_lib}": _lib_name(
+                "cudart_static",
+                cpu_value,
+                static = True,
+            ),
+            "%{cudart_static_linkopt}": _cudart_static_linkopt(cpu_value),
+            "%{cudart_lib}": _lib_name("cudart", cpu_value),
+            "%{cublas_lib}": _lib_name("cublas", cpu_value),
+            "%{cusolver_lib}": _lib_name("cusolver", cpu_value),
+            "%{cudnn_lib}": _lib_name("cudnn", cpu_value),
+            "%{cufft_lib}": _lib_name("cufft", cpu_value),
+            "%{curand_lib}": _lib_name("curand", cpu_value),
+            "%{cupti_lib}": _lib_name("cupti", cpu_value),
+            "%{cuda_include_genrules}": "",
+            "%{cuda_headers}": "",
+        },
+    )
+
+    # Create dummy files for the CUDA toolkit since they are still required by
+    # tensorflow/core/platform/default/build_config:cuda.
+    repository_ctx.file("cuda/cuda/include/cuda.h", "")
+    repository_ctx.file("cuda/cuda/include/cublas.h", "")
+    repository_ctx.file("cuda/cuda/include/cudnn.h", "")
+    repository_ctx.file("cuda/cuda/extras/CUPTI/include/cupti.h", "")
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cuda", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cudart", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cudart_static", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cublas", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cusolver", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cudnn", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("curand", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cufft", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cupti", cpu_value))
+
+    # Set up cuda_config.h, which is used by
+    # tensorflow/stream_executor/dso_loader.cc.
+    _tpl(
+        repository_ctx,
+        "cuda:cuda_config.h",
+        {
+            "%{cuda_version}": _DEFAULT_CUDA_VERSION,
+            "%{cudnn_version}": _DEFAULT_CUDNN_VERSION,
+            "%{cuda_compute_capabilities}": ",".join([
+                "CudaVersion(\"%s\")" % c
+                for c in _DEFAULT_CUDA_COMPUTE_CAPABILITIES
+            ]),
+            "%{cuda_toolkit_path}": _DEFAULT_CUDA_TOOLKIT_PATH,
+        },
+        "cuda/cuda/cuda_config.h",
+    )
+
+    # If cuda_configure is not configured to build with GPU support, and the user
+    # attempts to build with --config=cuda, add a dummy build rule to intercept
+    # this and fail with an actionable error message.
+    repository_ctx.file(
+        "crosstool/error_gpu_disabled.bzl",
+        _DUMMY_CROSSTOOL_BZL_FILE,
+    )
+    repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
+
+def _execute(
+        repository_ctx,
+        cmdline,
+        error_msg = None,
+        error_details = None,
+        empty_stdout_fine = False):
+    """Executes an arbitrary shell command.
+
+    Args:
+      repository_ctx: the repository_ctx object
+      cmdline: list of strings, the command to execute
+      error_msg: string, a summary of the error if the command fails
+      error_details: string, details about the error or steps to fix it
+      empty_stdout_fine: bool, if True, an empty stdout result is fine, otherwise
+        it's an error
+    Return:
+      the result of repository_ctx.execute(cmdline)
+    """
+    result = repository_ctx.execute(cmdline)
+    if result.stderr or not (empty_stdout_fine or result.stdout):
+        auto_configure_fail(
+            "\n".join([
+                error_msg.strip() if error_msg else "Repository command failed",
+                result.stderr.strip(),
+                error_details if error_details else "",
+            ]),
+        )
+    return result
 
 def _norm_path(path):
-  """Returns a path with '/' and remove the trailing slash."""
-  path = path.replace("\\", "/")
-  if path[-1] == "/":
-    path = path[:-1]
-  return path
-
-
-def symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name,
-                            src_files = [], dest_files = []):
-  """Returns a genrule to symlink(or copy if on Windows) a set of files.
-
-  If src_dir is passed, files will be read from the given directory; otherwise
-  we assume files are in src_files and dest_files
-  """
-  if src_dir != None:
-    src_dir = _norm_path(src_dir)
-    dest_dir = _norm_path(dest_dir)
-    files = '\n'.join(sorted(_read_dir(repository_ctx, src_dir).splitlines()))
-    # Create a list with the src_dir stripped to use for outputs.
-    dest_files = files.replace(src_dir, '').splitlines()
-    src_files = files.splitlines()
-  command = []
-  if not _is_windows(repository_ctx):
-    # We clear folders that might have been generated previously to avoid
-    # undesired inclusions
-    command.append('if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi')
-    command.append('if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi')
-    command.append('if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi')
-    command.append('if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi')
-  outs = []
-  for i in range(len(dest_files)):
-    if dest_files[i] != "":
-      # If we have only one file to link we do not want to use the dest_dir, as
-      # $(@D) will include the full path to the file.
-      dest = '$(@D)/' + dest_dir + dest_files[i] if len(dest_files) != 1 else '$(@D)/' + dest_files[i]
-      # On Windows, symlink is not supported, so we just copy all the files.
-      cmd = 'cp -f' if _is_windows(repository_ctx) else 'ln -s'
-      command.append(cmd + ' "%s" "%s"' % (src_files[i] , dest))
-      outs.append('        "' + dest_dir + dest_files[i] + '",')
-  genrule = _genrule(src_dir, genrule_name, " && ".join(command),
-                     "\n".join(outs))
-  return genrule
-
+    """Returns a path with '/' and remove the trailing slash."""
+    path = path.replace("\\", "/")
+    if path[-1] == "/":
+        path = path[:-1]
+    return path
+
+def symlink_genrule_for_dir(
+        repository_ctx,
+        src_dir,
+        dest_dir,
+        genrule_name,
+        src_files = [],
+        dest_files = []):
+    """Returns a genrule to symlink(or copy if on Windows) a set of files.
+
+    If src_dir is passed, files will be read from the given directory; otherwise
+    we assume files are in src_files and dest_files
+    """
+    if src_dir != None:
+        src_dir = _norm_path(src_dir)
+        dest_dir = _norm_path(dest_dir)
+        files = "\n".join(sorted(_read_dir(repository_ctx, src_dir).splitlines()))
+
+        # Create a list with the src_dir stripped to use for outputs.
+        dest_files = files.replace(src_dir, "").splitlines()
+        src_files = files.splitlines()
+    command = []
+    if not _is_windows(repository_ctx):
+        # We clear folders that might have been generated previously to avoid
+        # undesired inclusions
+        command.append('if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi')
+        command.append('if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi')
+        command.append('if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi')
+        command.append('if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi')
+    outs = []
+    for i in range(len(dest_files)):
+        if dest_files[i] != "":
+            # If we have only one file to link we do not want to use the dest_dir, as
+            # $(@D) will include the full path to the file.
+            dest = "$(@D)/" + dest_dir + dest_files[i] if len(dest_files) != 1 else "$(@D)/" + dest_files[i]
+
+            # On Windows, symlink is not supported, so we just copy all the files.
+            cmd = "cp -f" if _is_windows(repository_ctx) else "ln -s"
+            command.append(cmd + ' "%s" "%s"' % (src_files[i], dest))
+            outs.append('        "' + dest_dir + dest_files[i] + '",')
+    genrule = _genrule(
+        src_dir,
+        genrule_name,
+        " && ".join(command),
+        "\n".join(outs),
+    )
+    return genrule
 
 def _genrule(src_dir, genrule_name, command, outs):
-  """Returns a string with a genrule.
-
-  Genrule executes the given command and produces the given outputs.
-  """
-  return (
-      'genrule(\n' +
-      '    name = "' +
-      genrule_name + '",\n' +
-      '    outs = [\n' +
-      outs +
-      '\n    ],\n' +
-      '    cmd = """\n' +
-      command +
-      '\n   """,\n' +
-      ')\n'
-  )
+    """Returns a string with a genrule.
 
+    Genrule executes the given command and produces the given outputs.
+    """
+    return (
+        "genrule(\n" +
+        '    name = "' +
+        genrule_name + '",\n' +
+        "    outs = [\n" +
+        outs +
+        "\n    ],\n" +
+        '    cmd = """\n' +
+        command +
+        '\n   """,\n' +
+        ")\n"
+    )
 
 def _read_dir(repository_ctx, src_dir):
-  """Returns a string with all files in a directory.
-
-  Finds all files inside a directory, traversing subfolders and following
-  symlinks. The returned string contains the full path of all files
-  separated by line breaks.
-  """
-  if _is_windows(repository_ctx):
-    src_dir = src_dir.replace("/", "\\")
-    find_result = _execute(
-        repository_ctx, ["cmd.exe", "/c", "dir", src_dir, "/b", "/s", "/a-d"],
-        empty_stdout_fine=True)
-    # src_files will be used in genrule.outs where the paths must
-    # use forward slashes.
-    result = find_result.stdout.replace("\\", "/")
-  else:
-    find_result = _execute(
-        repository_ctx, ["find", src_dir, "-follow", "-type", "f"],
-        empty_stdout_fine=True)
-    result = find_result.stdout
-  return result
+    """Returns a string with all files in a directory.
+
+    Finds all files inside a directory, traversing subfolders and following
+    symlinks. The returned string contains the full path of all files
+    separated by line breaks.
+    """
+    if _is_windows(repository_ctx):
+        src_dir = src_dir.replace("/", "\\")
+        find_result = _execute(
+            repository_ctx,
+            ["cmd.exe", "/c", "dir", src_dir, "/b", "/s", "/a-d"],
+            empty_stdout_fine = True,
+        )
+
+        # src_files will be used in genrule.outs where the paths must
+        # use forward slashes.
+        result = find_result.stdout.replace("\\", "/")
+    else:
+        find_result = _execute(
+            repository_ctx,
+            ["find", src_dir, "-follow", "-type", "f"],
+            empty_stdout_fine = True,
+        )
+        result = find_result.stdout
+    return result
 
 def _flag_enabled(repository_ctx, flag_name):
-  if flag_name in repository_ctx.os.environ:
-    value = repository_ctx.os.environ[flag_name].strip()
-    return value == "1"
-  return False
+    if flag_name in repository_ctx.os.environ:
+        value = repository_ctx.os.environ[flag_name].strip()
+        return value == "1"
+    return False
 
 def _use_cuda_clang(repository_ctx):
-  return _flag_enabled(repository_ctx, "TF_CUDA_CLANG")
+    return _flag_enabled(repository_ctx, "TF_CUDA_CLANG")
 
 def _compute_cuda_extra_copts(repository_ctx, compute_capabilities):
-  if _use_cuda_clang(repository_ctx):
-    capability_flags = ["--cuda-gpu-arch=sm_" +
-        cap.replace(".", "") for cap in compute_capabilities]
-  else:
-    # Capabilities are handled in the "crosstool_wrapper_driver_is_not_gcc" for nvcc
-    capability_flags = []
-  return str(capability_flags)
+    if _use_cuda_clang(repository_ctx):
+        capability_flags = ["--cuda-gpu-arch=sm_" +
+                            cap.replace(".", "") for cap in compute_capabilities]
+    else:
+        # Capabilities are handled in the "crosstool_wrapper_driver_is_not_gcc" for nvcc
+        capability_flags = []
+    return str(capability_flags)
 
 def _create_local_cuda_repository(repository_ctx):
-  """Creates the repository containing files set up to build with CUDA."""
-  cuda_config = _get_cuda_config(repository_ctx)
-
-  cuda_include_path = _find_cuda_include_path(repository_ctx, cuda_config)
-  cudnn_header_dir = _find_cudnn_header_dir(repository_ctx,
-                                            cuda_config.cudnn_install_basedir)
-  cupti_header_dir = _find_cupti_header_dir(repository_ctx, cuda_config)
-  nvvm_libdevice_dir = _find_nvvm_libdevice_dir(repository_ctx, cuda_config)
-
-  # Set up symbolic links for the cuda toolkit by creating genrules to do
-  # symlinking. We create one genrule for each directory we want to track under
-  # cuda_toolkit_path
-  cuda_toolkit_path = cuda_config.cuda_toolkit_path
-  genrules = [symlink_genrule_for_dir(repository_ctx,
-      cuda_include_path, "cuda/include", "cuda-include")]
-  genrules.append(symlink_genrule_for_dir(repository_ctx,
-      nvvm_libdevice_dir, "cuda/nvvm/libdevice", "cuda-nvvm"))
-  genrules.append(symlink_genrule_for_dir(repository_ctx,
-      cupti_header_dir, "cuda/extras/CUPTI/include", "cuda-extras"))
-
-  cuda_libs = _find_libs(repository_ctx, cuda_config)
-  cuda_lib_src = []
-  cuda_lib_dest = []
-  for lib in cuda_libs.values():
-    cuda_lib_src.append(lib.path)
-    cuda_lib_dest.append("cuda/lib/" + lib.file_name)
-  genrules.append(symlink_genrule_for_dir(repository_ctx, None, "", "cuda-lib",
-                                          cuda_lib_src, cuda_lib_dest))
-
-  # Set up the symbolic links for cudnn if cndnn was not installed to
-  # CUDA_TOOLKIT_PATH.
-  included_files = _read_dir(repository_ctx, cuda_include_path).replace(
-      cuda_include_path, '').splitlines()
-  if '/cudnn.h' not in included_files:
-    genrules.append(symlink_genrule_for_dir(repository_ctx, None,
-        "cuda/include/", "cudnn-include", [cudnn_header_dir + "/cudnn.h"],
-        ["cudnn.h"]))
-  else:
-    genrules.append(
-            'filegroup(\n' +
+    """Creates the repository containing files set up to build with CUDA."""
+    cuda_config = _get_cuda_config(repository_ctx)
+
+    cuda_include_path = _find_cuda_include_path(repository_ctx, cuda_config)
+    cudnn_header_dir = _find_cudnn_header_dir(
+        repository_ctx,
+        cuda_config.cudnn_install_basedir,
+    )
+    cupti_header_dir = _find_cupti_header_dir(repository_ctx, cuda_config)
+    nvvm_libdevice_dir = _find_nvvm_libdevice_dir(repository_ctx, cuda_config)
+
+    # Set up symbolic links for the cuda toolkit by creating genrules to do
+    # symlinking. We create one genrule for each directory we want to track under
+    # cuda_toolkit_path
+    cuda_toolkit_path = cuda_config.cuda_toolkit_path
+    genrules = [symlink_genrule_for_dir(
+        repository_ctx,
+        cuda_include_path,
+        "cuda/include",
+        "cuda-include",
+    )]
+    genrules.append(symlink_genrule_for_dir(
+        repository_ctx,
+        nvvm_libdevice_dir,
+        "cuda/nvvm/libdevice",
+        "cuda-nvvm",
+    ))
+    genrules.append(symlink_genrule_for_dir(
+        repository_ctx,
+        cupti_header_dir,
+        "cuda/extras/CUPTI/include",
+        "cuda-extras",
+    ))
+
+    cuda_libs = _find_libs(repository_ctx, cuda_config)
+    cuda_lib_src = []
+    cuda_lib_dest = []
+    for lib in cuda_libs.values():
+        cuda_lib_src.append(lib.path)
+        cuda_lib_dest.append("cuda/lib/" + lib.file_name)
+    genrules.append(symlink_genrule_for_dir(
+        repository_ctx,
+        None,
+        "",
+        "cuda-lib",
+        cuda_lib_src,
+        cuda_lib_dest,
+    ))
+
+    # Set up the symbolic links for cudnn if cndnn was not installed to
+    # CUDA_TOOLKIT_PATH.
+    included_files = _read_dir(repository_ctx, cuda_include_path).replace(
+        cuda_include_path,
+        "",
+    ).splitlines()
+    if "/cudnn.h" not in included_files:
+        genrules.append(symlink_genrule_for_dir(
+            repository_ctx,
+            None,
+            "cuda/include/",
+            "cudnn-include",
+            [cudnn_header_dir + "/cudnn.h"],
+            ["cudnn.h"],
+        ))
+    else:
+        genrules.append(
+            "filegroup(\n" +
             '    name = "cudnn-include",\n' +
-            '    srcs = [],\n' +
-            ')\n'
+            "    srcs = [],\n" +
+            ")\n",
         )
 
-  # Set up BUILD file for cuda/
-  _tpl(repository_ctx, "cuda:build_defs.bzl",
-       {
-           "%{cuda_is_configured}": "True",
-           "%{cuda_extra_copts}": _compute_cuda_extra_copts(
-               repository_ctx, cuda_config.compute_capabilities),
-       })
-  _tpl(repository_ctx, "cuda:BUILD",
-       {
-           "%{cuda_driver_lib}": cuda_libs["cuda"].file_name,
-           "%{cudart_static_lib}": cuda_libs["cudart_static"].file_name,
-           "%{cudart_static_linkopt}": _cudart_static_linkopt(
-               cuda_config.cpu_value),
-           "%{cudart_lib}": cuda_libs["cudart"].file_name,
-           "%{cublas_lib}": cuda_libs["cublas"].file_name,
-           "%{cusolver_lib}": cuda_libs["cusolver"].file_name,
-           "%{cudnn_lib}": cuda_libs["cudnn"].file_name,
-           "%{cufft_lib}": cuda_libs["cufft"].file_name,
-           "%{curand_lib}": cuda_libs["curand"].file_name,
-           "%{cupti_lib}": cuda_libs["cupti"].file_name,
-           "%{cuda_include_genrules}": "\n".join(genrules),
-           "%{cuda_headers}": ('":cuda-include",\n' +
-                               '        ":cudnn-include",')
-       })
-
-  is_cuda_clang = _use_cuda_clang(repository_ctx)
-
-  should_download_clang = is_cuda_clang and _flag_enabled(
-      repository_ctx, _TF_DOWNLOAD_CLANG)
-  if should_download_clang:
-    download_clang(repository_ctx, "crosstool/extra_tools")
-
-  # Set up crosstool/
-  cc = find_cc(repository_ctx)
-  cc_fullpath = cc if not should_download_clang else "crosstool/" + cc
-
-  host_compiler_includes = _host_compiler_includes(repository_ctx, cc_fullpath)
-  cuda_defines = {}
-  if is_cuda_clang:
-    cuda_defines["%{host_compiler_path}"] = str(cc)
-    cuda_defines["%{host_compiler_warnings}"] = """
+    # Set up BUILD file for cuda/
+    _tpl(
+        repository_ctx,
+        "cuda:build_defs.bzl",
+        {
+            "%{cuda_is_configured}": "True",
+            "%{cuda_extra_copts}": _compute_cuda_extra_copts(
+                repository_ctx,
+                cuda_config.compute_capabilities,
+            ),
+        },
+    )
+    _tpl(
+        repository_ctx,
+        "cuda:BUILD.windows" if _is_windows(repository_ctx) else "cuda:BUILD",
+        {
+            "%{cuda_driver_lib}": cuda_libs["cuda"].file_name,
+            "%{cudart_static_lib}": cuda_libs["cudart_static"].file_name,
+            "%{cudart_static_linkopt}": _cudart_static_linkopt(
+                cuda_config.cpu_value,
+            ),
+            "%{cudart_lib}": cuda_libs["cudart"].file_name,
+            "%{cublas_lib}": cuda_libs["cublas"].file_name,
+            "%{cusolver_lib}": cuda_libs["cusolver"].file_name,
+            "%{cudnn_lib}": cuda_libs["cudnn"].file_name,
+            "%{cufft_lib}": cuda_libs["cufft"].file_name,
+            "%{curand_lib}": cuda_libs["curand"].file_name,
+            "%{cupti_lib}": cuda_libs["cupti"].file_name,
+            "%{cuda_include_genrules}": "\n".join(genrules),
+            "%{cuda_headers}": ('":cuda-include",\n' +
+                                '        ":cudnn-include",'),
+        },
+        "cuda/BUILD",
+    )
+
+    is_cuda_clang = _use_cuda_clang(repository_ctx)
+
+    should_download_clang = is_cuda_clang and _flag_enabled(
+        repository_ctx,
+        _TF_DOWNLOAD_CLANG,
+    )
+    if should_download_clang:
+        download_clang(repository_ctx, "crosstool/extra_tools")
+
+    # Set up crosstool/
+    cc = find_cc(repository_ctx)
+    cc_fullpath = cc if not should_download_clang else "crosstool/" + cc
+
+    host_compiler_includes = _host_compiler_includes(repository_ctx, cc_fullpath)
+    cuda_defines = {}
+    if is_cuda_clang:
+        cuda_defines["%{host_compiler_path}"] = str(cc)
+        cuda_defines["%{host_compiler_warnings}"] = """
         # Some parts of the codebase set -Werror and hit this warning, so
         # switch it off for now.
         flag: "-Wno-invalid-partial-specialization"
     """
-    cuda_defines["%{host_compiler_includes}"] = host_compiler_includes
-    _tpl(repository_ctx, "crosstool:BUILD", {"%{linker_files}": ":empty"})
-    repository_ctx.file("crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc", "")
-  else:
-    cuda_defines["%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc"
-    cuda_defines["%{host_compiler_warnings}"] = ""
-    # TODO(klimek): We currently need to inject "/" as builtin directory path
-    # to disable bazel's dependency checks.
-    # The problem is that:
-    # - the python rules symlink the python headers into the bazel root
-    # - the rules use 'includes' in the BUILD file to redirect includes of the
-    #   python headers through those paths
-    # - bazel currently uses -isystem for include paths specified via 'includes'
-    # - gcc follows symlinks when resolving files via -isystem paths, and puts
-    #   the resolved paths into the .d file, which makes the dependency check
-    #   fail for bazel
-    # There are multiple possible ways to solve this:
-    # 1. make bazel not use -isystem for paths specified via 'includes'
-    # 2. cp the headers instead of symlinking them
-    #
-    # Once this is fixed, the right builtin directory path is:
-    # (host_compiler_includes +
-    #    "\n  cxx_builtin_include_directory: \"%s\"" % cuda_include_path)
-    # The cuda directory needs to be passed, as there is currently no rule
-    # providing the cuda headers in the same way the python headers are
-    # provided.
-    cuda_defines["%{host_compiler_includes}"] = "\n  cxx_builtin_include_directory: \"/\""
-    nvcc_path = str(repository_ctx.path("%s/bin/nvcc%s" %
-        (cuda_config.cuda_toolkit_path,
-        ".exe" if cuda_config.cpu_value == "Windows" else "")))
-    _tpl(repository_ctx, "crosstool:BUILD",
-         {"%{linker_files}": ":crosstool_wrapper_driver_is_not_gcc"})
-    _tpl(repository_ctx,
-         "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc",
-         {
-             "%{cpu_compiler}": str(cc),
-             "%{cuda_version}": cuda_config.cuda_version,
-             "%{nvcc_path}": nvcc_path,
-             "%{gcc_host_compiler_path}": str(cc),
-             "%{cuda_compute_capabilities}": ", ".join(
-                 ["\"%s\"" % c for c in cuda_config.compute_capabilities]),
-         })
-  _tpl(repository_ctx, "crosstool:CROSSTOOL", cuda_defines, out="crosstool/CROSSTOOL")
-
-  # Set up cuda_config.h, which is used by
-  # tensorflow/stream_executor/dso_loader.cc.
-  _tpl(repository_ctx, "cuda:cuda_config.h",
-       {
-           "%{cuda_version}": cuda_config.cuda_version,
-           "%{cudnn_version}": cuda_config.cudnn_version,
-           "%{cuda_compute_capabilities}": ",".join(
-               ["CudaVersion(\"%s\")" % c
-                for c in cuda_config.compute_capabilities]),
-               "%{cuda_toolkit_path}": cuda_config.cuda_toolkit_path,
-       }, "cuda/cuda/cuda_config.h")
+        cuda_defines["%{host_compiler_includes}"] = host_compiler_includes
+        _tpl(repository_ctx, "crosstool:BUILD", {"%{linker_files}": ":empty", "%{win_linker_files}": ":empty"})
+        repository_ctx.file("crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc", "")
+        repository_ctx.file("crosstool/windows/msvc_wrapper_for_nvcc.py", "")
+        repository_ctx.file("crosstool/windows/msvc_wrapper_for_nvcc.bat", "")
+    else:
+        cuda_defines["%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc"
+        cuda_defines["%{host_compiler_warnings}"] = ""
+
+        # TODO(klimek): We currently need to inject "/" as builtin directory path
+        # to disable bazel's dependency checks.
+        # The problem is that:
+        # - the python rules symlink the python headers into the bazel root
+        # - the rules use 'includes' in the BUILD file to redirect includes of the
+        #   python headers through those paths
+        # - bazel currently uses -isystem for include paths specified via 'includes'
+        # - gcc follows symlinks when resolving files via -isystem paths, and puts
+        #   the resolved paths into the .d file, which makes the dependency check
+        #   fail for bazel
+        # There are multiple possible ways to solve this:
+        # 1. make bazel not use -isystem for paths specified via 'includes'
+        # 2. cp the headers instead of symlinking them
+        #
+        # Once this is fixed, the right builtin directory path is:
+        # (host_compiler_includes +
+        #    "\n  cxx_builtin_include_directory: \"%s\"" % cuda_include_path)
+        # The cuda directory needs to be passed, as there is currently no rule
+        # providing the cuda headers in the same way the python headers are
+        # provided.
+        cuda_defines["%{host_compiler_includes}"] = "\n  cxx_builtin_include_directory: \"/\""
+        nvcc_path = str(repository_ctx.path("%s/bin/nvcc%s" %
+                                            (
+                                                cuda_config.cuda_toolkit_path,
+                                                ".exe" if _is_windows(repository_ctx) else "",
+                                            )))
+        _tpl(
+            repository_ctx,
+            "crosstool:BUILD",
+            {
+                "%{linker_files}": ":crosstool_wrapper_driver_is_not_gcc",
+                "%{win_linker_files}": ":windows_msvc_wrapper_files",
+            },
+        )
+        wrapper_defines = {
+            "%{cpu_compiler}": str(cc),
+            "%{cuda_version}": cuda_config.cuda_version,
+            "%{nvcc_path}": nvcc_path,
+            "%{gcc_host_compiler_path}": str(cc),
+            "%{cuda_compute_capabilities}": ", ".join(
+                ["\"%s\"" % c for c in cuda_config.compute_capabilities],
+            ),
+            "%{nvcc_tmp_dir}": _get_nvcc_tmp_dir_for_windows(repository_ctx),
+        }
+        _tpl(
+            repository_ctx,
+            "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc",
+            wrapper_defines,
+        )
+        _tpl(
+            repository_ctx,
+            "crosstool:windows/msvc_wrapper_for_nvcc.py",
+            wrapper_defines,
+        )
+        _tpl(
+            repository_ctx,
+            "crosstool:windows/msvc_wrapper_for_nvcc.bat",
+            {
+                "%{python_binary}": _get_python_bin(repository_ctx),
+            },
+        )
+
+    _tpl(
+        repository_ctx,
+        "crosstool:CROSSTOOL",
+        cuda_defines + _get_win_cuda_defines(repository_ctx),
+        out = "crosstool/CROSSTOOL",
+    )
+
+    # Set up cuda_config.h, which is used by
+    # tensorflow/stream_executor/dso_loader.cc.
+    _tpl(
+        repository_ctx,
+        "cuda:cuda_config.h",
+        {
+            "%{cuda_version}": cuda_config.cuda_version,
+            "%{cudnn_version}": cuda_config.cudnn_version,
+            "%{cuda_compute_capabilities}": ",".join(
+                [
+                    "CudaVersion(\"%s\")" % c
+                    for c in cuda_config.compute_capabilities
+                ],
+            ),
+            "%{cuda_toolkit_path}": cuda_config.cuda_toolkit_path,
+        },
+        "cuda/cuda/cuda_config.h",
+    )
 
 def _create_remote_cuda_repository(repository_ctx, remote_config_repo):
-  """Creates pointers to a remotely configured repo set up to build with CUDA."""
-  _tpl(repository_ctx, "cuda:build_defs.bzl",
-       {
-           "%{cuda_is_configured}": "True",
-           "%{cuda_extra_copts}": _compute_cuda_extra_copts(
-               repository_ctx, _compute_capabilities(repository_ctx)),
-
-       })
-  _tpl(repository_ctx, "cuda:remote.BUILD",
-       {
-           "%{remote_cuda_repo}": remote_config_repo,
-       }, "cuda/BUILD")
-  _tpl(repository_ctx, "crosstool:remote.BUILD", {
-           "%{remote_cuda_repo}": remote_config_repo,
-       }, "crosstool/BUILD")
+    """Creates pointers to a remotely configured repo set up to build with CUDA."""
+    _tpl(
+        repository_ctx,
+        "cuda:build_defs.bzl",
+        {
+            "%{cuda_is_configured}": "True",
+            "%{cuda_extra_copts}": _compute_cuda_extra_copts(
+                repository_ctx,
+                _compute_capabilities(repository_ctx),
+            ),
+        },
+    )
+    _tpl(
+        repository_ctx,
+        "cuda:remote.BUILD",
+        {
+            "%{remote_cuda_repo}": remote_config_repo,
+        },
+        "cuda/BUILD",
+    )
+    _tpl(repository_ctx, "crosstool:remote.BUILD", {
+        "%{remote_cuda_repo}": remote_config_repo,
+    }, "crosstool/BUILD")
 
 def _cuda_autoconf_impl(repository_ctx):
-  """Implementation of the cuda_autoconf repository rule."""
-  if not _enable_cuda(repository_ctx):
-    _create_dummy_repository(repository_ctx)
-  else:
-    if _TF_CUDA_CONFIG_REPO in repository_ctx.os.environ:
-      _create_remote_cuda_repository(repository_ctx,
-          repository_ctx.os.environ[_TF_CUDA_CONFIG_REPO])
+    """Implementation of the cuda_autoconf repository rule."""
+    if not _enable_cuda(repository_ctx):
+        _create_dummy_repository(repository_ctx)
+    elif _TF_CUDA_CONFIG_REPO in repository_ctx.os.environ:
+        _create_remote_cuda_repository(
+            repository_ctx,
+            repository_ctx.os.environ[_TF_CUDA_CONFIG_REPO],
+        )
     else:
-      _create_local_cuda_repository(repository_ctx)
-
+        _create_local_cuda_repository(repository_ctx)
 
 cuda_configure = repository_rule(
     implementation = _cuda_autoconf_impl,
@@ -1191,6 +1457,7 @@ cuda_configure = repository_rule(
         _TF_CUDA_COMPUTE_CAPABILITIES,
         _TF_CUDA_CONFIG_REPO,
         "NVVMIR_LIBRARY_DIR",
+        _PYTHON_BIN_PATH,
     ],
 )
 
diff --git a/tools/bazel.rc b/tools/bazel.rc
index 1c1e6afb65..3559375d5c 100644
--- a/tools/bazel.rc
+++ b/tools/bazel.rc
@@ -36,8 +36,6 @@ build:cuda --define=using_cuda=true --define=using_cuda_nvcc=true
 build:cuda_clang --crosstool_top=@local_config_cuda//crosstool:toolchain
 build:cuda_clang --define=using_cuda=true --define=using_cuda_clang=true --define=using_clang=true
 
-build:win-cuda --define=using_cuda=true --define=using_cuda_nvcc=true
-
 build:mkl --define=using_mkl=true
 
 build:sycl --crosstool_top=@local_config_sycl//crosstool:toolchain
-- 
cgit v1.2.3


From 63e6b9bf43049472b33393df74de271b6aa33863 Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Fri, 13 Jul 2018 12:46:24 -0700
Subject: Update default NCCL version while installing from source to 2.2

This is to keep the default configuration consistent with prebuilt TensorFlow.

PiperOrigin-RevId: 204513386
---
 configure.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'configure.py')

diff --git a/configure.py b/configure.py
index 8930c3a1f1..d411214817 100644
--- a/configure.py
+++ b/configure.py
@@ -35,7 +35,7 @@ except ImportError:
 
 _DEFAULT_CUDA_VERSION = '9.0'
 _DEFAULT_CUDNN_VERSION = '7'
-_DEFAULT_NCCL_VERSION = '1.3'
+_DEFAULT_NCCL_VERSION = '2.2'
 _DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,5.2'
 _DEFAULT_CUDA_PATH = '/usr/local/cuda'
 _DEFAULT_CUDA_PATH_LINUX = '/opt/cuda'
@@ -1097,8 +1097,10 @@ def set_tf_nccl_install_path(environ_cp):
     raise ValueError('Currently NCCL is only supported on Linux platforms.')
 
   ask_nccl_version = (
-      'Please specify the NCCL version you want to use. '
-      '[Leave empty to default to NCCL %s]: ') % _DEFAULT_NCCL_VERSION
+      'Please specify the NCCL version you want to use. If NCCL %s is not '
+      'installed, then you can use version 1.3 that can be fetched '
+      'automatically but it may have worse performance with multiple GPUs. '
+      '[Default is %s]: ') % (_DEFAULT_NCCL_VERSION, _DEFAULT_NCCL_VERSION)
 
   for _ in range(_DEFAULT_PROMPT_ASK_ATTEMPTS):
     tf_nccl_version = get_from_env_or_user_or_default(
-- 
cgit v1.2.3


From fe7d1d9447a31562acb26aad7a9ffca60686c38a Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Sat, 14 Jul 2018 13:16:58 -0700
Subject: Update default cuda compute capability while installing from sources
 to cover up to Volta

PiperOrigin-RevId: 204606836
---
 configure.py                                   | 2 +-
 tensorflow/docs_src/install/install_sources.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'configure.py')

diff --git a/configure.py b/configure.py
index d411214817..df6259778e 100644
--- a/configure.py
+++ b/configure.py
@@ -36,7 +36,7 @@ except ImportError:
 _DEFAULT_CUDA_VERSION = '9.0'
 _DEFAULT_CUDNN_VERSION = '7'
 _DEFAULT_NCCL_VERSION = '2.2'
-_DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,5.2'
+_DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,7.0'
 _DEFAULT_CUDA_PATH = '/usr/local/cuda'
 _DEFAULT_CUDA_PATH_LINUX = '/opt/cuda'
 _DEFAULT_CUDA_PATH_WIN = ('C:/Program Files/NVIDIA GPU Computing '
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index fc1f6d05bd..5caf36eed1 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -262,7 +262,7 @@ Please specify the location where cuDNN 7 library is installed. Refer to README.
 Please specify a list of comma-separated CUDA compute capabilities you want to build with.
 You can find the compute capability of your device at: https://developer.nvidia.com/cuda-gpus.
 Please note that each additional compute capability significantly increases your build time and binary size.
-[Default is: "3.5,5.2"]: <b>3.0</b>
+[Default is: "3.5,7.0"]: <b>6.0,7.0</b>
 Do you wish to build TensorFlow with MPI support? [y/N]
 MPI support will not be enabled for TensorFlow
 Configuration finished
-- 
cgit v1.2.3


From 6e97fb388ad00df87beb58ebc5a1b02bd6a5dff0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 16 Jul 2018 14:07:29 -0700
Subject: Disable more TF_NEED_* on Windows by default.

This simplifies ./configure process on Windows, we can remove them when we actually support the corresponding feature on Windows.

PiperOrigin-RevId: 204804112
---
 configure.py | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'configure.py')

diff --git a/configure.py b/configure.py
index df6259778e..cd9d5d32a0 100644
--- a/configure.py
+++ b/configure.py
@@ -1451,6 +1451,11 @@ def main():
     # TODO(ibiryukov): Investigate using clang as a cpu or cuda compiler on
     # Windows.
     environ_cp['TF_DOWNLOAD_CLANG'] = '0'
+    environ_cp['TF_ENABLE_XLA'] = '0'
+    environ_cp['TF_NEED_GDR'] = '0'
+    environ_cp['TF_NEED_VERBS'] = '0'
+    environ_cp['TF_NEED_MPI'] = '0'
+    environ_cp['TF_SET_ANDROID_WORKSPACE'] = '0'
 
   if is_macos():
     environ_cp['TF_NEED_JEMALLOC'] = '0'
-- 
cgit v1.2.3


From e4c0dbcab8b404949a67c282fe7cae89c5b4ad87 Mon Sep 17 00:00:00 2001
From: Toby Boyd <tobyboyd@google.com>
Date: Mon, 16 Jul 2018 17:04:48 -0700
Subject: internal change

PiperOrigin-RevId: 204832902
---
 configure.py                        | 4 +---
 third_party/nccl/nccl_configure.bzl | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'configure.py')

diff --git a/configure.py b/configure.py
index cd9d5d32a0..10387493b0 100644
--- a/configure.py
+++ b/configure.py
@@ -1138,9 +1138,7 @@ def set_tf_nccl_install_path(environ_cp):
 
     nccl_lib_path = os.path.join(nccl_install_path, nccl_lib_path)
     nccl_hdr_path = os.path.join(nccl_install_path, 'include/nccl.h')
-    nccl_license_path = os.path.join(nccl_install_path, 'NCCL-SLA.txt')
-    if os.path.exists(nccl_lib_path) and os.path.exists(
-        nccl_hdr_path) and os.path.exists(nccl_license_path):
+    if os.path.exists(nccl_lib_path) and os.path.exists(nccl_hdr_path):
       # Set NCCL_INSTALL_PATH
       environ_cp['NCCL_INSTALL_PATH'] = nccl_install_path
       write_action_env_to_bazelrc('NCCL_INSTALL_PATH', nccl_install_path)
diff --git a/third_party/nccl/nccl_configure.bzl b/third_party/nccl/nccl_configure.bzl
index 9dfcb18369..5d1ebf0686 100644
--- a/third_party/nccl/nccl_configure.bzl
+++ b/third_party/nccl/nccl_configure.bzl
@@ -47,10 +47,10 @@ alias(
 )
 """
 
+# Local build results in dynamic link and the license should not be included.
 _NCCL_LOCAL_BUILD_TEMPLATE = """
 filegroup(
   name = "LICENSE",
-  data = ["nccl/NCCL-SLA.txt"],
   visibility = ["//visibility:public"],
 )
 
-- 
cgit v1.2.3


From c0ff0cccd4d6f770f03756279dd39ff43d4a7bca Mon Sep 17 00:00:00 2001
From: Shashi Shekhar <shashishekhar@google.com>
Date: Tue, 17 Jul 2018 09:00:24 -0700
Subject: Fix default SDK path for Mac.

PiperOrigin-RevId: 204919096
---
 configure.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'configure.py')

diff --git a/configure.py b/configure.py
index eaff83d2cc..c482628ec8 100644
--- a/configure.py
+++ b/configure.py
@@ -680,7 +680,7 @@ def create_android_sdk_rule(environ_cp):
   if is_windows() or is_cygwin():
     default_sdk_path = cygpath('%s/Android/Sdk' % environ_cp['APPDATA'])
   elif is_macos():
-    default_sdk_path = '%s/library/Android/Sdk/ndk-bundle' % environ_cp['HOME']
+    default_sdk_path = '%s/library/Android/Sdk' % environ_cp['HOME']
   else:
     default_sdk_path = '%s/Android/Sdk' % environ_cp['HOME']
 
-- 
cgit v1.2.3


From 5e7178ddb7bc8b863469f7240d0cf5a74c77b543 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Wed, 18 Jul 2018 15:27:50 -0700
Subject: Upgrade bazel to 0.15.0.

PiperOrigin-RevId: 205147588
---
 WORKSPACE                                                      | 2 +-
 configure.py                                                   | 2 +-
 tensorflow/tools/ci_build/ci_sanity.sh                         | 2 +-
 tensorflow/tools/ci_build/install/install_bazel.sh             | 2 +-
 tensorflow/tools/ci_build/install/install_bazel_from_source.sh | 2 +-
 tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh      | 8 ++++----
 tensorflow/tools/docker/Dockerfile.devel                       | 2 +-
 tensorflow/tools/docker/Dockerfile.devel-gpu                   | 2 +-
 tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7      | 2 +-
 9 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'configure.py')

diff --git a/WORKSPACE b/WORKSPACE
index fd7570a80a..17961829a6 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -18,7 +18,7 @@ closure_repositories()
 # files, in case the parsing of those build files depends on the bazel
 # version we require here.
 load("//tensorflow:version_check.bzl", "check_bazel_version_at_least")
-check_bazel_version_at_least("0.10.0")
+check_bazel_version_at_least("0.15.0")
 
 load("//tensorflow:workspace.bzl", "tf_workspace")
 
diff --git a/configure.py b/configure.py
index c482628ec8..25729adf36 100644
--- a/configure.py
+++ b/configure.py
@@ -1429,7 +1429,7 @@ def main():
   # environment variables.
   environ_cp = dict(os.environ)
 
-  check_bazel_version('0.10.0')
+  check_bazel_version('0.15.0')
 
   reset_tf_configure_bazelrc(args.workspace)
   cleanup_makefile()
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index db37edf809..866fe95d2b 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -354,7 +354,7 @@ do_external_licenses_check(){
 
   # Whitelist
   echo ${EXTRA_LICENSE_FILE}
-  grep -e "@bazel_tools//src" -e "@bazel_tools//tools/" -e "@com_google_absl//" -e "//external" -e "@local" -e "@com_github_googlecloudplatform_google_cloud_cpp//" -v ${EXTRA_LICENSES_FILE} > temp.txt
+  grep -e "@bazel_tools//src" -e "@bazel_tools//tools/" -e "@com_google_absl//" -e "//external" -e "@local" -e "@com_github_googlecloudplatform_google_cloud_cpp//" -e "@embedded_jdk//" -v ${EXTRA_LICENSES_FILE} > temp.txt
   mv temp.txt ${EXTRA_LICENSES_FILE}
 
 
diff --git a/tensorflow/tools/ci_build/install/install_bazel.sh b/tensorflow/tools/ci_build/install/install_bazel.sh
index adbff8f6ef..e284401b8a 100755
--- a/tensorflow/tools/ci_build/install/install_bazel.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel.sh
@@ -15,7 +15,7 @@
 # ==============================================================================
 
 # Select bazel version.
-BAZEL_VERSION="0.14.1"
+BAZEL_VERSION="0.15.0"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
diff --git a/tensorflow/tools/ci_build/install/install_bazel_from_source.sh b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
index 9d24b3e421..87be81577d 100755
--- a/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
@@ -18,7 +18,7 @@
 # It will compile bazel from source and install it in /usr/local/bin
 
 # Select bazel version.
-BAZEL_VERSION="0.14.1"
+BAZEL_VERSION="0.15.0"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
index c03cbd9c66..0482cf619a 100644
--- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
@@ -33,10 +33,10 @@ function set_remote_cache_options {
   echo "build --tls_enabled=true" >> "${TMP_BAZELRC}"
   echo "build --remote_timeout=3600" >> "${TMP_BAZELRC}"
   echo "build --auth_enabled=true" >> "${TMP_BAZELRC}"
-  echo "build --spawn_strategy=remote" >> "${TMP_BAZELRC}"
-  echo "build --strategy=Javac=remote" >> "${TMP_BAZELRC}"
-  echo "build --strategy=Closure=remote" >> "${TMP_BAZELRC}"
-  echo "build --genrule_strategy=remote" >> "${TMP_BAZELRC}"
+  echo "build --spawn_strategy=standalone" >> "${TMP_BAZELRC}"
+  echo "build --strategy=Javac=standalone" >> "${TMP_BAZELRC}"
+  echo "build --strategy=Closure=standalone" >> "${TMP_BAZELRC}"
+  echo "build --genrule_strategy=standalone" >> "${TMP_BAZELRC}"
   echo "build --google_credentials=$GOOGLE_CLOUD_CREDENTIAL" >> "${TMP_BAZELRC}"
 }
 
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index fd94d64268..f7fe4119da 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -63,7 +63,7 @@ RUN echo "startup --batch" >>/etc/bazel.bazelrc
 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
     >>/etc/bazel.bazelrc
 # Install the most recent bazel release.
-ENV BAZEL_VERSION 0.14.1
+ENV BAZEL_VERSION 0.15.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 44120bf274..957a7ed799 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -83,7 +83,7 @@ RUN echo "startup --batch" >>/etc/bazel.bazelrc
 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
     >>/etc/bazel.bazelrc
 # Install the most recent bazel release.
-ENV BAZEL_VERSION 0.14.1
+ENV BAZEL_VERSION 0.15.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7 b/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
index 3bedc8cf34..30bc2d2806 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
@@ -4,7 +4,7 @@ LABEL maintainer="Gunhan Gulsoy <gunan@google.com>"
 
 # It is possible to override these for releases.
 ARG TF_BRANCH=master
-ARG BAZEL_VERSION=0.5.4
+ARG BAZEL_VERSION=0.15.0
 ARG TF_AVAILABLE_CPUS=32
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
-- 
cgit v1.2.3


From 11020427c924ee21453feccdb7d4d8384006de10 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Wed, 18 Jul 2018 17:23:28 -0700
Subject: Automated rollback of commit 5e7178ddb7bc8b863469f7240d0cf5a74c77b543

PiperOrigin-RevId: 205165137
---
 WORKSPACE                                                      | 2 +-
 configure.py                                                   | 2 +-
 tensorflow/tools/ci_build/ci_sanity.sh                         | 2 +-
 tensorflow/tools/ci_build/install/install_bazel.sh             | 2 +-
 tensorflow/tools/ci_build/install/install_bazel_from_source.sh | 2 +-
 tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh      | 8 ++++----
 tensorflow/tools/docker/Dockerfile.devel                       | 2 +-
 tensorflow/tools/docker/Dockerfile.devel-gpu                   | 2 +-
 tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7      | 2 +-
 9 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'configure.py')

diff --git a/WORKSPACE b/WORKSPACE
index 17961829a6..fd7570a80a 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -18,7 +18,7 @@ closure_repositories()
 # files, in case the parsing of those build files depends on the bazel
 # version we require here.
 load("//tensorflow:version_check.bzl", "check_bazel_version_at_least")
-check_bazel_version_at_least("0.15.0")
+check_bazel_version_at_least("0.10.0")
 
 load("//tensorflow:workspace.bzl", "tf_workspace")
 
diff --git a/configure.py b/configure.py
index 25729adf36..c482628ec8 100644
--- a/configure.py
+++ b/configure.py
@@ -1429,7 +1429,7 @@ def main():
   # environment variables.
   environ_cp = dict(os.environ)
 
-  check_bazel_version('0.15.0')
+  check_bazel_version('0.10.0')
 
   reset_tf_configure_bazelrc(args.workspace)
   cleanup_makefile()
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index 866fe95d2b..db37edf809 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -354,7 +354,7 @@ do_external_licenses_check(){
 
   # Whitelist
   echo ${EXTRA_LICENSE_FILE}
-  grep -e "@bazel_tools//src" -e "@bazel_tools//tools/" -e "@com_google_absl//" -e "//external" -e "@local" -e "@com_github_googlecloudplatform_google_cloud_cpp//" -e "@embedded_jdk//" -v ${EXTRA_LICENSES_FILE} > temp.txt
+  grep -e "@bazel_tools//src" -e "@bazel_tools//tools/" -e "@com_google_absl//" -e "//external" -e "@local" -e "@com_github_googlecloudplatform_google_cloud_cpp//" -v ${EXTRA_LICENSES_FILE} > temp.txt
   mv temp.txt ${EXTRA_LICENSES_FILE}
 
 
diff --git a/tensorflow/tools/ci_build/install/install_bazel.sh b/tensorflow/tools/ci_build/install/install_bazel.sh
index e284401b8a..adbff8f6ef 100755
--- a/tensorflow/tools/ci_build/install/install_bazel.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel.sh
@@ -15,7 +15,7 @@
 # ==============================================================================
 
 # Select bazel version.
-BAZEL_VERSION="0.15.0"
+BAZEL_VERSION="0.14.1"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
diff --git a/tensorflow/tools/ci_build/install/install_bazel_from_source.sh b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
index 87be81577d..9d24b3e421 100755
--- a/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
@@ -18,7 +18,7 @@
 # It will compile bazel from source and install it in /usr/local/bin
 
 # Select bazel version.
-BAZEL_VERSION="0.15.0"
+BAZEL_VERSION="0.14.1"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
index 0482cf619a..c03cbd9c66 100644
--- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
@@ -33,10 +33,10 @@ function set_remote_cache_options {
   echo "build --tls_enabled=true" >> "${TMP_BAZELRC}"
   echo "build --remote_timeout=3600" >> "${TMP_BAZELRC}"
   echo "build --auth_enabled=true" >> "${TMP_BAZELRC}"
-  echo "build --spawn_strategy=standalone" >> "${TMP_BAZELRC}"
-  echo "build --strategy=Javac=standalone" >> "${TMP_BAZELRC}"
-  echo "build --strategy=Closure=standalone" >> "${TMP_BAZELRC}"
-  echo "build --genrule_strategy=standalone" >> "${TMP_BAZELRC}"
+  echo "build --spawn_strategy=remote" >> "${TMP_BAZELRC}"
+  echo "build --strategy=Javac=remote" >> "${TMP_BAZELRC}"
+  echo "build --strategy=Closure=remote" >> "${TMP_BAZELRC}"
+  echo "build --genrule_strategy=remote" >> "${TMP_BAZELRC}"
   echo "build --google_credentials=$GOOGLE_CLOUD_CREDENTIAL" >> "${TMP_BAZELRC}"
 }
 
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index f7fe4119da..fd94d64268 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -63,7 +63,7 @@ RUN echo "startup --batch" >>/etc/bazel.bazelrc
 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
     >>/etc/bazel.bazelrc
 # Install the most recent bazel release.
-ENV BAZEL_VERSION 0.15.0
+ENV BAZEL_VERSION 0.14.1
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 957a7ed799..44120bf274 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -83,7 +83,7 @@ RUN echo "startup --batch" >>/etc/bazel.bazelrc
 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
     >>/etc/bazel.bazelrc
 # Install the most recent bazel release.
-ENV BAZEL_VERSION 0.15.0
+ENV BAZEL_VERSION 0.14.1
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7 b/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
index 30bc2d2806..3bedc8cf34 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
@@ -4,7 +4,7 @@ LABEL maintainer="Gunhan Gulsoy <gunan@google.com>"
 
 # It is possible to override these for releases.
 ARG TF_BRANCH=master
-ARG BAZEL_VERSION=0.15.0
+ARG BAZEL_VERSION=0.5.4
 ARG TF_AVAILABLE_CPUS=32
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
-- 
cgit v1.2.3


From 1b21235444eb12429ee41d185b6f594778f7c30a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Jul 2018 13:48:50 -0700
Subject: Improve Windows build process

After this change, the windows build steps should be like:
https://docs.google.com/document/d/1oVYzPJVv8r5N9PecqwG74rY_QbqPH70IxBouBHdq5EI/edit?usp=sharing

PiperOrigin-RevId: 205295588
---
 configure.py                                       | 59 ++++++++++++++++------
 .../ci_build/windows/cpu/pip/build_tf_windows.sh   | 10 ++--
 .../ci_build/windows/gpu/pip/build_tf_windows.sh   | 10 ++--
 tensorflow/tools/pip_package/build_pip_package.sh  |  6 ++-
 4 files changed, 54 insertions(+), 31 deletions(-)

(limited to 'configure.py')

diff --git a/configure.py b/configure.py
index c482628ec8..60fe54b2f6 100644
--- a/configure.py
+++ b/configure.py
@@ -882,7 +882,7 @@ def set_tf_cudnn_version(environ_cp):
     default_cudnn_path = environ_cp.get('CUDA_TOOLKIT_PATH')
     ask_cudnn_path = (r'Please specify the location where cuDNN %s library is '
                       'installed. Refer to README.md for more details. [Default'
-                      ' is %s]:') % (tf_cudnn_version, default_cudnn_path)
+                      ' is %s]: ') % (tf_cudnn_version, default_cudnn_path)
     cudnn_install_path = get_from_env_or_user_or_default(
         environ_cp, 'CUDNN_INSTALL_PATH', ask_cudnn_path, default_cudnn_path)
 
@@ -1201,7 +1201,7 @@ def set_tf_cuda_compute_capabilities(environ_cp):
         'https://developer.nvidia.com/cuda-gpus.\nPlease'
         ' note that each additional compute '
         'capability significantly increases your '
-        'build time and binary size. [Default is: %s]' %
+        'build time and binary size. [Default is: %s]: ' %
         default_cuda_compute_capabilities)
     tf_cuda_compute_capabilities = get_from_env_or_user_or_default(
         environ_cp, 'TF_CUDA_COMPUTE_CAPABILITIES',
@@ -1402,14 +1402,36 @@ def set_build_strip_flag():
   write_to_bazelrc('build --strip=always')
 
 
-def set_windows_build_flags():
-  if is_windows():
-    # The non-monolithic build is not supported yet
-    write_to_bazelrc('build --config monolithic')
-    # Suppress warning messages
-    write_to_bazelrc('build --copt=-w --host_copt=-w')
-    # Output more verbose information when something goes wrong
-    write_to_bazelrc('build --verbose_failures')
+def set_windows_build_flags(environ_cp):
+  """Set Windows specific build options."""
+  # The non-monolithic build is not supported yet
+  write_to_bazelrc('build --config monolithic')
+  # Suppress warning messages
+  write_to_bazelrc('build --copt=-w --host_copt=-w')
+  # Output more verbose information when something goes wrong
+  write_to_bazelrc('build --verbose_failures')
+  # The host and target platforms are the same in Windows build. So we don't
+  # have to distinct them. This avoids building the same targets twice.
+  write_to_bazelrc('build --distinct_host_configuration=false')
+  # Enable short object file path to avoid long path issue on Windows.
+  # TODO(pcloudy): Remove this flag when upgrading Bazel to 0.16.0
+  # Short object file path will be enabled by default.
+  write_to_bazelrc('build --experimental_shortened_obj_file_path=true')
+
+  if get_var(
+      environ_cp, 'TF_OVERRIDE_EIGEN_STRONG_INLINE', 'Eigen strong inline',
+      True,
+      ('Would you like to override eigen strong inline for some C++ '
+       'compilation to reduce the compiling time?'),
+      'Eigen strong inline overridden.',
+      'Not overriding eigen strong inline, '
+      'some compilations could take more than 20 mins.'):
+    # Due to a known MSVC compiler issue
+    # https://github.com/tensorflow/tensorflow/issues/10521
+    # Overriding eigen strong inline speeds up the compiling of
+    # conv_grad_ops_3d.cc and conv_ops_3d.cc by 20 minutes,
+    # but this also hurts the performance. Let users decide what they want.
+    write_to_bazelrc('build --define=override_eigen_strong_inline=true')
 
 
 def config_info_line(name, help_text):
@@ -1537,7 +1559,8 @@ def main():
   set_grpc_build_flags()
   set_cc_opt_flags(environ_cp)
   set_build_strip_flag()
-  set_windows_build_flags()
+  if is_windows():
+    set_windows_build_flags(environ_cp)
 
   if get_var(
       environ_cp, 'TF_SET_ANDROID_WORKSPACE', 'android workspace',
@@ -1549,11 +1572,15 @@ def main():
     create_android_ndk_rule(environ_cp)
     create_android_sdk_rule(environ_cp)
 
-  print('Preconfigured Bazel build configs. You can use any of the below by '
-        'adding "--config=<>" to your build command. See tools/bazel.rc for '
-        'more details.')
-  config_info_line('mkl', 'Build with MKL support.')
-  config_info_line('monolithic', 'Config for mostly static monolithic build.')
+  # On Windows, we don't have MKL support and the build is always monolithic.
+  # So no need to print the following message.
+  # TODO(pcloudy): remove the following if check when they make sense on Windows
+  if not is_windows():
+    print('Preconfigured Bazel build configs. You can use any of the below by '
+          'adding "--config=<>" to your build command. See tools/bazel.rc for '
+          'more details.')
+    config_info_line('mkl', 'Build with MKL support.')
+    config_info_line('monolithic', 'Config for mostly static monolithic build.')
 
 if __name__ == '__main__':
   main()
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 61dec249f3..dc7ea1dc57 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -67,16 +67,12 @@ for ARG in "$@"; do
 done
 
 if [[ "$release_build" != 1 ]]; then
-  # --define=override_eigen_strong_inline=true speeds up the compiling of conv_grad_ops_3d.cc and conv_ops_3d.cc
+  # Overriding eigen strong inline speeds up the compiling of conv_grad_ops_3d.cc and conv_ops_3d.cc
   # by 20 minutes. See https://github.com/tensorflow/tensorflow/issues/10521
-  # Because this hurts the performance of TF, we don't enable it in release build.
-  echo "build --define=override_eigen_strong_inline=true" >> "${TMP_BAZELRC}"
+  # Because this hurts the performance of TF, we don't override it in release build.
+  export TF_OVERRIDE_EIGEN_STRONG_INLINE=0
 fi
 
-# The host and target platforms are the same in Windows build. So we don't have
-# to distinct them. This helps avoid building the same targets twice.
-echo "build --distinct_host_configuration=false" >> "${TMP_BAZELRC}"
-
 # Enable short object file path to avoid long path issue on Windows.
 echo "startup --output_user_root=${TMPDIR}" >> "${TMP_BAZELRC}"
 
diff --git a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
index e232306653..a4175a0e81 100644
--- a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
@@ -67,16 +67,12 @@ for ARG in "$@"; do
 done
 
 if [[ "$release_build" != 1 ]]; then
-  # --define=override_eigen_strong_inline=true speeds up the compiling of conv_grad_ops_3d.cc and conv_ops_3d.cc
+  # Overriding eigen strong inline speeds up the compiling of conv_grad_ops_3d.cc and conv_ops_3d.cc
   # by 20 minutes. See https://github.com/tensorflow/tensorflow/issues/10521
-  # Because this hurts the performance of TF, we don't enable it in release build.
-  echo "build --define=override_eigen_strong_inline=true" >> "${TMP_BAZELRC}"
+  # Because this hurts the performance of TF, we don't override it in release build.
+  export TF_OVERRIDE_EIGEN_STRONG_INLINE=0
 fi
 
-# The host and target platforms are the same in Windows build. So we don't have
-# to distinct them. This helps avoid building the same targets twice.
-echo "build --distinct_host_configuration=false" >> "${TMP_BAZELRC}"
-
 # Enable short object file path to avoid long path issue on Windows.
 echo "startup --output_user_root=${TMPDIR}" >> "${TMP_BAZELRC}"
 
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index 4101b34a11..ca40f2eaa8 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -17,8 +17,12 @@
 
 set -e
 
+function is_absolute {
+  [[ "$1" = /* ]] || [[ "$1" =~ ^[a-zA-Z]:[/\\].* ]]
+}
+
 function real_path() {
-  [[ $1 = /* ]] && echo "$1" || echo "$PWD/${1#./}"
+  is_absolute "$1" && echo "$1" || echo "$PWD/${1#./}"
 }
 
 function cp_external() {
-- 
cgit v1.2.3


From 716d15118f62c17c29bbec4d006fd3055bb56812 Mon Sep 17 00:00:00 2001
From: Reed Wanderman-Milne <reedwm@google.com>
Date: Thu, 19 Jul 2018 15:52:35 -0700
Subject: Update minimum bazel version to 0.13.0.

I was using bazel 0.12.0 and was getting the error:

file '@bazel_tools//tools/cpp:windows_cc_configure.bzl' does not contain symbol 'setup_vc_env_vars'

PiperOrigin-RevId: 205316270
---
 WORKSPACE    | 2 +-
 configure.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'configure.py')

diff --git a/WORKSPACE b/WORKSPACE
index fd7570a80a..e7cf23a159 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -18,7 +18,7 @@ closure_repositories()
 # files, in case the parsing of those build files depends on the bazel
 # version we require here.
 load("//tensorflow:version_check.bzl", "check_bazel_version_at_least")
-check_bazel_version_at_least("0.10.0")
+check_bazel_version_at_least("0.13.0")
 
 load("//tensorflow:workspace.bzl", "tf_workspace")
 
diff --git a/configure.py b/configure.py
index 60fe54b2f6..251bebc2e1 100644
--- a/configure.py
+++ b/configure.py
@@ -1451,7 +1451,7 @@ def main():
   # environment variables.
   environ_cp = dict(os.environ)
 
-  check_bazel_version('0.10.0')
+  check_bazel_version('0.13.0')
 
   reset_tf_configure_bazelrc(args.workspace)
   cleanup_makefile()
-- 
cgit v1.2.3


From 8ed40cdd3ea7e9aea996339678efba2b2b04e1ad Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Jul 2018 10:55:31 -0700
Subject: Fix grammar in configure.py

PiperOrigin-RevId: 205421605
---
 configure.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'configure.py')

diff --git a/configure.py b/configure.py
index 251bebc2e1..1df7bc736f 100644
--- a/configure.py
+++ b/configure.py
@@ -1422,7 +1422,7 @@ def set_windows_build_flags(environ_cp):
       environ_cp, 'TF_OVERRIDE_EIGEN_STRONG_INLINE', 'Eigen strong inline',
       True,
       ('Would you like to override eigen strong inline for some C++ '
-       'compilation to reduce the compiling time?'),
+       'compilation to reduce the compilation time?'),
       'Eigen strong inline overridden.',
       'Not overriding eigen strong inline, '
       'some compilations could take more than 20 mins.'):
-- 
cgit v1.2.3


From bb384118db531a7951735dcdc809b5735bc02a76 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Tue, 24 Jul 2018 13:12:54 -0700
Subject: Upgrade bazel to 0.15.0.

PiperOrigin-RevId: 205878953
---
 WORKSPACE                                                      | 2 +-
 configure.py                                                   | 2 +-
 tensorflow/tools/ci_build/ci_sanity.sh                         | 2 +-
 tensorflow/tools/ci_build/install/install_bazel.sh             | 2 +-
 tensorflow/tools/ci_build/install/install_bazel_from_source.sh | 2 +-
 tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh      | 8 ++++----
 tensorflow/tools/docker/Dockerfile.devel                       | 2 +-
 tensorflow/tools/docker/Dockerfile.devel-gpu                   | 2 +-
 tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7      | 2 +-
 9 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'configure.py')

diff --git a/WORKSPACE b/WORKSPACE
index e7cf23a159..17961829a6 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -18,7 +18,7 @@ closure_repositories()
 # files, in case the parsing of those build files depends on the bazel
 # version we require here.
 load("//tensorflow:version_check.bzl", "check_bazel_version_at_least")
-check_bazel_version_at_least("0.13.0")
+check_bazel_version_at_least("0.15.0")
 
 load("//tensorflow:workspace.bzl", "tf_workspace")
 
diff --git a/configure.py b/configure.py
index 1df7bc736f..f97bf8a668 100644
--- a/configure.py
+++ b/configure.py
@@ -1451,7 +1451,7 @@ def main():
   # environment variables.
   environ_cp = dict(os.environ)
 
-  check_bazel_version('0.13.0')
+  check_bazel_version('0.15.0')
 
   reset_tf_configure_bazelrc(args.workspace)
   cleanup_makefile()
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index db37edf809..866fe95d2b 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -354,7 +354,7 @@ do_external_licenses_check(){
 
   # Whitelist
   echo ${EXTRA_LICENSE_FILE}
-  grep -e "@bazel_tools//src" -e "@bazel_tools//tools/" -e "@com_google_absl//" -e "//external" -e "@local" -e "@com_github_googlecloudplatform_google_cloud_cpp//" -v ${EXTRA_LICENSES_FILE} > temp.txt
+  grep -e "@bazel_tools//src" -e "@bazel_tools//tools/" -e "@com_google_absl//" -e "//external" -e "@local" -e "@com_github_googlecloudplatform_google_cloud_cpp//" -e "@embedded_jdk//" -v ${EXTRA_LICENSES_FILE} > temp.txt
   mv temp.txt ${EXTRA_LICENSES_FILE}
 
 
diff --git a/tensorflow/tools/ci_build/install/install_bazel.sh b/tensorflow/tools/ci_build/install/install_bazel.sh
index adbff8f6ef..e284401b8a 100755
--- a/tensorflow/tools/ci_build/install/install_bazel.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel.sh
@@ -15,7 +15,7 @@
 # ==============================================================================
 
 # Select bazel version.
-BAZEL_VERSION="0.14.1"
+BAZEL_VERSION="0.15.0"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
diff --git a/tensorflow/tools/ci_build/install/install_bazel_from_source.sh b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
index 9d24b3e421..87be81577d 100755
--- a/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
@@ -18,7 +18,7 @@
 # It will compile bazel from source and install it in /usr/local/bin
 
 # Select bazel version.
-BAZEL_VERSION="0.14.1"
+BAZEL_VERSION="0.15.0"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
index c03cbd9c66..0482cf619a 100644
--- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
@@ -33,10 +33,10 @@ function set_remote_cache_options {
   echo "build --tls_enabled=true" >> "${TMP_BAZELRC}"
   echo "build --remote_timeout=3600" >> "${TMP_BAZELRC}"
   echo "build --auth_enabled=true" >> "${TMP_BAZELRC}"
-  echo "build --spawn_strategy=remote" >> "${TMP_BAZELRC}"
-  echo "build --strategy=Javac=remote" >> "${TMP_BAZELRC}"
-  echo "build --strategy=Closure=remote" >> "${TMP_BAZELRC}"
-  echo "build --genrule_strategy=remote" >> "${TMP_BAZELRC}"
+  echo "build --spawn_strategy=standalone" >> "${TMP_BAZELRC}"
+  echo "build --strategy=Javac=standalone" >> "${TMP_BAZELRC}"
+  echo "build --strategy=Closure=standalone" >> "${TMP_BAZELRC}"
+  echo "build --genrule_strategy=standalone" >> "${TMP_BAZELRC}"
   echo "build --google_credentials=$GOOGLE_CLOUD_CREDENTIAL" >> "${TMP_BAZELRC}"
 }
 
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index fd94d64268..f7fe4119da 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -63,7 +63,7 @@ RUN echo "startup --batch" >>/etc/bazel.bazelrc
 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
     >>/etc/bazel.bazelrc
 # Install the most recent bazel release.
-ENV BAZEL_VERSION 0.14.1
+ENV BAZEL_VERSION 0.15.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index a5560e459c..340f96df48 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -79,7 +79,7 @@ RUN echo "startup --batch" >>/etc/bazel.bazelrc
 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
     >>/etc/bazel.bazelrc
 # Install the most recent bazel release.
-ENV BAZEL_VERSION 0.14.1
+ENV BAZEL_VERSION 0.15.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7 b/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
index 3bedc8cf34..30bc2d2806 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
@@ -4,7 +4,7 @@ LABEL maintainer="Gunhan Gulsoy <gunan@google.com>"
 
 # It is possible to override these for releases.
 ARG TF_BRANCH=master
-ARG BAZEL_VERSION=0.5.4
+ARG BAZEL_VERSION=0.15.0
 ARG TF_AVAILABLE_CPUS=32
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
-- 
cgit v1.2.3


From b7d97e85a509dc14f3e81bb289fdb7bad64f0bbe Mon Sep 17 00:00:00 2001
From: Niall Moran <niall.moran@gmail.com>
Date: Thu, 9 Aug 2018 00:29:49 +0100
Subject: Update cuda and cudnn paths to for debian

---
 configure.py                        | 13 +++++++------
 third_party/gpus/cuda_configure.bzl |  4 +++-
 2 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'configure.py')

diff --git a/configure.py b/configure.py
index f97bf8a668..bf570a9fa3 100644
--- a/configure.py
+++ b/configure.py
@@ -839,15 +839,16 @@ def set_tf_cuda_version(environ_cp):
       cuda_toolkit_path = cygpath(cuda_toolkit_path)
 
     if is_windows():
-      cuda_rt_lib_path = 'lib/x64/cudart.lib'
+      cuda_rt_lib_paths = ['lib/x64/cudart.lib']
     elif is_linux():
-      cuda_rt_lib_path = 'lib64/libcudart.so.%s' % tf_cuda_version
+      cuda_rt_lib_paths = ['%s/libcudart.so.%s' % (x, tf_cuda_version)
+                           for x in ['lib64', 'lib/x86_64-linux-gnu']]
     elif is_macos():
-      cuda_rt_lib_path = 'lib/libcudart.%s.dylib' % tf_cuda_version
+      cuda_rt_lib_paths = ['lib/libcudart.%s.dylib' % tf_cuda_version]
 
-    cuda_toolkit_path_full = os.path.join(cuda_toolkit_path, cuda_rt_lib_path)
-    if os.path.exists(cuda_toolkit_path_full):
-      break
+    cuda_toolkit_paths_full = [os.path.join(cuda_toolkit_path, x) for x in cuda_rt_lib_paths]
+    if any([os.path.exists(x) for x in cuda_toolkit_paths_full]):
+        break
 
     # Reset and retry
     print('Invalid path to CUDA %s toolkit. %s cannot be found' %
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index e848fa175c..f6a39aeaf1 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -61,6 +61,7 @@ CUDA_LIB_PATHS = [
 CUPTI_HEADER_PATHS = [
     "extras/CUPTI/include/",
     "include/cuda/CUPTI/",
+    "include/",
 ]
 
 # Lookup paths for the cupti library, relative to the
@@ -69,7 +70,7 @@ CUPTI_HEADER_PATHS = [
 # the other CUDA libraries but rather in a special extras/CUPTI directory.
 CUPTI_LIB_PATHS = [
     "extras/CUPTI/lib64/",
-    "lib/x86_64-linux-gnu",
+    "lib/x86_64-linux-gnu/",
     "lib64/",
     "extras/CUPTI/libx64/",
     "extras/CUPTI/lib/",
@@ -96,6 +97,7 @@ CUDNN_INCLUDE_PATHS = [
 NVVM_LIBDEVICE_PATHS = [
     "nvvm/libdevice/",
     "share/cuda/",
+    "lib/nvidia-cuda-toolkit/libdevice/",
 ]
 
 # Files used to detect the NVVM libdevice path.
-- 
cgit v1.2.3