From 1fa73c53ab95693f070ce70e6be0c644d83c163a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 26 Jun 2017 14:00:17 -0700
Subject: Automated g4 rollback of changelist 160182040

PiperOrigin-RevId: 160190881
---
 CONTRIBUTING.md                                    |   7 +-
 ISSUE_TEMPLATE.md                                  |   1 -
 README.md                                          |  14 +-
 RELEASE.md                                         |   2 -
 configure                                          |   6 +-
 tensorflow/c/generate-pc.sh                        |   2 +-
 tensorflow/cc/BUILD                                |  21 +-
 tensorflow/cc/gradients/math_grad.cc               |  26 --
 tensorflow/cc/gradients/math_grad_test.cc          |  52 ----
 tensorflow/cc/gradients/nn_grad.cc                 |  13 -
 tensorflow/cc/gradients/nn_grad_test.cc            |  13 -
 tensorflow/compiler/plugin/BUILD                   |   4 +-
 tensorflow/compiler/plugin/executor/BUILD          |  32 --
 tensorflow/compiler/plugin/executor/compiler.cc    | 123 --------
 tensorflow/compiler/plugin/executor/compiler.h     |  64 ----
 tensorflow/compiler/plugin/executor/device.cc      |  60 ----
 tensorflow/compiler/plugin/executor/executable.cc  | 147 ---------
 tensorflow/compiler/plugin/executor/executable.h   |  65 ----
 tensorflow/compiler/plugin/executor/executor.cc    | 135 --------
 tensorflow/compiler/plugin/executor/executor.h     | 213 -------------
 tensorflow/compiler/plugin/executor/platform.cc    | 125 --------
 tensorflow/compiler/plugin/executor/platform.h     |  83 -----
 tensorflow/compiler/plugin/executor/platform_id.h  |  31 --
 .../compiler/plugin/executor/transfer_manager.cc   | 187 -----------
 .../compiler/plugin/executor/transfer_manager.h    |  77 -----
 tensorflow/compiler/tests/ftrl_test.py             |   2 +-
 .../compiler/tf2xla/kernels/batch_matmul_op.cc     |   6 +-
 .../compiler/tf2xla/kernels/batchtospace_op.cc     |   3 +-
 .../compiler/tf2xla/kernels/depthwise_conv_ops.cc  |  19 +-
 tensorflow/compiler/tf2xla/kernels/diag_op.cc      |   8 +-
 .../compiler/tf2xla/kernels/dynamic_stitch_op.cc   |   4 +-
 tensorflow/compiler/tf2xla/kernels/slice_op.cc     |   4 +-
 tensorflow/compiler/tf2xla/kernels/split_op.cc     |  14 +-
 .../compiler/tf2xla/kernels/strided_slice_op.cc    |  44 ++-
 .../compiler/tf2xla/kernels/tensor_array_ops.cc    |   8 +-
 tensorflow/compiler/tf2xla/kernels/unpack_op.cc    |   4 +-
 .../compiler/xla/client/computation_builder.cc     |   6 +-
 .../compiler/xla/client/computation_builder.h      |   4 +-
 tensorflow/compiler/xla/literal_util.cc            |  10 +-
 tensorflow/compiler/xla/literal_util_test.cc       |  57 ----
 tensorflow/compiler/xla/service/BUILD              |   2 +
 .../compiler/xla/service/algebraic_simplifier.cc   |   7 +-
 .../xla/service/algebraic_simplifier_test.cc       |   8 +-
 .../compiler/xla/service/buffer_assignment_test.cc |  12 +-
 .../compiler/xla/service/buffer_liveness_test.cc   |   2 +-
 .../compiler/xla/service/compile_only_service.h    |   2 +-
 .../compiler/xla/service/computation_placer.cc     |   9 +-
 .../compiler/xla/service/computation_placer.h      |   6 +-
 .../compiler/xla/service/cpu/cpu_compiler.cc       |   9 +
 .../compiler/xla/service/elemental_ir_emitter.cc   |  17 +-
 .../compiler/xla/service/gpu/pad_insertion.cc      |  11 +-
 .../xla/service/hlo_constant_folding_test.cc       |   3 +-
 tensorflow/compiler/xla/service/hlo_instruction.cc |   7 +-
 tensorflow/compiler/xla/service/hlo_instruction.h  |  13 +-
 .../xla/service/hlo_rematerialization_test.cc      |  16 +-
 .../compiler/xla/service/llvm_ir/llvm_util.cc      |  23 +-
 tensorflow/compiler/xla/service/shape_inference.cc |  28 +-
 tensorflow/compiler/xla/service/shape_inference.h  |   3 +-
 .../compiler/xla/service/shape_inference_test.cc   |  33 +-
 .../xla/service/tuple_points_to_analysis_test.cc   |   2 +-
 .../compiler/xla/service/user_computation.cc       |   6 +-
 .../xla/tests/array_elementwise_ops_test.cc        |   2 +-
 .../compiler/xla/tests/dot_operation_test.cc       |   4 +-
 tensorflow/compiler/xla/tests/fusion_test.cc       |   2 +-
 .../xla/tests/multidimensional_slice_test.cc       |   4 +-
 tensorflow/compiler/xla/tests/params_test.cc       |   2 +-
 tensorflow/compiler/xla/tests/slice_test.cc        |  51 ++-
 tensorflow/compiler/xla/tests/while_test.cc        |   3 +-
 tensorflow/compiler/xla/util.h                     |  18 +-
 tensorflow/compiler/xla/xla_data.proto             |   3 +-
 .../android/TensorFlowInferenceInterface.java      |  20 +-
 .../python/kernel_tests/csiszar_divergence_test.py |  43 ++-
 .../python/ops/csiszar_divergence_impl.py          |  62 ++--
 tensorflow/contrib/cmake/CMakeLists.txt            |   2 +-
 tensorflow/contrib/cmake/tf_python.cmake           |   1 +
 .../python/ops/bijectors/affine_impl.py            |   4 +-
 .../distributions/python/ops/relaxed_bernoulli.py  |   2 +-
 .../distributions/python/ops/sample_stats.py       |   2 +-
 .../python/ops/vector_laplace_linear_operator.py   |   2 +-
 tensorflow/contrib/graph_editor/transform.py       |   2 +-
 tensorflow/contrib/keras/python/keras/backend.py   |   6 +-
 .../contrib/keras/python/keras/layers/core.py      |   2 +-
 .../contrib/keras/python/keras/layers/recurrent.py |   2 +-
 .../contrib/keras/python/keras/models_test.py      |   2 +-
 tensorflow/contrib/layers/python/layers/layers.py  |   3 +-
 .../learn/python/learn/estimators/debug_test.py    |   4 +-
 .../learn/python/learn/estimators/linear_test.py   |  20 +-
 .../learn/python/learn/estimators/model_fn_test.py |  10 +-
 tensorflow/contrib/learn/python/learn/models.py    |   4 +-
 tensorflow/contrib/lookup/lookup_ops.py            |   6 +-
 tensorflow/contrib/makefile/build_with_docker.sh   |   2 +-
 .../contrib/makefile/compile_android_protobuf.sh   |   4 +-
 .../contrib/makefile/compile_ios_protobuf.sh       |  17 +-
 .../contrib/makefile/compile_ios_tensorflow.sh     |   4 +-
 tensorflow/contrib/makefile/compile_pi_protobuf.sh |   6 +-
 tensorflow/contrib/remote_fused_graph/pylib/BUILD  |   1 -
 tensorflow/contrib/rnn/python/ops/rnn_cell.py      |   2 +-
 .../python/kernel_tests/beam_search_ops_test.py    |   4 +-
 tensorflow/contrib/slim/README.md                  |   2 +-
 tensorflow/contrib/tfprof/README.md                |  23 --
 .../pywrap_tensorflow_print_model_analysis.i       |   1 -
 .../tools/tfprof/internal/run_metadata_test.py     |   2 +-
 .../tfprof/python/tools/tfprof/model_analyzer.py   | 121 ++++++--
 .../python/tools/tfprof/model_analyzer_test.py     |  30 +-
 .../tfprof/python/tools/tfprof/profiler_test.py    |  16 +-
 .../contrib/training/python/training/evaluation.py |   2 +-
 tensorflow/contrib/verbs/rdma.cc                   |  55 +---
 tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc    |  41 +--
 tensorflow/contrib/verbs/verbs_util.cc             |  34 --
 tensorflow/contrib/verbs/verbs_util.h              |  10 -
 tensorflow/core/common_runtime/direct_session.cc   |   4 +-
 tensorflow/core/debug/grpc_session_debug_test.cc   |   5 +-
 tensorflow/core/framework/graph_def_util.h         |   2 +-
 tensorflow/core/framework/op.h                     |   1 +
 tensorflow/core/framework/tensor.h                 |   2 +-
 tensorflow/core/graph/mkl_layout_pass.cc           | 277 +++++------------
 tensorflow/core/graph/mkl_layout_pass_test.cc      | 252 +--------------
 .../core/grappler/costs/virtual_scheduler_test.cc  |   2 +-
 tensorflow/core/grappler/grappler_item.h           |   1 -
 .../core/grappler/optimizers/auto_parallel.cc      |   5 -
 .../core/grappler/optimizers/auto_parallel.h       |   1 -
 .../core/grappler/optimizers/auto_parallel_test.cc |  42 ++-
 .../core/grappler/optimizers/layout_optimizer.cc   |   2 +-
 tensorflow/core/kernels/BUILD                      |   8 +-
 tensorflow/core/kernels/adjust_contrast_op.cc      |  24 --
 .../kernels/adjust_contrast_op_benchmark_test.cc   |   5 -
 tensorflow/core/kernels/colorspace_op.cc           |  15 -
 tensorflow/core/kernels/control_flow_ops.cc        | 136 +++-----
 tensorflow/core/kernels/cwise_op_add_2.cc          |   7 +-
 tensorflow/core/kernels/cwise_op_cosh.cc           |  37 ---
 tensorflow/core/kernels/cwise_op_gpu_add.cu.cc     |   3 +-
 tensorflow/core/kernels/cwise_op_gpu_cosh.cu.cc    |  26 --
 tensorflow/core/kernels/cwise_op_gpu_sinh.cu.cc    |  26 --
 tensorflow/core/kernels/cwise_op_invert.cc         |   2 +-
 tensorflow/core/kernels/cwise_op_sinh.cc           |  37 ---
 tensorflow/core/kernels/cwise_ops.h                |   6 -
 tensorflow/core/kernels/dynamic_stitch_op.cc       |  27 +-
 tensorflow/core/kernels/map_stage_op.cc            |   7 +-
 .../core/kernels/mkl_conv_grad_filter_ops.cc       |  69 ++---
 tensorflow/core/kernels/mkl_conv_ops.cc            |  51 +--
 tensorflow/core/kernels/mkl_lrn_op.cc              | 112 ++++---
 tensorflow/core/kernels/mkl_relu_op.cc             |  30 +-
 tensorflow/core/kernels/mkl_tfconv_op.cc           |  54 +---
 tensorflow/core/kernels/non_max_suppression_op.cc  |  36 +--
 tensorflow/core/kernels/priority_queue.cc          |   2 +-
 tensorflow/core/kernels/shape_ops.cc               |  56 ++--
 tensorflow/core/kernels/slice_op.cc                | 258 +---------------
 tensorflow/core/kernels/sparse_reduce_op.cc        | 341 ---------------------
 tensorflow/core/kernels/stack_ops.cc               |  68 +---
 tensorflow/core/kernels/topk_op.cc                 |   2 +-
 tensorflow/core/kernels/transpose_functor.h        |   7 -
 tensorflow/core/kernels/transpose_op.cc            |   5 +-
 .../kernels/typed_conditional_accumulator_base.h   |   2 +-
 tensorflow/core/lib/gtl/optional.h                 |   2 +-
 tensorflow/core/ops/math_grad.cc                   |  20 --
 tensorflow/core/ops/math_grad_test.cc              |  20 --
 tensorflow/core/ops/math_ops.cc                    |   8 -
 tensorflow/core/ops/nn_ops.cc                      |   2 -
 tensorflow/core/ops/ops.pbtxt                      | 169 +---------
 tensorflow/core/ops/sparse_ops.cc                  |  69 -----
 tensorflow/core/platform/cloud/retrying_utils.cc   |   2 +-
 tensorflow/core/protobuf/worker.proto              |   2 +-
 tensorflow/core/public/version.h                   |   2 +-
 tensorflow/core/util/mkl_util.h                    |  38 +--
 .../docs_src/api_guides/python/contrib.losses.md   |  10 +-
 tensorflow/docs_src/api_guides/python/math_ops.md  |   2 -
 tensorflow/docs_src/get_started/get_started.md     |   5 +-
 tensorflow/docs_src/get_started/mnist/beginners.md |   2 +-
 tensorflow/docs_src/get_started/mnist/mechanics.md |   2 +-
 tensorflow/docs_src/install/install_c.md           |   2 +-
 tensorflow/docs_src/install/install_go.md          |   2 +-
 tensorflow/docs_src/install/install_java.md        |  18 +-
 tensorflow/docs_src/install/install_linux.md       |  26 +-
 tensorflow/docs_src/install/install_mac.md         |  14 +-
 tensorflow/docs_src/install/install_sources.md     |   4 +-
 tensorflow/docs_src/install/install_windows.md     |  13 +-
 .../docs_src/performance/performance_guide.md      |   8 +-
 tensorflow/docs_src/performance/quantization.md    |  13 +-
 tensorflow/docs_src/tutorials/seq2seq.md           |  13 +-
 tensorflow/docs_src/tutorials/wide.md              |  18 +-
 tensorflow/docs_src/tutorials/wide_and_deep.md     |   6 +-
 tensorflow/docs_src/tutorials/word2vec.md          |   2 +-
 tensorflow/examples/image_retraining/retrain.py    |  43 +--
 tensorflow/examples/learn/examples_test.sh         |   2 +-
 tensorflow/examples/tutorials/mnist/mnist.py       |   2 +-
 .../examples/tutorials/word2vec/word2vec_basic.py  |  16 +-
 tensorflow/go/README.md                            |  29 ++
 tensorflow/go/genop/generate.sh                    |   2 +-
 tensorflow/go/op/wrappers.go                       |  30 --
 tensorflow/go/shape.go                             |   2 +-
 tensorflow/go/tensor.go                            |   2 +-
 tensorflow/java/BUILD                              |  26 +-
 tensorflow/java/build_defs.bzl                     |  11 +-
 tensorflow/java/maven/libtensorflow/pom.xml        |   4 +-
 tensorflow/java/maven/proto/pom.xml                |   6 +-
 tensorflow/java/maven/release.sh                   |   2 +-
 tensorflow/java/maven/run_inside_container.sh      |   2 +-
 .../java/src/main/java/org/tensorflow/Input.java   |  48 ---
 .../src/main/java/org/tensorflow/Operation.java    |  26 +-
 .../main/java/org/tensorflow/OperationBuilder.java |   2 +-
 .../java/src/main/java/org/tensorflow/Output.java  |  10 +-
 .../java/src/main/java/org/tensorflow/Session.java |   4 +-
 .../src/main/java/org/tensorflow/op/NameScope.java | 146 ---------
 .../src/main/java/org/tensorflow/op/Scope.java     | 165 ----------
 tensorflow/java/src/main/native/operation_jni.cc   |  18 --
 tensorflow/java/src/main/native/operation_jni.h    |  11 -
 .../test/java/org/tensorflow/OperationTest.java    |  21 --
 .../src/test/java/org/tensorflow/SessionTest.java  |   6 +-
 .../src/test/java/org/tensorflow/TensorTest.java   |   2 +-
 .../src/test/java/org/tensorflow/op/ScopeTest.java | 270 ----------------
 tensorflow/python/BUILD                            |   5 +-
 tensorflow/python/debug/cli/analyzer_cli.py        |   2 +-
 tensorflow/python/debug/wrappers/framework_test.py |   2 +-
 .../python/estimator/canned/dnn_linear_combined.py |   2 +-
 tensorflow/python/estimator/canned/linear.py       |  73 +++--
 tensorflow/python/kernel_tests/BUILD               |  10 +
 tensorflow/python/kernel_tests/basic_gpu_test.py   |   2 -
 tensorflow/python/kernel_tests/cwise_ops_test.py   |  15 -
 tensorflow/python/kernel_tests/fft_ops_test.py     |  63 ++--
 .../python/kernel_tests/map_stage_op_test.py       |  93 +++---
 .../python/kernel_tests/record_input_test.py       |  14 +-
 tensorflow/python/kernel_tests/sparse_ops_test.py  |  49 +--
 tensorflow/python/kernel_tests/stage_op_test.py    |  87 +++---
 tensorflow/python/ops/array_ops.py                 |  26 +-
 tensorflow/python/ops/data_flow_ops.py             |  31 +-
 .../python/ops/distributions/special_math.py       |   2 +-
 tensorflow/python/ops/lookup_ops.py                |   6 +-
 tensorflow/python/ops/math_grad.py                 |  18 --
 tensorflow/python/ops/math_ops.py                  |   7 +-
 tensorflow/python/ops/rnn_cell_impl.py             |   4 +-
 tensorflow/python/ops/sparse_ops.py                |  86 ------
 tensorflow/python/ops/tensor_array_ops.py          |   2 +-
 tensorflow/python/ops/variable_scope.py            |   5 -
 .../tools/print_selective_registration_header.py   |   2 +-
 tensorflow/python/training/input.py                |   3 -
 tensorflow/tensorflow.bzl                          |   2 +-
 tensorflow/tools/api/golden/tensorflow.pbtxt       |  16 -
 tensorflow/tools/ci_build/builds/pip.sh            |  27 +-
 tensorflow/tools/ci_build/ci_build.sh              |  10 +-
 .../tools/ci_build/ci_parameterized_build.sh       |  17 +-
 tensorflow/tools/ci_build/ci_sanity.sh             |  24 +-
 .../tools/ci_build/install/install_pip_packages.sh |   4 +-
 .../tools/ci_build/install/install_proto3.sh       |   6 +-
 .../install/install_python3.5_pip_packages.sh      |   2 +-
 .../ci_build/protobuf/protobuf_optimized_pip.sh    |   2 +-
 .../tools/ci_build/windows/bazel/bazel_test_lib.sh |   2 +-
 .../tools/ci_build/windows/cpu/cmake/run_py.bat    |   5 +-
 .../tools/ci_build/windows/gpu/cmake/run_py.bat    |   5 +-
 tensorflow/tools/dist_test/local_test.sh           |   6 +-
 tensorflow/tools/docker/README.md                  |  28 +-
 .../tools/docker/parameterized_docker_build.sh     |  44 +--
 tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh     |  11 +-
 tensorflow/tools/git/gen_git_source.sh             |   2 +-
 tensorflow/tools/graph_transforms/README.md        |   2 +-
 .../graph_transforms/quantize_weights_test.cc      |   4 +-
 .../tools/lib_package/libtensorflow_java_test.sh   |   2 +-
 tensorflow/tools/lib_package/libtensorflow_test.sh |   2 +-
 tensorflow/tools/pip_package/setup.py              |   2 +-
 tensorflow/tools/tfprof/BUILD                      |   1 +
 tensorflow/tools/tfprof/README.md                  |  12 +-
 tensorflow/tools/tfprof/g3doc/advise.md            |  80 ++++-
 tensorflow/tools/tfprof/g3doc/command_line.md      |   2 +-
 tensorflow/tools/tfprof/internal/advisor/BUILD     |   9 +
 .../advisor/accelerator_utilization_checker.h      |  24 +-
 tensorflow/tools/tfprof/internal/advisor/checker.h |  24 +-
 .../internal/advisor/internal_checker_runner.h     |   5 +-
 .../advisor/internal_checker_runner_dummy.cc       |   6 +-
 .../tfprof/internal/advisor/operation_checker.h    |  19 +-
 .../tools/tfprof/internal/advisor/tfprof_advisor.h |  47 ++-
 .../tfprof/internal/advisor/tfprof_advisor_test.cc |  46 ++-
 .../tools/tfprof/internal/print_model_analysis.cc  |  19 +-
 .../tools/tfprof/internal/print_model_analysis.h   |   2 -
 tensorflow/tools/tfprof/internal/tfprof_graph.h    |   4 +-
 tensorflow/tools/tfprof/internal/tfprof_op.cc      |  11 +-
 tensorflow/tools/tfprof/internal/tfprof_op.h       |   6 +-
 tensorflow/tools/tfprof/internal/tfprof_options.h  |   4 +-
 tensorflow/tools/tfprof/internal/tfprof_show.cc    |  14 +-
 tensorflow/tools/tfprof/internal/tfprof_show.h     |  13 +-
 .../tools/tfprof/internal/tfprof_show_multi.cc     |  19 +-
 .../tools/tfprof/internal/tfprof_show_multi.h      |  16 +-
 .../tools/tfprof/internal/tfprof_show_test.cc      |   1 +
 tensorflow/tools/tfprof/internal/tfprof_stats.cc   |  77 +++--
 tensorflow/tools/tfprof/internal/tfprof_stats.h    |  22 +-
 .../tools/tfprof/internal/tfprof_stats_test.cc     |   1 +
 .../tools/tfprof/internal/tfprof_tensor_test.cc    |   1 +
 .../tools/tfprof/internal/tfprof_timeline_test.cc  |   1 +
 tensorflow/tools/tfprof/tfprof_main.cc             |  25 +-
 tensorflow/tools/tfprof/tfprof_output.proto        |   8 +
 tensorflow/workspace.bzl                           |  24 +-
 tools/tf_env_collect.sh                            | 143 ++++-----
 290 files changed, 1733 insertions(+), 5824 deletions(-)
 delete mode 100644 tensorflow/compiler/plugin/executor/BUILD
 delete mode 100644 tensorflow/compiler/plugin/executor/compiler.cc
 delete mode 100644 tensorflow/compiler/plugin/executor/compiler.h
 delete mode 100644 tensorflow/compiler/plugin/executor/device.cc
 delete mode 100644 tensorflow/compiler/plugin/executor/executable.cc
 delete mode 100644 tensorflow/compiler/plugin/executor/executable.h
 delete mode 100644 tensorflow/compiler/plugin/executor/executor.cc
 delete mode 100644 tensorflow/compiler/plugin/executor/executor.h
 delete mode 100644 tensorflow/compiler/plugin/executor/platform.cc
 delete mode 100644 tensorflow/compiler/plugin/executor/platform.h
 delete mode 100644 tensorflow/compiler/plugin/executor/platform_id.h
 delete mode 100644 tensorflow/compiler/plugin/executor/transfer_manager.cc
 delete mode 100644 tensorflow/compiler/plugin/executor/transfer_manager.h
 delete mode 100644 tensorflow/core/kernels/cwise_op_cosh.cc
 delete mode 100644 tensorflow/core/kernels/cwise_op_gpu_cosh.cu.cc
 delete mode 100644 tensorflow/core/kernels/cwise_op_gpu_sinh.cu.cc
 delete mode 100644 tensorflow/core/kernels/cwise_op_sinh.cc
 delete mode 100644 tensorflow/core/kernels/sparse_reduce_op.cc
 delete mode 100644 tensorflow/java/src/main/java/org/tensorflow/Input.java
 delete mode 100644 tensorflow/java/src/main/java/org/tensorflow/op/NameScope.java
 delete mode 100644 tensorflow/java/src/main/java/org/tensorflow/op/Scope.java
 delete mode 100644 tensorflow/java/src/test/java/org/tensorflow/op/ScopeTest.java

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 43abdaafbf..c78b6b1a15 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -159,12 +159,7 @@ There are two ways to run TensorFlow unit tests.
    bazel test ${flags} //tensorflow/python/...
    ```
 
-2. Using [Docker](www.docker.com) and TensorFlow's CI scripts.
-
-   ```bash
-   # Install Docker first, then this will build and run cpu tests
-   tensorflow/tools/ci_build/ci_build.sh CPU bazel test //tensorflow/...
-   ```
+2. Using Docker and TensorFlow's CI scripts.
 
    See
    [TensorFlow Builds](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/ci_build) for details.
diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md
index 5b37028c50..6f4c048ce8 100644
--- a/ISSUE_TEMPLATE.md
+++ b/ISSUE_TEMPLATE.md
@@ -6,7 +6,6 @@ If you open a GitHub issue, here is our policy:
 
 1. It must be a bug or a feature request.
 2. The form below must be filled out.
-3. It shouldn't be a TensorBoard issue. Those go [here](https://github.com/tensorflow/tensorflow/issues).
 
 **Here's why we have that policy**: TensorFlow developers respond to issues. We want to focus on work that benefits the whole community, e.g., fixing bugs and adding features. Support only helps individuals. GitHub also notifies thousands of people when issues are filed. We want them to see you communicating an interesting problem, rather than being redirected to Stack Overflow.
 
diff --git a/README.md b/README.md
index abbead98a7..e7dbf57b25 100644
--- a/README.md
+++ b/README.md
@@ -34,13 +34,13 @@ and discussion.**
 
 People who are a little more adventurous can also try our nightly binaries:
 
-* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
-* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
-* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
-* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
-* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.2.0-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.2.0-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/))
-* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.2.0-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.2.0-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/))
-* Android: [demo APK](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk), [native libs](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/native/)
+* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0rc2-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0rc2-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0rc2-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
+* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc2-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc2-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc2-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
+* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0rc2-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0rc2-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
+* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc2-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc2-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
+* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.2.0rc2-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.2.0rc2-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/))
+* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.2.0rc2-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.2.0rc2-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/))
+* Android: [demo APK](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk), [native libs](http://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/native/)
 ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-android/))
 
 #### *Try your first TensorFlow program*
diff --git a/RELEASE.md b/RELEASE.md
index 9875838d7e..d22c5c62fe 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -113,8 +113,6 @@
   checkpoints containing such RNN cells, in which case you can use the
   [checkpoint_convert script](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/rnn/python/tools/checkpoint_convert.py)
   to convert the variable names in your old checkpoints.
-* Added `tf.contrib.kernel_methods` module with Ops and estimators for primal
-  (explicit) kernel methods in TensorFlow.
 
 ## Bug Fixes and Other Changes
 * In python, `Operation.get_attr` on type attributes returns the Python DType
diff --git a/configure b/configure
index 602124225f..e1aaddabda 100755
--- a/configure
+++ b/configure
@@ -162,12 +162,8 @@ bazel version > bazel.version
 curr_bazel_version=$(head -n 1 bazel.version | cut -d ' ' -f3)
 rm -f bazel.version
 
-
 echo "You have bazel $curr_bazel_version installed."
-if [ -z "$curr_bazel_version" ]; then
-  echo "WARNING: current bazel installation is not a release version."
-  echo "Make sure you are running at least bazel $MIN_BAZEL_VERSION."
-elif [ "$(version "$MIN_BAZEL_VERSION")" -gt "$(version "$curr_bazel_version")" ]; then
+if [ "$(version "$MIN_BAZEL_VERSION")" -gt "$(version "$curr_bazel_version")" ]; then
   echo "Please upgrade your bazel installation to version $MIN_BAZEL_VERSION or higher to build TensorFlow!"
   echo "Exiting..."
   exit 1
diff --git a/tensorflow/c/generate-pc.sh b/tensorflow/c/generate-pc.sh
index 02a6a58b61..73d427d9b2 100755
--- a/tensorflow/c/generate-pc.sh
+++ b/tensorflow/c/generate-pc.sh
@@ -26,7 +26,7 @@ usage() {
 [ $# == 0 ] && usage && exit 0
 
 # read the options
-ARGS=$(getopt -o p:v:h --long prefix:,version:,help -n $0 -- "$@")
+ARGS=`getopt -o p:v:h --long prefix:,version:,help -n $0 -- "$@"`
 eval set -- "$ARGS"
 
 # extract options and their arguments into variables.
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index 9801add1da..a884f11d48 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -472,23 +472,10 @@ cc_binary(
     name = "tutorials_example_trainer",
     srcs = ["tutorials/example_trainer.cc"],
     copts = tf_copts(),
-    linkopts = select({
-        "//tensorflow:windows": [],
-        "//tensorflow:windows_msvc": [],
-        "//tensorflow:darwin": [
-            "-lm",
-            "-lpthread",
-        ],
-        "//tensorflow:ios": [
-            "-lm",
-            "-lpthread",
-        ],
-        "//conditions:default": [
-            "-lm",
-            "-lpthread",
-            "-lrt",
-        ],
-    }),
+    linkopts = [
+        "-lpthread",
+        "-lm",
+    ],
     deps = [
         ":cc_ops",
         "//tensorflow/core:core_cpu",
diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc
index 71d9a8ed7b..8c1a01f518 100644
--- a/tensorflow/cc/gradients/math_grad.cc
+++ b/tensorflow/cc/gradients/math_grad.cc
@@ -162,32 +162,6 @@ Status Log1pGrad(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("Log1p", Log1pGrad);
 
-Status SinhGrad(const Scope& scope, const Operation& op,
-                const std::vector<Output>& grad_inputs,
-                std::vector<Output>* grad_outputs) {
-  // y = sinh(x)
-  // dy/dx = cosh(x)
-  auto dydx = Cosh(scope, op.input(0));
-  // grad(x) = grad(y) * conj(dy/dx)
-  grad_outputs->push_back(
-      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
-  return scope.status();
-}
-REGISTER_GRADIENT_OP("Sinh", SinhGrad);
-
-Status CoshGrad(const Scope& scope, const Operation& op,
-                const std::vector<Output>& grad_inputs,
-                std::vector<Output>* grad_outputs) {
-  // y = cosh(x)
-  // dy/dx = sinh(x)
-  auto dydx = Sinh(scope, op.input(0));
-  // grad(x) = grad(y) * conj(dy/dx)
-  grad_outputs->push_back(
-      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
-  return scope.status();
-}
-REGISTER_GRADIENT_OP("Cosh", CoshGrad);
-
 Status TanhGrad(const Scope& scope, const Operation& op,
                 const std::vector<Output>& grad_inputs,
                 std::vector<Output>* grad_outputs) {
diff --git a/tensorflow/cc/gradients/math_grad_test.cc b/tensorflow/cc/gradients/math_grad_test.cc
index 1653b04378..de6baa1769 100644
--- a/tensorflow/cc/gradients/math_grad_test.cc
+++ b/tensorflow/cc/gradients/math_grad_test.cc
@@ -45,8 +45,6 @@ class CWiseUnaryGradTest : public ::testing::Test {
     EXPM1,
     LOG,
     LOG1P,
-    SINH,
-    COSH,
     TANH,
     SIGMOID,
     SIGN,
@@ -113,12 +111,6 @@ class CWiseUnaryGradTest : public ::testing::Test {
       case LOG1P:
         y = Log1p(scope_, x);
         break;
-      case SINH:
-        y = Sinh(scope_, x);
-        break;
-      case COSH:
-        y = Cosh(scope_, x);
-        break;
       case TANH:
         y = Tanh(scope_, x);
         break;
@@ -345,50 +337,6 @@ TEST_F(CWiseUnaryGradTest, Log1p_Complex) {
   TestCWiseGrad<complex64>(LOG1P, x_fn, dy_fn, dx_fn);
 }
 
-TEST_F(CWiseUnaryGradTest, Sinh) {
-  auto x_fn = [this](const int i) { return RV({0, -1, 1, -2, 2, -3, 3}); };
-  auto dy_fn = [this](const float x) { return x + RV({-2, 2, -3, 3, -4, 4}); };
-  auto dx_fn = [this](const float x, const float dy) {
-    return dy * std::cosh(x);
-  };
-  TestCWiseGrad<float>(SINH, x_fn, dy_fn, dx_fn);
-}
-
-TEST_F(CWiseUnaryGradTest, Sinh_Complex) {
-  auto x_fn = [this](const int i) {
-    return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}});
-  };
-  auto dy_fn = [this](const complex64& x) {
-    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
-  };
-  auto dx_fn = [this](const complex64& x, const complex64& dy) {
-    return dy * conjugate(std::cosh(x));
-  };
-  TestCWiseGrad<complex64>(SINH, x_fn, dy_fn, dx_fn);
-}
-
-TEST_F(CWiseUnaryGradTest, Cosh) {
-  auto x_fn = [this](const int i) { return RV({0, -1, 1, -2, 2, -3, 3}); };
-  auto dy_fn = [this](const float x) { return x + RV({-2, 2, -3, 3, -4, 4}); };
-  auto dx_fn = [this](const float x, const float dy) {
-    return dy * std::sinh(x);
-  };
-  TestCWiseGrad<float>(COSH, x_fn, dy_fn, dx_fn);
-}
-
-TEST_F(CWiseUnaryGradTest, Cosh_Complex) {
-  auto x_fn = [this](const int i) {
-    return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}});
-  };
-  auto dy_fn = [this](const complex64& x) {
-    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
-  };
-  auto dx_fn = [this](const complex64& x, const complex64& dy) {
-    return dy * conjugate(std::sinh(x));
-  };
-  TestCWiseGrad<complex64>(COSH, x_fn, dy_fn, dx_fn);
-}
-
 TEST_F(CWiseUnaryGradTest, Tanh) {
   auto x_fn = [this](const int i) { return RV({0, -1, 1, -2, 2, -3, 3}); };
   auto dy_fn = [this](const float x) { return x + RV({-2, 2, -3, 3, -4, 4}); };
diff --git a/tensorflow/cc/gradients/nn_grad.cc b/tensorflow/cc/gradients/nn_grad.cc
index 952b2015ed..5e5203d090 100644
--- a/tensorflow/cc/gradients/nn_grad.cc
+++ b/tensorflow/cc/gradients/nn_grad.cc
@@ -46,19 +46,6 @@ Status SoftmaxGrad(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("Softmax", SoftmaxGrad);
 
-Status LogSoftmaxGrad(const Scope& scope, const Operation& op,
-                   const std::vector<Output>& grad_inputs,
-                   std::vector<Output>* grad_outputs) {
-
-  auto softmax = Exp(scope, op.output(0));
-  auto sum = Sum(scope, grad_inputs[0], {1}, Sum::KeepDims(true));
-  auto mul = Mul(scope, sum, softmax);
-  auto dx = Sub(scope, grad_inputs[0], mul);
-  grad_outputs->push_back(dx);
-  return scope.status();
-}
-REGISTER_GRADIENT_OP("LogSoftmax", LogSoftmaxGrad);
-
 Status ReluGradHelper(const Scope& scope, const Operation& op,
                       const std::vector<Output>& grad_inputs,
                       std::vector<Output>* grad_outputs) {
diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc
index daa87546ec..70c9bd4e08 100644
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@@ -57,19 +57,6 @@ TEST_F(NNGradTest, SoftmaxGrad) {
   RunTest(x, shape, y, shape);
 }
 
-TEST_F(NNGradTest, LogSoftmaxGrad) {
-  TensorShape shape({5, 3});
-  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
-  auto y = LogSoftmax(scope_, x);
-  // Avoid numerical instability when computing finite differences.
-  Tensor x_init_value = test::AsTensor<float>(
-          {-0.9f, -0.7f, -0.5f, -0.3f, -0.1f,
-           0.1f, 0.3f, 0.5f, 0.7f, 0.8f,
-           -0.1f, 0.1f, 0.1f, 0.1f, 1.2f},
-          {5, 3});
-  RunTest(x, x_init_value, y, shape);
-}
-
 TEST_F(NNGradTest, ReluGrad) {
   TensorShape shape({5, 2});
   auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
diff --git a/tensorflow/compiler/plugin/BUILD b/tensorflow/compiler/plugin/BUILD
index 8c2e9a7c81..4badd3a589 100644
--- a/tensorflow/compiler/plugin/BUILD
+++ b/tensorflow/compiler/plugin/BUILD
@@ -32,7 +32,5 @@ package(
 
 cc_library(
     name = "plugin",
-    deps = [
-        "//tensorflow/compiler/plugin/executor:plugin_lib",
-    ],
+    deps = [],
 )
diff --git a/tensorflow/compiler/plugin/executor/BUILD b/tensorflow/compiler/plugin/executor/BUILD
deleted file mode 100644
index 9bc706abdf..0000000000
--- a/tensorflow/compiler/plugin/executor/BUILD
+++ /dev/null
@@ -1,32 +0,0 @@
-licenses(["restricted"])
-
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "plugin_lib",
-    srcs = glob([
-        "*.cc",
-    ]),
-    hdrs = glob([
-        "*.h",
-    ]),
-    deps = [
-        "//tensorflow/compiler/jit:xla_jit_headers_lib",
-        "//tensorflow/compiler/xla:xla_headers_lib",
-        "//tensorflow/compiler/xla/service:hlo_evaluator",
-        "//third_party/eigen3",
-        "@local_config_cuda//cuda:cuda_headers",
-        "@protobuf//:protobuf_headers",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/compiler/plugin/executor/compiler.cc b/tensorflow/compiler/plugin/executor/compiler.cc
deleted file mode 100644
index 893ff152f0..0000000000
--- a/tensorflow/compiler/plugin/executor/compiler.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <stdlib.h>
-#include <fstream>
-
-#include "tensorflow/compiler/plugin/executor/compiler.h"
-#include "tensorflow/compiler/plugin/executor/executable.h"
-
-#include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
-#include "tensorflow/compiler/xla/service/flatten_call_graph.h"
-#include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
-#include "tensorflow/compiler/xla/service/hlo_cse.h"
-#include "tensorflow/compiler/xla/service/hlo_dce.h"
-#include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
-#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
-#include "tensorflow/compiler/xla/service/hlo_subcomputation_unification.h"
-#include "tensorflow/compiler/xla/service/inliner.h"
-#include "tensorflow/compiler/xla/service/reshape_mover.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-
-#include "tensorflow/stream_executor/lib/initialize.h"
-#include "tensorflow/stream_executor/lib/strcat.h"
-
-#include "tensorflow/core/lib/core/errors.h"
-
-namespace se = ::perftools::gputools;
-namespace sep = ::perftools::gputools::executorplugin;
-namespace port = ::perftools::gputools::port;
-
-namespace xla {
-namespace executorplugin {
-
-/*
- * Run optimization passes on the module.  The graph is transformed by
- * each pass in the optimization pipeline.  The service subdirectory
- * contains useful optimization passes.
- */
-Status ExecutorCompiler::RunHloOptimization(HloModule* hlo_module,
-                                            HloDumper dump_hlo) {
-  HloPassPipeline pipeline("Executor", dump_hlo);
-  pipeline.AddPass<Inliner>();
-  pipeline.AddPass<HloSubcomputationUnification>();
-  pipeline.AddPass<HloCSE>(false);
-
-  pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(
-      false, [](const Shape&, const Shape&) { return false; });
-  pipeline.AddPass<ReshapeMover>();
-  pipeline.AddPass<HloConstantFolding>();
-  pipeline.AddPass<HloCSE>(true);
-
-  pipeline.AddPass<HloDCE>();
-  pipeline.AddPass<FlattenCallGraph>();
-  return pipeline.Run(hlo_module).status();
-}
-
-StatusOr<std::unique_ptr<Executable>> ExecutorCompiler::Compile(
-        std::unique_ptr<HloModule> hlo_module, HloDumper dump_hlo,
-        se::StreamExecutor* stream_exec) {
-  TF_RET_CHECK(stream_exec != nullptr);
-
-  VLOG(1) << "Generate graph " << hlo_module->name();
-
-  TF_RETURN_IF_ERROR(RunHloOptimization(hlo_module.get(), dump_hlo));
-
-  // Typically you would visit the HLO graph, building up a compiled equivalent
-  // In this case we are using an Hlo evaluator at execution time, so we don't
-  // need to compile anything
-
-  // Create executable from only the Hlo module
-  std::unique_ptr<Executable> executable;
-  executable.reset(new ExecutorExecutable(std::move(hlo_module)));
-
-  return std::move(executable);
-}
-
-StatusOr<std::vector<std::unique_ptr<Executable>>> ExecutorCompiler::Compile(
-        std::vector<std::unique_ptr<HloModule>> hlo_modules,
-        HloDumper dump_hlos, std::vector<se::StreamExecutor*> stream_execs) {
-
-  return tensorflow::errors::Unimplemented(
-      "Compilation of multiple HLO modules is not supported on Executor.");
-}
-
-StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-ExecutorCompiler::CompileAheadOfTime(
-    std::vector<std::unique_ptr<HloModule>> hlo_modules,
-    HloDumper dump_hlo, const AotCompilationOptions& aot_options) {
-
-  return tensorflow::errors::InvalidArgument(
-      "AOT compilation not supported on Executor");
-}
-
-se::Platform::Id ExecutorCompiler::PlatformId() const {
-  return sep::kExecutorPlatformId;
-}
-
-HloCostAnalysis::ShapeSizeFunction
-ExecutorCompiler::ShapeSizeBytesFunction() const {
-  return ExecutorExecutable::ShapeSizeBytes;
-}
-
-
-}  // namespace executorplugin
-}  // namespace xla
-
-REGISTER_MODULE_INITIALIZER(executor_compiler, {
-  xla::Compiler::RegisterCompilerFactory(sep::kExecutorPlatformId, []() {
-    return xla::MakeUnique<xla::executorplugin::ExecutorCompiler>();
-  });
-});
diff --git a/tensorflow/compiler/plugin/executor/compiler.h b/tensorflow/compiler/plugin/executor/compiler.h
deleted file mode 100644
index 8fe591c8ab..0000000000
--- a/tensorflow/compiler/plugin/executor/compiler.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_EXECUTOR_COMPILER_H_
-#define TENSORFLOW_COMPILER_EXECUTOR_COMPILER_H_
-
-#include <memory>
-
-#include "tensorflow/compiler/xla/service/compiler.h"
-#include "tensorflow/compiler/xla/service/executable.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_module_config.h"
-
-#include "tensorflow/compiler/plugin/executor/platform_id.h"
-
-namespace xla {
-namespace executorplugin {
-
-class ExecutorCompiler : public Compiler {
- public:
-  ExecutorCompiler() {}
-  ~ExecutorCompiler() override {}
-
-  StatusOr<std::unique_ptr<Executable>> Compile(
-      std::unique_ptr<HloModule> hlo_module,
-      HloDumper dump_hlo,
-      perftools::gputools::StreamExecutor* stream_exec) override;
-
-  StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
-      std::vector<std::unique_ptr<HloModule>> hlo_module,
-      HloDumper dump_hlo,
-      std::vector<perftools::gputools::StreamExecutor*> stream_exec) override;
-
-  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-  CompileAheadOfTime(
-      std::vector<std::unique_ptr<HloModule>> module,
-      HloDumper dump_hlo, const AotCompilationOptions& options) override;
-
-  HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override;
-
-  perftools::gputools::Platform::Id PlatformId() const override;
-
- private:
-  Status RunHloOptimization(HloModule* hlo_module, HloDumper dump_hlo);
-
-  TF_DISALLOW_COPY_AND_ASSIGN(ExecutorCompiler);
-};
-
-}  // namespace executorplugin
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_EXECUTOR_COMPILER_H_
diff --git a/tensorflow/compiler/plugin/executor/device.cc b/tensorflow/compiler/plugin/executor/device.cc
deleted file mode 100644
index bbc39dc03f..0000000000
--- a/tensorflow/compiler/plugin/executor/device.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/jit/kernels/xla_device_launch_op.h"
-#include "tensorflow/compiler/jit/xla_device.h"
-#include "tensorflow/compiler/jit/xla_device_ops.h"
-#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-
-namespace tensorflow {
-
-const char* const DEVICE_XLA_EXEC = "XLA_EXEC";
-const char* const DEVICE_EXEC_XLA_JIT = "XLA_EXEC_JIT";
-
-constexpr std::array<DataType, 5> kExecAllTypes = {
-    {DT_INT32, DT_FLOAT, DT_BOOL, DT_DOUBLE, DT_INT64}};
-
-class XlaExaDeviceFactory : public DeviceFactory {
- public:
-  Status CreateDevices(const SessionOptions& options, const string& name_prefix,
-                       std::vector<Device*>* devices) override;
-};
-
-Status XlaExaDeviceFactory::CreateDevices(const SessionOptions& options,
-                                          const string& name_prefix,
-                                          std::vector<Device*>* devices) {
-  static XlaDeviceOpRegistrations* registrations =
-      RegisterXlaDeviceKernels(DEVICE_XLA_EXEC, DEVICE_EXEC_XLA_JIT);
-  (void)registrations;
-
-  std::unique_ptr<XlaDevice> device;
-  TF_RETURN_IF_ERROR(XlaDevice::Create("Executor", DEVICE_XLA_EXEC, 0,
-                                       DEVICE_EXEC_XLA_JIT, options,
-                                       name_prefix, &device));
-  devices->push_back(device.release());
-  return Status::OK();
-}
-
-REGISTER_LOCAL_DEVICE_FACTORY(DEVICE_XLA_EXEC, XlaExaDeviceFactory, 110);
-
-// Kernel registrations
-
-static bool OpFilter(KernelDef* kdef) { return true; }
-
-REGISTER_XLA_LAUNCH_KERNEL(DEVICE_XLA_EXEC, XlaDeviceLaunchOp, kExecAllTypes);
-REGISTER_XLA_DEVICE_KERNELS(DEVICE_XLA_EXEC, kExecAllTypes);
-REGISTER_XLA_BACKEND(DEVICE_EXEC_XLA_JIT, kExecAllTypes, OpFilter);
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/plugin/executor/executable.cc b/tensorflow/compiler/plugin/executor/executable.cc
deleted file mode 100644
index 79eea9af3f..0000000000
--- a/tensorflow/compiler/plugin/executor/executable.cc
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/plugin/executor/executable.h"
-#include "tensorflow/compiler/plugin/executor/executor.h"
-
-#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
-
-#include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-
-namespace se = ::perftools::gputools;
-namespace sep = ::perftools::gputools::executorplugin;
-
-namespace xla {
-namespace executorplugin {
-
-ExecutorExecutable::ExecutorExecutable(std::unique_ptr<HloModule> hlo_module)
-    : Executable(std::move(hlo_module), ShapeSizeBytes) {}
-
-ExecutorExecutable::~ExecutorExecutable() {}
-
-static se::DeviceMemoryBase AllocateSingleOutput(sep::ExecutorExecutor* executor,
-                                                 const Literal& literal) {
-  int64 size(xla::ShapeUtil::ByteSizeOf(literal.shape()));
-  void* buf = executor->Allocate(size);
-  const void* src = literal.InternalData();
-  memcpy(buf, src, size);
-  return se::DeviceMemoryBase(buf, size);
-}
-
-static se::DeviceMemoryBase AllocateOutputBuffer(sep::ExecutorExecutor* executor,
-                                                 const Literal& literal) {
-  const Shape& shape = literal.shape();
-  if (shape.element_type() != xla::TUPLE) {
-    return AllocateSingleOutput(executor, literal);
-  } else {
-    int64 size(xla::ShapeUtil::ByteSizeOf(shape, sizeof(void*)));
-    void** buf = reinterpret_cast<void**>(executor->Allocate(size));
-    for (int64 n = 0; n < xla::ShapeUtil::TupleElementCount(shape); n++) {
-      se::DeviceMemoryBase out =
-          AllocateSingleOutput(executor, literal.tuple_literals(n));
-      *buf++ = out.opaque();
-    }
-
-    return se::DeviceMemoryBase(buf, size);
-  }
-}
-
-StatusOr<se::DeviceMemoryBase> ExecutorExecutable::ExecuteOnStream(
-    const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments,
-    HloExecutionProfile* hlo_execution_profile) {
-  se::Stream* stream = run_options->stream();
-
-  VLOG(1) << "Execute " << module().name();
-  if (VLOG_IS_ON(2)) {
-    for (const auto& a : arguments) {
-      VLOG(2) << "-- argument " << a.opaque();
-    }
-  }
-
-  uint64 start_micros = tensorflow::Env::Default()->NowMicros();
-
-  HloComputation* computation = module().entry_computation();
-  if (computation->num_parameters() != arguments.size()) {
-    return tensorflow::errors::Internal(
-        "Mismatch between argument count and graph parameter count.");
-  }
-
-  // Create the arguments as an vector of XLA literals
-  std::vector<std::unique_ptr<Literal>> arg_literals;
-  std::vector<Literal*> arg_literals_ptrs;
-  for (int64 p = 0; p < computation->num_parameters(); p++) {
-    // Create the input literal for the parameter
-    HloInstruction* param = computation->parameter_instruction(p);
-    arg_literals.emplace_back(Literal::CreateFromShape(param->shape()));
-    arg_literals_ptrs.push_back(arg_literals.back().get());
-
-    // Copy in the data from the stream_executor buffers
-    void* buffer = arg_literals.back().get()->MutableInternalData();
-    memcpy(buffer, arguments[p].opaque(),
-           ShapeUtil::ByteSizeOf(param->shape()));
-  }
-
-  // Execute the graph using the evaluator
-  HloEvaluator evaluator;
-  std::unique_ptr<Literal> output;
-  TF_ASSIGN_OR_RETURN(output,
-                      evaluator.Evaluate(computation, arg_literals_ptrs));
-
-  // Copy the result into the return buffer
-  perftools::gputools::StreamExecutor* executor(stream->parent());
-  sep::ExecutorExecutor* executorExecutor(
-      static_cast<sep::ExecutorExecutor*>(executor->implementation()));
-
-  se::DeviceMemoryBase ret =
-      AllocateOutputBuffer(executorExecutor, *(output.get()));
-
-  uint64 end_micros = tensorflow::Env::Default()->NowMicros();
-
-  {
-    tensorflow::mutex_lock lock(mutex_);
-    const double nanoseconds = (end_micros - start_micros) * 1000.0;
-    execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0));
-  }
-
-  return ret;
-}
-
-StatusOr<std::unique_ptr<ShapedBuffer>> ExecutorExecutable::ExecuteOnStream(
-    const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    HloExecutionProfile* hlo_execution_profile) {
-  return tensorflow::errors::Unimplemented(
-      "ExecuteOnStream is not yet supported on Executor.");
-}
-
-StatusOr<se::DeviceMemoryBase> ExecutorExecutable::ExecuteAsyncOnStream(
-    const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments) {
-  return tensorflow::errors::Unimplemented(
-      "ExecuteAsyncOnStream is not yet supported on Executor.");
-}
-
-/*static*/ int64 ExecutorExecutable::ShapeSizeBytes(const Shape& shape) {
-  if (ShapeUtil::IsOpaque(shape)) {
-    return sizeof(void*);
-  }
-  return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
-}
-
-
-}  // namespace executorplugin
-}  // namespace xla
diff --git a/tensorflow/compiler/plugin/executor/executable.h b/tensorflow/compiler/plugin/executor/executable.h
deleted file mode 100644
index ba3d4da21d..0000000000
--- a/tensorflow/compiler/plugin/executor/executable.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_EXECUTABLE_H_
-#define TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_EXECUTABLE_H_
-
-#include <cstddef>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "tensorflow/compiler/xla/service/executable.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_module_config.h"
-
-#include "tensorflow/stream_executor/lib/status.h"
-#include "tensorflow/stream_executor/lib/statusor.h"
-
-namespace xla {
-namespace executorplugin {
-
-class ExecutorExecutable : public Executable {
- public:
-  ExecutorExecutable(std::unique_ptr<HloModule> hlo_module);
-  ~ExecutorExecutable() override;
-
-  StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream(
-      const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments,
-      HloExecutionProfile* hlo_execution_profile) override;
-
-  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream(
-      const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      HloExecutionProfile* hlo_execution_profile) override;
-
-  StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteAsyncOnStream(
-      const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments) override;
-
-  static int64 ShapeSizeBytes(const Shape& shape);
-
- private:
-  TF_DISALLOW_COPY_AND_ASSIGN(ExecutorExecutable);
-};
-
-}  // namespace executorplugin
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_EXECUTABLE_H_
diff --git a/tensorflow/compiler/plugin/executor/executor.cc b/tensorflow/compiler/plugin/executor/executor.cc
deleted file mode 100644
index e72c2711f7..0000000000
--- a/tensorflow/compiler/plugin/executor/executor.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/plugin/executor/executor.h"
-#include "tensorflow/compiler/plugin/executor/platform_id.h"
-
-#include "tensorflow/compiler/xla/status_macros.h"
-
-#include <stdlib.h>
-#include <string.h>
-
-namespace se = ::perftools::gputools;
-
-namespace perftools {
-namespace gputools {
-namespace executorplugin {
-
-host::HostStream *AsExecutorStream(Stream *stream) {
-  DCHECK(stream != nullptr);
-  return dynamic_cast<host::HostStream *>(stream->implementation());
-}
-
-ExecutorExecutor::ExecutorExecutor(const PluginConfig &plugin_config)
-    : plugin_config_(plugin_config) {}
-
-ExecutorExecutor::~ExecutorExecutor() {}
-
-void *ExecutorExecutor::Allocate(uint64 size) {
-  void *buf = new char[size];
-  return buf;
-}
-
-void *ExecutorExecutor::AllocateSubBuffer(DeviceMemoryBase *parent,
-                                         uint64 offset_bytes,
-                                         uint64 size_bytes) {
-  return parent + offset_bytes;
-}
-
-void ExecutorExecutor::Deallocate(DeviceMemoryBase *mem) {
-  if (!mem->is_sub_buffer()) {
-    delete[] static_cast<char *>(mem->opaque());
-  }
-}
-
-bool ExecutorExecutor::Memcpy(Stream *stream, void *host_dst,
-                             const DeviceMemoryBase &dev_src, uint64 size) {
-  AsExecutorStream(stream)->EnqueueTask([this, host_dst, dev_src, size]() {
-    port::Status ok = SynchronousMemcpy(host_dst, dev_src, size);
-  });
-  return true;
-}
-
-bool ExecutorExecutor::Memcpy(Stream *stream, DeviceMemoryBase *dev_dst,
-                             const void *host_src, uint64 size) {
-  AsExecutorStream(stream)->EnqueueTask([this, dev_dst, host_src, size]() {
-    port::Status ok = SynchronousMemcpy(dev_dst, host_src, size);
-  });
-  return true;
-}
-
-port::Status ExecutorExecutor::SynchronousMemcpy(DeviceMemoryBase *dev_dst,
-                                                const void *host_src,
-                                                uint64 size) {
-  memcpy(dev_dst->opaque(), host_src, size);
-  return port::Status::OK();
-}
-
-port::Status ExecutorExecutor::SynchronousMemcpy(void *host_dst,
-                                                const DeviceMemoryBase &dev_src,
-                                                uint64 size) {
-  memcpy(host_dst, dev_src.opaque(), size);
-  return port::Status::OK();
-}
-
-bool ExecutorExecutor::HostCallback(Stream *stream,
-                                   std::function<void()> callback) {
-  AsExecutorStream(stream)->EnqueueTask(callback);
-  return true;
-}
-
-bool ExecutorExecutor::CreateStreamDependency(Stream *dependent, Stream *other) {
-  AsExecutorStream(dependent)->EnqueueTask(
-      [other]() { other->BlockHostUntilDone(); });
-  AsExecutorStream(dependent)->BlockUntilDone();
-  return true;
-}
-
-bool ExecutorExecutor::StartTimer(Stream *stream, Timer *timer) {
-  dynamic_cast<host::HostTimer *>(timer->implementation())->Start(stream);
-  return true;
-}
-
-bool ExecutorExecutor::StopTimer(Stream *stream, Timer *timer) {
-  dynamic_cast<host::HostTimer *>(timer->implementation())->Stop(stream);
-  return true;
-}
-
-bool ExecutorExecutor::BlockHostUntilDone(Stream *stream) {
-  AsExecutorStream(stream)->BlockUntilDone();
-  return true;
-}
-
-DeviceDescription *ExecutorExecutor::PopulateDeviceDescription() const {
-  internal::DeviceDescriptionBuilder builder;
-
-  builder.set_device_address_bits(64);
-
-  builder.set_name("Executor");
-  builder.set_device_vendor("VectorName");
-  builder.set_platform_version("1.0");
-  builder.set_driver_version("1.0");
-  builder.set_runtime_version("1.0");
-  builder.set_pci_bus_id("1");
-  builder.set_device_memory_size(static_cast<uint64>(4) * 1024 * 1024 * 1024);
-  builder.set_clock_rate_ghz(static_cast<float>(CLOCKS_PER_SEC) / 1e9);
-
-  auto built = builder.Build();
-  return built.release();
-}
-
-}  // namespace executorplugin
-}  // namespace gputools
-}  // namespace perftools
diff --git a/tensorflow/compiler/plugin/executor/executor.h b/tensorflow/compiler/plugin/executor/executor.h
deleted file mode 100644
index 32fdb157e4..0000000000
--- a/tensorflow/compiler/plugin/executor/executor.h
+++ /dev/null
@@ -1,213 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Declares the ExecutorExecutor class, which is a CPU-only implementation of
-// the StreamExecutor interface. For now, this is used for testing and to
-// examine the performance of host-based StreamExecutor code.
-#ifndef TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_EXECUTOR_H_
-#define TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_EXECUTOR_H_
-
-#include "tensorflow/stream_executor/host/host_stream.h"
-#include "tensorflow/stream_executor/host/host_timer.h"
-
-#include "tensorflow/compiler/xla/shape_util.h"
-
-#include "tensorflow/stream_executor/blas.h"
-#include "tensorflow/stream_executor/lib/error.h"
-#include "tensorflow/stream_executor/lib/status.h"
-#include "tensorflow/stream_executor/lib/statusor.h"
-#include "tensorflow/stream_executor/rng.h"
-#include "tensorflow/stream_executor/stream_executor.h"
-#include "tensorflow/stream_executor/stream_executor_internal.h"
-
-#include <list>
-#include <mutex>
-
-namespace perftools {
-namespace gputools {
-namespace executorplugin {
-
-using Args = tensorflow::gtl::ArraySlice<DeviceMemoryBase>;
-
-class ExecutorExecutor : public internal::StreamExecutorInterface {
- public:
-  explicit ExecutorExecutor(const PluginConfig &plugin_config);
-  ~ExecutorExecutor() override;
-
-  port::Status Init(int device_ordinal, DeviceOptions device_options) override {
-    return port::Status::OK();
-  }
-
-  bool GetKernel(const MultiKernelLoaderSpec &spec,
-                 KernelBase *kernel) override {
-    return false;
-  }
-  bool Launch(Stream *stream, const ThreadDim &thread_dims,
-              const BlockDim &block_dims, const KernelBase &kernel,
-              const KernelArgsArrayBase &args) override {
-    return false;
-  }
-
-  void *Allocate(uint64 size) override;
-  void *AllocateSubBuffer(DeviceMemoryBase *mem, uint64 offset_bytes,
-                          uint64 size_bytes) override;
-  void Deallocate(DeviceMemoryBase *mem) override;
-
-  void *HostMemoryAllocate(uint64 size) override { return new char[size]; }
-  void HostMemoryDeallocate(void *mem) override {
-    delete[] static_cast<char *>(mem);
-  }
-  bool HostMemoryRegister(void *mem, uint64 size) override { return true; }
-  bool HostMemoryUnregister(void *mem) override { return true; }
-
-  bool Memcpy(Stream *stream, void *host_dst, const DeviceMemoryBase &pop_src,
-              uint64 size) override;
-  bool Memcpy(Stream *stream, DeviceMemoryBase *pop_dst, const void *host_src,
-              uint64 size) override;
-  bool MemcpyDeviceToDevice(Stream *stream, DeviceMemoryBase *pop_dst,
-                            const DeviceMemoryBase &host_src,
-                            uint64 size) override {
-    return false;
-  }
-
-  bool MemZero(Stream *stream, DeviceMemoryBase *location,
-               uint64 size) override {
-    return false;
-  }
-  bool Memset(Stream *stream, DeviceMemoryBase *location, uint8 pattern,
-              uint64 size) override {
-    return false;
-  }
-  bool Memset32(Stream *stream, DeviceMemoryBase *location, uint32 pattern,
-                uint64 size) override {
-    return false;
-  }
-
-  // No "synchronize all activity" implemented for this platform at the moment.
-  bool SynchronizeAllActivity() override { return false; }
-  bool SynchronousMemZero(DeviceMemoryBase *location, uint64 size) override {
-    return false;
-  }
-
-  bool SynchronousMemSet(DeviceMemoryBase *location, int value,
-                         uint64 size) override {
-    return false;
-  }
-
-  port::Status SynchronousMemcpy(DeviceMemoryBase *pop_dst,
-                                 const void *host_src, uint64 size) override;
-  port::Status SynchronousMemcpy(void *host_dst,
-                                 const DeviceMemoryBase &pop_src,
-                                 uint64 size) override;
-  port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase *pop_dst,
-                                               const DeviceMemoryBase &pop_src,
-                                               uint64 size) override {
-    return port::Status{port::error::UNIMPLEMENTED, ""};
-  }
-
-  bool HostCallback(Stream *stream, std::function<void()> callback) override;
-
-  port::Status AllocateEvent(Event *event) override {
-    return port::Status{port::error::UNIMPLEMENTED, ""};
-  }
-
-  port::Status DeallocateEvent(Event *event) override {
-    return port::Status{port::error::UNIMPLEMENTED, ""};
-  }
-
-  port::Status RecordEvent(Stream *stream, Event *event) override {
-    return port::Status{port::error::UNIMPLEMENTED, ""};
-  }
-
-  port::Status WaitForEvent(Stream *stream, Event *event) override {
-    return port::Status{port::error::UNIMPLEMENTED, ""};
-  }
-
-  Event::Status PollForEventStatus(Event *event) override {
-    return Event::Status::kError;
-  }
-
-  bool AllocateStream(Stream *stream) override { return true; }
-  void DeallocateStream(Stream *stream) override {}
-  bool CreateStreamDependency(Stream *dependent, Stream *other) override;
-
-  bool AllocateTimer(Timer *timer) override { return true; }
-  void DeallocateTimer(Timer *timer) override {}
-  bool StartTimer(Stream *stream, Timer *timer) override;
-  bool StopTimer(Stream *stream, Timer *timer) override;
-
-  bool BlockHostUntilDone(Stream *stream) override;
-
-  int PlatformDeviceCount() override { return 1; }
-
-  bool DeviceMemoryUsage(int64 *free, int64 *total) const override {
-    return false;
-  }
-
-  DeviceDescription *PopulateDeviceDescription() const override;
-
-  port::Status EnablePeerAccessTo(StreamExecutorInterface *other) override {
-    return port::Status::OK();
-  }
-
-  bool CanEnablePeerAccessTo(StreamExecutorInterface *other) override {
-    return true;
-  }
-
-  SharedMemoryConfig GetDeviceSharedMemoryConfig() override {
-    return SharedMemoryConfig::kDefault;
-  }
-
-  port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override {
-    return port::Status{port::error::UNIMPLEMENTED,
-                        "Shared memory not supported"};
-  }
-
-  std::unique_ptr<internal::EventInterface> CreateEventImplementation()
-      override {
-    return nullptr;
-  }
-
-  std::unique_ptr<internal::KernelInterface> CreateKernelImplementation()
-      override {
-    return nullptr;
-  }
-
-  std::unique_ptr<internal::StreamInterface> GetStreamImplementation()
-      override {
-    return std::unique_ptr<internal::StreamInterface>(new host::HostStream());
-  }
-
-  std::unique_ptr<internal::TimerInterface> GetTimerImplementation() override {
-    return std::unique_ptr<internal::TimerInterface>(new host::HostTimer());
-  }
-
-  port::StatusOr<DeviceMemoryBase> ExecuteGraph(const xla::Shape &shape,
-                                                Args args);
-
- private:
-  DeviceMemoryBase AllocateSingleOutput(const xla::Shape &shape);
-
-  port::StatusOr<DeviceMemoryBase> AllocateOutputBuffer(
-      const xla::Shape &shape);
-
-  const PluginConfig plugin_config_;
-};
-
-}  // namespace executorplugin
-}  // namespace gputools
-}  // namespace perftools
-
-#endif  // TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_EXECUTOR_H_
diff --git a/tensorflow/compiler/plugin/executor/platform.cc b/tensorflow/compiler/plugin/executor/platform.cc
deleted file mode 100644
index 2f339f04a7..0000000000
--- a/tensorflow/compiler/plugin/executor/platform.cc
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/plugin/executor/platform.h"
-#include "tensorflow/compiler/plugin/executor/executor.h"
-#include "tensorflow/compiler/plugin/executor/platform_id.h"
-
-#include "tensorflow/stream_executor/lib/error.h"
-#include "tensorflow/stream_executor/lib/initialize.h"
-#include "tensorflow/stream_executor/lib/ptr_util.h"
-#include "tensorflow/stream_executor/lib/status.h"
-#include "tensorflow/stream_executor/lib/status_macros.h"
-#include "tensorflow/stream_executor/lib/stringprintf.h"
-
-namespace se = ::perftools::gputools;
-namespace sep = ::perftools::gputools::executorplugin;
-
-namespace perftools {
-namespace gputools {
-namespace executorplugin {
-
-PLATFORM_DEFINE_ID(kExecutorPlatformId);
-
-ExecutorPlatform::ExecutorPlatform() : name_("Executor") {}
-
-ExecutorPlatform::~ExecutorPlatform() {}
-
-Platform::Id ExecutorPlatform::id() const { return kExecutorPlatformId; }
-
-int ExecutorPlatform::VisibleDeviceCount() const { return 1; }
-
-const string& ExecutorPlatform::Name() const { return name_; }
-
-port::StatusOr<StreamExecutor*> ExecutorPlatform::ExecutorForDevice(
-    int ordinal) {
-  StreamExecutorConfig config;
-  config.ordinal = ordinal;
-  config.plugin_config = PluginConfig();
-  config.device_options = DeviceOptions::Default();
-  return GetExecutor(config);
-}
-
-port::StatusOr<StreamExecutor*>
-ExecutorPlatform::ExecutorForDeviceWithPluginConfig(
-    int device_ordinal, const PluginConfig& plugin_config) {
-  StreamExecutorConfig config;
-  config.ordinal = device_ordinal;
-  config.plugin_config = plugin_config;
-  config.device_options = DeviceOptions::Default();
-  return GetExecutor(config);
-}
-
-port::StatusOr<StreamExecutor*> ExecutorPlatform::GetExecutor(
-    const StreamExecutorConfig& config) {
-  mutex_lock lock(executors_mutex_);
-
-  port::StatusOr<StreamExecutor*> status = executor_cache_.Get(config);
-  if (status.ok()) {
-    return status.ValueOrDie();
-  }
-
-  port::StatusOr<std::unique_ptr<StreamExecutor>> executor =
-      GetUncachedExecutor(config);
-  if (!executor.ok()) {
-    return executor.status();
-  }
-
-  StreamExecutor* naked_executor = executor.ValueOrDie().get();
-  SE_RETURN_IF_ERROR(
-      executor_cache_.Insert(config, executor.ConsumeValueOrDie()));
-  return naked_executor;
-}
-
-port::StatusOr<std::unique_ptr<StreamExecutor>>
-ExecutorPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
-  auto executor = port::MakeUnique<StreamExecutor>(
-      this, port::MakeUnique<ExecutorExecutor>(config.plugin_config));
-  auto init_status = executor->Init(config.ordinal, config.device_options);
-  if (!init_status.ok()) {
-    return port::Status{
-        port::error::INTERNAL,
-        port::Printf(
-            "failed initializing StreamExecutor for device ordinal %d: %s",
-            config.ordinal, init_status.ToString().c_str())};
-  }
-
-  return std::move(executor);
-}
-
-void ExecutorPlatform::RegisterTraceListener(
-    std::unique_ptr<TraceListener> listener) {
-  LOG(FATAL) << "not yet implemented: register executor trace listener";
-}
-
-void ExecutorPlatform::UnregisterTraceListener(TraceListener* listener) {
-  LOG(FATAL) << "not yet implemented: unregister executor trace listener";
-}
-
-static void InitializeExecutorPlatform() {
-  std::unique_ptr<se::Platform> platform(new sep::ExecutorPlatform);
-  SE_CHECK_OK(se::MultiPlatformManager::RegisterPlatform(std::move(platform)));
-}
-
-}  // namespace executorplugin
-}  // namespace gputools
-}  // namespace perftools
-
-REGISTER_MODULE_INITIALIZER(executor_platform, sep::InitializeExecutorPlatform());
-
-DECLARE_MODULE_INITIALIZER(multi_platform_manager);
-// Note that module initialization sequencing is not supported in the
-// open-source project, so this will be a no-op there.
-REGISTER_MODULE_INITIALIZER_SEQUENCE(executor_platform, multi_platform_manager);
diff --git a/tensorflow/compiler/plugin/executor/platform.h b/tensorflow/compiler/plugin/executor/platform.h
deleted file mode 100644
index c252a589d4..0000000000
--- a/tensorflow/compiler/plugin/executor/platform.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_PLATFORM_H_
-#define TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_PLATFORM_H_
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "tensorflow/stream_executor/executor_cache.h"
-#include "tensorflow/stream_executor/lib/statusor.h"
-#include "tensorflow/stream_executor/multi_platform_manager.h"
-#include "tensorflow/stream_executor/platform.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
-#include "tensorflow/stream_executor/platform/port.h"
-#include "tensorflow/stream_executor/platform/thread_annotations.h"
-#include "tensorflow/stream_executor/stream_executor_pimpl.h"
-#include "tensorflow/stream_executor/trace_listener.h"
-
-namespace perftools {
-namespace gputools {
-namespace executorplugin {
-
-class ExecutorPlatform : public Platform {
- public:
-  ExecutorPlatform();
-  ~ExecutorPlatform() override;
-
-  Platform::Id id() const override;
-
-  // Device count is less clear-cut for CPUs than accelerators. This call
-  // currently returns the number of thread units in the host, as reported by
-  // base::NumCPUs().
-  int VisibleDeviceCount() const override;
-
-  const string& Name() const override;
-
-  port::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
-
-  port::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig(
-      int ordinal, const PluginConfig& config) override;
-
-  port::StatusOr<StreamExecutor*> GetExecutor(
-      const StreamExecutorConfig& config) override;
-
-  port::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
-      const StreamExecutorConfig& config) override;
-
-  void RegisterTraceListener(std::unique_ptr<TraceListener> listener) override;
-
-  void UnregisterTraceListener(TraceListener* listener) override;
-
- private:
-  // This platform's name.
-  string name_;
-
-  // mutex that guards the ordinal-to-executor map.
-  mutable mutex executors_mutex_;
-
-  // Cache of created StreamExecutors.
-  ExecutorCache executor_cache_;
-
-  SE_DISALLOW_COPY_AND_ASSIGN(ExecutorPlatform);
-};
-
-}  // namespace executorplugin
-}  // namespace gputools
-}  // namespace perftools
-
-#endif  // TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_PLATFORM_H_
diff --git a/tensorflow/compiler/plugin/executor/platform_id.h b/tensorflow/compiler/plugin/executor/platform_id.h
deleted file mode 100644
index 8d2b29a3e4..0000000000
--- a/tensorflow/compiler/plugin/executor/platform_id.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_EXECUTOR_PLATFORM_ID_H_
-#define TENSORFLOW_STREAM_EXECUTOR_EXECUTOR_PLATFORM_ID_H_
-
-#include "tensorflow/stream_executor/platform.h"
-
-namespace perftools {
-namespace gputools {
-namespace executorplugin {
-
-extern const Platform::Id kExecutorPlatformId;
-
-}  // namespace executorplugin
-}  // namespace gputools
-}  // namespace perftools
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_EXECUTOR_PLATFORM_ID_H_
diff --git a/tensorflow/compiler/plugin/executor/transfer_manager.cc b/tensorflow/compiler/plugin/executor/transfer_manager.cc
deleted file mode 100644
index 51c5deeea5..0000000000
--- a/tensorflow/compiler/plugin/executor/transfer_manager.cc
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/plugin/executor/transfer_manager.h"
-#include "tensorflow/compiler/plugin/executor/platform_id.h"
-
-#include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/stream_executor_no_cuda.h"
-
-#include <string>
-#include <utility>
-#include <vector>
-
-namespace sep = ::perftools::gputools::executorplugin;
-
-namespace xla {
-namespace executorplugin {
-
-ExecutorTransferManager::ExecutorTransferManager() {}
-
-se::Platform::Id ExecutorTransferManager::PlatformId() const {
-  return se::executorplugin::kExecutorPlatformId;
-}
-
-Status ExecutorTransferManager::TransferLiteralFromDevice(
-    se::StreamExecutor* executor, const se::DeviceMemoryBase& source,
-    const Shape& device_shape, const Shape& literal_shape, Literal* literal) {
-  TF_RET_CHECK(ShapeUtil::Compatible(device_shape, literal_shape));
-
-  // Tuples are a special case and contain one or more shapes inside of them to
-  // an arbitrary nesting depth.
-  if (device_shape.element_type() == TUPLE) {
-    *literal->mutable_shape() = literal_shape;
-    TF_ASSIGN_OR_RETURN(
-        std::vector<se::DeviceMemoryBase> element_buffers,
-        ShallowCopyTupleFromDevice(executor, source, device_shape));
-    TF_RET_CHECK(element_buffers.size() ==
-                 ShapeUtil::TupleElementCount(device_shape));
-    for (int64 i = 0; i < element_buffers.size(); ++i) {
-      const Shape& element_device_shape = device_shape.tuple_shapes(i);
-      const Shape& element_literal_shape = literal_shape.tuple_shapes(i);
-      Literal* element_literal = literal->add_tuple_literals();
-      // Recursively call TransferFromDevice to copy over the data in the
-      // element array.
-      TF_RETURN_IF_ERROR(TransferLiteralFromDevice(
-          executor, element_buffers[i], element_device_shape,
-          element_literal_shape, element_literal));
-    }
-    return Status::OK();
-  }
-
-  *literal->mutable_shape() = device_shape;
-  literal->Reserve(ShapeUtil::ElementsIn(device_shape));
-  TF_RETURN_IF_ERROR(TransferBufferFromDevice(
-      executor, source, ShapeUtil::ByteSizeOf(device_shape),
-      literal->MutableInternalData()));
-  if (!ShapeUtil::Equal(literal_shape, device_shape)) {
-    literal->Swap(
-        literal->Relayout(literal_shape.layout()).get());
-  }
-  TF_RET_CHECK(ShapeUtil::Equal(literal_shape, literal->shape()));
-  return Status::OK();
-}
-
-StatusOr<std::vector<se::DeviceMemoryBase>>
-ExecutorTransferManager::ShallowCopyTupleFromDevice(
-    se::StreamExecutor* executor, const se::DeviceMemoryBase& source,
-    const Shape& shape) {
-  TF_RET_CHECK(ShapeUtil::IsTuple(shape));
-
-  std::vector<void*> element_pointers(ShapeUtil::TupleElementCount(shape),
-                                      nullptr);
-  int64 tuple_size = ShapeUtil::ByteSizeOf(shape, sizeof(void*));
-  auto copy_status = executor->SynchronousMemcpyD2H(source, tuple_size,
-                                                    element_pointers.data());
-  if (!copy_status.ok()) {
-    return AddStatus(
-        Status(static_cast<tensorflow::error::Code>(copy_status.code()),
-               copy_status.error_message()),
-        "failed transfer of tuple buffer " + ShapeUtil::HumanString(shape));
-  }
-
-  // Create a DeviceMemoryBase from each void* pointer.
-  std::vector<se::DeviceMemoryBase> destination;
-  for (int i = 0; i < element_pointers.size(); ++i) {
-    if (element_pointers[i] == nullptr &&
-        !ShapeUtil::HasZeroElements(shape.tuple_shapes(i))) {
-      return FailedPrecondition("tuple contains nullptr at element %d", i);
-    }
-    int64 buffer_size =
-        ShapeUtil::ByteSizeOf(shape.tuple_shapes(i), sizeof(void*));
-    destination.emplace_back(element_pointers[i], buffer_size);
-  }
-  return std::move(destination);
-}
-
-Status ExecutorTransferManager::TransferLiteralToDevice(
-    se::StreamExecutor* executor, const Literal& literal,
-    se::DeviceMemoryBase* destination) {
-  const Shape& shape = literal.shape();
-
-  if (ShapeUtil::IsTuple(literal.shape())) {
-    std::vector<void*> tuple_elements_on_device;
-    for (const Literal& tuple_element : literal.tuple_literals()) {
-      se::DeviceMemoryBase allocation = executor->AllocateArray<uint8>(
-          GetByteSizeRequirement(tuple_element.shape()));
-      TF_RETURN_IF_ERROR(
-          TransferLiteralToDevice(executor, tuple_element, &allocation));
-      tuple_elements_on_device.push_back(allocation.opaque());
-    }
-    return TransferBufferToDevice(
-        executor, tuple_elements_on_device.size() * sizeof(void*),
-        tuple_elements_on_device.data(), destination);
-  }
-
-  return TransferBufferToDevice(executor, GetByteSizeRequirement(shape),
-                                literal.InternalData(),
-                                destination);
-}
-
-Status ExecutorTransferManager::TransferLiteralToInfeed(
-    se::StreamExecutor* executor, const Literal& literal) {
-  const Shape& shape = literal.shape();
-  VLOG(1) << "transferring literal shape to infeed: "
-          << ShapeUtil::HumanString(shape);
-
-  return Status::OK();
-}
-
-Status ExecutorTransferManager::TransferBufferToInfeed(
-    se::StreamExecutor* executor, int64 size, const void* source) {
-  return Unimplemented("Transfer to Infeed");
-}
-
-Status ExecutorTransferManager::TransferLiteralFromOutfeed(
-    perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
-    Literal* literal) {
-  const Shape& shape = literal->shape();
-  VLOG(1) << "transferring literal shape from outfeed: "
-          << ShapeUtil::HumanString(shape);
-
-  return Status::OK();
-}
-
-Status ExecutorTransferManager::ResetDevices(
-    tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
-        executors) {
-  return Unimplemented("Device reset not supported");
-}
-
-int64 ExecutorTransferManager::GetByteSizeRequirement(const Shape& shape) {
-  return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
-}
-
-}  // namespace executorplugin
-}  // namespace xla
-
-static std::unique_ptr<xla::TransferManager> CreateExecutorTransferManager() {
-  return xla::MakeUnique<xla::executorplugin::ExecutorTransferManager>();
-}
-
-static bool InitModule() {
-  xla::TransferManager::RegisterTransferManager(sep::kExecutorPlatformId,
-                                                &CreateExecutorTransferManager);
-  return true;
-}
-static bool module_initialized = InitModule();
diff --git a/tensorflow/compiler/plugin/executor/transfer_manager.h b/tensorflow/compiler/plugin/executor/transfer_manager.h
deleted file mode 100644
index 7a42e5a2d7..0000000000
--- a/tensorflow/compiler/plugin/executor/transfer_manager.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_TRANSFER_MANAGER_H_
-#define TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_TRANSFER_MANAGER_H_
-
-#include "tensorflow/compiler/xla/service/transfer_manager.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/stream_executor_no_cuda.h"
-#include "tensorflow/core/platform/types.h"
-
-#include <vector>
-
-namespace se = ::perftools::gputools;
-
-namespace xla {
-namespace executorplugin {
-
-class ExecutorTransferManager : public TransferManager {
- public:
-  ExecutorTransferManager();
-
-  ~ExecutorTransferManager() override {}
-
-  se::Platform::Id PlatformId() const override;
-
-  StatusOr<std::vector<se::DeviceMemoryBase>> ShallowCopyTupleFromDevice(
-      se::StreamExecutor* executor, const se::DeviceMemoryBase& source,
-      const Shape& shape) override;
-
-  Status TransferLiteralFromDevice(se::StreamExecutor* executor,
-                                   const se::DeviceMemoryBase& source,
-                                   const Shape& device_shape,
-                                   const Shape& literal_shape,
-                                   Literal* literal) override;
-
-  Status TransferLiteralToDevice(se::StreamExecutor* executor,
-                                 const Literal& literal,
-                                 se::DeviceMemoryBase* destination) override;
-
-  Status TransferLiteralToInfeed(se::StreamExecutor* executor,
-                                 const Literal& literal) override;
-
-  Status TransferBufferToInfeed(se::StreamExecutor* executor,
-                                 int64 size, const void* source) override;
-
-  Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
-                                    const Shape& literal_shape,
-                                    Literal* literal) override;
-
-  Status ResetDevices(
-      tensorflow::gtl::ArraySlice<se::StreamExecutor*> executors) override;
-
-  int64 GetByteSizeRequirement(const Shape& shape) override;
-
- private:
-  TF_DISALLOW_COPY_AND_ASSIGN(ExecutorTransferManager);
-};
-
-}  // namespace executorplugin
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_TRANSFER_MANAGER_H_
diff --git a/tensorflow/compiler/tests/ftrl_test.py b/tensorflow/compiler/tests/ftrl_test.py
index a75a5cd2cf..6b328fb618 100644
--- a/tensorflow/compiler/tests/ftrl_test.py
+++ b/tensorflow/compiler/tests/ftrl_test.py
@@ -218,7 +218,7 @@ class FtrlOptimizerTest(XLATestCase):
         self.assertAllClose(np.array([-0.24059935, -0.46829352]), var0.eval())
         self.assertAllClose(np.array([-0.02406147, -0.04830509]), var1.eval())
 
-  # When variables are initialized with Zero, FTRL-Proximal has two properties:
+  # When variables are intialized with Zero, FTRL-Proximal has two properties:
   # 1. Without L1&L2 but with fixed learning rate, FTRL-Proximal is identical
   # with GradientDescent.
   # 2. Without L1&L2 but with adaptive learning rate, FTRL-Proximal is idential
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
index 16b778bca4..f752fb3ae2 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
@@ -94,14 +94,12 @@ class BatchMatMulOp : public XlaOpKernel {
       // Slice off individual matrices and reshape to 2D tensors.
       auto x_slice = builder->Slice(
           x_flat, {i, 0, 0},
-          {i + 1, x_shape.dim_size(ndims - 2), x_shape.dim_size(ndims - 1)},
-          {1, 1, 1});
+          {i + 1, x_shape.dim_size(ndims - 2), x_shape.dim_size(ndims - 1)});
       x_slice = builder->Reshape(
           x_slice, {x_shape.dim_size(ndims - 2), x_shape.dim_size(ndims - 1)});
       auto y_slice = builder->Slice(
           y_flat, {i, 0, 0},
-          {i + 1, y_shape.dim_size(ndims - 2), y_shape.dim_size(ndims - 1)},
-          {1, 1, 1});
+          {i + 1, y_shape.dim_size(ndims - 2), y_shape.dim_size(ndims - 1)});
       y_slice = builder->Reshape(
           y_slice, {y_shape.dim_size(ndims - 2), y_shape.dim_size(ndims - 1)});
 
diff --git a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
index 21d3e64872..47d2d747e6 100644
--- a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
@@ -125,7 +125,6 @@ void BatchToSpace(XlaOpKernelContext* ctx,
   //       input_shape[M+1], ..., input_shape[N-1]]
   std::vector<int64> start_indices(input_rank, 0);
   std::vector<int64> end_indices = reshaped_permuted_shape;
-  std::vector<int64> strides(input_rank, 1);
   for (int i = 0; i < block_rank; ++i) {
     int64 crop_start = crops.Get<int64>({i, 0});
     int64 crop_end = crops.Get<int64>({i, 1});
@@ -140,7 +139,7 @@ void BatchToSpace(XlaOpKernelContext* ctx,
             " end: ", crop_end, " size ", reshaped_permuted_shape[1 + i]));
   }
   xla::ComputationDataHandle output =
-      b->Slice(reshaped_permuted, start_indices, end_indices, strides);
+      b->Slice(reshaped_permuted, start_indices, end_indices);
   ctx->SetOutput(0, output);
 }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/depthwise_conv_ops.cc b/tensorflow/compiler/tf2xla/kernels/depthwise_conv_ops.cc
index 852d2a966e..92b371cc4e 100644
--- a/tensorflow/compiler/tf2xla/kernels/depthwise_conv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/depthwise_conv_ops.cc
@@ -172,14 +172,15 @@ class DepthwiseConv2dNativeOp : public XlaOpKernel {
     } else {
       // These will be used to define the bounds of each slice.
       // Within the loop, the input_channel index will be modified.
-      gtl::InlinedVector<int64, 4> filter_begin(4, 0);
-      gtl::InlinedVector<int64, 4> filter_limits(4);
-      gtl::InlinedVector<int64, 4> input_begin(4, 0);
-      gtl::InlinedVector<int64, 4> input_limits(4);
-      gtl::InlinedVector<int64, 4> strides(4, 1);
+      gtl::InlinedVector<int64, 4> filter_begin;
+      gtl::InlinedVector<int64, 4> filter_limits;
+      gtl::InlinedVector<int64, 4> input_begin;
+      gtl::InlinedVector<int64, 4> input_limits;
       for (int i = 0; i < 4; ++i) {
-        filter_limits[i] = filter_shape.dim_size(i);
-        input_limits[i] = input_shape.dim_size(i);
+        filter_begin.push_back(0);
+        filter_limits.push_back(filter_shape.dim_size(i));
+        input_begin.push_back(0);
+        input_limits.push_back(input_shape.dim_size(i));
       }
 
       std::vector<int64> strides_for_tla{strides_[1], strides_[2]};
@@ -208,9 +209,9 @@ class DepthwiseConv2dNativeOp : public XlaOpKernel {
         input_limits[3] = i + 1;
 
         xla::ComputationDataHandle filter_slice =
-            b.Slice(filter, filter_begin, filter_limits, strides);
+            b.Slice(filter, filter_begin, filter_limits);
         xla::ComputationDataHandle input_slice =
-            b.Slice(input, input_begin, input_limits, strides);
+            b.Slice(input, input_begin, input_limits);
         convs.push_back(b.ConvWithGeneralDimensions(
             input_slice, filter_slice, strides_for_tla, xla_padding, dims));
       }
diff --git a/tensorflow/compiler/tf2xla/kernels/diag_op.cc b/tensorflow/compiler/tf2xla/kernels/diag_op.cc
index ec5017f6ab..74994d8961 100644
--- a/tensorflow/compiler/tf2xla/kernels/diag_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/diag_op.cc
@@ -125,7 +125,7 @@ class DiagPartOp : public XlaOpKernel {
     diag = builder->Reshape(diag, {new_size, new_size + 1});
 
     // Slices out the first column and reshapes to the final shape.
-    diag = builder->Slice(diag, {0, 0}, {new_size, 1}, {1, 1});
+    diag = builder->Slice(diag, {0, 0}, {new_size, 1});
     diag = builder->Reshape(diag, new_dims);
 
     ctx->SetOutput(0, diag);
@@ -224,9 +224,8 @@ class MatrixDiagPartOp : public XlaOpKernel {
     } else if (actual_size > target_size) {
       std::vector<int64> start(flattened_dims.size(), 0);
       std::vector<int64> limits(flattened_dims.begin(), flattened_dims.end());
-      std::vector<int64> strides(flattened_dims.size(), 1);
       limits[flattened_dims.size() - 1] = target_size;
-      diag = builder->Slice(diag, start, limits, strides);
+      diag = builder->Slice(diag, start, limits);
     }
 
     // Reshape so the target values are in the first position of the last
@@ -239,9 +238,8 @@ class MatrixDiagPartOp : public XlaOpKernel {
     // Slices out the first column and reshapes to the final shape.
     std::vector<int64> start(dims.size(), 0);
     std::vector<int64> limits(dims.begin(), dims.end());
-    std::vector<int64> strides(dims.size(), 1);
     limits[last_dim] = 1;
-    diag = builder->Slice(diag, start, limits, strides);
+    diag = builder->Slice(diag, start, limits);
 
     // Collapses away the last dimension.
     dims.pop_back();
diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
index 0330e34c98..faa7ef0ef9 100644
--- a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
@@ -156,8 +156,6 @@ class DynamicStitchOp : public XlaOpKernel {
                                    indices0_shape.dims());
     std::vector<int64> slice_limit(1 + data0_shape.dims() -
                                    indices0_shape.dims());
-    std::vector<int64> stride(1 + data0_shape.dims() -
-                              indices0_shape.dims(), 1);
     for (int d = indices0_shape.dims(); d < data0_shape.dims(); d++) {
       slice_limit[1 + d - indices0_shape.dims()] = data0_shape.dim_size(d);
     }
@@ -170,7 +168,7 @@ class DynamicStitchOp : public XlaOpKernel {
       // And place it in the concat list in the place indicated by
       // the index.
       to_concat[index_num] =
-          ctx->builder()->Slice(expression, slice_start, slice_limit, stride);
+          ctx->builder()->Slice(expression, slice_start, slice_limit);
     }
 
     ctx->SetOutput(0, ctx->builder()->ConcatInDim(to_concat, 0));
diff --git a/tensorflow/compiler/tf2xla/kernels/slice_op.cc b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
index 482c54a40c..51c97d85d7 100644
--- a/tensorflow/compiler/tf2xla/kernels/slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
@@ -54,9 +54,7 @@ class SliceOp : public XlaOpKernel {
     for (int i = 0; i < begin.size(); ++i) {
       limits.push_back(begin[i] + size[i]);
     }
-    std::vector<int64> strides(begin.size(), 1);
-    ctx->SetOutput(0, ctx->builder()->Slice(ctx->Input(0), begin, limits,
-                                            strides));
+    ctx->SetOutput(0, ctx->builder()->Slice(ctx->Input(0), begin, limits));
   }
 
  private:
diff --git a/tensorflow/compiler/tf2xla/kernels/split_op.cc b/tensorflow/compiler/tf2xla/kernels/split_op.cc
index 44ee81461e..017f3a110e 100644
--- a/tensorflow/compiler/tf2xla/kernels/split_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/split_op.cc
@@ -77,14 +77,14 @@ class SplitOp : public XlaOpKernel {
 
     // The vectors we will use to define the slice. The entry for the
     // split dimensions varies for each output.
-    std::vector<int64> begin(input_shape.dims(), 0);
-    std::vector<int64> limits(input_shape.dims());
-    std::vector<int64> strides(input_shape.dims(), 1);
+    std::vector<int64> begin;
+    std::vector<int64> limits;
     for (int i = 0; i < input_shape.dims(); ++i) {
       // Initially set up the limits to be the full size of the input:
       // the split dimension is filled in below.
       int64 dim = input_shape.dim_size(i);
-      limits[i] = dim;
+      begin.push_back(0);
+      limits.push_back(dim);
     }
 
     auto input = ctx->Input(1);
@@ -94,7 +94,7 @@ class SplitOp : public XlaOpKernel {
       // Slice out the ith split from the split dimension.
       begin[split_dim] = i * slice_size;
       limits[split_dim] = (i + 1) * slice_size;
-      ctx->SetOutput(i, ctx->builder()->Slice(input, begin, limits, strides));
+      ctx->SetOutput(i, ctx->builder()->Slice(input, begin, limits));
     }
   }
 };
@@ -188,7 +188,7 @@ class SplitVOp : public XlaOpKernel {
     std::vector<int64> begin(input_shape.dims(), 0);
     auto dim_sizes = input_shape.dim_sizes();
     std::vector<int64> limits(dim_sizes.begin(), dim_sizes.end());
-    std::vector<int64> strides(input_shape.dims(), 1);
+
     for (int i = 0; i < num_split; ++i) {
       TensorShape output_shape(input_shape);
       int slice_size = split_sizes_vec[i];
@@ -196,7 +196,7 @@ class SplitVOp : public XlaOpKernel {
 
       // Slice out the ith split from the split dimension.
       limits[split_dim] = begin[split_dim] + slice_size;
-      ctx->SetOutput(i, ctx->builder()->Slice(input, begin, limits, strides));
+      ctx->SetOutput(i, ctx->builder()->Slice(input, begin, limits));
       begin[split_dim] = limits[split_dim];
     }
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
index 6af4bd0496..8037e90791 100644
--- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
@@ -72,29 +72,55 @@ class StridedSliceOp : public XlaOpKernel {
                        &dummy, &dummy, &dummy, &begin, &end, &strides));
 
     gtl::InlinedVector<int64, 4> dimensions_to_reverse;
-    gtl::InlinedVector<int64, 4> slice_begin, slice_end, slice_strides;
-
+    gtl::InlinedVector<int64, 4> slice_begin, slice_end;
+    bool simple_strides = true;
     for (int i = 0; i < begin.size(); ++i) {
+      simple_strides &= (std::abs(strides[i]) == 1);
       if (strides[i] > 0) {
         slice_begin.push_back(begin[i]);
         slice_end.push_back(end[i]);
-        slice_strides.push_back(strides[i]);
       } else {
         // Negative stride: swap begin and end, add 1 because the interval
         // is semi-open, and mark the dimension to be reversed.
-        slice_begin.push_back(input_shape.dim_size(i) - begin[i] - 1);
-        slice_end.push_back(input_shape.dim_size(i) - end[i] - 1);
-        slice_strides.push_back(-strides[i]);
+        slice_begin.push_back(end[i] + 1);
+        slice_end.push_back(begin[i] + 1);
         dimensions_to_reverse.push_back(i);
       }
     }
-
-    xla::ComputationDataHandle slice = ctx->Input(0);
+    xla::ComputationDataHandle slice =
+        ctx->builder()->Slice(ctx->Input(0), slice_begin, slice_end);
     if (!dimensions_to_reverse.empty()) {
       slice = ctx->builder()->Rev(slice, dimensions_to_reverse);
     }
 
-    slice = ctx->builder()->Slice(slice, slice_begin, slice_end, slice_strides);
+    // If at least one of the strides is > 1 (or < -1) then use Slice
+    // to pull out each of the strided slices, and Concat to put them
+    // together again.
+    if (!simple_strides) {
+      // Re-adjust the begin and end now that the periphery has been
+      // sliced away.
+      for (int d = 0; d < strides.size(); ++d) {
+        slice_end[d] -= slice_begin[d];
+        slice_begin[d] = 0;
+      }
+
+      for (int d = 0; d < strides.size(); ++d) {
+        int64 stride = std::abs(strides[d]);
+        if (stride > 1) {
+          std::vector<xla::ComputationDataHandle> to_concat;
+          int64 end = slice_end[d];
+          for (int64 i = 0; i < end; i += stride) {
+            slice_begin[d] = i;
+            slice_end[d] = i + 1;
+            to_concat.push_back(
+                ctx->builder()->Slice(slice, slice_begin, slice_end));
+          }
+          slice = ctx->builder()->ConcatInDim(to_concat, d);
+          slice_begin[d] = 0;
+          slice_end[d] = to_concat.size();
+        }
+      }
+    }
 
     slice = ctx->builder()->Reshape(slice, final_shape.dim_sizes());
     ctx->SetOutput(0, slice);
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
index 9367c1ef22..598b341002 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
@@ -318,7 +318,7 @@ class TensorArrayGatherOp : public XlaOpKernel {
     for (int i = 0; i < num_indices; ++i) {
       // Slices the i-th index out of `indices`, and pads it with zeros in the
       // minor dimensions to form an index into the TensorArray storage.
-      auto index = b->Slice(indices, {i}, {i + 1}, {1});
+      auto index = b->Slice(indices, {i}, {i + 1});
 
       // start_indices of the DynamicSlice are [index, 0, 0, ..., 0].
       auto start_indices = PadIndexWithZeros(b, index, ta_shape.dims() - 1);
@@ -381,18 +381,16 @@ class TensorArrayScatterOp : public XlaOpKernel {
     std::vector<int64> value_starts(value_shape.dims(), 0);
     auto value_ends = value_shape.dim_sizes();
 
-    std::vector<int64> value_strides(value_shape.dims(), 1);
-
     // For every (index, value) pair, update the corresponding TensorArray
     // storage.
     for (int i = 0; i < num_indices; ++i) {
       // Slice out part of the value.
       value_starts[0] = i;
       value_ends[0] = i + 1;
-      auto slice = b->Slice(value, value_starts, value_ends, value_strides);
+      auto slice = b->Slice(value, value_starts, value_ends);
 
       // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
-      auto index = b->Slice(indices, {i}, {i + 1}, {1});
+      auto index = b->Slice(indices, {i}, {i + 1});
       auto start_indices = PadIndexWithZeros(b, index, elem_shape.dims());
       ta = DynamicAddSlice(b, ta, slice, slice_dims, start_indices);
     }
diff --git a/tensorflow/compiler/tf2xla/kernels/unpack_op.cc b/tensorflow/compiler/tf2xla/kernels/unpack_op.cc
index f87586ba57..a5ce78e520 100644
--- a/tensorflow/compiler/tf2xla/kernels/unpack_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unpack_op.cc
@@ -66,7 +66,6 @@ class UnpackOp : public XlaOpKernel {
 
     std::vector<int64> start_indices(input_shape.dims(), 0);
     std::vector<int64> limit_indices(input_shape.dims());
-    std::vector<int64> strides(input_shape.dims(), 1);
     for (int i = 0; i < input_shape.dims(); ++i) {
       limit_indices[i] = input_shape.dim_size(i);
     }
@@ -74,8 +73,7 @@ class UnpackOp : public XlaOpKernel {
     for (int i = 0; i < num; ++i) {
       start_indices[axis] = i;
       limit_indices[axis] = i + 1;
-      auto slice = ctx->builder()->Slice(input, start_indices, limit_indices,
-                                         strides);
+      auto slice = ctx->builder()->Slice(input, start_indices, limit_indices);
       // Reshape to drop the 'axis' dimension.
       auto result = ctx->builder()->Reshape(slice, output_shape.dim_sizes());
       ctx->SetOutput(i, result);
diff --git a/tensorflow/compiler/xla/client/computation_builder.cc b/tensorflow/compiler/xla/client/computation_builder.cc
index dcc313707b..735a69d596 100644
--- a/tensorflow/compiler/xla/client/computation_builder.cc
+++ b/tensorflow/compiler/xla/client/computation_builder.cc
@@ -256,8 +256,7 @@ void ComputationBuilder::CheckSameShape(const ComputationDataHandle& lhs,
 ComputationDataHandle ComputationBuilder::Slice(
     const ComputationDataHandle& operand,
     tensorflow::gtl::ArraySlice<int64> start_indices,
-    tensorflow::gtl::ArraySlice<int64> limit_indices,
-    tensorflow::gtl::ArraySlice<int64> stride) {
+    tensorflow::gtl::ArraySlice<int64> limit_indices) {
   if (!first_error_.ok() || !PrepareComputation().ok()) {
     return ComputationDataHandle();
   }
@@ -270,9 +269,6 @@ ComputationDataHandle ComputationBuilder::Slice(
   for (int64 index : limit_indices) {
     request.add_limit_indices(index);
   }
-  for (int64 index : stride) {
-    request.add_stride(index);
-  }
   OpRequest op_request;
   *op_request.mutable_computation() = computation_.handle();
   *op_request.mutable_slice_request() = request;
diff --git a/tensorflow/compiler/xla/client/computation_builder.h b/tensorflow/compiler/xla/client/computation_builder.h
index b411346459..5dceb03281 100644
--- a/tensorflow/compiler/xla/client/computation_builder.h
+++ b/tensorflow/compiler/xla/client/computation_builder.h
@@ -211,11 +211,9 @@ class ComputationBuilder {
   //
   // Note that "limit" means up-to-but-not-including; i.e. [start, limit) in 1D
   // range notation.
-  // The stride parameter determines the stride over the slice
   ComputationDataHandle Slice(const ComputationDataHandle& operand,
                               tensorflow::gtl::ArraySlice<int64> start_indices,
-                              tensorflow::gtl::ArraySlice<int64> limit_indices,
-                              tensorflow::gtl::ArraySlice<int64> stride);
+                              tensorflow::gtl::ArraySlice<int64> limit_indices);
 
   // Enqueues a slice operation onto the computation that slices the 'operand'
   // from dynamic start indices which are passed in 'start_indices'.
diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index b6bd1158d2..1b125e3596 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -1205,7 +1205,11 @@ void Literal::Resize<double>(int64 num_elements, double value) {
 template <>
 void Literal::Resize<half>(int64 num_elements, half value) {
   CHECK_EQ(ShapeUtil::ElementsIn(shape()), num_elements);
-  mutable_f16s()->resize(num_elements, value);
+  mutable_f16s()->resize(num_elements * sizeof(half));
+  auto data = GetMutableArraySlice<half>();
+  for (int i = 0; i < num_elements; i++) {
+    data[i] = value;
+  }
 }
 
 template <typename RepeatedFieldT, typename NativeT>
@@ -1248,7 +1252,7 @@ LiteralProto Literal::ToProto() const {
     case F16:
       *proto.mutable_f16s() =
           string(reinterpret_cast<const char*>(f16s_.data()),
-                 f16s_.size() * sizeof(half));
+                 f16s_.size() / sizeof(half));
       break;
     case F32:
       CopyToRepeatedField(proto.mutable_f32s(), f32s());
@@ -1304,7 +1308,7 @@ void Literal::CopyFromProto(const LiteralProto& literal_proto) {
       const string& s(literal_proto.f16s());
       CHECK_EQ(0, s.size() % sizeof(half));
       f16s_ = std::vector<half>(s.size() / sizeof(half));
-      memcpy(f16s_.data(), s.data(), s.size());
+      memcpy(f16s_.data(), s.data(), s.size() / sizeof(half));
       break;
     }
     case F32:
diff --git a/tensorflow/compiler/xla/literal_util_test.cc b/tensorflow/compiler/xla/literal_util_test.cc
index 5a550ef4c6..ffae623b0c 100644
--- a/tensorflow/compiler/xla/literal_util_test.cc
+++ b/tensorflow/compiler/xla/literal_util_test.cc
@@ -939,62 +939,5 @@ TEST_F(LiteralUtilTest, CopyFromProto_Bool) {
   }
 }
 
-// Note that f16 is currently stored in a byte array in little endian byte order
-TEST_F(LiteralUtilTest, ToProto_f16) {
-  half h1(1.0f);
-  half h2(2.0f);
-
-  auto m = Literal::CreateR2<half>({{h1, h2}, {h2, h1}});
-  Literal* l = m.get();
-  EXPECT_EQ(4, ShapeUtil::ElementsIn(l->shape()));
-  EXPECT_EQ(4, l->f16s().size());
-  EXPECT_EQ(4, l->f16s_size());
-
-  LiteralProto p = l->ToProto();
-  EXPECT_EQ(4, ShapeUtil::ElementsIn(p.shape()));
-  EXPECT_EQ(8, p.f16s().size());
-  const char* d = p.f16s().data();
-  EXPECT_EQ(d[0], 0);
-  EXPECT_EQ(d[1], 0x3C);
-  EXPECT_EQ(d[2], 0);
-  EXPECT_EQ(d[3], 0x40);
-  EXPECT_EQ(d[4], 0);
-  EXPECT_EQ(d[5], 0x40);
-  EXPECT_EQ(d[6], 0);
-  EXPECT_EQ(d[7], 0x3C);
-}
-
-// Note that f16 is currently stored in a byte array in little endian byte order
-TEST_F(LiteralUtilTest, CopyFromProto_f16) {
-  half h1(1.0f);
-  half h2(2.0f);
-
-  const char half_vals[8] = {
-    0x00, 0x3C, 0x00, 0x40, 0x00, 0x40, 0x00, 0x3C
-  };
-  LiteralProto p;
-  p.mutable_shape()->set_element_type(F16);
-  p.mutable_shape()->clear_dimensions();
-  p.mutable_shape()->add_dimensions(4);
-  p.clear_f16s();
-  p.set_f16s(half_vals, 8);
-
-
-  Literal literal(p);
-  ASSERT_EQ(4, literal.f16s_size());
-  ASSERT_EQ(h1, literal.f16s(0));
-  ASSERT_EQ(h2, literal.f16s(1));
-  ASSERT_EQ(h2, literal.f16s(2));
-  ASSERT_EQ(h1, literal.f16s(3));
-
-  const std::vector<half>& r = literal.f16s();
-  ASSERT_EQ(4, r.size());
-  ASSERT_EQ(h1, r[0]);
-  ASSERT_EQ(h2, r[1]);
-  ASSERT_EQ(h2, r[2]);
-  ASSERT_EQ(h1, r[3]);
-}
-
-
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 99b1337b11..718a2d798c 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -90,6 +90,8 @@ cc_library(
         ":hlo_query",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 5709ac3067..0187c09d7b 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -855,7 +855,6 @@ Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
     // Second, construct the slice instruction to perform the negative padding.
     std::vector<int64> start_indices;
     std::vector<int64> end_indices;
-    std::vector<int64> strides;
     for (int64 i = 0; i < pad->padding_config().dimensions_size(); ++i) {
       const PaddingConfig::PaddingConfigDimension& padding_dimension =
           pad->padding_config().dimensions(i);
@@ -869,18 +868,16 @@ Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
       }
       start_indices.push_back(start);
       end_indices.push_back(end);
-      strides.push_back(1);
     }
 
     // Verify that the slice shape matches the pad shape.
     TF_ASSIGN_OR_RETURN(Shape inferred_slice_shape,
                         ShapeInference::InferSliceShape(
-                            nonzero_pad_shape, start_indices, end_indices,
-                            strides));
+                            nonzero_pad_shape, start_indices, end_indices));
     TF_RET_CHECK(ShapeUtil::Compatible(inferred_slice_shape, pad->shape()));
 
     std::unique_ptr<HloInstruction> slice = HloInstruction::CreateSlice(
-        pad->shape(), nonzero_pad, start_indices, end_indices, strides);
+        pad->shape(), nonzero_pad, start_indices, end_indices);
     return ReplaceWithNewInstruction(pad, std::move(slice));
   }
 
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 7e52c8fb0c..0792006ddb 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -520,7 +520,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveEmptyConcatenateOperands) {
       HloInstruction::CreateConstant(Literal::CreateR1<float>({})));
   HloInstruction* empty_slice =
       builder.AddInstruction(HloInstruction::CreateSlice(
-          ShapeUtil::MakeShape(F32, {0}), param1, {42}, {42}, {1}));
+          ShapeUtil::MakeShape(F32, {0}), param1, {42}, {42}));
   Shape result_shape = ShapeUtil::MakeShape(F32, {3 * kParamLength});
   builder.AddInstruction(HloInstruction::CreateConcatenate(
       result_shape, {empty_literal, param0, param0, empty_slice, param1}, 0));
@@ -551,7 +551,7 @@ TEST_F(AlgebraicSimplifierTest, OnlyEmptyConcatenateOperands) {
       HloInstruction::CreateConstant(Literal::CreateR1<float>({})));
   HloInstruction* empty_slice =
       builder.AddInstruction(HloInstruction::CreateSlice(
-          ShapeUtil::MakeShape(F32, {0}), param0, {42}, {42}, {1}));
+          ShapeUtil::MakeShape(F32, {0}), param0, {42}, {42}));
   Shape result_shape = ShapeUtil::MakeShape(F32, {0});
   builder.AddInstruction(HloInstruction::CreateConcatenate(
       result_shape, {empty_literal, empty_slice}, 0));
@@ -1132,7 +1132,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopSlice) {
           0, ShapeUtil::MakeShape(F32, {dim0, dim1}), "param"));
   builder.AddInstruction(HloInstruction::CreateSlice(
       ShapeUtil::MakeShape(F32, {dim0, dim1}), param, /*start_indices=*/{0, 0},
-      /*limit_indices=*/{dim0, dim1}, /*slices=*/{1, 1}));
+      /*limit_indices=*/{dim0, dim1}));
 
   HloModule module(TestName());
   HloComputation* computation = module.AddEntryComputation(builder.Build());
@@ -1537,7 +1537,7 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToSlice) {
 
   Shape slice_shape = ShapeUtil::MakeShape(F32, {2, 2, 3, 3});
   HloInstruction* slice = builder.AddInstruction(HloInstruction::CreateSlice(
-      slice_shape, broadcast, {0, 1, 2, 3}, {2, 3, 5, 6}, {1, 1, 1, 1}));
+      slice_shape, broadcast, {0, 1, 2, 3}, {2, 3, 5, 6}));
 
   HloModule module(TestName());
   auto computation = module.AddEntryComputation(builder.Build());
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index 56568fd446..c498b86dd4 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -731,7 +731,7 @@ TEST_F(BufferAssignmentTest, ReuseNonOperandBuffer) {
   auto negate = builder.AddInstruction(
       HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, param0));
   auto slice = builder.AddInstruction(
-      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}, {1}));
+      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}));
   auto broadcast = builder.AddInstruction(
       HloInstruction::CreateBroadcast(f32a100x10_, slice, {1}));
 
@@ -763,7 +763,7 @@ TEST_F(BufferAssignmentTest, NoReuseLiveBuffer) {
   auto negate = builder.AddInstruction(
       HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, param0));
   auto slice = builder.AddInstruction(
-      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}, {1}));
+      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}));
   auto broadcast = builder.AddInstruction(
       HloInstruction::CreateBroadcast(f32a100x10_, slice, {1}));
   builder.AddInstruction(HloInstruction::CreateTuple({negate, broadcast}));
@@ -800,7 +800,7 @@ TEST_F(BufferAssignmentTest, NoReuseAliasedBuffer) {
   auto tuple_element = builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(f32vec100_, tuple, 0));
   auto slice = builder.AddInstruction(
-      HloInstruction::CreateSlice(f32vec10_, tuple_element, {0}, {10}, {1}));
+      HloInstruction::CreateSlice(f32vec10_, tuple_element, {0}, {10}));
   auto broadcast = builder.AddInstruction(
       HloInstruction::CreateBroadcast(f32a100x10_, slice, {1}));
   builder.AddInstruction(HloInstruction::CreateTuple({tuple, broadcast}));
@@ -835,7 +835,7 @@ TEST_F(BufferAssignmentTest, DoNotReuseOversizedOutputBuffer) {
       HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, param0));
   // Slice output is 10 elements.
   auto slice = builder.AddInstruction(
-      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}, {1}));
+      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}));
   // Broadcast output is 40 elements.
   auto broadcast = builder.AddInstruction(HloInstruction::CreateBroadcast(
       ShapeUtil::MakeShape(F32, {10, 4}), slice, {0}));
@@ -867,7 +867,7 @@ TEST_F(BufferAssignmentTest, ReuseOutputBufferIfExactlySized) {
   auto negate = builder.AddInstruction(
       HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, param0));
   auto slice = builder.AddInstruction(
-      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}, {1}));
+      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}));
   // Broadcast output is 40 elements.
   auto broadcast = builder.AddInstruction(HloInstruction::CreateBroadcast(
       ShapeUtil::MakeShape(F32, {10, 10}), slice, {0}));
@@ -904,7 +904,7 @@ TEST_F(BufferAssignmentTest, DoNotReuseOversizedOutputBufferInTuple) {
       HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, param0));
   // Slice output is 10 elements.
   auto slice = builder.AddInstruction(
-      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}, {1}));
+      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}));
   // Broadcast output is 40 elements.
   auto broadcast = builder.AddInstruction(HloInstruction::CreateBroadcast(
       ShapeUtil::MakeShape(F32, {10, 4}), slice, {0}));
diff --git a/tensorflow/compiler/xla/service/buffer_liveness_test.cc b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
index a5f7cc0aeb..a31e9b1782 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
@@ -588,7 +588,7 @@ class FusedDynamicUpdateSliceLivenessTest : public BufferLivenessTest {
     if (update_uses_tuple_element1) {
       // Create a slice instruction as an additional user of 'gte1'.
       slice = builder.AddInstruction(
-          HloInstruction::CreateSlice(update_shape, gte1, {0}, {3}, {1}));
+          HloInstruction::CreateSlice(update_shape, gte1, {0}, {3}));
       update = builder.AddInstruction(HloInstruction::CreateBinary(
           update_shape, HloOpcode::kAdd, update, slice));
     }
diff --git a/tensorflow/compiler/xla/service/compile_only_service.h b/tensorflow/compiler/xla/service/compile_only_service.h
index 0a1911cbd1..dd00c58240 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.h
+++ b/tensorflow/compiler/xla/service/compile_only_service.h
@@ -55,7 +55,7 @@ class CompileOnlyService : public Service {
 
   // Override Service methods that require or imply the existence of an
   // execute backend.  Note that this does not include TransferToClient, as
-  // computing constants produces global data that we may wish to transfer.
+  // computing contants produces global data that we may wish to transfer.
   tensorflow::Status Execute(const ExecuteRequest* arg,
                              ExecuteResponse* result) override {
     return Unimplemented("CompileOnlyService does not support execution.");
diff --git a/tensorflow/compiler/xla/service/computation_placer.cc b/tensorflow/compiler/xla/service/computation_placer.cc
index cdfa30dd9a..cdf277581f 100644
--- a/tensorflow/compiler/xla/service/computation_placer.cc
+++ b/tensorflow/compiler/xla/service/computation_placer.cc
@@ -49,18 +49,17 @@ Status DeviceAssignment::Serialize(DeviceAssignmentProto* proto) const {
   return Status::OK();
 }
 
-/* static */ StatusOr<std::unique_ptr<DeviceAssignment>>
-DeviceAssignment::Deserialize(const DeviceAssignmentProto& proto) {
+/* static */ StatusOr<DeviceAssignment> DeviceAssignment::Deserialize(
+    const DeviceAssignmentProto& proto) {
   TF_RET_CHECK(proto.computation_devices_size() == proto.computation_count());
-  auto assignment = MakeUnique<DeviceAssignment>(proto.replica_count(),
-                                                 proto.computation_count());
+  DeviceAssignment assignment(proto.replica_count(), proto.computation_count());
   for (int computation = 0; computation < proto.computation_count();
        ++computation) {
     const auto& computation_device = proto.computation_devices(computation);
     TF_RET_CHECK(computation_device.replica_device_ids_size() ==
                  proto.replica_count());
     for (int replica = 0; replica < proto.replica_count(); ++replica) {
-      (*assignment)(replica, computation) =
+      assignment(replica, computation) =
           computation_device.replica_device_ids(replica);
     }
   }
diff --git a/tensorflow/compiler/xla/service/computation_placer.h b/tensorflow/compiler/xla/service/computation_placer.h
index 7d9abcd100..4d26d6bb85 100644
--- a/tensorflow/compiler/xla/service/computation_placer.h
+++ b/tensorflow/compiler/xla/service/computation_placer.h
@@ -49,11 +49,7 @@ class DeviceAssignment : public Array2D<int> {
 
   // Protocol buffer serialization and deserialization.
   Status Serialize(DeviceAssignmentProto* proto) const;
-
-  // Return a std::unique_ptr<DeviceAssignment> instead of a DeviceAssignment
-  // directly because one of the supported TF platforms (mac) does not compile
-  // due to a StatusOr of an incomplete type (DeviceAssignment).
-  static StatusOr<std::unique_ptr<DeviceAssignment>> Deserialize(
+  static StatusOr<DeviceAssignment> Deserialize(
       const DeviceAssignmentProto& proto);
 };
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 759d27e1f3..da8d983e1a 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -359,6 +359,7 @@ Status AppendIRToFile(const string& file_name, const string& ir_module_string) {
 StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
     std::unique_ptr<HloModule> module, HloDumper dump_hlo,
     se::StreamExecutor* stream_exec) {
+  VLOG(1) << "Compiling: " << module->name();
   TF_RET_CHECK(stream_exec != nullptr);
   std::call_once(llvm_command_line_options_initialized,
                  &InitializeLLVMCommandLineOptions, module->config());
@@ -403,6 +404,8 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
       module->config().debug_options().xla_dump_debug_json_to();
 
   if (CpuParallelBackendRequested(module->config())) {
+    VLOG(1) << "Using parallel cpu backend";
+
     // Run buffer analysis on the HLO graph. This analysis figures out which
     // temporary buffers are required to run the computation.
     // DependencyHloOrdering is used for the parallel emitter because the order
@@ -497,6 +500,8 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
           .set_ir_module_string(ir_module_string);
     }
   } else {
+    VLOG(1) << "Using sequential cpu backend";
+
     // Select an order for emitting the HLO instructions for each
     // computation. Using this sequence enables tighter buffer liveness analysis
     // and reduced memory usage (as compared to using DependencyHloOrdering).
@@ -562,6 +567,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
     }
   }
 
+  VLOG(1) << "Compilation finished";
   return std::move(cpu_executable);
 }
 
@@ -663,6 +669,7 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
   std::vector<std::unique_ptr<AotCompilationResult>> results;
   for (size_t i = 0; i < modules.size(); ++i) {
     HloModule* module = modules[i].get();
+    VLOG(1) << "Compiling ahead-of-time: " << module->name();
 
     TF_RETURN_IF_ERROR(RunHloPasses(module, dump_hlo));
 
@@ -741,6 +748,8 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
         std::move(object_file_data), std::move(buffer_sizes),
         result_slice.index()));
   }
+
+  VLOG(1) << "Compilation finished";
   return std::move(results);
 }
 
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index db0a8b36cd..5b21ae3d2a 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -949,20 +949,9 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
                  const IrArray::Index& index) -> StatusOr<llvm::Value*> {
         IrArray::Index sliced_index(index.size());
         for (int i = 0; i < index.size(); ++i) {
-          int64 stride = hlo->slice_stride(i);
-          if (stride != 1) {
-            sliced_index[i] = ir_builder_->CreateAdd(
-                ir_builder_->CreateMul(
-                    index[i], llvm::ConstantInt::get(index[i]->getType(),
-                                                     stride)),
-                llvm::ConstantInt::get(index[i]->getType(),
-                                       hlo->slice_starts(i)));
-          } else {
-            sliced_index[i] = ir_builder_->CreateAdd(
-                    index[i],
-                    llvm::ConstantInt::get(index[i]->getType(),
-                                           hlo->slice_starts(i)));
-          }
+          sliced_index[i] = ir_builder_->CreateAdd(
+              index[i], llvm::ConstantInt::get(index[i]->getType(),
+                                               hlo->slice_starts(i)));
         }
         return operand_to_generator.at(hlo->operand(0))(sliced_index);
       };
diff --git a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
index b8c6162084..4e130de311 100644
--- a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
+++ b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
@@ -80,7 +80,6 @@ HloInstruction* MaybePaddedAndSlicedInput(
     std::vector<int64> start_indices(input->shape().dimensions_size(), 0);
     std::vector<int64> limit_indices(input->shape().dimensions().begin(),
                                      input->shape().dimensions().end());
-    std::vector<int64> strides(input->shape().dimensions_size(), 1);
     for (size_t i = 0; i < conv_dnums.spatial_dimensions().size(); ++i) {
       int64 dim = conv_dnums.spatial_dimensions(i);
       // If dimension "dim" has negative padding, increase the start index or
@@ -93,9 +92,9 @@ HloInstruction* MaybePaddedAndSlicedInput(
 
     input = computation->AddInstruction(HloInstruction::CreateSlice(
         ShapeInference::InferSliceShape(input->shape(), start_indices,
-                                        limit_indices, strides)
+                                        limit_indices)
             .ConsumeValueOrDie(),
-        input, start_indices, limit_indices, strides));
+        input, start_indices, limit_indices));
   }
 
   return input;
@@ -355,8 +354,6 @@ bool PadInsertion::CanonicalizeBackwardInputConvolution(
   std::vector<int64> limit_indices(
       new_backward_conv->shape().dimensions().begin(),
       new_backward_conv->shape().dimensions().end());
-  std::vector<int64> strides(new_backward_conv->shape().dimensions_size(),
-                             1LL);
   for (size_t i = 0; i < backward_conv->window().dimensions_size(); ++i) {
     int64 padding_low = backward_conv->window().dimensions(i).padding_low();
     int64 padding_high = backward_conv->window().dimensions(i).padding_high();
@@ -376,13 +373,13 @@ bool PadInsertion::CanonicalizeBackwardInputConvolution(
   // Replace the old backward convolution with the slice.
   CHECK(ShapeUtil::Compatible(
       ShapeInference::InferSliceShape(new_backward_conv->shape(), start_indices,
-                                      limit_indices, strides)
+                                      limit_indices)
           .ConsumeValueOrDie(),
       backward_conv->shape()));
   TF_CHECK_OK(computation->ReplaceWithNewInstruction(
       backward_conv,
       HloInstruction::CreateSlice(backward_conv->shape(), new_backward_conv,
-                                  start_indices, limit_indices, strides)));
+                                  start_indices, limit_indices)));
   return true;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
index 1c60b06ddd..a643bc4076 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
@@ -147,7 +147,6 @@ TEST_F(HloConstantFoldingTest, Slice) {
   const int64 dimensions[] = {11, 8, 7, 5, 9};
   const int64 slice_start[] = {4, 2, 3, 1, 5};
   const int64 slice_limits[] = {10, 8, 6, 5, 9};
-  const int64 slice_strides[] = {1, 1, 1, 1, 1};
   TF_ASSIGN_OR_ASSERT_OK(auto literal,
                          LiteralTestUtil::CreateRandomLiteral<F32>(
                              ShapeUtil::MakeShape(F32, dimensions), 0.0, 1.0));
@@ -155,7 +154,7 @@ TEST_F(HloConstantFoldingTest, Slice) {
       HloInstruction::CreateConstant(std::move(literal)));
   Shape shape = ShapeUtil::MakeShape(F32, {6, 6, 3, 4, 4});
   builder.AddInstruction(HloInstruction::CreateSlice(
-      shape, literal_instruction, slice_start, slice_limits, slice_strides));
+      shape, literal_instruction, slice_start, slice_limits));
   auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 9117ab9653..99b73dea29 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -306,13 +306,11 @@ HloInstruction::CreateCrossReplicaSum(const Shape& shape,
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateSlice(
     const Shape& shape, HloInstruction* operand,
     tensorflow::gtl::ArraySlice<int64> start_indices,
-    tensorflow::gtl::ArraySlice<int64> limit_indices,
-    tensorflow::gtl::ArraySlice<int64> strides) {
+    tensorflow::gtl::ArraySlice<int64> limit_indices) {
   auto instruction = WrapUnique(new HloInstruction(HloOpcode::kSlice, shape));
   instruction->AppendOperand(operand);
   instruction->slice_starts_.assign(start_indices.begin(), start_indices.end());
   instruction->slice_limits_.assign(limit_indices.begin(), limit_indices.end());
-  instruction->slice_strides_.assign(strides.begin(), strides.end());
   return instruction;
 }
 
@@ -854,8 +852,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       return CreateReshape(shape, new_operands[0]);
     case HloOpcode::kSlice:
       CHECK_EQ(new_operands.size(), 1);
-      return CreateSlice(shape, new_operands[0], slice_starts_, slice_limits_,
-                         slice_strides_);
+      return CreateSlice(shape, new_operands[0], slice_starts_, slice_limits_);
     case HloOpcode::kDynamicSlice:
       return CreateDynamicSlice(shape, new_operands[0], new_operands[1],
                                 dynamic_slice_sizes_);
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index d29c0935fc..37cbb0b769 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -174,8 +174,7 @@ class HloInstruction {
   static std::unique_ptr<HloInstruction> CreateSlice(
       const Shape& shape, HloInstruction* operand,
       tensorflow::gtl::ArraySlice<int64> start_indices,
-      tensorflow::gtl::ArraySlice<int64> limit_indices,
-      tensorflow::gtl::ArraySlice<int64> strides);
+      tensorflow::gtl::ArraySlice<int64> limit_indices);
 
   // Creates a slice instruction, where the first operand is sliced by
   // start indices specified in the second operand, and by size specfied in
@@ -663,15 +662,6 @@ class HloInstruction {
     return slice_limits_;
   }
 
-  // Returns the stride in the given dimension for a slice node.
-  //
-  // Precondition: opcode() == HloOpcode::kSlice
-  int64 slice_stride(int64 dimension) const {
-    CHECK_EQ(HloOpcode::kSlice, opcode_);
-    return slice_strides_[dimension];
-  }
-  const std::vector<int64>& slice_strides() const { return slice_strides_; }
-
   // Returns the size of the slice in the given dimension for a dynamic
   // slice node.
   //
@@ -917,7 +907,6 @@ class HloInstruction {
   // Describes the [begin, end) index range for a slice.
   std::vector<int64> slice_starts_;
   std::vector<int64> slice_limits_;
-  std::vector<int64> slice_strides_;
 
   // The bit sizes for a reduce-precision operation.
   int32 exponent_bits_;
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
index 1a861cd16b..8a1e705711 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
@@ -67,8 +67,7 @@ class HloRematerializationTest : public HloTestBase {
         /*dimension=*/0));
     auto slice_1 = builder.AddInstruction(HloInstruction::CreateSlice(
         vec1_shape_, concat_1, /*start_indices=*/{0},
-        /*limit_indices=*/{1},
-        /*strides=*/{1}));
+        /*limit_indices=*/{1}));
     auto concat_2 = builder.AddInstruction(HloInstruction::CreateConcatenate(
         ShapeUtil::MakeShape(xla::F32, {1025}), {bcast, slice_1},
         /*dimension=*/0));
@@ -76,8 +75,7 @@ class HloRematerializationTest : public HloTestBase {
     // which is necessary to use this computation in a while.
     builder.AddInstruction(HloInstruction::CreateSlice(vec1_shape_, concat_2,
                                                        /*start_indices=*/{0},
-                                                       /*limit_indices=*/{1},
-                                                       /*strides=*/{1}));
+                                                       /*limit_indices=*/{1}));
     return builder.Build();
   }
 
@@ -105,8 +103,7 @@ class HloRematerializationTest : public HloTestBase {
         HloInstruction::CreateBroadcast(vec1024_shape_, param, {}));
     auto slice_1 = builder.AddInstruction(
         HloInstruction::CreateSlice(vec1_shape_, bcast, /*start_indices=*/{0},
-                                    /*limit_indices=*/{1},
-                                    /*strides=*/{1}));
+                                    /*limit_indices=*/{1}));
     auto while_inst = builder.AddInstruction(HloInstruction::CreateWhile(
         vec1_shape_, while_cond, while_body, slice_1));
     auto concat = builder.AddInstruction(HloInstruction::CreateConcatenate(
@@ -114,8 +111,7 @@ class HloRematerializationTest : public HloTestBase {
         /*dimension=*/0));
     builder.AddInstruction(HloInstruction::CreateSlice(vec1_shape_, concat,
                                                        /*start_indices=*/{0},
-                                                       /*limit_indices=*/{1},
-                                                       /*strides=*/{1}));
+                                                       /*limit_indices=*/{1}));
     return builder.Build();
   }
 
@@ -357,7 +353,7 @@ TEST_F(HloRematerializationTest, InstructionRematerializedMultipleTimes) {
         /*dimension=*/0));
     builder.AddInstruction(HloInstruction::CreateSlice(
         vec1024_shape_, concat, /*start_indices=*/{0},
-        /*limit_indices=*/{1024}, /*slices=*/{1}));
+        /*limit_indices=*/{1024}));
     subcomputation = module->AddEmbeddedComputation(builder.Build());
   }
 
@@ -473,7 +469,7 @@ TEST_P(IndirectUseTest, IndirectUseNotRematerialized) {
         /*dimension=*/0));
     builder.AddInstruction(HloInstruction::CreateSlice(
         vec1024_shape_, concat, /*start_indices=*/{0},
-        /*limit_indices=*/{1024}, /*slices=*/{1}));
+        /*limit_indices=*/{1024}));
     subcomputation = module->AddEmbeddedComputation(builder.Build());
   }
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index bcc9418d59..e348511c62 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -356,26 +356,9 @@ void EmitLogging(const char* tag, llvm::Value* value,
 
 void SetTbaaForInstruction(llvm::Instruction* instruction, Shape shape,
                            bool is_pointer_to) {
-  llvm::MDBuilder metadata_builder(instruction->getContext());
-  llvm::MDNode* root = metadata_builder.createTBAARoot("XLA TBAA");
-  string type_name;
-  if (is_pointer_to) {
-    type_name += "pointer-to ";
-  }
-  // Scalars do not have layout which makes it permissible to omit an explicit
-  // layout.  To make sure that equivalent scalar shapes have the same TBAA,
-  // remove the (meaningless) explicit layout if one is present.
-  if (!ShapeUtil::IsArray(shape) || ShapeUtil::IsScalar(shape)) {
-    LayoutUtil::ClearLayout(&shape);
-  } else {
-    CHECK(shape.has_layout());
-  }
-  type_name += shape.ShortDebugString();
-  llvm::MDNode* tbaa_node =
-      metadata_builder.createTBAANode(llvm_ir::AsStringRef(type_name), root);
-  instruction->setMetadata(llvm::LLVMContext::MD_tbaa,
-                           metadata_builder.createTBAAStructTagNode(
-                               tbaa_node, tbaa_node, /*Offset=*/0));
+  // TODO(b/62903316): TBAA metadata causes LLVM to miscompile generated code,
+  // most likely because the generated metadata is incorrect.  Disable TBAA
+  // metadata while we resolve this.
 }
 
 void SetAlignmentMetadataForLoad(llvm::LoadInst* load, uint64_t alignment) {
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 5e4df9ddd6..b332709995 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -1135,8 +1135,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
 
 /* static */ StatusOr<Shape> ShapeInference::InferSliceShape(
     const Shape& arg, tensorflow::gtl::ArraySlice<int64> starts,
-    tensorflow::gtl::ArraySlice<int64> limits,
-    tensorflow::gtl::ArraySlice<int64> strides) {
+    tensorflow::gtl::ArraySlice<int64> limits) {
   TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(arg, "operand of slice"));
   VLOG(2) << tensorflow::strings::Printf(
       "slicing shape %s starts={%s} limits={%s}",
@@ -1159,13 +1158,13 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   for (int64 dimension = 0; dimension < starts.size(); ++dimension) {
     int64 start_index = starts[dimension];
     int64 limit_index = limits[dimension];
-    int64 stride = strides[dimension];
     if (start_index < 0) {
       return InvalidArgument("negative start index to slice: %lld",
                              start_index);
     }
-    if (stride == 0) {
-      return InvalidArgument("Zero stride");
+    if (limit_index < 0) {
+      return InvalidArgument("negative limit index to slice: %lld",
+                             limit_index);
     }
     if (limit_index > arg.dimensions(dimension)) {
       return InvalidArgument(
@@ -1173,21 +1172,18 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
           "size (%lld)",
           limit_index, arg.dimensions(dimension));
     }
+    if (start_index > limit_index) {
+      return InvalidArgument(
+          "limit index (%lld) must be greater or equal to "
+          "start index (%lld) in slice",
+          limit_index, start_index);
+    }
     VLOG(2) << tensorflow::strings::Printf("starts[%lld] = %lld", dimension,
                                            start_index);
     VLOG(2) << tensorflow::strings::Printf("limits[%lld] = %lld", dimension,
                                            limit_index);
-    if (stride > 0) {
-      if (start_index > limit_index) {
-        return InvalidArgument(
-            "limit index (%lld) must be greater or equal to "
-            "start index (%lld) in slice with positive stride",
-            limit_index, start_index);
-      }
-      sizes.push_back((limit_index - start_index + stride - 1) / stride);
-    } else {
-      return InvalidArgument("Negative strides not supported");
-    }
+
+    sizes.push_back(limits[dimension] - starts[dimension]);
   }
 
   return ShapeUtil::MakeShape(arg.element_type(), sizes);
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index 42e4c7d39d..55c60e149d 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -116,8 +116,7 @@ class ShapeInference {
   // e.g. slice f32[32x32] 0:16 0:16 -> f32[16x16]
   static StatusOr<Shape> InferSliceShape(
       const Shape& arg, tensorflow::gtl::ArraySlice<int64> starts,
-      tensorflow::gtl::ArraySlice<int64> limits,
-      tensorflow::gtl::ArraySlice<int64> strides);
+      tensorflow::gtl::ArraySlice<int64> limits);
 
   // Infers the shape produced by a dynamic slice operation of size specified
   // in 'slice_sizes', with dynamic start indices shape 'start_indices_shape'.
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index 8c731ae297..7cff042a48 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -682,43 +682,16 @@ TEST_F(ReduceShapeInferenceTest, ErrorElementTypeVsApplyType) {
 TEST_F(ShapeInferenceTest, InferSliceShapeRank2) {
   Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
   auto inferred_status =
-      ShapeInference::InferSliceShape(matrix_shape, {32, 0}, {64, 64}, {1, 1});
+      ShapeInference::InferSliceShape(matrix_shape, {32, 0}, {64, 64});
   ASSERT_IS_OK(inferred_status.status());
   Shape inferred = inferred_status.ValueOrDie();
   ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {32, 64}), inferred));
 }
 
-TEST_F(ShapeInferenceTest, InferSliceShapeRank2WithStrides) {
-  Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
-  auto inferred_status =
-      ShapeInference::InferSliceShape(matrix_shape, {32, 0}, {64, 64}, {2, 4});
-  ASSERT_IS_OK(inferred_status.status());
-  Shape inferred = inferred_status.ValueOrDie();
-  ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {16, 16}), inferred));
-}
-
-TEST_F(ShapeInferenceTest, InferSliceShapeRank2WithStridesNotIntegral) {
-  Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
-  auto inferred_status =
-      ShapeInference::InferSliceShape(matrix_shape, {15, 0}, {20, 13}, {2, 4});
-  ASSERT_IS_OK(inferred_status.status());
-  Shape inferred = inferred_status.ValueOrDie();
-  ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {3, 4}), inferred));
-}
-
-TEST_F(ShapeInferenceTest, InferInvalidStride) {
-  Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
-  auto inferred_status =
-      ShapeInference::InferSliceShape(matrix_shape, {127, 0}, {129, 2}, {0, 1});
-  ASSERT_FALSE(inferred_status.ok());
-  ASSERT_EQ(tensorflow::error::INVALID_ARGUMENT,
-            inferred_status.status().code());
-}
-
 TEST_F(ShapeInferenceTest, InferOobSliceShapeRank2) {
   Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
   auto inferred_status =
-      ShapeInference::InferSliceShape(matrix_shape, {127, 0}, {129, 2}, {1, 1});
+      ShapeInference::InferSliceShape(matrix_shape, {127, 0}, {129, 2});
   ASSERT_FALSE(inferred_status.ok());
   ASSERT_EQ(tensorflow::error::INVALID_ARGUMENT,
             inferred_status.status().code());
@@ -727,7 +700,7 @@ TEST_F(ShapeInferenceTest, InferOobSliceShapeRank2) {
 TEST_F(ShapeInferenceTest, InferSliceShapeRank1) {
   Shape vector_shape = ShapeUtil::MakeShape(F32, {17});
   auto inferred_status =
-      ShapeInference::InferSliceShape(vector_shape, {2}, {4}, {1});
+      ShapeInference::InferSliceShape(vector_shape, {2}, {4});
   ASSERT_TRUE(inferred_status.ok());
   Shape inferred = inferred_status.ValueOrDie();
   ASSERT_TRUE(ShapeUtil::Equal(inferred, ShapeUtil::MakeShape(F32, {2})));
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
index cd79e63caf..d25e5adee3 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
@@ -584,7 +584,7 @@ class FusionPointsToAnalysisTest : public TuplePointsToAnalysisTest {
     if (add_additional_gte0_user) {
       // Create 'slice' as an additional user of 'input'.
       auto slice = builder.AddInstruction(
-          HloInstruction::CreateSlice(update_shape, input, {0}, {3}, {1}));
+          HloInstruction::CreateSlice(update_shape, input, {0}, {3}));
       // Modify 'update' to take 'slice' output.
       update = builder.AddInstruction(HloInstruction::CreateBinary(
           update_shape, HloOpcode::kAdd, update, slice));
diff --git a/tensorflow/compiler/xla/service/user_computation.cc b/tensorflow/compiler/xla/service/user_computation.cc
index 92b8c7bb21..1f6e789379 100644
--- a/tensorflow/compiler/xla/service/user_computation.cc
+++ b/tensorflow/compiler/xla/service/user_computation.cc
@@ -744,8 +744,7 @@ StatusOr<ComputationDataHandle> UserComputation::AddSliceInstruction(
       Shape new_shape,
       ShapeInference::InferSliceShape(
           operand->output_shape(), AsInt64Slice(slice_request.start_indices()),
-          AsInt64Slice(slice_request.limit_indices()),
-          AsInt64Slice(slice_request.stride())));
+          AsInt64Slice(slice_request.limit_indices())));
 
   ComputationDataHandle handle = CreateComputationDataHandle();
 
@@ -2394,8 +2393,7 @@ void ComputationLowerer::Visit(
       hlo_instruction = add_instruction(HloInstruction::CreateSlice(
           request.output_shape(), operand,
           AsInt64Slice(slice_request.start_indices()),
-          AsInt64Slice(slice_request.limit_indices()),
-          AsInt64Slice(slice_request.stride())));
+          AsInt64Slice(slice_request.limit_indices())));
       break;
     }
 
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index 024988743c..bb7fbad000 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -1853,7 +1853,7 @@ TEST_F(ArrayElementwiseOpTest, ImplictBroadcastInFusedExpressions) {
 
   auto x = builder.Parameter(0, x_literal->shape(), "x");
   auto y = builder.Parameter(1, y_literal->shape(), "y");
-  auto slice = builder.Slice(x, {1}, {2}, {1});
+  auto slice = builder.Slice(x, {1}, {2});
   builder.Sub(slice, y);
 
   ComputeAndCompareR1<float>(&builder, {-2, -3}, {x_data.get(), y_data.get()},
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index 63a630f9e5..7abef6a27b 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -365,9 +365,9 @@ XLA_TEST_F(DotOperationTest, BatchMatMul) {
   std::vector<xla::ComputationDataHandle> out_slices;
   for (int i = 0; i < 4; ++i) {
     // Slice off individual matrices and reshape to 2D tensors.
-    auto x_slice = builder.Slice(x_flat, {i, 0, 0}, {i + 1, 2, 2}, {1, 1, 1});
+    auto x_slice = builder.Slice(x_flat, {i, 0, 0}, {i + 1, 2, 2});
     x_slice = builder.Reshape(x_slice, {0, 1, 2}, {2, 2});
-    auto y_slice = builder.Slice(y_flat, {i, 0, 0}, {i + 1, 2, 2}, {1, 1, 1});
+    auto y_slice = builder.Slice(y_flat, {i, 0, 0}, {i + 1, 2, 2});
     y_slice = builder.Reshape(y_slice, {0, 1, 2}, {2, 2});
 
     auto out = builder.Dot(x_slice, y_slice);
diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc
index 7803d234fd..c8b91eafc7 100644
--- a/tensorflow/compiler/xla/tests/fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/fusion_test.cc
@@ -210,7 +210,7 @@ XLA_TEST_F(FusionTest, Test) {
       HloInstruction::CreateTernary(ShapeUtil::MakeShape(F32, {2, 3}),
                                     HloOpcode::kSelect, const10, add8, const9));
   auto slice12 = builder.AddInstruction(HloInstruction::CreateSlice(
-      ShapeUtil::MakeShape(F32, {2, 1}), select11, {0, 1}, {2, 2}, {1, 1}));
+      ShapeUtil::MakeShape(F32, {2, 1}), select11, {0, 1}, {2, 2}));
   // CreateFusionInstruction needs the `instructions_to_fuse` argument in
   // reverse topological order, so the first element in `instructions_to_fuse`
   // must be the root.
diff --git a/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc b/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc
index 56c15e5ff7..df3d4fa21d 100644
--- a/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc
+++ b/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc
@@ -36,7 +36,7 @@ XLA_TEST_F(SliceTest, Slice2D) {
   ComputationBuilder builder(client_, "slice_2d");
   auto original = builder.ConstantR2<float>(
       {{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}, {7.0, 8.0, 9.0}, {10.0, 11.0, 12.0}});
-  builder.Slice(original, {2, 1}, {4, 3}, {1, 1});
+  builder.Slice(original, {2, 1}, {4, 3});
 
   Array2D<float> expected({{8.0f, 9.0f}, {11.0f, 12.0f}});
   ComputeAndCompareR2<float>(&builder, expected, {}, ErrorSpec(0.000001));
@@ -47,7 +47,7 @@ XLA_TEST_F(SliceTest, Slice3D) {
   Array3D<float> array_3d(
       {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}});
   auto original = builder.ConstantR3FromArray3D<float>(array_3d);
-  builder.Slice(original, {0, 0, 1}, {2, 1, 2}, {1, 1, 1});
+  builder.Slice(original, {0, 0, 1}, {2, 1, 2});
 
   Array3D<float> expected_3d({{{2.0f}}, {{6.0f}}});
   ComputeAndCompareR3<float>(&builder, expected_3d, {}, ErrorSpec(0.000001));
diff --git a/tensorflow/compiler/xla/tests/params_test.cc b/tensorflow/compiler/xla/tests/params_test.cc
index a7692fceb4..2065e9e813 100644
--- a/tensorflow/compiler/xla/tests/params_test.cc
+++ b/tensorflow/compiler/xla/tests/params_test.cc
@@ -325,7 +325,7 @@ XLA_TEST_F(ParamsTest, R2_2x2_TryToPassReverseLayoutToParameter) {
   ComputationBuilder builder(client_, TestName());
   auto input = builder.Parameter(0, original, "input");
   // Use the slice operator to get an off-diagonal element.
-  builder.Slice(input, {0, 1}, {1, 2}, {1, 1});
+  builder.Slice(input, {0, 1}, {1, 2});
 
   std::unique_ptr<GlobalData> data =
       client_->TransferToServer(*literal).ConsumeValueOrDie();
diff --git a/tensorflow/compiler/xla/tests/slice_test.cc b/tensorflow/compiler/xla/tests/slice_test.cc
index 5e7d475662..97120df0c5 100644
--- a/tensorflow/compiler/xla/tests/slice_test.cc
+++ b/tensorflow/compiler/xla/tests/slice_test.cc
@@ -44,7 +44,7 @@ class SliceTest : public ClientLibraryTestBase {
 
     ComputationBuilder builder(client_, TestName());
     auto original = builder.ConstantR1<NativeT>(constant);
-    builder.Slice(original, {2}, {4}, {1});
+    builder.Slice(original, {2}, {4});
 
     const std::vector<NativeT> expected = {static_cast<NativeT>(2),
                                            static_cast<NativeT>(3)};
@@ -55,7 +55,7 @@ class SliceTest : public ClientLibraryTestBase {
 XLA_TEST_F(SliceTest, SliceZeroToZeroF32) {
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR1<float>({});
-  builder.Slice(original, {0}, {0}, {1});
+  builder.Slice(original, {0}, {0});
 
   ComputeAndCompareR1<float>(&builder, {}, {});
 }
@@ -64,7 +64,7 @@ XLA_TEST_F(SliceTest, SliceTenToZeroF32) {
   ComputationBuilder builder(client_, TestName());
   std::vector<float> constant(10, 0.3);
   auto original = builder.ConstantR1<float>(constant);
-  builder.Slice(original, {7}, {7}, {1});
+  builder.Slice(original, {7}, {7});
 
   ComputeAndCompareR1<float>(&builder, {}, {});
 }
@@ -87,7 +87,7 @@ TEST_F(SliceTest, SliceTenToTen) {
 
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR1<float>(values);
-  builder.Slice(original, {0}, {10}, {1});
+  builder.Slice(original, {0}, {10});
 
   ComputeAndCompareR1<float>(&builder, values, {}, ErrorSpec(0.000001));
 }
@@ -98,7 +98,7 @@ TEST_F(SliceTest, SliceLastFourOf1024) {
 
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR1<float>(values);
-  builder.Slice(original, {1024 - 4}, {1024}, {1});
+  builder.Slice(original, {1024 - 4}, {1024});
 
   const std::vector<float> expected = {1020, 1021, 1022, 1023};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.000001));
@@ -112,7 +112,7 @@ TEST_F(SliceTest, DISABLED_SliceUnaligned1024In4096Values) {
 
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR1<float>(values);
-  builder.Slice(original, {7}, {7 + 1024}, {1});
+  builder.Slice(original, {7}, {7 + 1024});
 
   std::vector<float> expected(1024);
   std::iota(values.begin(), values.end(), 7.0);
@@ -122,7 +122,7 @@ TEST_F(SliceTest, DISABLED_SliceUnaligned1024In4096Values) {
 XLA_TEST_F(SliceTest, Slice0x0to0x0F32) {
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 0));
-  builder.Slice(original, {0, 0}, {0, 0}, {1, 1});
+  builder.Slice(original, {0, 0}, {0, 0});
 
   ComputeAndCompareR2<float>(&builder, Array2D<float>(0, 0), {});
 }
@@ -130,7 +130,7 @@ XLA_TEST_F(SliceTest, Slice0x0to0x0F32) {
 XLA_TEST_F(SliceTest, Slice0x20to0x5F32) {
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 20));
-  builder.Slice(original, {0, 15}, {0, 20}, {1, 1});
+  builder.Slice(original, {0, 15}, {0, 20});
 
   ComputeAndCompareR2<float>(&builder, Array2D<float>(0, 5), {});
 }
@@ -138,7 +138,7 @@ XLA_TEST_F(SliceTest, Slice0x20to0x5F32) {
 XLA_TEST_F(SliceTest, Slice3x0to2x0F32) {
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR2FromArray2D<float>(Array2D<float>(3, 0));
-  builder.Slice(original, {1, 0}, {3, 0}, {1, 1});
+  builder.Slice(original, {1, 0}, {3, 0});
 
   ComputeAndCompareR2<float>(&builder, Array2D<float>(2, 0), {});
 }
@@ -153,7 +153,7 @@ XLA_TEST_F(SliceTest, SliceQuadrantOf256x256) {
 
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR2FromArray2D<float>(values);
-  builder.Slice(original, {128, 128}, {256, 256}, {1, 1});
+  builder.Slice(original, {128, 128}, {256, 256});
 
   Array2D<float> expected(128, 128);
   for (int row = 0; row < 128; ++row) {
@@ -171,7 +171,7 @@ TEST_F(SliceTest, Slice_1x4096_To_1x1024) {
 
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR2FromArray2D<float>(values);
-  builder.Slice(original, {0, 3072}, {1, 4096}, {1, 1});
+  builder.Slice(original, {0, 3072}, {1, 4096});
 
   Array2D<float> expected(1, 1024);
   std::iota(expected.data(), expected.data() + 1024, 3072.0);
@@ -192,7 +192,7 @@ TEST_F(SliceTest, Slice_16x4_To_16x2) {
   }
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR2FromArray2D<float>(values);
-  builder.Slice(original, {0, 0}, {16, 2}, {1, 1});
+  builder.Slice(original, {0, 0}, {16, 2});
   ComputeAndCompareR2<float>(&builder, expected, {}, ErrorSpec(0.000001));
 }
 
@@ -204,7 +204,7 @@ TEST_F(SliceTest, SliceR4ThreeDimsMiddleMinor) {
       ReferenceUtil::Slice4D(values, {{1, 0, 8, 0}}, {{2, 2, 16, 128}});
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR4FromArray4D(values);
-  builder.Slice(original, {1, 0, 8, 0}, {2, 2, 16, 128}, {1, 1, 1, 1});
+  builder.Slice(original, {1, 0, 8, 0}, {2, 2, 16, 128});
   ComputeAndCompareR4(&builder, *expected, {}, ErrorSpec(0.000001));
 }
 
@@ -213,7 +213,6 @@ struct R2Spec {
   int64 input_dim1;
   std::array<int64, 2> slice_starts;
   std::array<int64, 2> slice_limits;
-  std::array<int64, 2> slice_strides;
   Layout layout;
 };
 
@@ -229,7 +228,7 @@ TEST_P(SliceR2Test, DoIt) {
 
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR2FromArray2D<int32>(input);
-  builder.Slice(a, spec.slice_starts, spec.slice_limits, spec.slice_strides);
+  builder.Slice(a, spec.slice_starts, spec.slice_limits);
 
   std::unique_ptr<Array2D<int32>> expected =
       ReferenceUtil::Slice2D(input, spec.slice_starts, spec.slice_limits);
@@ -240,23 +239,19 @@ TEST_P(SliceR2Test, DoIt) {
 INSTANTIATE_TEST_CASE_P(
     SliceR2TestInstantiation, SliceR2Test,
     ::testing::Values(
-        R2Spec {4, 12, {{0, 3}}, {{4, 6}}, {{1, 1}},
-          LayoutUtil::MakeLayout({0, 1})},
-        R2Spec {4, 12, {{0, 3}}, {{4, 6}}, {{1, 1}},
+        R2Spec {4, 12, {{0, 3}}, {{4, 6}}, LayoutUtil::MakeLayout({0, 1})},
+        R2Spec {4, 12, {{0, 3}}, {{4, 6}}, LayoutUtil::MakeLayout({1, 0})},
+        R2Spec {16, 4, {{0, 2}}, {{16, 4}}, LayoutUtil::MakeLayout({0, 1})},
+        R2Spec {16, 4, {{0, 2}}, {{16, 4}}, LayoutUtil::MakeLayout({1, 0})},
+        R2Spec {256, 400, {{0, 300}}, {{256, 400}},
           LayoutUtil::MakeLayout({1, 0})},
-        R2Spec {16, 4, {{0, 2}}, {{16, 4}}, {{1, 1}},
-          LayoutUtil::MakeLayout({0, 1})},
-        R2Spec {16, 4, {{0, 2}}, {{16, 4}}, {{1, 1}},
+        R2Spec {500, 400, {{111, 123}}, {{300, 257}},
           LayoutUtil::MakeLayout({1, 0})},
-        R2Spec {256, 400, {{0, 300}}, {{256, 400}}, {{1, 1}},
+        R2Spec {500, 400, {{111, 123}}, {{300, 400}},
           LayoutUtil::MakeLayout({1, 0})},
-        R2Spec {500, 400, {{111, 123}}, {{300, 257}}, {{1, 1}},
+        R2Spec {384, 512, {{128, 256}}, {{256, 384}},
           LayoutUtil::MakeLayout({1, 0})},
-        R2Spec {500, 400, {{111, 123}}, {{300, 400}}, {{1, 1}},
-          LayoutUtil::MakeLayout({1, 0})},
-        R2Spec {384, 512, {{128, 256}}, {{256, 384}}, {{1, 1}},
-          LayoutUtil::MakeLayout({1, 0})},
-        R2Spec {357, 512, {{111, 256}}, {{301, 384}}, {{1, 1}},
+        R2Spec {357, 512, {{111, 256}}, {{301, 384}},
           LayoutUtil::MakeLayout({1, 0})}
     )
 );
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index afa7d871c0..ccd2a95658 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -666,8 +666,7 @@ TEST_F(WhileTest, WhileWithPrngScalarResult) {
   auto build_condition = [this, v6s32](int count) {
     ComputationBuilder builder(client_, TestName());
     auto prev = builder.Reshape(
-        builder.Slice(builder.Parameter(0, v6s32, "prev"), {0}, {1}, {1}), {0},
-          {});
+        builder.Slice(builder.Parameter(0, v6s32, "prev"), {0}, {1}), {0}, {});
     builder.Gt(builder.ConstantR0<int32>(count), prev);
     return builder.Build().ConsumeValueOrDie();
   };
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index 31f0c3147e..42d5c1d155 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -195,24 +195,16 @@ bool IsPermutation(tensorflow::gtl::ArraySlice<int64> permutation, int64 rank);
 // 2. permutation.size() == input.size().
 template <template <typename...> class C, typename T>
 std::vector<T> Permute(tensorflow::gtl::ArraySlice<int64> permutation,
-                       C<T> input) {
-  tensorflow::gtl::ArraySlice<T> data(input);
-  CHECK(IsPermutation(permutation, data.size()));
-  std::vector<T> output(data.size());
+                       C<T> input_) {
+  tensorflow::gtl::ArraySlice<T> input(input_);
+  CHECK(IsPermutation(permutation, input.size()));
+  std::vector<T> output(input.size());
   for (size_t i = 0; i < permutation.size(); ++i) {
-    output[permutation[i]] = data[i];
+    output[permutation[i]] = input[i];
   }
   return output;
 }
 
-// Override of the above that works around compile failures with gcc 7.1.1.
-// For details see https://github.com/tensorflow/tensorflow/issues/10843
-template <typename T>
-std::vector<T> Permute(tensorflow::gtl::ArraySlice<int64> permutation,
-                       const std::vector<T>& input) {
-  return Permute<std::vector, T>(permutation, input);
-}
-
 // Inverts a permutation, i.e., output_permutation[input_permutation[i]] = i.
 std::vector<int64> InversePermutation(
     tensorflow::gtl::ArraySlice<int64> input_permutation);
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index 86c72b3449..95c1f0995b 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -200,7 +200,7 @@ message OpMetadata {
   string op_name = 2;
   // Indicate a file and line that this op is associated to in a user's program.
   //
-  // e.g. it could be the file and line of user code that generated the op.
+  // e.g. it could be be the file and line of user code that generated the op.
   string source_file = 3;
   int32 source_line = 4;
 }
@@ -369,7 +369,6 @@ message SliceRequest {
   ComputationDataHandle operand = 2;
   repeated int64 start_indices = 3;
   repeated int64 limit_indices = 4;
-  repeated int64 stride = 5;
 }
 
 message DynamicSliceRequest {
diff --git a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
index b1d18d2faf..1f180429b2 100644
--- a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
+++ b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
@@ -17,7 +17,6 @@ package org.tensorflow.contrib.android;
 
 import android.content.res.AssetManager;
 import android.os.Trace;
-import android.os.Build.VERSION;
 import android.text.TextUtils;
 import android.util.Log;
 import java.io.FileInputStream;
@@ -371,11 +370,9 @@ public class TensorFlowInferenceInterface {
   private void loadGraph(InputStream is, Graph g) throws IOException {
     final long startMs = System.currentTimeMillis();
 
-    if (VERSION.SDK_INT >= 18) {
-      Trace.beginSection("initializeTensorFlow");
-      Trace.beginSection("readGraphDef");
-    }
+    Trace.beginSection("initializeTensorFlow");
 
+    Trace.beginSection("readGraphDef");
     // TODO(ashankar): Can we somehow mmap the contents instead of copying them?
     byte[] graphDef = new byte[is.available()];
     final int numBytesRead = is.read(graphDef);
@@ -386,22 +383,17 @@ public class TensorFlowInferenceInterface {
               + " of the graph, expected to read "
               + graphDef.length);
     }
+    Trace.endSection();
 
-    if (VERSION.SDK_INT >= 18) {
-      Trace.endSection(); // readGraphDef.
-      Trace.beginSection("importGraphDef");
-    }
-
+    Trace.beginSection("importGraphDef");
     try {
       g.importGraphDef(graphDef);
     } catch (IllegalArgumentException e) {
       throw new IOException("Not a valid TensorFlow Graph serialization: " + e.getMessage());
     }
+    Trace.endSection();
 
-    if (VERSION.SDK_INT >= 18) {
-      Trace.endSection(); // importGraphDef.
-      Trace.endSection(); // initializeTensorFlow.
-    }
+    Trace.endSection(); // initializeTensorFlow.
 
     final long endMs = System.currentTimeMillis();
     Log.i(
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/csiszar_divergence_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/csiszar_divergence_test.py
index fabf7a9b77..fba0cc6522 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/csiszar_divergence_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/csiszar_divergence_test.py
@@ -627,6 +627,11 @@ class MonteCarloCsiszarFDivergenceTest(test.TestCase):
       grad = lambda fs: gradients_impl.gradients(fs, s)[0]
 
       [
+          approx_kl_grad_,
+          approx_kl_self_normalized_grad_,
+          approx_kl_score_trick_grad_,
+          approx_kl_self_normalized_score_trick_grad_,
+          exact_kl_grad_,
           approx_kl_,
           approx_kl_self_normalized_,
           approx_kl_score_trick_,
@@ -638,23 +643,39 @@ class MonteCarloCsiszarFDivergenceTest(test.TestCase):
           grad(approx_kl_score_trick),
           grad(approx_kl_self_normalized_score_trick),
           grad(exact_kl),
+          approx_kl,
+          approx_kl_self_normalized,
+          approx_kl_score_trick,
+          approx_kl_self_normalized_score_trick,
+          exact_kl,
       ])
 
-      self.assertAllClose(
-          approx_kl_, exact_kl_,
-          rtol=0.06, atol=0.)
+      # Test average divergence.
+      self.assertAllClose(approx_kl_, exact_kl_,
+                          rtol=0.02, atol=0.)
 
-      self.assertAllClose(
-          approx_kl_self_normalized_, exact_kl_,
-          rtol=0.05, atol=0.)
+      self.assertAllClose(approx_kl_self_normalized_, exact_kl_,
+                          rtol=0.08, atol=0.)
 
-      self.assertAllClose(
-          approx_kl_score_trick_, exact_kl_,
-          rtol=0.06, atol=0.)
+      self.assertAllClose(approx_kl_score_trick_, exact_kl_,
+                          rtol=0.02, atol=0.)
+
+      self.assertAllClose(approx_kl_self_normalized_score_trick_, exact_kl_,
+                          rtol=0.08, atol=0.)
+
+      # Test average gradient-divergence.
+      self.assertAllClose(approx_kl_grad_, exact_kl_grad_,
+                          rtol=0.007, atol=0.)
+
+      self.assertAllClose(approx_kl_self_normalized_grad_, exact_kl_grad_,
+                          rtol=0.011, atol=0.)
+
+      self.assertAllClose(approx_kl_score_trick_grad_, exact_kl_grad_,
+                          rtol=0.018, atol=0.)
 
       self.assertAllClose(
-          approx_kl_self_normalized_score_trick_, exact_kl_,
-          rtol=0.05, atol=0.)
+          approx_kl_self_normalized_score_trick_grad_, exact_kl_grad_,
+          rtol=0.017, atol=0.)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/bayesflow/python/ops/csiszar_divergence_impl.py b/tensorflow/contrib/bayesflow/python/ops/csiszar_divergence_impl.py
index 7b51d8d932..09389e5d38 100644
--- a/tensorflow/contrib/bayesflow/python/ops/csiszar_divergence_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/csiszar_divergence_impl.py
@@ -40,8 +40,8 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib import framework as contrib_framework
+from tensorflow.contrib.bayesflow.python.ops import monte_carlo_impl as monte_carlo
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops.distributions import distribution
@@ -750,7 +750,7 @@ def monte_carlo_csiszar_f_divergence(
   ```none
   D_f[p(X), q(X)] := E_{q(X)}[ f( p(X) / q(X) ) ]
                   ~= m**-1 sum_j^m f( p(x_j) / q(x_j) ),
-                             where x_j ~iid q(x)
+                             where x_j ~iid q(X)
   ```
 
   Tricks: Reparameterization and Score-Gradient
@@ -759,8 +759,8 @@ def monte_carlo_csiszar_f_divergence(
   parameterless distribution (e.g.,
   `Normal(Y; m, s) <=> Y = sX + m, X ~ Normal(0,1)`), we can swap gradient and
   expectation, i.e.,
-  `nabla Avg{ s_i : i=1...n } = Avg{ nabla s_i : i=1...n }` where `S_n=Avg{s_i}`
-  and `s_i = f(x_i), x_i ~ q`.
+  `grad[Avg{ s_i : i=1...n }] = Avg{ grad[s_i] : i=1...n }` where `S_n=Avg{s_i}`
+  and `s_i = f(x_i), x_i ~iid q(X)`.
 
   However, if q is not reparameterized, TensorFlow's gradient will be incorrect
   since the chain-rule stops at samples of unreparameterized distributions. In
@@ -768,22 +768,17 @@ def monte_carlo_csiszar_f_divergence(
   gradient, i.e.,
 
   ```none
-  nabla E_q[f(X)]
-  = nabla int dx q(x) f(x)
-  = int dx nabla [ q(x) f(x) ]
-  = int dx q'(x) f(x) + q(x) f'(x)
+  grad[ E_q[f(X)] ]
+  = grad[ int dx q(x) f(x) ]
+  = int dx grad[ q(x) f(x) ]
+  = int dx [ q'(x) f(x) + q(x) f'(x) ]
   = int dx q(x) [q'(x) / q(x) f(x) + f'(x) ]
-  = int dx q(x) nabla [ log(q(x)) stopgrad[f(x)] + f(x) ]
-  = E_q[ nabla [ log(q(X)) stopgrad[f(X)] + f(X) ] ]
-  ~= Avg{ log(q(y_i)) stopgrad[f(y_i)] + f(y_i) : y_i = stopgrad[x_i], x_i ~ q}
+  = int dx q(x) grad[ f(x) q(x) / stop_grad[q(x)] ]
+  = E_q[ grad[ f(x) q(x) / stop_grad[q(x)] ] ]
   ```
 
   Unless `q.reparameterization_type != distribution.FULLY_REPARAMETERIZED` it is
-  usually preferable to `use_reparametrization = True`.
-
-  Warning: using `use_reparametrization = False` will mean that the result is
-  *not* the Csiszar f-Divergence. However its expected gradient *is* the
-  gradient of the Csiszar f-Divergence.
+  usually preferable to set `use_reparametrization = True`.
 
   Example Application:
 
@@ -817,10 +812,7 @@ def monte_carlo_csiszar_f_divergence(
 
   Returns:
     monte_carlo_csiszar_f_divergence: Floating-type `Tensor` Monte Carlo
-      approximation of the Csiszar f-Divergence. Warning: using
-      `use_reparametrization = False` will mean that the result is *not* the
-      Csiszar f-Divergence. However its expected gradient *is* the actual
-      gradient of the Csiszar f-Divergence.
+      approximation of the Csiszar f-Divergence.
 
   Raises:
     ValueError: if `q` is not a reparameterized distribution and
@@ -831,24 +823,16 @@ def monte_carlo_csiszar_f_divergence(
       to parameters) is valid.
   """
   with ops.name_scope(name, "monte_carlo_csiszar_f_divergence", [num_draws]):
-    x = q.sample(num_draws, seed=seed)
-    if use_reparametrization:
+    if (use_reparametrization and
+        q.reparameterization_type != distribution.FULLY_REPARAMETERIZED):
       # TODO(jvdillon): Consider only raising an exception if the gradient is
       # requested.
-      if q.reparameterization_type != distribution.FULLY_REPARAMETERIZED:
-        raise ValueError(
-            "Distribution `q` must be reparameterized, i.e., a diffeomorphic "
-            "transformation of a parameterless distribution. (Otherwise this "
-            "function has a biased gradient.)")
-      return math_ops.reduce_mean(f(p.log_prob(x) - q.log_prob(x)), axis=0)
-    else:
-      x = array_ops.stop_gradient(x)
-      logqx = q.log_prob(x)
-      fx = f(p.log_prob(x) - logqx)
-      # Alternatively we could have returned:
-      #   reduce_mean(fx * exp(logqx) / stop_gradient(exp(logqx)), axis=0)
-      # This is nice because it means the result is exactly the Csiszar
-      # f-Divergence yet the gradient is unbiased. However its numerically
-      # unstable since the q is not in log-domain.
-      return math_ops.reduce_mean(logqx * array_ops.stop_gradient(fx) + fx,
-                                  axis=0)
+      raise ValueError(
+          "Distribution `q` must be reparameterized, i.e., a diffeomorphic "
+          "transformation of a parameterless distribution. (Otherwise this "
+          "function has a biased gradient.)")
+    return monte_carlo.expectation_v2(
+        f=lambda x: f(p.log_prob(x) - q.log_prob(x)),
+        samples=q.sample(num_draws, seed=seed),
+        log_prob=q.log_prob,
+        use_reparametrization=use_reparametrization)
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 205b44ac12..9ffe08eded 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -74,7 +74,7 @@ if(WIN32)
   set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} /ignore:4049 /ignore:4197 /ignore:4217 /ignore:4221")
   set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /ignore:4049 /ignore:4197 /ignore:4217 /ignore:4221")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
-  set(CMAKE_CXX_FLAGS_DEBUG "/D_DEBUG /MDd /Ob2")
+  set(CMAKE_CXX_FLAGS_DEBUG "/D_DEBUG /MDd /Ob0")
   set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /D_ITERATOR_DEBUG_LEVEL=0")
   set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /D_ITERATOR_DEBUG_LEVEL=0")
   set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /D_ITERATOR_DEBUG_LEVEL=0")
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index bf89e64bfd..a971232e2f 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -204,6 +204,7 @@ add_python_module("tensorflow/python/debug/examples")
 add_python_module("tensorflow/python/debug/lib")
 add_python_module("tensorflow/python/debug/wrappers")
 add_python_module("tensorflow/python/estimator")
+add_python_module("tensorflow/python/estimator/canned")
 add_python_module("tensorflow/python/estimator/export")
 add_python_module("tensorflow/python/estimator/inputs")
 add_python_module("tensorflow/python/estimator/inputs/queues")
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py
index 42865ed404..d44e258bd2 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py
@@ -120,7 +120,7 @@ class _TriLPlusVDVTLightweightOperatorPD(object):
 
     Doesn't actually do the sqrt! Named as such to agree with API.
 
-    To compute (M + V D V.T), we use the Woodbury matrix identity:
+    To compute (M + V D V.T), we use the the Woodbury matrix identity:
       inv(M + V D V.T) = inv(M) - inv(M) V inv(C) V.T inv(M)
     where,
       C = inv(D) + V.T inv(M) V.
@@ -166,7 +166,7 @@ class _TriLPlusVDVTLightweightOperatorPD(object):
   def _woodbury_sandwiched_term(self):
     """Computes the sandwiched term in the Woodbury identity.
 
-    Computes the "`C`" in the identity:
+    Computes the "`C`" in the the identity:
        inv(M + V D V.T) = inv(M) - inv(M) V inv(C) V.T inv(M)
     where,
        C = inv(D) + V.T inv(M) V.
diff --git a/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py b/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
index b525809015..5b57a95c55 100644
--- a/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
@@ -52,7 +52,7 @@ class RelaxedBernoulli(transformed_distribution.TransformedDistribution):
   the RelaxedBernoulli can suffer from underflow issues. In many case loss
   functions such as these are invariant under invertible transformations of
   the random variables. The KL divergence, found in the variational autoencoder
-  loss, is an example. Because RelaxedBernoullis are sampled by a Logistic
+  loss, is an example. Because RelaxedBernoullis are sampled by by a Logistic
   random variable followed by a `tf.sigmoid` op, one solution is to treat
   the Logistic as the random variable and `tf.sigmoid` as downstream. The
   KL divergences of two Logistics, which are always followed by a `tf.sigmoid`
diff --git a/tensorflow/contrib/distributions/python/ops/sample_stats.py b/tensorflow/contrib/distributions/python/ops/sample_stats.py
index 2a4b92c729..26cf922d0a 100644
--- a/tensorflow/contrib/distributions/python/ops/sample_stats.py
+++ b/tensorflow/contrib/distributions/python/ops/sample_stats.py
@@ -47,7 +47,7 @@ def percentile(x,
   """Compute the `q`-th percentile of `x`.
 
   Given a vector `x`, the `q`-th percentile of `x` is the value `q / 100` of the
-  way from the minimum to the maximum in a sorted copy of `x`.
+  way from the minimum to the maximum in in a sorted copy of `x`.
 
   The values and distances of the two nearest neighbors as well as the
   `interpolation` parameter will determine the percentile if the normalized
diff --git a/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py b/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
index fdee57695e..fd2c46d94d 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Vectorized Laplace distribution class, directly using LinearOperator."""
+"""Vectorized Laplace distribution class, directly using LinearOpeartor."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/graph_editor/transform.py b/tensorflow/contrib/graph_editor/transform.py
index 2234400fdc..762bc44814 100644
--- a/tensorflow/contrib/graph_editor/transform.py
+++ b/tensorflow/contrib/graph_editor/transform.py
@@ -446,7 +446,7 @@ class Transformer(object):
       # TODO(fkp): return a subgraph?
       op_, op_outputs_ = self.transform_op_handler(info, op)
       if op is op_:
-        raise ValueError("In-place transformation not allowed.")
+        raise ValueError("In-place tranformation not allowed.")
 
       # Process op.
       info.transformed_ops[op] = op_
diff --git a/tensorflow/contrib/keras/python/keras/backend.py b/tensorflow/contrib/keras/python/keras/backend.py
index 324f510301..9f02fc0958 100644
--- a/tensorflow/contrib/keras/python/keras/backend.py
+++ b/tensorflow/contrib/keras/python/keras/backend.py
@@ -3261,7 +3261,7 @@ def conv2d(x,
       padding: string, `"same"` or `"valid"`.
       data_format: `"channels_last"` or `"channels_first"`.
           Whether to use Theano or TensorFlow data format
-          for inputs/kernels/outputs.
+          for inputs/kernels/ouputs.
       dilation_rate: tuple of 2 integers.
 
   Returns:
@@ -3309,7 +3309,7 @@ def conv2d_transpose(x,
       padding: string, `"same"` or `"valid"`.
       data_format: `"channels_last"` or `"channels_first"`.
           Whether to use Theano or TensorFlow data format
-          for inputs/kernels/outputs.
+          for inputs/kernels/ouputs.
 
   Returns:
       A tensor, result of transposed 2D convolution.
@@ -3395,7 +3395,7 @@ def conv3d(x,
       padding: string, `"same"` or `"valid"`.
       data_format: `"channels_last"` or `"channels_first"`.
           Whether to use Theano or TensorFlow data format
-          for inputs/kernels/outputs.
+          for inputs/kernels/ouputs.
       dilation_rate: tuple of 3 integers.
 
   Returns:
diff --git a/tensorflow/contrib/keras/python/keras/layers/core.py b/tensorflow/contrib/keras/python/keras/layers/core.py
index 34548c83c5..d287fa56d9 100644
--- a/tensorflow/contrib/keras/python/keras/layers/core.py
+++ b/tensorflow/contrib/keras/python/keras/layers/core.py
@@ -107,7 +107,7 @@ class Dropout(tf_core_layers.Dropout, Layer):
     self.supports_masking = True
     # Inheritance call order:
     # 1) tf.layers.Dropout, 2) keras.layers.Layer, 3) tf.layers.Layer
-    super(Dropout, self).__init__(rate=rate, noise_shape=noise_shape, seed=seed, **kwargs)
+    super(Dropout, self).__init__(**kwargs)
 
   def call(self, inputs, training=None):
     if training is None:
diff --git a/tensorflow/contrib/keras/python/keras/layers/recurrent.py b/tensorflow/contrib/keras/python/keras/layers/recurrent.py
index cdef55f599..5e8c23ed3e 100644
--- a/tensorflow/contrib/keras/python/keras/layers/recurrent.py
+++ b/tensorflow/contrib/keras/python/keras/layers/recurrent.py
@@ -985,7 +985,7 @@ class LSTM(Recurrent):
 
   References:
       - [Long short-term
-        memory](http://www.bioinf.jku.at/publications/older/2604.pdf)
+        memory](http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf)
         (original 1997 paper)
       - [Supervised sequence labeling with recurrent neural
         networks](http://www.cs.toronto.edu/~graves/preprint.pdf)
diff --git a/tensorflow/contrib/keras/python/keras/models_test.py b/tensorflow/contrib/keras/python/keras/models_test.py
index 99fd6e1cbe..50aba43c24 100644
--- a/tensorflow/contrib/keras/python/keras/models_test.py
+++ b/tensorflow/contrib/keras/python/keras/models_test.py
@@ -105,7 +105,7 @@ class TestModelSaving(test.TestCase):
       out2 = model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
 
-  def test_functional_model_saving(self):
+  def test_fuctional_model_saving(self):
     if h5py is None:
       return  # Skip test if models cannot be saved.
 
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index d8ff6d6b6a..f2a904b521 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -1467,8 +1467,7 @@ def fully_connected(inputs,
     ValueError: If x has rank less than 2 or if its last dimension is not set.
   """
   if not isinstance(num_outputs, six.integer_types):
-    raise ValueError(
-        'num_outputs should be int or long, got %s.' % (num_outputs,))
+    raise ValueError('num_outputs should be int or long, got %s.', num_outputs)
 
   layer_variable_getter = _build_variable_getter({'bias': 'biases',
                                                   'kernel': 'weights'})
diff --git a/tensorflow/contrib/learn/python/learn/estimators/debug_test.py b/tensorflow/contrib/learn/python/learn/estimators/debug_test.py
index 6b125534a4..b4b552c025 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/debug_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/debug_test.py
@@ -52,8 +52,8 @@ LABEL_DIMENSION = 3  #  Dimensionality of regression labels.
 
 def _train_test_split(features_and_labels):
   features, labels = features_and_labels
-  train_set = (features[:int(len(features) / 2)], labels[:int(len(features) / 2)])
-  test_set = (features[int(len(features) / 2):], labels[int(len(features) / 2):])
+  train_set = (features[:len(features) / 2], labels[:len(features) / 2])
+  test_set = (features[len(features) / 2:], labels[len(features) / 2:])
   return train_set, test_set
 
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear_test.py b/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
index d3bb0fda57..145d5c40fa 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
@@ -729,7 +729,7 @@ class LinearClassifierTest(test.TestCase):
     self.assertLess(loss, 0.07)
 
   def testSdcaOptimizerRealValuedFeatures(self):
-    """Tests LinearClassifier with SDCAOptimizer and real valued features."""
+    """Tests LinearClasssifier with SDCAOptimizer and real valued features."""
 
     def input_fn():
       return {
@@ -776,7 +776,7 @@ class LinearClassifierTest(test.TestCase):
     self.assertLess(loss, 0.05)
 
   def testSdcaOptimizerBucketizedFeatures(self):
-    """Tests LinearClassifier with SDCAOptimizer and bucketized features."""
+    """Tests LinearClasssifier with SDCAOptimizer and bucketized features."""
 
     def input_fn():
       return {
@@ -802,7 +802,7 @@ class LinearClassifierTest(test.TestCase):
     self.assertGreater(scores['accuracy'], 0.9)
 
   def testSdcaOptimizerSparseFeatures(self):
-    """Tests LinearClassifier with SDCAOptimizer and sparse features."""
+    """Tests LinearClasssifier with SDCAOptimizer and sparse features."""
 
     def input_fn():
       return {
@@ -833,7 +833,7 @@ class LinearClassifierTest(test.TestCase):
     self.assertGreater(scores['accuracy'], 0.9)
 
   def testSdcaOptimizerWeightedSparseFeatures(self):
-    """LinearClassifier with SDCAOptimizer and weighted sparse features."""
+    """LinearClasssifier with SDCAOptimizer and weighted sparse features."""
 
     def input_fn():
       return {
@@ -864,7 +864,7 @@ class LinearClassifierTest(test.TestCase):
     self.assertGreater(scores['accuracy'], 0.9)
 
   def testSdcaOptimizerCrossedFeatures(self):
-    """Tests LinearClassifier with SDCAOptimizer and crossed features."""
+    """Tests LinearClasssifier with SDCAOptimizer and crossed features."""
 
     def input_fn():
       return {
@@ -897,7 +897,7 @@ class LinearClassifierTest(test.TestCase):
     self.assertGreater(scores['accuracy'], 0.9)
 
   def testSdcaOptimizerMixedFeatures(self):
-    """Tests LinearClassifier with SDCAOptimizer and a mix of features."""
+    """Tests LinearClasssifier with SDCAOptimizer and a mix of features."""
 
     def input_fn():
       return {
@@ -1509,7 +1509,7 @@ class LinearRegressorTest(test.TestCase):
     self.assertLess(loss, 0.05)
 
   def testSdcaOptimizerSparseFeaturesWithL1Reg(self):
-    """Tests LinearClassifier with SDCAOptimizer and sparse features."""
+    """Tests LinearClasssifier with SDCAOptimizer and sparse features."""
 
     def input_fn():
       return {
@@ -1581,7 +1581,7 @@ class LinearRegressorTest(test.TestCase):
     self.assertLess(l1_reg_weights_norm, no_l1_reg_weights_norm)
 
   def testSdcaOptimizerBiasOnly(self):
-    """Tests LinearClassifier with SDCAOptimizer and validates bias weight."""
+    """Tests LinearClasssifier with SDCAOptimizer and validates bias weight."""
 
     def input_fn():
       """Testing the bias weight when it's the only feature present.
@@ -1614,7 +1614,7 @@ class LinearRegressorTest(test.TestCase):
         regressor.get_variable_value('linear/bias_weight')[0], 0.25, err=0.1)
 
   def testSdcaOptimizerBiasAndOtherColumns(self):
-    """Tests LinearClassifier with SDCAOptimizer and validates bias weight."""
+    """Tests LinearClasssifier with SDCAOptimizer and validates bias weight."""
 
     def input_fn():
       """Testing the bias weight when there are other features present.
@@ -1676,7 +1676,7 @@ class LinearRegressorTest(test.TestCase):
         regressor.get_variable_value('linear/b/weight')[0], 0.0, err=0.05)
 
   def testSdcaOptimizerBiasAndOtherColumnsFabricatedCentered(self):
-    """Tests LinearClassifier with SDCAOptimizer and validates bias weight."""
+    """Tests LinearClasssifier with SDCAOptimizer and validates bias weight."""
 
     def input_fn():
       """Testing the bias weight when there are other features present.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/model_fn_test.py b/tensorflow/contrib/learn/python/learn/estimators/model_fn_test.py
index 284e2cfd7a..6ebfeb0f16 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/model_fn_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/model_fn_test.py
@@ -123,7 +123,7 @@ class ModelFnopsTest(test.TestCase):
       self.assertAllEqual(predictions["probabilities"].eval(),
                           regression_output.value.eval())
 
-  def testEstimatorSpec_export_classification(self):
+  def testEstimatorSpec_export_classsification(self):
     predictions = self.create_predictions()
     output_alternatives = {"classification_head": (
         constants.ProblemType.CLASSIFICATION, predictions)}
@@ -143,7 +143,7 @@ class ModelFnopsTest(test.TestCase):
       self.assertAllEqual(predictions["classes"].eval(),
                           classification_output.classes.eval())
 
-  def testEstimatorSpec_export_classification_with_missing_scores(self):
+  def testEstimatorSpec_export_classsification_with_missing_scores(self):
     predictions = self.create_predictions()
     output_alternatives_predictions = predictions.copy()
     del output_alternatives_predictions["scores"]
@@ -165,7 +165,7 @@ class ModelFnopsTest(test.TestCase):
       self.assertAllEqual(predictions["classes"].eval(),
                           classification_output.classes.eval())
 
-  def testEstimatorSpec_export_classification_with_missing_scores_proba(self):
+  def testEstimatorSpec_export_classsification_with_missing_scores_proba(self):
     predictions = self.create_predictions()
     output_alternatives_predictions = predictions.copy()
     del output_alternatives_predictions["scores"]
@@ -187,7 +187,7 @@ class ModelFnopsTest(test.TestCase):
       self.assertAllEqual(predictions["classes"].eval(),
                           classification_output.classes.eval())
 
-  def testEstimatorSpec_export_classification_with_missing_classes(self):
+  def testEstimatorSpec_export_classsification_with_missing_classes(self):
     predictions = self.create_predictions()
     output_alternatives_predictions = predictions.copy()
     del output_alternatives_predictions["classes"]
@@ -208,7 +208,7 @@ class ModelFnopsTest(test.TestCase):
                           classification_output.scores.eval())
       self.assertIsNone(classification_output.classes)
 
-  def testEstimatorSpec_export_classification_with_nonstring_classes(self):
+  def testEstimatorSpec_export_classsification_with_nonstring_classes(self):
     predictions = self.create_predictions()
     output_alternatives_predictions = predictions.copy()
     output_alternatives_predictions["classes"] = constant_op.constant(
diff --git a/tensorflow/contrib/learn/python/learn/models.py b/tensorflow/contrib/learn/python/learn/models.py
index 4283240d01..234605ff76 100644
--- a/tensorflow/contrib/learn/python/learn/models.py
+++ b/tensorflow/contrib/learn/python/learn/models.py
@@ -63,7 +63,7 @@ def linear_regression(x, y, init_mean=None, init_stddev=1.0):
     x: tensor or placeholder for input features.
     y: tensor or placeholder for labels.
     init_mean: the mean value to use for initialization.
-    init_stddev: the standard deviation to use for initialization.
+    init_stddev: the standard devation to use for initialization.
 
   Returns:
     Predictions and loss tensors.
@@ -124,7 +124,7 @@ def logistic_regression(x,
                   will check if graph contains tensor `class_weight:0`.
                   If that is not provided either all ones are used.
     init_mean: the mean value to use for initialization.
-    init_stddev: the standard deviation to use for initialization.
+    init_stddev: the standard devation to use for initialization.
 
   Returns:
     Predictions and loss tensors.
diff --git a/tensorflow/contrib/lookup/lookup_ops.py b/tensorflow/contrib/lookup/lookup_ops.py
index ce8518267f..d58b9744ac 100644
--- a/tensorflow/contrib/lookup/lookup_ops.py
+++ b/tensorflow/contrib/lookup/lookup_ops.py
@@ -208,7 +208,7 @@ def index_to_string_table_from_tensor(mapping, default_value="UNK", name=None):
   Sample Usages:
 
   ```python
-  mapping_string = tf.constant(["emerson", "lake", "palmer"])
+  mapping_string = tf.constant(["emerson", "lake", "palmer")
   indices = tf.constant([1, 5], tf.int64)
   table = tf.contrib.lookup.index_to_string_table_from_tensor(
       mapping_string, default_value="UNKNOWN")
@@ -260,11 +260,7 @@ def index_to_string(tensor, mapping, default_value="UNK", name=None):
   For example:
 
   ```python
-<<<<<<< HEAD
-  mapping_string = tf.constant(["emerson", "lake", "palmer"])
-=======
   mapping_string = tf.constant(["emerson", "lake", "palmer")
->>>>>>> 338a7ead4475d6b97b420d6d1c56ff66815e3e7b
   indices = tf.constant([1, 5], tf.int64)
   values = tf.contrib.lookup.index_to_string(
       indices, mapping=mapping_string, default_value="UNKNOWN")
diff --git a/tensorflow/contrib/makefile/build_with_docker.sh b/tensorflow/contrib/makefile/build_with_docker.sh
index 51a73fafe5..7fe38f4b5d 100755
--- a/tensorflow/contrib/makefile/build_with_docker.sh
+++ b/tensorflow/contrib/makefile/build_with_docker.sh
@@ -23,7 +23,7 @@
 # Make sure we're in the correct directory, at the root of the source tree.
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 WORKSPACE="${SCRIPT_DIR}/../../../"
-cd ${WORKSPACE} || exit 1
+cd ${WORKSPACE}
 
 DOCKER_IMG_NAME="tf-make-base"
 DOCKER_CONTEXT_PATH="${WORKSPACE}tensorflow/contrib/makefile/"
diff --git a/tensorflow/contrib/makefile/compile_android_protobuf.sh b/tensorflow/contrib/makefile/compile_android_protobuf.sh
index fadbe271b8..50d72d6093 100755
--- a/tensorflow/contrib/makefile/compile_android_protobuf.sh
+++ b/tensorflow/contrib/makefile/compile_android_protobuf.sh
@@ -27,7 +27,7 @@ cc_prefix="${CC_PREFIX}"
 usage() {
   echo "Usage: $(basename "$0") [-a:c]"
   echo "-a [Architecture] Architecture of target android [default=armeabi-v7a] \
-(supported architecture list: \
+(supported archtecture list: \
 arm64-v8a armeabi armeabi-v7a armeabi-v7a-hard mips mips64 x86 x86_64)"
   echo "-c Clean before building protobuf for target"
   echo "\"NDK_ROOT\" should be defined as an environment variable."
@@ -130,7 +130,7 @@ elif [[ ${ARCHITECTURE} == "x86_64" ]]; then
     sysroot_arch="x86_64"
     bin_prefix="x86_64-linux-android"
 else
-    echo "architecture ${ARCHITECTURE} is not supported." 1>&2
+    echo "archtecture ${arcitecture} is not supported." 1>&2
     usage
     exit 1
 fi
diff --git a/tensorflow/contrib/makefile/compile_ios_protobuf.sh b/tensorflow/contrib/makefile/compile_ios_protobuf.sh
index e8b9454e7e..d1012a6c93 100755
--- a/tensorflow/contrib/makefile/compile_ios_protobuf.sh
+++ b/tensorflow/contrib/makefile/compile_ios_protobuf.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/bash -x -e
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,9 +15,6 @@
 # ==============================================================================
 # Builds protobuf 3 for iOS.
 
-set -x
-set -e
-
 SCRIPT_DIR=$(dirname $0)
 source "${SCRIPT_DIR}/build_helper.subr"
 
@@ -33,17 +30,17 @@ fi
 
 JOB_COUNT="${JOB_COUNT:-$(get_job_count)}"
 
-GENDIR=$(pwd)/gen/protobuf_ios/
+GENDIR=`pwd`/gen/protobuf_ios/
 LIBDIR=${GENDIR}lib
 mkdir -p ${LIBDIR}
 
 OSX_VERSION=darwin14.0.0
 
-IPHONEOS_PLATFORM=$(xcrun --sdk iphoneos --show-sdk-platform-path)
-IPHONEOS_SYSROOT=$(xcrun --sdk iphoneos --show-sdk-path)
-IPHONESIMULATOR_PLATFORM=$(xcrun --sdk iphonesimulator --show-sdk-platform-path)
-IPHONESIMULATOR_SYSROOT=$(xcrun --sdk iphonesimulator --show-sdk-path)
-IOS_SDK_VERSION=$(xcrun --sdk iphoneos --show-sdk-version)
+IPHONEOS_PLATFORM=`xcrun --sdk iphoneos --show-sdk-platform-path`
+IPHONEOS_SYSROOT=`xcrun --sdk iphoneos --show-sdk-path`
+IPHONESIMULATOR_PLATFORM=`xcrun --sdk iphonesimulator --show-sdk-platform-path`
+IPHONESIMULATOR_SYSROOT=`xcrun --sdk iphonesimulator --show-sdk-path`
+IOS_SDK_VERSION=`xcrun --sdk iphoneos --show-sdk-version`
 MIN_SDK_VERSION=8.0
 
 CFLAGS="-DNDEBUG -Os -pipe -fPIC -fno-exceptions"
diff --git a/tensorflow/contrib/makefile/compile_ios_tensorflow.sh b/tensorflow/contrib/makefile/compile_ios_tensorflow.sh
index bcf097b303..6f47b80780 100755
--- a/tensorflow/contrib/makefile/compile_ios_tensorflow.sh
+++ b/tensorflow/contrib/makefile/compile_ios_tensorflow.sh
@@ -20,7 +20,7 @@ source "${SCRIPT_DIR}/build_helper.subr"
 JOB_COUNT="${JOB_COUNT:-$(get_job_count)}"
 
 function less_than_required_version() {
-  echo $1 | (IFS=. read -r major minor micro
+  echo $1 | (IFS=. read major minor micro
     if [ $major -ne $2 ]; then
       [ $major -lt $2 ]
     elif [ $minor -ne $3 ]; then
@@ -31,7 +31,7 @@ function less_than_required_version() {
   )
 }
 
-ACTUAL_XCODE_VERSION=$(xcodebuild -version | head -n 1 | sed 's/Xcode //')
+ACTUAL_XCODE_VERSION=`xcodebuild -version | head -n 1 | sed 's/Xcode //'`
 REQUIRED_XCODE_VERSION=7.3.0
 if less_than_required_version $ACTUAL_XCODE_VERSION 7 3 0
 then
diff --git a/tensorflow/contrib/makefile/compile_pi_protobuf.sh b/tensorflow/contrib/makefile/compile_pi_protobuf.sh
index f863d80009..2aae2d5f4e 100755
--- a/tensorflow/contrib/makefile/compile_pi_protobuf.sh
+++ b/tensorflow/contrib/makefile/compile_pi_protobuf.sh
@@ -15,15 +15,15 @@
 # ==============================================================================
 # Builds protobuf 3 for iOS.
 
-cd tensorflow/contrib/makefile || exit 1
+cd tensorflow/contrib/makefile
 
-GENDIR=$(pwd)/gen/protobuf_pi/
+GENDIR=`pwd`/gen/protobuf_pi/
 LIBDIR=${GENDIR}
 mkdir -p ${LIBDIR}
 
 CXX=arm-linux-gnueabihf-g++
 
-cd downloads/protobuf || exit 1
+cd downloads/protobuf
 
 ./autogen.sh
 if [ $? -ne 0 ]
diff --git a/tensorflow/contrib/remote_fused_graph/pylib/BUILD b/tensorflow/contrib/remote_fused_graph/pylib/BUILD
index 288f59fed4..c7ed663131 100644
--- a/tensorflow/contrib/remote_fused_graph/pylib/BUILD
+++ b/tensorflow/contrib/remote_fused_graph/pylib/BUILD
@@ -7,7 +7,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 
 tf_gen_op_wrapper_py(
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index c8552fc050..9c5e9fec9d 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -79,7 +79,7 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
 
   The default non-peephole implementation is based on:
 
-    http://www.bioinf.jku.at/publications/older/2604.pdf
+    http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
 
   S. Hochreiter and J. Schmidhuber.
   "Long Short-Term Memory". Neural Computation, 9(8):1735-1780, 1997.
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py
index 3496b355b4..491d87f62d 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py
@@ -65,9 +65,7 @@ class GatherTreeTest(test.TestCase):
         _ = beams.eval()
 
   def testBadParentValuesOnGPU(self):
-    # Only want to run this test on CUDA devices, as gather_tree is not
-    # registered for SYCL devices.
-    if not test.is_gpu_available(cuda_only=True):
+    if not test.is_gpu_available():
       return
     # (max_time = 4, batch_size = 1, beams = 3)
     # bad parent in beam 1 time 1; appears as a negative index at time 0
diff --git a/tensorflow/contrib/slim/README.md b/tensorflow/contrib/slim/README.md
index 4bde661698..d37c632be7 100644
--- a/tensorflow/contrib/slim/README.md
+++ b/tensorflow/contrib/slim/README.md
@@ -836,7 +836,7 @@ with tf.Session() as sess:
   for batch_id in range(num_batches):
     sess.run(names_to_updates.values())
 
-  metric_values = sess.run(names_to_values.values())
+  metric_values = sess.run(name_to_values.values())
   for metric, value in zip(names_to_values.keys(), metric_values):
     print('Metric %s has value: %f' % (metric, value))
 ```
diff --git a/tensorflow/contrib/tfprof/README.md b/tensorflow/contrib/tfprof/README.md
index 4fa1ccea69..824ba4c09b 100644
--- a/tensorflow/contrib/tfprof/README.md
+++ b/tensorflow/contrib/tfprof/README.md
@@ -1,26 +1,3 @@
 # tfprof: TensorFlow Profiler and Beyond
 
 # Full Document in tensorflow/tools/tfprof/README.md
-
-Author: Xin Pan (xpan@google.com, github: panyx0718), Jon Shlens, Yao Zhang
-
-Consultants: Jon Shlens, Pete Warden
-
-###Major Features
-
-1.  Measure model parameters, float operations, tensor shapes.
-2.  Profile op execution times, requested memory size and device placement.
-3.  Inspect checkpoint tensors' shapes and their values.
-4.  Selectively group, filter, account and order ops.
-
-####tfprof supports 3 views to organize TensorFlow model profiles
-
-    *  code view: Stats are associated your Python codes and organized as call stacks.
-    *  scope view: Stats are organized as name scope hierarchies.
-    *  graph view: Stats are organized as Tensorflow Op graph.
-
-####For each view, there are 3 ways to display outputs:
-
-    *  stdout: Results are written to stdout.
-    *  timeline: Visualized in chrome browser as time series.
-    *  file: Results are dumped to file.
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/internal/pywrap_tensorflow_print_model_analysis.i b/tensorflow/contrib/tfprof/python/tools/tfprof/internal/pywrap_tensorflow_print_model_analysis.i
index 40f29ae8a2..582c36e339 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/internal/pywrap_tensorflow_print_model_analysis.i
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/internal/pywrap_tensorflow_print_model_analysis.i
@@ -43,7 +43,6 @@ using tensorflow::int64;
 %unignore tensorflow::tfprof::DeleteProfiler;
 %unignore tensorflow::tfprof::AddStep;
 %unignore tensorflow::tfprof::Profile;
-%unignore tensorflow::tfprof::Advise;
 
 %include "tensorflow/tools/tfprof/internal/print_model_analysis.h"
 
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/internal/run_metadata_test.py b/tensorflow/contrib/tfprof/python/tools/tfprof/internal/run_metadata_test.py
index 71468dde37..9c59df3117 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/internal/run_metadata_test.py
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/internal/run_metadata_test.py
@@ -89,7 +89,7 @@ def _run_loop_model():
 class RunMetadataTest(test.TestCase):
 
   def testGPU(self):
-    if not test.is_gpu_available(cuda_only=True):
+    if not test.is_gpu_available():
       return
 
     ops.reset_default_graph()
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer.py b/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer.py
index 419beac0b9..c781d2af4e 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer.py
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer.py
@@ -20,6 +20,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import six
+
 from tensorflow.contrib.tfprof.python.tools.tfprof import tfprof_logger
 from tensorflow.contrib.tfprof.python.tools.tfprof.internal import pywrap_tensorflow_print_model_analysis_lib as print_mdl
 from tensorflow.python.framework import errors
@@ -108,49 +110,77 @@ PRINT_ALL_TIMING_MEMORY = {
     'dump_to_file': ''
 }
 
+# The following options are for 'advise' tfprof_cmd.
+# Show all advice.
+ALL_ADVICE = {
+    'ExpensiveOperationChecker': {},
+    'AcceleratorUtilizationChecker': {},
+    'JobChecker': {},  # Only available internally.
+    'OperationChecker': {},
+}
+
 # pylint: enable=bad-whitespace
 # pylint: enable=bad-continuation
 
 
-def _build_options(tfprof_options):
+def _build_options(options):
   """Build tfprof.OptionsProto.
 
   Args:
-    tfprof_options: A dictionary of options.
+    options: A dictionary of options.
   Returns:
     tfprof.OptionsProto.
   """
   opts = tfprof_options_pb2.OptionsProto()
-  opts.max_depth = tfprof_options.get('max_depth', 10)
-  opts.min_bytes = tfprof_options.get('min_bytes', 0)
-  opts.min_micros = tfprof_options.get('min_micros', 0)
-  opts.min_params = tfprof_options.get('min_params', 0)
-  opts.min_float_ops = tfprof_options.get('min_float_ops', 0)
-  opts.min_occurrence = tfprof_options.get('min_occurrence', 0)
+  opts.max_depth = options.get('max_depth', 10)
+  opts.min_bytes = options.get('min_bytes', 0)
+  opts.min_micros = options.get('min_micros', 0)
+  opts.min_params = options.get('min_params', 0)
+  opts.min_float_ops = options.get('min_float_ops', 0)
+  opts.min_occurrence = options.get('min_occurrence', 0)
 
-  opts.step = tfprof_options.get('step', -1)
+  opts.step = options.get('step', -1)
 
-  opts.order_by = tfprof_options.get('order_by', 'name')
+  opts.order_by = options.get('order_by', 'name')
 
-  for p in tfprof_options.get('account_type_regexes', []):
+  for p in options.get('account_type_regexes', []):
     opts.account_type_regexes.append(p)
-  for p in tfprof_options.get('start_name_regexes', []):
+  for p in options.get('start_name_regexes', []):
     opts.start_name_regexes.append(p)
-  for p in tfprof_options.get('trim_name_regexes', []):
+  for p in options.get('trim_name_regexes', []):
     opts.trim_name_regexes.append(p)
-  for p in tfprof_options.get('show_name_regexes', []):
+  for p in options.get('show_name_regexes', []):
     opts.show_name_regexes.append(p)
-  for p in tfprof_options.get('hide_name_regexes', []):
+  for p in options.get('hide_name_regexes', []):
     opts.hide_name_regexes.append(p)
-  opts.account_displayed_op_only = tfprof_options.get(
-      'account_displayed_op_only', False)
+  opts.account_displayed_op_only = options.get('account_displayed_op_only',
+                                               False)
 
-  for p in tfprof_options.get('select', []):
+  for p in options.get('select', []):
     opts.select.append(p)
 
-  opts.output = tfprof_options.get('output', 'stdout')
-  opts.dump_to_file = tfprof_options.get('dump_to_file', '')
+  opts.output = options.get('output', 'stdout')
+  opts.dump_to_file = options.get('dump_to_file', '')
+
+  return opts
+
+
+def _build_advisor_options(options):
+  """Build tfprof.AdvisorOptionsProto.
 
+  Args:
+    options: A dictionary of options. See ALL_ADVICE example.
+  Returns:
+    tfprof.AdvisorOptionsProto.
+  """
+  opts = tfprof_options_pb2.AdvisorOptionsProto()
+  if options is None:
+    return opts
+  for checker, checker_opts in six.iteritems(options):
+    checker_ops_pb = tfprof_options_pb2.AdvisorOptionsProto.CheckerOption()
+    for k, v in six.iteritems(checker_opts):
+      checker_ops_pb[k] = v
+    opts.checkers[checker].MergeFrom(checker_ops_pb)
   return opts
 
 
@@ -190,7 +220,7 @@ class Profiler(object):
       else:
         _ = sess.run(...)
     # Auto detect problems and generate advice.
-    profiler.advise()
+    profiler.advise(model_analyzer.ALL_ADVICE)
   """
 
   def __init__(self, graph, op_log=None):
@@ -288,9 +318,19 @@ class Profiler(object):
         print_mdl.Profile('graph'.encode('utf-8'), opts.SerializeToString()))
     return tfprof_node
 
-  def advise(self):
-    """Automatically detect problems and generate reports."""
-    print_mdl.Advise()
+  def advise(self, options=ALL_ADVICE):  # pylint: disable=dangerous-default-value
+    """Automatically detect problems and generate reports.
+
+    Args:
+      options: A dict of options.
+    Returns:
+      A Advise proto that conains the reports from all checkers.
+    """
+    advise_pb = tfprof_output_pb2.AdviceProto()
+    opts = _build_advisor_options(options)
+    advise_pb.ParseFromString(
+        print_mdl.Profile('advise'.encode('utf-8'), opts.SerializeToString()))
+    return advise_pb
 
 
 def print_model_analysis(graph,
@@ -354,3 +394,36 @@ def print_model_analysis(graph,
         None, None, 'unknown tfprof_cmd: %s\n' % tfprof_cmd)
 
   return tfprof_node
+
+
+def advise(graph, run_meta=None, tfprof_options=ALL_ADVICE):  # pylint: disable=dangerous-default-value
+  """Auto profile and advise.
+
+    Builds profiles and automatically check anormalies of various
+    aspects. See go/tfprof or README for examples and tutorials.
+
+  Args:
+    graph: tf.Graph.
+    run_meta: tensorflow::RunMetadata proto. Allows auto-profile
+              time and memroy.
+    tfprof_options: see ALL_ADVICE example above.
+  Returns:
+    Returns AdviceProto proto
+  """
+  # pylint: disable=protected-access
+  op_log = tfprof_logger._merge_default_with_oplog(
+      graph, None, run_meta, add_trace=True)
+  # pylint: enable=protected-access
+
+  run_meta_str = run_meta.SerializeToString() if run_meta else b''
+
+  opts = _build_advisor_options(tfprof_options)
+  ret = tfprof_output_pb2.AdviceProto()
+  ret.ParseFromString(
+      print_mdl.PrintModelAnalysis(
+          graph.as_graph_def(add_shapes=True).SerializeToString(),
+          run_meta_str,
+          op_log.SerializeToString(),
+          'advise'.encode('utf-8'),
+          opts.SerializeToString()))
+  return ret
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_test.py b/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_test.py
index 9db752c577..fea27a82a5 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_test.py
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_test.py
@@ -126,7 +126,7 @@ class PrintModelAnalysisTest(test.TestCase):
     opts['account_displayed_op_only'] = False
     opts['select'] = ['params', 'float_ops']
 
-    with session.Session() as sess, ops.device('/cpu:0'):
+    with session.Session() as sess:
       x = lib.BuildFullModel()
 
       sess.run(variables.global_variables_initializer())
@@ -176,6 +176,7 @@ class PrintModelAnalysisTest(test.TestCase):
     opts['select'] = [
         'bytes', 'params', 'float_ops', 'device'
     ]
+    opts['output'] = 'none'
 
     with session.Session() as sess:
       x = lib.BuildSmallModel()
@@ -276,6 +277,33 @@ class PrintModelAnalysisTest(test.TestCase):
       self.assertEqual(total_children, 15)
       self.assertGreater(input_shapes, 0)
 
+  def testAdvisor(self):
+    ops.reset_default_graph()
+
+    with session.Session() as sess:
+      x = lib.BuildFullModel()
+
+      sess.run(variables.global_variables_initializer())
+      run_meta = config_pb2.RunMetadata()
+      _ = sess.run(
+          x,
+          options=config_pb2.RunOptions(
+              trace_level=config_pb2.RunOptions.FULL_TRACE),
+          run_metadata=run_meta)
+
+      advice_pb = model_analyzer.advise(sess.graph, run_meta)
+      self.assertTrue('AcceleratorUtilizationChecker' in advice_pb.checkers)
+      self.assertTrue('ExpensiveOperationChecker' in advice_pb.checkers)
+      self.assertTrue('OperationChecker' in advice_pb.checkers)
+
+      checker = advice_pb.checkers['AcceleratorUtilizationChecker']
+      if test.is_gpu_available():
+        self.assertGreater(len(checker.reports), 0)
+      else:
+        self.assertEqual(len(checker.reports), 0)
+      checker = advice_pb.checkers['ExpensiveOperationChecker']
+      self.assertGreater(len(checker.reports), 0)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/profiler_test.py b/tensorflow/contrib/tfprof/python/tools/tfprof/profiler_test.py
index 5daaafd7c8..c7113b6a57 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/profiler_test.py
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/profiler_test.py
@@ -129,7 +129,7 @@ class ProfilerTest(test.TestCase):
     opts = model_analyzer.PRINT_ALL_TIMING_MEMORY.copy()
     opts['account_type_regexes'] = ['.*']
 
-    with session.Session() as sess, ops.device('/cpu:0'):
+    with session.Session() as sess:
       r1, r2, r3 = lib.BuildSplitableModel()
       sess.run(variables.global_variables_initializer())
 
@@ -179,8 +179,18 @@ class ProfilerTest(test.TestCase):
       self.assertEqual(lib.SearchTFProfNode(pb2, 'add'), None)
       self.assertGreater(lib.SearchTFProfNode(pb3, 'add').exec_micros, 0)
 
-      # TODO(xpan): Better test of advisor.
-      profiler.advise()
+      advice_pb = profiler.advise(model_analyzer.ALL_ADVICE)
+      self.assertTrue('AcceleratorUtilizationChecker' in advice_pb.checkers)
+      self.assertTrue('ExpensiveOperationChecker' in advice_pb.checkers)
+      self.assertTrue('OperationChecker' in advice_pb.checkers)
+
+      checker = advice_pb.checkers['AcceleratorUtilizationChecker']
+      if test.is_gpu_available():
+        self.assertGreater(len(checker.reports), 0)
+      else:
+        self.assertEqual(len(checker.reports), 0)
+      checker = advice_pb.checkers['ExpensiveOperationChecker']
+      self.assertGreater(len(checker.reports), 0)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/training/python/training/evaluation.py b/tensorflow/contrib/training/python/training/evaluation.py
index a895d90b8e..24b733dd29 100644
--- a/tensorflow/contrib/training/python/training/evaluation.py
+++ b/tensorflow/contrib/training/python/training/evaluation.py
@@ -226,7 +226,7 @@ def checkpoints_iterator(checkpoint_dir,
 
   This behavior gives control to callers on what to do if checkpoints do not
   come fast enough or stop being generated.  For example, if callers have a way
-  to detect that the training has stopped and know that no new checkpoints
+  to detect that the training has stopped and know that no new new checkpoints
   will be generated, they can provide a `timeout_fn` that returns `True` when
   the training has stopped.  If they know that the training is still going on
   they return `False` instead.
diff --git a/tensorflow/contrib/verbs/rdma.cc b/tensorflow/contrib/verbs/rdma.cc
index 6f3a616fe8..bc687be0ab 100644
--- a/tensorflow/contrib/verbs/rdma.cc
+++ b/tensorflow/contrib/verbs/rdma.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
-#include "tensorflow/core/common_runtime/gpu/process_state.h"
 #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
 #include "tensorflow/core/framework/rendezvous.h"
@@ -684,6 +683,7 @@ void RdmaTensorBuffer::SendNextItem() {
                          << " error message: " << status.error_message();
       size_t buffer_size = RdmaMessage::kMessageTotalBytes;
       size_t tensor_bytes = 0;
+      TensorProto proto;
       // Figures out which device the tensor is hosted on.
       Device* src_dev = nullptr;
       Status s = channel_->adapter_->worker_env_->device_mgr->LookupDevice(
@@ -703,47 +703,21 @@ void RdmaTensorBuffer::SendNextItem() {
       CHECK(s.ok()) << "dst device not found";
       AllocatorAttributes dst_alloc_attr;
       dst_alloc_attr.set_on_host(true);
-
-      bool can_memcpy = DataTypeCanUseMemcpy(in.dtype());
       // string tensor needs to be serialized
-      Tensor copy;
-      StringPiece copy_buf;
-      TensorProto proto;
       if (src_dev->tensorflow_gpu_device_info() &&
           (!send_args.alloc_attrs.on_host())) {
         CHECK(send_args.device_context)
-          << "send dev name: " << src_dev->name()
-          << " gpu_info: " << src_dev->tensorflow_gpu_device_info();
-
-        if (can_memcpy) {
-          AllocatorAttributes host_alloc_attrs;
-          host_alloc_attrs.set_gpu_compatible(true);
-          host_alloc_attrs.set_on_host(true);
-          Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0);
-          copy = Tensor(alloc, in.dtype(), in.shape());
-          s = VerbsUtil::CopyGPUTensorToCPUSync(
-              src_dev, send_args.device_context, &in, &copy);
-          CHECK(s.ok()) << "copy tensor from gpu sync";
-          copy_buf = copy.tensor_data();
-        } else {
-          // "val" is on a GPU. Uses GPUUtil to fill the proto.
-          s = VerbsUtil::SetProtoFromGPUSync(
-              in, src_dev, send_args.device_context, &proto, is_dead);
-          CHECK(s.ok()) << "set proto from gpu sync";
-        }
+            << "send dev name: " << src_dev->name()
+            << " gpu_info: " << src_dev->tensorflow_gpu_device_info();
+        // "val" is on a GPU. Uses GPUUtil to fill the proto.
+        s = VerbsUtil::SetProtoFromGPUSync(
+            in, src_dev, send_args.device_context, &proto, is_dead);
+        CHECK(s.ok()) << "set proto from gpu sync";
       } else {
         // tensor is in CPU memory.
-        if (can_memcpy) {
-          copy_buf = in.tensor_data();
-        } else {
-          in.AsProtoTensorContent(&proto);
-        }
-      }
-      if (can_memcpy) {
-        tensor_bytes = in.TotalBytes();
-      } else {
-        tensor_bytes = proto.ByteSize();
+        in.AsProtoTensorContent(&proto);
       }
+      tensor_bytes = proto.ByteSize();
       // maybe some margin for string tensor?
       buffer_size += tensor_bytes;
       // prepare message
@@ -797,16 +771,7 @@ void RdmaTensorBuffer::SendNextItem() {
               static_cast<void*>(static_cast<char*>(buffer_) +
                                  RdmaMessage::kTensorBufferStartIndex);
           CHECK(tensor_bytes + RdmaMessage::kTensorBufferStartIndex <= size_);
-          if (can_memcpy) {
-            CHECK(copy_buf.size() == tensor_bytes)
-               << "unexpected tensor size: "
-               << copy_buf.size()
-               << " != "
-               << tensor_bytes;
-            memcpy(output, copy_buf.data(), tensor_bytes);
-          } else {
-            proto.SerializeToArray(output, tensor_bytes);
-          }
+          proto.SerializeToArray(output, tensor_bytes);
         } else {
           buffer_size = RdmaMessage::kMessageTotalBytes;
         }
diff --git a/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc b/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc
index 9ea696589a..5871400f26 100644
--- a/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc
+++ b/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
-#include "tensorflow/core/common_runtime/gpu/process_state.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -100,40 +99,12 @@ void RdmaRemoteRendezvous::RecvFromRemoteAsync(
     if (!rm.is_dead_) {
       void* input = static_cast<char*>(rb->buffer_) +
                     RdmaMessage::kTensorBufferStartIndex;
-      bool can_memcpy = DataTypeCanUseMemcpy(rm.data_type_);
-      if (can_memcpy) {
-        if (dst_dev->tensorflow_gpu_device_info() &&
-            (!recv_args.alloc_attrs.on_host())) {
-          CHECK(recv_args.device_context)
-            << "send dev name: " << src_dev->name()
-            << " gpu_info: " << src_dev->tensorflow_gpu_device_info();
-          Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0);
-          Tensor copy(alloc, rm.data_type_, rm.tensor_shape_);
-          memcpy(DMAHelper::base(&copy), input, rm.tensor_bytes_);
-
-          Allocator* dst_alloc = dst_dev->GetAllocator(recv_args.alloc_attrs);
-          Tensor gpu_copy(dst_alloc, rm.data_type_, rm.tensor_shape_);
-          s = VerbsUtil::CopyCPUTensorToGPUSync(&copy, recv_args.device_context,
-                                                dst_dev, &gpu_copy);
-          CHECK(s.ok()) << "copy tensor to gpu sync";
-          val = std::move(gpu_copy);
-        } else {
-          AllocatorAttributes host_alloc_attrs;
-          host_alloc_attrs.set_gpu_compatible(true);
-          host_alloc_attrs.set_on_host(true);
-          Allocator* alloc = dst_dev->GetAllocator(host_alloc_attrs);
-          Tensor copy(alloc, rm.data_type_, rm.tensor_shape_);
-          memcpy(DMAHelper::base(&copy), input, rm.tensor_bytes_);
-          val = std::move(copy);
-        }
-      } else {
-        TensorProto proto;
-        CHECK(rm.tensor_bytes_ + RdmaMessage::kTensorBufferStartIndex <=
-              rb->size_);
-        CHECK(ParseProtoUnlimited(&proto, input, rm.tensor_bytes_))
-            << "fail to parse proto from array";
-        s = dst_dev->MakeTensorFromProto(proto, recv_args.alloc_attrs, &val);
-      }
+      TensorProto proto;
+      CHECK(rm.tensor_bytes_ + RdmaMessage::kTensorBufferStartIndex <=
+            rb->size_);
+      CHECK(ParseProtoUnlimited(&proto, input, rm.tensor_bytes_))
+          << "fail to parse proto from array";
+      s = dst_dev->MakeTensorFromProto(proto, recv_args.alloc_attrs, &val);
     }
 
     rc->RemoveRecvCallback(key_with_step_id);
diff --git a/tensorflow/contrib/verbs/verbs_util.cc b/tensorflow/contrib/verbs/verbs_util.cc
index 76e44d34a9..c3350f7958 100644
--- a/tensorflow/contrib/verbs/verbs_util.cc
+++ b/tensorflow/contrib/verbs/verbs_util.cc
@@ -20,40 +20,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 namespace tensorflow {
 
-// static sync wrapper:
-Status VerbsUtil::CopyGPUTensorToCPUSync(Device* gpu_device,
-                              const DeviceContext* device_context,
-                              const Tensor* gpu_tensor,
-                              Tensor* cpu_tensor) {
-  Notification n;
-  Status status;
-  GPUUtil::CopyGPUTensorToCPU(gpu_device, device_context,
-                              gpu_tensor, cpu_tensor,
-                              [&n, &status](const Status& s) {
-                                status = s;
-                                n.Notify();
-                              });
-  n.WaitForNotification();
-  return status;
-}
-
-// static sync wrapper:
-Status VerbsUtil::CopyCPUTensorToGPUSync(const Tensor* cpu_tensor,
-                                         const DeviceContext* device_context,
-                                         Device* gpu_device,
-                                         Tensor* gpu_tensor) {
-  Notification n;
-  Status status;
-  GPUUtil::CopyCPUTensorToGPU(cpu_tensor, device_context,
-                              gpu_device, gpu_tensor,
-                              [&n, &status](const Status& s) {
-                                status = s;
-                                n.Notify();
-                              });
-  n.WaitForNotification();
-  return status;
-}
-
 // static sync wrapper:
 Status VerbsUtil::SetProtoFromGPUSync(const Tensor& tensor, Device* dev,
                                       const DeviceContext* device_context,
diff --git a/tensorflow/contrib/verbs/verbs_util.h b/tensorflow/contrib/verbs/verbs_util.h
index d9da396228..cbc01adae4 100644
--- a/tensorflow/contrib/verbs/verbs_util.h
+++ b/tensorflow/contrib/verbs/verbs_util.h
@@ -28,16 +28,6 @@ class TensorProto;
 
 class VerbsUtil {
  public:
-  // synchronous wrapper of CopyGPUTensorToCPU
-  static Status CopyGPUTensorToCPUSync(Device* gpu_device,
-                                       const DeviceContext* device_context,
-                                       const Tensor* gpu_tensor,
-                                       Tensor* cpu_tensor);
-  // synchronous wrapper of CopyCPUTensorToGPU
-  static Status CopyCPUTensorToGPUSync(const Tensor* cpu_tensor,
-                                       const DeviceContext* device_context,
-                                       Device* gpu_device,
-                                       Tensor* gpu_tensor);
   // synchronous wrapper of SetProtoFromGPU
   static Status SetProtoFromGPUSync(const Tensor& tensor, Device* dev,
                                     const DeviceContext* device_context,
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 4b951691fb..21a20bcc4d 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -654,11 +654,11 @@ Status DirectSession::Run(const RunOptions& run_options,
 
   // If requested via RunOptions, output the partition graphs.
   if (run_options.output_partition_graphs()) {
-    protobuf::RepeatedPtrField<GraphDef>* partition_graph_defs =
+    protobuf::RepeatedPtrField<GraphDef>* parition_graph_defs =
         run_metadata->mutable_partition_graphs();
     for (const PerPartitionExecutorsAndLib& exec_and_lib :
          executors_and_keys->items) {
-      GraphDef* partition_graph_def = partition_graph_defs->Add();
+      GraphDef* partition_graph_def = parition_graph_defs->Add();
       exec_and_lib.graph->ToGraphDef(partition_graph_def);
     }
   }
diff --git a/tensorflow/core/debug/grpc_session_debug_test.cc b/tensorflow/core/debug/grpc_session_debug_test.cc
index d6f35fe24c..3827596a67 100644
--- a/tensorflow/core/debug/grpc_session_debug_test.cc
+++ b/tensorflow/core/debug/grpc_session_debug_test.cc
@@ -279,12 +279,9 @@ TEST_F(GrpcSessionDebugTest, MultiDevices_String) {
 
         DeleteDumpDir();
       } else {
-        // CUDA and SYCL devices do not have an Identity op for strings
         LOG(ERROR) << "Error: " << s;
         ASSERT_TRUE((a_dev.device_type() == DEVICE_GPU) ||
-                    (a_dev.device_type() == DEVICE_SYCL) ||
-                    (b_dev.device_type() == DEVICE_GPU) ||
-                    (b_dev.device_type() == DEVICE_SYCL));
+                    (b_dev.device_type() == DEVICE_GPU));
         ASSERT_FALSE(s.ok());
       }
     }
diff --git a/tensorflow/core/framework/graph_def_util.h b/tensorflow/core/framework/graph_def_util.h
index 950737c39a..56355eaf36 100644
--- a/tensorflow/core/framework/graph_def_util.h
+++ b/tensorflow/core/framework/graph_def_util.h
@@ -62,7 +62,7 @@ Status AddDefaultAttrsToGraphDef(GraphDef* graph_def,
 // attr with a default was added). Note that this will not affect
 // attrs with non-default values, so you must run a
 // ValidateGraphDef...() function to see if the result is in fact
-// compatible. If not nullptr, the op/attr pairs that were removed
+// compatible. If not nulllptr, the op/attr pairs that were removed
 // are added to '*op_attr_removed'.
 //
 // Expected usage, for a producer that wants to prepare a graph for
diff --git a/tensorflow/core/framework/op.h b/tensorflow/core/framework/op.h
index a4dd06de45..c5a0983a54 100644
--- a/tensorflow/core/framework/op.h
+++ b/tensorflow/core/framework/op.h
@@ -205,6 +205,7 @@ class OpDefBuilderWrapper;
 template <>
 class OpDefBuilderWrapper<true> {
  public:
+  typedef OpDefBuilderWrapper<true> WrapperType;
   OpDefBuilderWrapper(const char name[]) : builder_(name) {}
   OpDefBuilderWrapper<true>& Attr(StringPiece spec) {
     builder_.Attr(spec);
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index a164fe61b5..49eecc0b08 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -307,7 +307,7 @@ class Tensor {
   /// Returns the data as an Eigen::Tensor with NDIMS dimensions, collapsing the
   /// first 'begin' Tensor dimensions into the first dimension of the result and
   /// the Tensor dimensions of the last dims() - 'begin' - NDIMS into the last
-  /// dimension of the result. If 'begin' < 0 then the |'begin'| leading
+  /// dimension of the result. If 'begin' < 0 then the the |'begin'| leading
   /// dimensions of size 1 will be added. If 'begin' + NDIMS > dims() then
   /// 'begin' + NDIMS - dims() trailing dimensions of size 1 will be added.
   template <typename T, size_t NDIMS = 3>
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 625780e7c9..94741a11ff 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -247,10 +247,16 @@ namespace tensorflow {
 //
 //           P = Conv2DWithBiasBackpropBias(O, O_m)
 //
-// Rewrite of BiasAddGrad into Conv2DWithBiasBackpropBias takes place depending
-// on the matching 'context'. The term context is loosely related to which
-// forward op is _associated_ to BiasAddGrad. If it is _MklConv2DWithBias then
-// we consider it Conv2D context; if it is MatMul, then it is MatMul context.
+// 'Distance' between input of BiasAddGrad and _MklConv2D in terms of hops is
+// the context matching depth. If _MklConv2DWithBias is not within the context
+// matching depth, then we do not rewrite BiasAddGrad.
+
+// How many hops do we search for matching node in the backward dataflow graph?
+// We use maxhop of 10 based on empirical observations. Also, these are
+// maxhops in backward data-flow graph. Since input of forward nodes (Conv2D)
+// directly goes to backward nodes, we do not expect the hop-distance
+// would be more than few nodes.
+static size_t kNodeMergeContextMaxDepth = 10;
 
 class MklLayoutRewritePass : public GraphOptimizationPass {
  public:
@@ -274,8 +280,6 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.max_pool = "MaxPool";
     csinfo_.max_pool_grad = "MaxPoolGrad";
     csinfo_.mkl_conv2d = "_MklConv2D";
-    csinfo_.mkl_conv2d_grad_input = "_MklConv2DBackpropInput";
-    csinfo_.mkl_conv2d_grad_filter = "_MklConv2DBackpropFilter";
     csinfo_.mkl_conv2d_with_bias = "_MklConv2DWithBias";
     csinfo_.mkl_conv2d_with_bias_backprop_bias =
                                    "_MklConv2DWithBiasBackpropBias";
@@ -356,12 +360,16 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     minfo_.push_back({csinfo_.mkl_conv2d, csinfo_.bias_add, 0,
                       csinfo_.mkl_conv2d_with_bias});
 
+    // We use maxhop of 10 based on empirical observations. Also, these are
+    // maxhops in backward data-flow graph. Since input of forward nodes
+    // (Conv2D) directly goes to backward nodes, we do not expect the
+    // hop-distance would be more than few nodes.
     biasaddgrad_matmul_context_ = {csinfo_.bias_add_grad, csinfo_.matmul,
-                                   IsBiasAddGradInMatMulContext};
+                                   kNodeMergeContextMaxDepth};
 
     biasaddgrad_conv2dwithbias_context_ = {csinfo_.bias_add_grad,
                                    csinfo_.mkl_conv2d_with_bias,
-                                   IsBiasAddGradInConv2DWithBiasContext};
+                                   kNodeMergeContextMaxDepth};
 
     cinfo_.push_back(&biasaddgrad_matmul_context_);
     cinfo_.push_back(&biasaddgrad_conv2dwithbias_context_);
@@ -384,7 +392,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string node;     // Name of the node to be rewritten
     string fwd;      // Name of the node in the forward pass that this node
                      // corresponds to
-    std::function<bool(const Node*, const Node**, void* c)> context_match_fn;
+    size_t max_hop;  // Maximum number of hops the fwd is located
+                     // from this node. If the fwd is farther than max_hop
+                     // then we do not rewrite the node.
   } ContextInfo;
 
   /// Structure to specify the name of an original node, its new name after
@@ -428,7 +438,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
   /// Structure to store all constant strings
   /// NOTE: names are alphabetically sorted.
-  typedef struct {
+  struct {
     string avg_pool;
     string avg_pool_grad;
     string bias_add;
@@ -447,15 +457,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string max_pool;
     string max_pool_grad;
     string mkl_conv2d;
-    string mkl_conv2d_grad_input;
-    string mkl_conv2d_grad_filter;
     string mkl_conv2d_with_bias;
     string mkl_conv2d_with_bias_backprop_bias;
     string relu;
     string relu_grad;
     string reshape;
     string split;
-  } ConstStringsInfo;
+  } csinfo_;
 
  private:
   /// Maintain info about nodes to rewrite
@@ -470,9 +478,6 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   /// Maintain info about nodes to rewrite
   static std::vector<ContextInfo*> cinfo_;
 
-  /// Maintain structure of constant strings
-  static ConstStringsInfo csinfo_;
-
   /// Context variables used in referencing rules
   static ContextInfo biasaddgrad_matmul_context_;
   static ContextInfo biasaddgrad_conv2dwithbias_context_;
@@ -624,173 +629,6 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return false;
   }
 
-  // Is BiasAddGrad node in 'n' is associated with Conv2DWithBias node
-  // specified in contextinfo 'ci'. Function updates fwd_node to point
-  // to Conv2DWithBias node if 'n' is associated with Conv2DWithBias.
-  //
-  // Association checks for one of the following graphs:
-  //
-  // Graph A:
-  //
-  // _ = Conv2DWithBias(F, I, _)
-  // ..
-  // _ = Conv2DBackpropFilter(F, _, G)
-  // _ = Conv2DBackpropInput(_, I, G)
-  // _ = BiasAddGrad(G)
-  //
-  // OR
-  //
-  // Graph B:
-  //
-  // _ = Conv2DWithBias(F, _, _)
-  // ..
-  // _ = Conv2DBackpropFilter(F, _, G)
-  // _ = BiasAddGrad(G)
-  //
-  // Here F, G, and I are graph nodes; _ represents graph nodes that we
-  // don't care here.
-  //
-  // @return - true (if BiasAddGrad is associated with Conv2DWithBias);
-  //           false otherwise.
-  static bool IsBiasAddGradInConv2DWithBiasContext(const Node* n,
-                                                   const Node** fwd_node,
-                                                   void* ci) {
-    CHECK_NOTNULL(n);
-    CHECK_NOTNULL(fwd_node);
-    CHECK_NOTNULL(ci);
-    *fwd_node = nullptr;
-
-    CHECK_EQ(n->type_string(), csinfo_.bias_add_grad);
-
-    // Get the only 1 input of BiasAddGrad.
-    CHECK_EQ(n->num_inputs(), 1);
-    const Node* bias_add_grad_inp = nullptr;
-    TF_CHECK_OK(n->input_node(0, &bias_add_grad_inp));
-    CHECK_NOTNULL(bias_add_grad_inp);
-
-    // Check if this input also goes to BackpropFilter and BackpropInput
-    // as 3rd input.
-    bool found_backprop_input = false;
-    bool found_backprop_filter = false;
-    Node* backprop_filter_node = nullptr;
-    Node* backprop_input_node = nullptr;
-
-    for (const Edge* e : bias_add_grad_inp->out_edges()) {
-      Node* third_input = nullptr;
-      if (e->dst()->type_string() == csinfo_.conv2d_grad_input ||
-          e->dst()->type_string() == csinfo_.mkl_conv2d_grad_input) {
-        // Third input (index 2) of BackpropInput
-        TF_CHECK_OK(e->dst()->input_node(2, &third_input));
-        // Third input (index 2) of BackpropInput must be same as the input
-        // of BiasAddGrad.
-        if (third_input == bias_add_grad_inp) {
-          found_backprop_input = true;
-          backprop_input_node = e->dst();
-        }
-      }
-
-      if (e->dst()->type_string() == csinfo_.conv2d_grad_filter ||
-          e->dst()->type_string() == csinfo_.mkl_conv2d_grad_filter) {
-        // Third input (index 2) of BackpropFilter
-        TF_CHECK_OK(e->dst()->input_node(2, &third_input));
-        // Third input (index 2) of BackpropFilter must be same as the input
-        // of BiasAddGrad.
-        if (third_input == bias_add_grad_inp) {
-          found_backprop_filter = true;
-          backprop_filter_node = e->dst();
-        }
-      }
-
-      // If we found both the nodes, then we can stop the search.
-      if (found_backprop_input && found_backprop_filter) {
-        break;
-      }
-    }
-
-    // If BackpropFilter node is not found, then this is not
-    // Conv2DWithBias context. For 2nd graph in the example above, only
-    // BackpropFilter would be present.
-    if (!found_backprop_filter) {
-      return false;
-    }
-
-    // Otherwise, we found the nodes.
-    CHECK_NOTNULL(backprop_filter_node);
-    if (found_backprop_input) {
-      CHECK_NOTNULL(backprop_input_node);
-    }
-
-    // Now that we confirmed that this is Conv2DWithBias context, we need to
-    // get access to the forward node (Conv2DWithBias). 2nd input of
-    // Conv2DWithBias is same as the 2nd input of Conv2DBackpropInput; 1st
-    // input of Conv2DWithBias is same as the 1st input of Conv2DBackpropFilter
-    // (This comes from definition of gradient computation for Conv2D).
-    if (found_backprop_input) {
-      // Graph A in the example.
-      Node* second_inp_of_input = nullptr;
-      Node* first_inp_of_filter = nullptr;
-      TF_CHECK_OK(backprop_input_node->input_node(1, &second_inp_of_input));
-      TF_CHECK_OK(backprop_filter_node->input_node(0, &first_inp_of_filter));
-      CHECK_NOTNULL(second_inp_of_input);
-      CHECK_NOTNULL(first_inp_of_filter);
-
-      // Now we need to find out Conv2DWithBias node from these input nodes.
-      // Conv2DWithBias node is the node that accepts both the nodes
-      // second_inp_of_input and first_inp_of_filter in 2nd and 1st input slots.
-      for (const Edge* fe : first_inp_of_filter->out_edges()) {
-        if (fe->dst()->type_string() == csinfo_.mkl_conv2d_with_bias &&
-            fe->dst_input() == 0) {
-          for (const Edge* ie : second_inp_of_input->out_edges()) {
-            if (ie->dst()->type_string() == csinfo_.mkl_conv2d_with_bias &&
-                ie->dst_input() == 1 && fe->dst() == ie->dst()) {
-              VLOG(1) << "MklLayoutRewritePass: found "
-                      << fe->dst()->DebugString()
-                      << " as the forward node for matching context, backward"
-                      << " node is: " << n->DebugString();
-              *fwd_node = fe->dst();
-              return true;
-            }
-          }
-        }
-      }
-    } else {
-      // We did not find BackpropInput, so we work with BackpropFilter only.
-      // Graph B in the example.
-      Node* first_inp_of_filter = nullptr;
-      TF_CHECK_OK(backprop_filter_node->input_node(0, &first_inp_of_filter));
-      CHECK_NOTNULL(first_inp_of_filter);
-
-      // Now we need to find out Conv2DWithBias node from first input of
-      // BackpropFIlter. Conv2DWithBias node is the node that accepts
-      // first_inp_of_filter in 1st input slot.
-      for (const Edge* fe : first_inp_of_filter->out_edges()) {
-        if (fe->dst()->type_string() == csinfo_.mkl_conv2d_with_bias &&
-            fe->dst_input() == 0) {
-          VLOG(1) << "MklLayoutRewritePass: found "
-                  << fe->dst()->DebugString()
-                  << " as the forward node for matching context, backward"
-                  << " node is: " << n->DebugString();
-          *fwd_node = fe->dst();
-          return true;
-        }
-      }
-    }
-
-    return false;
-  }
-
-  // Is BiasAddGrad node in 'n' is associated with MatMul node
-  // specified in contextinfo 'ci'. Function does not update fwd_node.
-  //
-  // @return - true (if BiasAddGrad is associated with MatMul);
-  //           false otherwise.
-  static bool IsBiasAddGradInMatMulContext(const Node* n,
-                                           const Node** fwd_node,
-                                           void* ci) {
-    return (!IsBiasAddGradInConv2DWithBiasContext(n, fwd_node, ci));
-  }
-
-
   // Rewrite rule that uses context-information for matching,
   // used in scenario 2.
   //
@@ -801,6 +639,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   static bool ContextMatchRewrite(const Node* n, const ContextInfo* c);
 
   // Helper function that searches the matching contextinfo for the node.
+  // Implements depth-first search in the data dependence graph for the
+  // gradient op in the backward direction.
   //
   // @input n - Node (gradient op) whose contextinfo is to be searched,
   //        fwd_node - pointer to node from the forward pass that this node
@@ -948,7 +788,6 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                                    Node* orig_node);
 };
 
-MklLayoutRewritePass::ConstStringsInfo MklLayoutRewritePass::csinfo_;
 MklLayoutRewritePass::ContextInfo
   MklLayoutRewritePass::biasaddgrad_conv2dwithbias_context_;
 MklLayoutRewritePass::ContextInfo
@@ -1828,12 +1667,12 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
   const ContextInfo* ci = nullptr;
   bool is_context_based_rewrite = false;
   if ((ci = SearchMatchingContext(orig_node, &fwd_node)) != nullptr) {
+    CHECK_NOTNULL(fwd_node);
     is_context_based_rewrite = true;
 
     // Sanity checks for context-based rewrite (if any)
     if (orig_node->type_string() == csinfo_.bias_add_grad &&
         ri->new_name == csinfo_.mkl_conv2d_with_bias_backprop_bias) {
-      CHECK_NOTNULL(fwd_node);
       DataType orig_T, ctx_T;
       string orig_data_format, ctx_data_format;
       TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &orig_T));
@@ -1945,17 +1784,69 @@ MklLayoutRewritePass::SearchMatchingContext(const Node* n,
   CHECK_NOTNULL(fwd_node);
   *fwd_node = nullptr;
 
-  // Search for matching contextinfo based on node name and call
-  // callback function using matching contextinfo.
-  // There could be more than one matching contextinfos but whichever
-  // matches first is returned.
+  // Search for matching contextinfo based on node name.
+  // There could be more than one matching contextinfos.
+  bool is_matching_cinfo_found = false;
+  std::vector<const ContextInfo*> mci;
   for (auto ci = cinfo_.cbegin(); ci != cinfo_.cend(); ++ci) {
-    if (n->type_string() == (*ci)->node &&
-        (*ci)->context_match_fn(n, fwd_node, *ci)) {
-      VLOG(1) << "Found context as matching: " << (*ci)->fwd;
-      return *ci;
+    if (n->type_string() == (*ci)->node) {
+      mci.push_back(*ci);
+      is_matching_cinfo_found = true;
     }
   }
+  // If no matching contextinfo is found, return immediately.
+  if (!is_matching_cinfo_found) {
+    return nullptr;
+  }
+
+  VLOG(1) << "MklLayoutRewritePass: Searching graph for: " << n->type_string()
+          << " in backwards.";
+
+  // Now we will check for forward op name for context info in data
+  // flow graph. Get the max hops we should search for the fwd node.
+  // We are now going to search (breadth-first) backwards in data
+  // dependence graph (for up to max hops) from n for the node
+  // specified in fwd.
+  // queue to maintain nodes to be visited and depth info for
+  // breadth-first search
+  std::queue<std::pair<const Node*, int>> nqueue;
+  const Node* curr_node = n;
+  size_t curr_depth = 0;
+  nqueue.push(std::make_pair(curr_node, curr_depth));
+
+  while (curr_depth < kNodeMergeContextMaxDepth && !nqueue.empty()) {
+    std::pair<const Node*, int> curr_pair = nqueue.front();
+    nqueue.pop();
+
+    std::set<const Node*> visited_nodes;
+    curr_node = curr_pair.first;
+    curr_depth = curr_pair.second;
+    CHECK_NOTNULL(curr_node);
+
+    VLOG(1) << "MklLayoutRewritePass: Visiting node: "
+            << curr_node->type_string() << " at depth: " << curr_depth
+            << " for node: " << n->type_string();
+
+    // If we find a match, we return immediately.
+    for (const ContextInfo* ci : mci) {
+      if (curr_node->type_string() == ci->fwd) {
+        *fwd_node = curr_node;
+        return ci;
+      }
+    }
+
+    // Else we explore backward edges from current node.
+    // Add the source nodes of all incoming edges of the node to the queue.
+    for (const Edge* e : curr_node->in_edges()) {
+      // We do not visit already visited node.
+      if (visited_nodes.find(e->src()) == visited_nodes.end()) {
+        // Depth of these nodes is 1 more than the depth of current node.
+        nqueue.push(std::make_pair(e->src(), curr_depth + 1));
+        visited_nodes.insert(e->src());
+      }
+    }
+  } /* while */
+
   return nullptr;
 }
 
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index efbe2134e0..3c4a5263af 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -345,8 +345,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_AttrMismatch) {
 // Test set 2: _MklConv2D..BiasAddGrad -> _MklConv2DWithBiasBackpropBias
 // rewrite tests
 
-// BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter
-// and BackpropInput
+// D=_MklConv2D(A,M,B,N,C,O); E=Sub(D,A); F=BiasAddGrad(E)
 TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Positive) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
@@ -365,255 +364,16 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Positive) {
       "node { name: 'E' op: 'Sub'"
       " attr {key: 'T'                 value { type: DT_FLOAT } }"
       " input: ['D', 'A']}"
-      "node { name: 'F' op: 'Int32Input'}"
-      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'F', 'E', 'M', 'N', 'O'] }"
-      "node { name: 'H' op: 'Int32Input'}"
-      "node { name: 'I' op: '_MklConv2DBackpropInput'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['H', 'B', 'E', 'M', 'N', 'O']}"
-      "node { name: 'J' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['E'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);DMT/_0(Const);"
-            "E(Sub);F(Int32Input);G(_MklConv2DBackpropFilter);H(Int32Input);"
-            "I(_MklConv2DBackpropInput);J(_MklConv2DWithBiasBackpropBias);"
-            "M(_MklInput);N(_MklInput);O(_MklInput)|A->D;A->E:1;A->G;B->D:1;"
-            "B->I:1;C->D:2;D->E;DMT/_0->J:1;E->G:2;E->I:2;E->J;"
-            "E:control->DMT/_0:control;F->G:1;H->I;M->D:3;M->G:3;M->I:3;"
-            "N->D:4;N->G:4;N->I:4;O->D:5;O->G:5;O->I:5");
-}
-
-// BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter
-// and BackpropInput. But nodes do not match criteria for rewrite. So
-// rewrite should not happen.
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative1) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'O' op: '_MklInput'}"
-      "node { name: 'D' op: '_MklConv2DWithBias'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
-      "node { name: 'E' op: 'Sub'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['D', 'A']}"
-      "node { name: 'F' op: 'Int32Input'}"
-      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['E', 'F', 'A', 'M', 'N', 'O'] }"
-      "node { name: 'H' op: 'Int32Input'}"
-      "node { name: 'I' op: '_MklConv2DBackpropInput'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['H', 'B', 'E', 'M', 'N', 'O']}"
-      "node { name: 'J' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['E'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
-            "E(Sub);F(Int32Input);G(_MklConv2DBackpropFilter);H(Int32Input);"
-            "I(_MklConv2DBackpropInput);J(BiasAddGrad);"
-            "M(_MklInput);N(_MklInput);O(_MklInput)|A->D;A->E:1;A->G:2;B->D:1;"
-            "B->I:1;C->D:2;D->E;E->G;E->I:2;E->J;F->G:1;H->I;M->D:3;M->G:3;"
-            "M->I:3;N->D:4;N->G:4;N->I:4;O->D:5;O->G:5;O->I:5");
-}
-
-// BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter
-// and BackpropInput. But nodes do not match criteria for rewrite. So
-// rewrite should not happen.
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative2) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'O' op: '_MklInput'}"
-      "node { name: 'D' op: '_MklConv2DWithBias'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['B', 'A', 'C', 'M', 'N', 'O']}"
-      "node { name: 'E' op: 'Sub'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['D', 'A']}"
-      "node { name: 'F' op: 'Int32Input'}"
-      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'F', 'E', 'M', 'N', 'O'] }"
-      "node { name: 'H' op: 'Int32Input'}"
-      "node { name: 'I' op: '_MklConv2DBackpropInput'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['H', 'B', 'E', 'M', 'N', 'O']}"
-      "node { name: 'J' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['E'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
-            "E(Sub);F(Int32Input);G(_MklConv2DBackpropFilter);H(Int32Input);"
-            "I(_MklConv2DBackpropInput);J(BiasAddGrad);"
-            "M(_MklInput);N(_MklInput);O(_MklInput)|A->D:1;A->E:1;A->G;B->D;"
-            "B->I:1;C->D:2;D->E;E->G:2;E->I:2;E->J;F->G:1;H->I;M->D:3;M->G:3;"
-            "M->I:3;N->D:4;N->G:4;N->I:4;O->D:5;O->G:5;O->I:5");
-}
-
-
-// BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter only
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'O' op: '_MklInput'}"
-      "node { name: 'D' op: '_MklConv2DWithBias'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
-      "node { name: 'E' op: 'Sub'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['D', 'A']}"
-      "node { name: 'F' op: 'Int32Input'}"
-      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'F', 'E', 'M', 'N', 'O'] }"
-      "node { name: 'H' op: 'BiasAddGrad'"
+      "node { name: 'F' op: 'BiasAddGrad'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
       " input: ['E'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);DMT/_0(Const);"
-            "E(Sub);F(Int32Input);G(_MklConv2DBackpropFilter);"
-            "H(_MklConv2DWithBiasBackpropBias);M(_MklInput);N(_MklInput);"
-            "O(_MklInput)|A->D;A->E:1;A->G;B->D:1;C->D:2;D->E;DMT/_0->H:1;"
-            "E->G:2;E->H;E:control->DMT/_0:control;F->G:1;M->D:3;M->G:3;"
-            "N->D:4;N->G:4;O->D:5;O->G:5");
-}
-
-// BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter only
-// But BackpropFilter node inputs do not satisfy criteria for rewrite.
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Negative1) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'O' op: '_MklInput'}"
-      "node { name: 'D' op: '_MklConv2DWithBias'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
-      "node { name: 'E' op: 'Sub'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['D', 'A']}"
-      "node { name: 'F' op: 'Int32Input'}"
-      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['E', 'F', 'A', 'M', 'N', 'O'] }"
-      "node { name: 'H' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['E'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
-            "E(Sub);F(Int32Input);G(_MklConv2DBackpropFilter);H(BiasAddGrad);"
-            "M(_MklInput);N(_MklInput);O(_MklInput)|A->D;A->E:1;A->G:2;B->D:1;"
-            "C->D:2;D->E;E->G;E->H;F->G:1;M->D:3;M->G:3;N->D:4;N->G:4;O->D:5;"
-            "O->G:5");
-}
-
-// BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter only
-// But BackpropFilter node inputs do not satisfy criteria for rewrite.
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Negative2) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'O' op: '_MklInput'}"
-      "node { name: 'D' op: '_MklConv2DWithBias'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['B', 'A', 'C', 'M', 'N', 'O']}"
-      "node { name: 'E' op: 'Sub'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['D', 'A']}"
-      "node { name: 'F' op: 'Int32Input'}"
-      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'F', 'E', 'M', 'N', 'O'] }"
-      "node { name: 'H' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['E'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
-            "E(Sub);F(Int32Input);G(_MklConv2DBackpropFilter);H(BiasAddGrad);"
-            "M(_MklInput);N(_MklInput);O(_MklInput)|A->D:1;A->E:1;A->G;B->D;"
-            "C->D:2;D->E;E->G:2;E->H;F->G:1;M->D:3;M->G:3;N->D:4;N->G:4;O->D:5;"
-            "O->G:5");
+            "E(Sub);F(_MklConv2DWithBiasBackpropBias);M(_MklInput);"
+            "N(_MklInput);O(_MklInput)|A->D;A->E:1;B->D:1;C->D:2;D->E;"
+            "DMT/_0->F:1;E->F;E:control->DMT/_0:control;M->D:3;N->D:4;"
+            "O->D:5");
 }
 
 // No _MklConv2DWithBias in context, but _MklConv2D in context.
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
index f6595fcbb3..e959eab54e 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
@@ -728,7 +728,7 @@ TEST_F(VirtualSchedulerTest, ComplexDependency) {
            1 /* control dependency */);
   EXPECT_EQ(expected_size, cpu_state.memory_usage);
 
-  // Nodes currently in memory: bn's port -1, 0, and 2, and x's port 0.
+  // Nodes currrently in memory: bn's port -1, 0, and 2, and x's port 0.
   std::set<std::pair<string, int>> nodes_in_memory;
   std::transform(
       cpu_state.nodes_in_memory.begin(), cpu_state.nodes_in_memory.end(),
diff --git a/tensorflow/core/grappler/grappler_item.h b/tensorflow/core/grappler/grappler_item.h
index 1e7a9dfaf5..84a7681782 100644
--- a/tensorflow/core/grappler/grappler_item.h
+++ b/tensorflow/core/grappler/grappler_item.h
@@ -23,7 +23,6 @@ limitations under the License.
 
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/variable.pb.h"
 #include "tensorflow/core/protobuf/queue_runner.pb.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/grappler/optimizers/auto_parallel.cc b/tensorflow/core/grappler/optimizers/auto_parallel.cc
index d46b849ad4..42f2f1850f 100644
--- a/tensorflow/core/grappler/optimizers/auto_parallel.cc
+++ b/tensorflow/core/grappler/optimizers/auto_parallel.cc
@@ -168,11 +168,6 @@ Status AutoParallel::Initialize(const GrapplerItem& item) {
   for (const auto& variable : item.MainVariables()) {
     dont_replicate_nodes.insert(variable->name());
   }
-
-  for (const auto& init : item.init_ops) {
-    dont_replicate_nodes.insert(NodeName(init));
-  }
-
   // Don't replicate all input nodes, except the dequeue node.
   for (const auto& input_node : input_nodes) {
     if (input_node->name() != dequeue_node->name()) {
diff --git a/tensorflow/core/grappler/optimizers/auto_parallel.h b/tensorflow/core/grappler/optimizers/auto_parallel.h
index c5d2d47782..ad90bbe028 100644
--- a/tensorflow/core/grappler/optimizers/auto_parallel.h
+++ b/tensorflow/core/grappler/optimizers/auto_parallel.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
 
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
-#include "tensorflow/core/framework/variable.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/grappler/optimizers/auto_parallel_test.cc b/tensorflow/core/grappler/optimizers/auto_parallel_test.cc
index 9a41b5e0b5..3d1b4a34bf 100644
--- a/tensorflow/core/grappler/optimizers/auto_parallel_test.cc
+++ b/tensorflow/core/grappler/optimizers/auto_parallel_test.cc
@@ -33,7 +33,6 @@ TEST_F(AutoParallelTest, SimpleParallel) {
   Output constant_b = ops::Const(s.WithOpName("constant_b"), 1, {1});
   Output var = ops::Variable(s.WithOpName("var"), {1}, DT_FLOAT);
   Output assign = ops::Assign(s.WithOpName("assign"), {var}, {constant_a});
-  Output identity = ops::Identity(s.WithOpName("identity"), {var});
   Output fifo_queue = ops::FIFOQueue(s.WithOpName("fifo_queue"), {DT_FLOAT});
   auto dequeue = ops::QueueDequeueMany(s.WithOpName("dequeue"), {fifo_queue},
                                        {constant_b}, {DT_FLOAT});
@@ -45,14 +44,13 @@ TEST_F(AutoParallelTest, SimpleParallel) {
   GrapplerItem item;
   item.init_ops.push_back("assign");
   item.fetch.push_back("apply_gradient");
-  item.init_ops.push_back("assign");
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
   AutoParallel parallel(2);
   GraphDef output;
   Status status = parallel.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
-  EXPECT_EQ(21, output.node_size());
+  EXPECT_EQ(20, output.node_size());
 
   const NodeDef& node_assign = output.node(0);
   EXPECT_EQ("assign", node_assign.name());
@@ -64,64 +62,60 @@ TEST_F(AutoParallelTest, SimpleParallel) {
   const NodeDef& node_fifo_queue = output.node(2);
   EXPECT_EQ("fifo_queue", node_fifo_queue.name());
 
-  const NodeDef& node_identity = output.node(3);
-  EXPECT_EQ("identity", node_identity.name());
-  EXPECT_EQ("var", node_identity.input(0));
-
-  const NodeDef& node_var = output.node(4);
+  const NodeDef& node_var = output.node(3);
   EXPECT_EQ("var", node_var.name());
 
-  const NodeDef& node_div_const0 = output.node(5);
+  const NodeDef& node_div_const0 = output.node(4);
   EXPECT_EQ("AutoParallel-Replica-0/AutoParallel-Div-Const",
             node_div_const0.name());
 
-  const NodeDef& node_div0 = output.node(6);
+  const NodeDef& node_div0 = output.node(5);
   EXPECT_EQ("AutoParallel-Replica-0/AutoParallel-Div-apply_gradient",
             node_div0.name());
-  const NodeDef& node_add0 = output.node(7);
+  const NodeDef& node_add0 = output.node(6);
   EXPECT_EQ("AutoParallel-Replica-0/add", node_add0.name());
 
-  const NodeDef& node_gradient0 = output.node(8);
+  const NodeDef& node_gradient0 = output.node(7);
   EXPECT_EQ("AutoParallel-Replica-0/apply_gradient", node_gradient0.name());
 
-  const NodeDef& node_constant_a0 = output.node(9);
+  const NodeDef& node_constant_a0 = output.node(8);
   EXPECT_EQ("AutoParallel-Replica-0/constant_a", node_constant_a0.name());
 
-  const NodeDef& node_dequeue0 = output.node(10);
+  const NodeDef& node_dequeue0 = output.node(9);
   EXPECT_EQ("AutoParallel-Replica-0/dequeue", node_dequeue0.name());
 
-  const NodeDef& node_learning_rate0 = output.node(11);
+  const NodeDef& node_learning_rate0 = output.node(10);
   EXPECT_EQ("AutoParallel-Replica-0/learning_rate", node_learning_rate0.name());
 
-  const NodeDef& node_div_const1 = output.node(12);
+  const NodeDef& node_div_const1 = output.node(11);
   EXPECT_EQ("AutoParallel-Replica-1/AutoParallel-Div-Const",
             node_div_const1.name());
 
-  const NodeDef& node_div1 = output.node(13);
+  const NodeDef& node_div1 = output.node(12);
   EXPECT_EQ("AutoParallel-Replica-1/AutoParallel-Div-apply_gradient",
             node_div1.name());
 
-  const NodeDef& node_add1 = output.node(14);
+  const NodeDef& node_add1 = output.node(13);
   EXPECT_EQ("AutoParallel-Replica-1/add", node_add1.name());
 
-  const NodeDef& node_gradient1 = output.node(15);
+  const NodeDef& node_gradient1 = output.node(14);
   EXPECT_EQ("AutoParallel-Replica-1/apply_gradient", node_gradient1.name());
 
-  const NodeDef& node_constant_a1 = output.node(16);
+  const NodeDef& node_constant_a1 = output.node(15);
   EXPECT_EQ("AutoParallel-Replica-1/constant_a", node_constant_a1.name());
 
-  const NodeDef& node_dequeue1 = output.node(17);
+  const NodeDef& node_dequeue1 = output.node(16);
   EXPECT_EQ("AutoParallel-Replica-1/dequeue", node_dequeue1.name());
 
-  const NodeDef& node_learning_rate1 = output.node(18);
+  const NodeDef& node_learning_rate1 = output.node(17);
   EXPECT_EQ("AutoParallel-Replica-1/learning_rate", node_learning_rate1.name());
 
-  const NodeDef& node_fetch = output.node(19);
+  const NodeDef& node_fetch = output.node(18);
   EXPECT_EQ("AutoParallel-Control-Fetch", node_fetch.name());
   EXPECT_EQ("^AutoParallel-Replica-0/apply_gradient", node_fetch.input(0));
   EXPECT_EQ("^AutoParallel-Replica-1/apply_gradient", node_fetch.input(1));
 
-  const NodeDef& node_gradient = output.node(20);
+  const NodeDef& node_gradient = output.node(19);
   EXPECT_EQ("apply_gradient", node_gradient.name());
   EXPECT_EQ("^AutoParallel-Control-Fetch", node_gradient.input(0));
 }
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index 28d663e2f7..ded1e474ce 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -929,7 +929,7 @@ struct TuningConfig {
   // Conv2DBackpropFilter will use a specialized GEMM implementation, which is
   // usually faster than the NCHW implementation. The downside is that this
   // might result in more non-cancellable layout conversion nodes (implemented
-  // by the Transpose op).
+  // by the Tranpose op).
   bool no_gemm;
 };
 
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 9c397954e1..8b7c269a11 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -2464,7 +2464,7 @@ tf_cc_tests(
         ":ops_util",
         ":sparse_add_op",
         ":sparse_dense_binary_op_shared",
-        ":sparse_reduce_op",
+        ":sparse_reduce_sum_op",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -3207,7 +3207,7 @@ cc_library(
         ":sparse_cross_op",
         ":sparse_dense_binary_op_shared",
         ":sparse_fill_empty_rows_op",
-        ":sparse_reduce_op",
+        ":sparse_reduce_sum_op",
         ":sparse_reorder_op",
         ":sparse_reshape_op",
         ":sparse_softmax",
@@ -3263,8 +3263,8 @@ tf_kernel_library(
 )
 
 tf_kernel_library(
-    name = "sparse_reduce_op",
-    prefix = "sparse_reduce_op",
+    name = "sparse_reduce_sum_op",
+    prefix = "sparse_reduce_sum_op",
     deps = SPARSE_DEPS,
 )
 
diff --git a/tensorflow/core/kernels/adjust_contrast_op.cc b/tensorflow/core/kernels/adjust_contrast_op.cc
index 37976f7183..c8f12f91a6 100644
--- a/tensorflow/core/kernels/adjust_contrast_op.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op.cc
@@ -31,9 +31,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif
 
 // AdjustContrastOp is deprecated as of GraphDef version >= 2
 
@@ -413,25 +410,4 @@ REGISTER_KERNEL_BUILDER(Name("AdjustContrastv2").Device(DEVICE_GPU),
                         AdjustContrastOpv2<GPUDevice>);
 #endif  // GOOGLE_CUDA
 
-#ifdef TENSORFLOW_USE_SYCL
-template <>
-class AdjustContrastOpv2<SYCLDevice> : public AdjustContrastOpV2Base {
- public:
-  explicit AdjustContrastOpv2(OpKernelConstruction* context)
-      : AdjustContrastOpV2Base(context) {}
-
-  void DoCompute(OpKernelContext* context,
-                 const ComputeOptions& options) override {
-    const int64 shape[4] = {options.batch, options.height, options.width,
-                            options.channels};
-    functor::AdjustContrastv2<SYCLDevice>()(
-        context->eigen_device<SYCLDevice>(),
-        options.input->shaped<float, 4>(shape), options.factor->scalar<float>(),
-        options.output->shaped<float, 4>(shape));
-  }
-};
-REGISTER_KERNEL_BUILDER(Name("AdjustContrastv2").Device(DEVICE_SYCL),
-                        AdjustContrastOpv2<SYCLDevice>);
-#endif  // TENSORFLOW_USE_SYCL
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc b/tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc
index c485f14844..ffd47406eb 100644
--- a/tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc
@@ -56,11 +56,6 @@ static Graph* BM_AdjustContrast(int batches, int width, int height) {
 // BM_AdjustContrast_cpu_1_299_299     179084     340186  2181  751.9M items/s
 // BM_AdjustContrast_gpu_32_299_299     85276     123665  4189  2.9G items/s
 BM_AdjustContrastDev(cpu, 1, 299, 299);
-#if GOOGLE_CUDA
 BM_AdjustContrastDev(gpu, 32, 299, 299);
-#endif  // GOOGLE_CUDA
-#ifdef TENSORFLOW_USE_SYCL
-BM_AdjustContrastDev(sycl, 32, 299, 299);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/colorspace_op.cc b/tensorflow/core/kernels/colorspace_op.cc
index ba100b32e7..d65a34fd73 100644
--- a/tensorflow/core/kernels/colorspace_op.cc
+++ b/tensorflow/core/kernels/colorspace_op.cc
@@ -35,9 +35,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif
 
 template <typename Device, typename T>
 class RGBToHSVOp : public OpKernel {
@@ -149,16 +146,4 @@ TF_CALL_float(REGISTER_GPU);
 TF_CALL_double(REGISTER_GPU);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL(T)                                       \
-  REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_SYCL) \
-                              .TypeConstraint<T>("T"),         \
-                          RGBToHSVOp<SYCLDevice, T>);          \
-  REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_SYCL) \
-                              .TypeConstraint<T>("T"),         \
-                          HSVToRGBOp<SYCLDevice, T>);
-TF_CALL_float(REGISTER_SYCL);
-TF_CALL_double(REGISTER_SYCL);
-#endif
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index 64c06786bc..203a9a9f24 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -112,14 +112,15 @@ REGISTER_GPU_HOST_REF_KERNEL(string);
 #undef REGISTER_GPU_HOST_KERNEL
 #undef REGISTER_GPU_HOST_REF_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_SWITCH(type)                       \
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(type)                       \
   REGISTER_KERNEL_BUILDER(Name("Switch")                 \
                               .Device(DEVICE_SYCL)       \
-                              .HostMemory("pred")        \
-                              .TypeConstraint<type>("T"),\
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("pred"),       \
                           SwitchOp)
-TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_SWITCH);
+REGISTER_SYCL_KERNEL(bool);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 
 #define REGISTER_SYCL_REF_SWITCH(type)                     \
   REGISTER_KERNEL_BUILDER(Name("RefSwitch")                \
@@ -127,41 +128,12 @@ TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_SWITCH);
                               .HostMemory("pred")          \
                               .TypeConstraint<type>("T"),  \
                           SwitchOp)
-TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_SWITCH);
+REGISTER_SYCL_REF_SWITCH(bool);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_SWITCH);
 
-#undef REGISTER_SYCL_SWITCH
+#undef REGISTER_SYCL_KERNEL
 #undef REGISTER_SYCL_REF_SWITCH
 
-#define REGISTER_SYCL_HOST_KERNEL(type)                  \
-  REGISTER_KERNEL_BUILDER(Name("Switch")                 \
-                              .Device(DEVICE_SYCL)       \
-                              .HostMemory("data")        \
-                              .HostMemory("pred")        \
-                              .HostMemory("output_false")\
-                              .HostMemory("output_true") \
-                              .TypeConstraint<type>("T"),\
-                          SwitchOp)
-
-REGISTER_SYCL_HOST_KERNEL(bool);
-REGISTER_SYCL_HOST_KERNEL(string);
-REGISTER_SYCL_HOST_KERNEL(int32);
-
-#define REGISTER_SYCL_HOST_REF_KERNEL(type)                \
-  REGISTER_KERNEL_BUILDER(Name("RefSwitch")                \
-                              .Device(DEVICE_SYCL)         \
-                              .HostMemory("data")          \
-                              .HostMemory("pred")          \
-                              .HostMemory("output_false")  \
-                              .HostMemory("output_true")   \
-                              .TypeConstraint<type>("T"),  \
-                          SwitchOp)
-
-REGISTER_SYCL_HOST_REF_KERNEL(int32);
-REGISTER_SYCL_HOST_REF_KERNEL(bool);
-REGISTER_SYCL_HOST_REF_KERNEL(string);
-
-#undef REGISTER_SYCL_HOST_KERNEL
-#undef REGISTER_SYCL_HOST_REF_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
 class RefSelectOp : public OpKernel {
@@ -261,13 +233,13 @@ REGISTER_GPU_REF_KERNEL(bool);
 #undef REGISTER_GPU_KERNEL
 #undef REGISTER_GPU_REF_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
+#if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(type)                        \
   REGISTER_KERNEL_BUILDER(Name("Merge")                   \
                               .Device(DEVICE_SYCL)        \
                               .TypeConstraint<type>("T")  \
                               .HostMemory("value_index"), \
-                          MergeOp);
+                          MergeOp)
 REGISTER_SYCL_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 
@@ -276,10 +248,9 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
                               .Device(DEVICE_SYCL)        \
                               .TypeConstraint<type>("T")  \
                               .HostMemory("value_index"), \
-                          MergeOp);
+                          MergeOp)
 REGISTER_SYCL_REF_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
-
 #undef REGISTER_SYCL_KERNEL
 #undef REGISTER_SYCL_REF_KERNEL
 #endif // TENSORFLOW_USE_SYCL
@@ -309,30 +280,6 @@ REGISTER_GPU_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_GPU_HOST_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_HOST_KERNEL(type)                   \
-  REGISTER_KERNEL_BUILDER(Name("Merge")                   \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("inputs")       \
-                              .HostMemory("output")       \
-                              .HostMemory("value_index")  \
-                              .TypeConstraint<type>("T"), \
-                          MergeOp);                       \
-  REGISTER_KERNEL_BUILDER(Name("RefMerge")                \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("inputs")       \
-                              .HostMemory("output")       \
-                              .HostMemory("value_index")  \
-                              .TypeConstraint<type>("T"), \
-                          MergeOp)
-
-REGISTER_SYCL_HOST_KERNEL(int32);
-REGISTER_SYCL_HOST_KERNEL(string);
-REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
-
-#undef REGISTER_SYCL_HOST_KERNEL
-#endif // TENSORFLOW_USE_SYCL
-
 void EnterOp::Compute(OpKernelContext* context) {
   if (IsRefType(context->input_dtype(0))) {
     context->forward_ref_input_to_ref_output(0, 0);
@@ -359,7 +306,7 @@ REGISTER_GPU_REF_KERNEL(bool);
 #undef REGISTER_GPU_KERNEL
 #undef REGISTER_GPU_REF_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
+#if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(type)  \
   REGISTER_KERNEL_BUILDER(          \
       Name("Enter").Device(DEVICE_SYCL).TypeConstraint<type>("T"), EnterOp)
@@ -398,7 +345,7 @@ REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_SYCL_HOST_KERNEL
 #undef REGISTER_SYCL_HOST_REF_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif
 
 // Special GPU kernels for int32 and string.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@@ -447,25 +394,30 @@ REGISTER_KERNEL_BUILDER(Name("RefExit").Device(DEVICE_CPU), ExitOp);
       Name("RefExit").Device(DEVICE_GPU).TypeConstraint<type>("T"), ExitOp);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_KERNEL);
 REGISTER_GPU_KERNEL(bool);
-REGISTER_GPU_REF_KERNEL(bool);
 
 #undef REGISTER_GPU_KERNEL
 #undef REGISTER_GPU_REF_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                                           \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("Exit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp);   \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("RefExit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp);
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(type)  \
+  REGISTER_KERNEL_BUILDER(          \
+  Name("Exit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp)
 REGISTER_SYCL_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 
+#define REGISTER_SYCL_REF_KERNEL(type)  \
+  REGISTER_KERNEL_BUILDER(              \
+  Name("RefExit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp)
+REGISTER_SYCL_REF_KERNEL(bool);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
+
 #undef REGISTER_SYCL_KERNEL
 #undef REGISTER_SYCL_REF_KERNEL
 
+// Special GPU kernels for int32 and string.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
 #define REGISTER_SYCL_HOST_KERNEL(type)                   \
   REGISTER_KERNEL_BUILDER(Name("Exit")                    \
                               .Device(DEVICE_SYCL)        \
@@ -555,19 +507,31 @@ REGISTER_GPU_HOST_KERNEL(string);
 
 #undef REGISTER_GPU_HOST_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                                           \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("NextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"),   \
-      NextIterationOp);                                                      \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("RefNextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"),\
-      NextIterationOp)
-REGISTER_SYCL_KERNEL(bool);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(type)  \
+  REGISTER_KERNEL_BUILDER(Name("NextIteration")           \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("data")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          NextIterationOp)
+  REGISTER_SYCL_KERNEL(bool);
+  TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
+#define REGISTER_SYCL_REF_KERNEL(type)                    \
+  REGISTER_KERNEL_BUILDER(Name("RefNextIteration")        \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("data")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          NextIterationOp)
+  REGISTER_SYCL_REF_KERNEL(bool);
+  TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
 #undef REGISTER_SYCL_KERNEL
+#undef REGISTER_SYCL_REF_KERNEL
 
+// Special GPU kernels for int32 and string.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
 #define REGISTER_SYCL_HOST_KERNEL(type)                   \
   REGISTER_KERNEL_BUILDER(Name("NextIteration")           \
                               .Device(DEVICE_SYCL)        \
diff --git a/tensorflow/core/kernels/cwise_op_add_2.cc b/tensorflow/core/kernels/cwise_op_add_2.cc
index 5dea00e95c..5d3385b0ed 100644
--- a/tensorflow/core/kernels/cwise_op_add_2.cc
+++ b/tensorflow/core/kernels/cwise_op_add_2.cc
@@ -22,11 +22,10 @@ namespace tensorflow {
 // sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__.
 #if !defined(__ANDROID_TYPES_SLIM__)
 
-REGISTER6(BinaryOp, CPU, "Add", functor::add, int8, int16, complex64,
-          uint8, complex128, string);
+REGISTER5(BinaryOp, CPU, "Add", functor::add, int8, int16, complex64,
+          complex128, string);
 #if GOOGLE_CUDA
-REGISTER4(BinaryOp, GPU, "Add", functor::add, uint8, int64, complex64,
-          complex128);
+REGISTER3(BinaryOp, GPU, "Add", functor::add, int64, complex64, complex128);
 #endif  // GOOGLE_CUDA
 
 #endif  // !defined(__ANDROID_TYPES_SLIM__)
diff --git a/tensorflow/core/kernels/cwise_op_cosh.cc b/tensorflow/core/kernels/cwise_op_cosh.cc
deleted file mode 100644
index bca99a4f89..0000000000
--- a/tensorflow/core/kernels/cwise_op_cosh.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/kernels/cwise_ops_common.h"
-
-namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Cosh", functor::cosh, float, double,
-          complex64, complex128);
-
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Cosh")                                \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::cosh<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
-
-#if GOOGLE_CUDA
-REGISTER2(UnaryOp, GPU, "Cosh", functor::cosh, float, double);
-#endif
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_gpu_add.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_add.cu.cc
index 61079ebab3..5aaf2b5b4b 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_add.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_add.cu.cc
@@ -19,8 +19,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-DEFINE_BINARY7(add, Eigen::half, float, double, uint8, int64, complex64,
-               complex128);
+DEFINE_BINARY6(add, Eigen::half, float, double, int64, complex64, complex128);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_cosh.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_cosh.cu.cc
deleted file mode 100644
index 267a381d1a..0000000000
--- a/tensorflow/core/kernels/cwise_op_gpu_cosh.cu.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if GOOGLE_CUDA
-
-#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
-
-namespace tensorflow {
-namespace functor {
-DEFINE_UNARY2(cosh, float, double);
-}  // namespace functor
-}  // namespace tensorflow
-
-#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_sinh.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_sinh.cu.cc
deleted file mode 100644
index f8329e50d6..0000000000
--- a/tensorflow/core/kernels/cwise_op_gpu_sinh.cu.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if GOOGLE_CUDA
-
-#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
-
-namespace tensorflow {
-namespace functor {
-DEFINE_UNARY2(sinh, float, double);
-}  // namespace functor
-}  // namespace tensorflow
-
-#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_invert.cc b/tensorflow/core/kernels/cwise_op_invert.cc
index df2c02e42e..c84ee6894e 100644
--- a/tensorflow/core/kernels/cwise_op_invert.cc
+++ b/tensorflow/core/kernels/cwise_op_invert.cc
@@ -20,7 +20,7 @@ REGISTER6(UnaryOp, CPU, "Invert", functor::invert, int8, int16, int32, int64,
           uint8, uint16);
 
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER6(UnaryOp, SYCL, "Invert", functor::invert, int8, int16, int32, int64,
+REGISTER(UnaryOp, SYCL, "Invert", functor::invert, int8, int16, int32, int64,
          uint8, uint16);
 #endif  // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/cwise_op_sinh.cc b/tensorflow/core/kernels/cwise_op_sinh.cc
deleted file mode 100644
index 055f0b12e1..0000000000
--- a/tensorflow/core/kernels/cwise_op_sinh.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/kernels/cwise_ops_common.h"
-
-namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Sinh", functor::sinh, float, double,
-          complex64, complex128);
-
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Sinh")                                \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::sinh<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYC
-
-#if GOOGLE_CUDA
-REGISTER2(UnaryOp, GPU, "Sinh", functor::sinh, float, double);
-#endif
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index c11d6cfabb..97bdc5e878 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -483,12 +483,6 @@ struct log1p : base<T, Eigen::internal::scalar_log1p_op<T> > {};
 template <typename T>
 struct sign : base<T, Eigen::internal::scalar_sign_op<T> > {};
 
-template <typename T>
-struct sinh : base<T, Eigen::internal::scalar_sinh_op<T> > {};
-
-template <typename T>
-struct cosh : base<T, Eigen::internal::scalar_cosh_op<T> > {};
-
 template <typename T>
 struct tanh : base<T, Eigen::internal::scalar_tanh_op<T> > {};
 
diff --git a/tensorflow/core/kernels/dynamic_stitch_op.cc b/tensorflow/core/kernels/dynamic_stitch_op.cc
index 135d635514..08ae787c86 100644
--- a/tensorflow/core/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/core/kernels/dynamic_stitch_op.cc
@@ -165,32 +165,33 @@ class DynamicStitchOp : public OpKernel {
 TF_CALL_POD_STRING_TYPES(REGISTER_DYNAMIC_STITCH);
 #undef REGISTER_DYNAMIC_STITCH
 
-#if GOOGLE_CUDA
-#define REGISTER_DYNAMIC_STITCH_GPU(type)                \
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_DYNAMIC_STITCH_SYCL(type)               \
   REGISTER_KERNEL_BUILDER(Name("DynamicStitch")          \
-                              .Device(DEVICE_GPU)        \
+                              .Device(DEVICE_SYCL)       \
                               .TypeConstraint<type>("T") \
                               .HostMemory("indices")     \
                               .HostMemory("data")        \
                               .HostMemory("merged"),     \
                           DynamicStitchOp<type>)
 
-TF_CALL_POD_STRING_TYPES(REGISTER_DYNAMIC_STITCH_GPU);
-#undef REGISTER_DYNAMIC_STITCH_GPU
-
-#endif  // GOOGLE_CUDA
+TF_CALL_ALL_TYPES(REGISTER_DYNAMIC_STITCH_SYCL);
+#undef REGISTER_DYNAMIC_STITCH_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_DYNAMIC_STITCH_SYCL(type)               \
+#if GOOGLE_CUDA
+#define REGISTER_DYNAMIC_STITCH_GPU(type)                \
   REGISTER_KERNEL_BUILDER(Name("DynamicStitch")          \
-                              .Device(DEVICE_SYCL)       \
+                              .Device(DEVICE_GPU)        \
                               .TypeConstraint<type>("T") \
                               .HostMemory("indices")     \
                               .HostMemory("data")        \
                               .HostMemory("merged"),     \
                           DynamicStitchOp<type>)
 
-TF_CALL_POD_STRING_TYPES(REGISTER_DYNAMIC_STITCH_SYCL);
-#undef REGISTER_DYNAMIC_STITCH_SYCL
-#endif  // TENSORFLOW_USE_SYCL
+TF_CALL_POD_STRING_TYPES(REGISTER_DYNAMIC_STITCH_GPU);
+#undef REGISTER_DYNAMIC_STITCH_GPU
+
+#endif  // GOOGLE_CUDA
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/map_stage_op.cc b/tensorflow/core/kernels/map_stage_op.cc
index 46eaf3d9e7..6431c6540e 100644
--- a/tensorflow/core/kernels/map_stage_op.cc
+++ b/tensorflow/core/kernels/map_stage_op.cc
@@ -547,14 +547,14 @@ REGISTER_KERNEL_BUILDER(Name("OrderedMapStage")
                             .HostMemory("indices")
                             .Device(DEVICE_GPU),
                         MapStageOp<true>);
-#endif // GOOGLE_CUDA
-
+#endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("MapStage").HostMemory("key").Device(DEVICE_SYCL),
                         MapStageOp<false>);
 REGISTER_KERNEL_BUILDER(
     Name("OrderedMapStage").HostMemory("key").Device(DEVICE_SYCL),
     MapStageOp<true>);
+
 #endif // TENSORFLOW_USE_SYCL
 
 template <bool Ordered>
@@ -661,7 +661,6 @@ REGISTER_KERNEL_BUILDER(Name("OrderedMapPeek")
                             .Device(DEVICE_GPU),
                         MapPeekOp<true>);
 #endif
-
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(
     Name("MapPeek").HostMemory("key").HostMemory("indices").Device(DEVICE_SYCL),
@@ -725,8 +724,8 @@ REGISTER_KERNEL_BUILDER(Name("OrderedMapUnstageNoKey")
                             .HostMemory("indices")
                             .Device(DEVICE_GPU),
                         MapUnstageNoKeyOp<true>);
-#endif
 
+#endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("MapUnstageNoKey")
                             .HostMemory("key")
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index ddcf241277..dc6b88e953 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -206,15 +206,10 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
     // Mkl needs the entities in its native format.
     // So create temporary tensors along with buffers to
     // convert the received entities.
-    Tensor mkl_tmp_input_buf_tensor, mkl_tmp_out_backprop_buf_tensor,
-           mkl_tmp_buf_trans_input;
+    Tensor mkl_tmp_input_buf_tensor, mkl_tmp_out_backprop_buf_tensor;
     // This preparation sets (1) dnnResourceSrc (2) dnnResourceDiffDst
-    mkl_context.MklPrepareInputs(context, data_format_,
-                                 input_in_mkl_format,
-                                 out_backprop_in_mkl_format,
-                                 &mkl_tmp_input_buf_tensor,
-                                 &mkl_tmp_out_backprop_buf_tensor,
-                                 &mkl_tmp_buf_trans_input);
+    mkl_context.MklPrepareInputs(context, &mkl_tmp_input_buf_tensor,
+                                 &mkl_tmp_out_backprop_buf_tensor);
 
     // Final conv-grad-filter should be in TF layout.
     Tensor* grad_filter;
@@ -312,58 +307,34 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
 
     // Compare incoming tensor layouts with MKL preferred layouts and convert
     // data to the preferred layout if necessary
-    void MklPrepareInputs(OpKernelContext* context, TensorFormat format,
-                          bool input_in_mkl_format,
-                          bool out_backprop_in_mkl_format,
+    void MklPrepareInputs(OpKernelContext* context,
                           Tensor* mkl_tmp_input_buf_tensor,
-                          Tensor* mkl_tmp_out_backprop_buf_tensor,
-                          Tensor* mkl_tmp_buf_trans_input) {
+                          Tensor* mkl_tmp_out_backprop_buf_tensor) {
       bool mkl_convert_input, mkl_convert_out_backprop;
       dnnPrimitive_t mkl_prim_convert_input, mkl_prim_convert_out_backprop;
-      dnnLayout_t mkl_lt_internal_input, mkl_lt_internal_out_backprop,
-                  mkl_lt_trans_input;
+      dnnLayout_t mkl_lt_internal_input, mkl_lt_internal_out_backprop;
       void *mkl_buf_convert_input, *mkl_buf_convert_out_backprop;
-      void *mkl_buf_input, *mkl_buf_out_backprop;
 
       mkl_prim_convert_input = nullptr;
       mkl_prim_convert_out_backprop = nullptr;
       mkl_lt_internal_input = nullptr;
       mkl_lt_internal_out_backprop = nullptr;
-      mkl_lt_trans_input = nullptr;
       mkl_buf_convert_input = nullptr;
       mkl_buf_convert_out_backprop = nullptr;
-      mkl_buf_input = nullptr;
-      mkl_buf_out_backprop = nullptr;
 
       // Compare with internal layouts and convert if needed
       const Tensor& input = MklGetInput(context, 0);
-      if (!input_in_mkl_format && format == FORMAT_NHWC){
-        TensorShape nchw_shape = ShapeFromFormat(FORMAT_NCHW,
-            in_sizes[MklDims::N], in_sizes[MklDims::H],
-            in_sizes[MklDims::W], in_sizes[MklDims::C]);
-        OP_REQUIRES_OK(context, context->allocate_temp(
-            DataTypeToEnum<float>::value, nchw_shape, mkl_tmp_buf_trans_input));
-        MklNHWCToNCHW(input, &mkl_tmp_buf_trans_input);
-        mkl_buf_input = const_cast<void*>(static_cast<const void*>(
-            mkl_tmp_buf_trans_input->flat<float>().data()));
-        size_t strides[4];
-        GetStridesFromSizes(FORMAT_NCHW, strides, in_sizes);
-        CHECK_EQ(dnnLayoutCreate_F32(&mkl_lt_trans_input, in_dims, in_sizes,
-            strides), E_SUCCESS);
-      }
-      else {
-        mkl_buf_input =
-            const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
-        mkl_lt_trans_input = lt_input;
-      }
+      void* mkl_buf_input =
+          const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
       CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
                    &mkl_lt_internal_input, prim_conv_bwdfilter, dnnResourceSrc),
                E_SUCCESS);
       mkl_convert_input =
-          !dnnLayoutCompare_F32(mkl_lt_internal_input, mkl_lt_trans_input);
+          !dnnLayoutCompare_F32(mkl_lt_internal_input, lt_input);
       if (mkl_convert_input) {
-        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input,
-                     mkl_lt_trans_input, mkl_lt_internal_input), E_SUCCESS);
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, lt_input,
+                                         mkl_lt_internal_input),
+                 E_SUCCESS);
         AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
                        &mkl_buf_convert_input);
         CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
@@ -372,30 +343,26 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
         dnnDelete_F32(mkl_prim_convert_input);
       }
       dnnLayoutDelete_F32(mkl_lt_internal_input);
-      if (!input_in_mkl_format && format == FORMAT_NHWC)
-        dnnLayoutDelete_F32(mkl_lt_trans_input);
-
 
       conv_res[dnnResourceSrc] =
           (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input;
 
       const Tensor& out_backprop = MklGetInput(context, 2);
-      mkl_buf_out_backprop = const_cast<void*>(
-                      static_cast<const void*>(out_backprop.flat<T>().data()));
-
+      void* mkl_buf_out_backprop = const_cast<void*>(
+          static_cast<const void*>(out_backprop.flat<T>().data()));
       CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_out_backprop,
                                                 prim_conv_bwdfilter,
                                                 dnnResourceDiffDst),
                E_SUCCESS);
       mkl_convert_out_backprop =
-          !dnnLayoutCompare_F32(mkl_lt_internal_out_backprop,
-                                lt_out_backprop);
+          !dnnLayoutCompare_F32(mkl_lt_internal_out_backprop, lt_out_backprop);
       if (mkl_convert_out_backprop) {
         CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_out_backprop,
-                      lt_out_backprop, mkl_lt_internal_out_backprop),
+                                         lt_out_backprop,
+                                         mkl_lt_internal_out_backprop),
                  E_SUCCESS);
         AllocTmpBuffer(context, mkl_tmp_out_backprop_buf_tensor,
-            mkl_lt_internal_out_backprop, &mkl_buf_convert_out_backprop);
+                       lt_out_backprop, &mkl_buf_convert_out_backprop);
         CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_out_backprop,
                                           mkl_buf_out_backprop,
                                           mkl_buf_convert_out_backprop),
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index df49e03f31..76b9f1798d 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -267,15 +267,12 @@ class MklConv2DOp : public OpKernel {
 
     mkl_context.MklCreateInputLayouts(context);
 
-    // Temp tensor used to allocate tmp buffers
     Tensor mkl_tmp_input_buf_tensor, mkl_tmp_filter_buf_tensor,
-        mkl_tmp_bias_buf_tensor, mkl_tmp_buf_trans_input;
-    mkl_context.MklPrepareConvolutionInputs(context, data_format_,
-                                            input_in_mkl_format,
-                                            &mkl_tmp_input_buf_tensor,
+        mkl_tmp_bias_buf_tensor;  // Temp tensor used to allocate tmp
+                                  // buffers
+    mkl_context.MklPrepareConvolutionInputs(context, &mkl_tmp_input_buf_tensor,
                                             &mkl_tmp_filter_buf_tensor,
-                                            &mkl_tmp_bias_buf_tensor,
-                                            &mkl_tmp_buf_trans_input);
+                                            &mkl_tmp_bias_buf_tensor);
 
     // Execute convolution
     CHECK_EQ(dnnExecute_F32(mkl_context.prim_fwd, mkl_context.conv_res),
@@ -326,59 +323,39 @@ class MklConv2DOp : public OpKernel {
     // Compare incoming tensor layouts with MKL preferred layouts and convert
     // data to the preferred layout if necessary
     void MklPrepareConvolutionInputs(OpKernelContext* context,
-                                     TensorFormat format,
-                                     bool input_in_mkl_format,
                                      Tensor* mkl_tmp_input_buf_tensor,
                                      Tensor* mkl_tmp_filter_buf_tensor,
-                                     Tensor* mkl_tmp_bias_buf_tensor,
-                                     Tensor* mkl_tmp_buf_trans_input) {
+                                     Tensor* mkl_tmp_bias_buf_tensor) {
       bool mkl_convert_input, mkl_convert_filter, mkl_convert_bias;
       dnnPrimitive_t mkl_prim_convert_filter, mkl_prim_convert_bias,
           mkl_prim_convert_input;
       dnnLayout_t mkl_lt_internal_filter, mkl_lt_internal_bias,
-          mkl_lt_internal_input, mkl_lt_trans_input;
+          mkl_lt_internal_input;
       void *mkl_buf_convert_input, *mkl_buf_convert_filter,
-          *mkl_buf_convert_bias, *mkl_buf_input;
+          *mkl_buf_convert_bias;
       mkl_prim_convert_filter = nullptr;
       mkl_prim_convert_bias = nullptr;
       mkl_prim_convert_input = nullptr;
       mkl_lt_internal_filter = nullptr;
       mkl_lt_internal_bias = nullptr;
       mkl_lt_internal_input = nullptr;
-      mkl_lt_trans_input = nullptr;
       mkl_buf_convert_input = nullptr;
       mkl_buf_convert_filter = nullptr;
       mkl_buf_convert_bias = nullptr;
-      mkl_buf_input = nullptr;
 
       // Compare with internal layouts and convert if needed
       const Tensor& input = MklGetInput(context, 0);
-      if (!input_in_mkl_format && format == FORMAT_NHWC) {
-        TensorShape nchw_shape = ShapeFromFormat(FORMAT_NCHW,
-            in_sizes[MklDims::N], in_sizes[MklDims::H],
-            in_sizes[MklDims::W], in_sizes[MklDims::C]);
-        OP_REQUIRES_OK(context, context->allocate_temp(
-            DataTypeToEnum<float>::value, nchw_shape, mkl_tmp_buf_trans_input));
-        MklNHWCToNCHW(input, &mkl_tmp_buf_trans_input);
-        mkl_buf_input = const_cast<void*>(static_cast<const void*>(
-            mkl_tmp_buf_trans_input->flat<float>().data()));
-        size_t strides[4];
-        GetStridesFromSizes(FORMAT_NCHW, strides, in_sizes);
-        CHECK_EQ(dnnLayoutCreate_F32(&mkl_lt_trans_input, in_dims, in_sizes,
-            strides), E_SUCCESS);
-      } else {
-          mkl_buf_input = const_cast<void*>(
-                              static_cast<const void*>(input.flat<T>().data()));
-          mkl_lt_trans_input = lt_input;
-      }
+      void* mkl_buf_input =
+          const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
       CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_input,
                                                 prim_fwd, dnnResourceSrc),
                E_SUCCESS);
       mkl_convert_input =
-          !dnnLayoutCompare_F32(mkl_lt_internal_input, mkl_lt_trans_input);
+          !dnnLayoutCompare_F32(mkl_lt_internal_input, lt_input);
       if (mkl_convert_input) {
-        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input,
-                 mkl_lt_trans_input, mkl_lt_internal_input), E_SUCCESS);
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, lt_input,
+                                         mkl_lt_internal_input),
+                 E_SUCCESS);
         AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
                        &mkl_buf_convert_input);
         CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
@@ -387,8 +364,6 @@ class MklConv2DOp : public OpKernel {
         dnnDelete_F32(mkl_prim_convert_input);
       }
       dnnLayoutDelete_F32(mkl_lt_internal_input);
-      if (!input_in_mkl_format && format == FORMAT_NHWC)
-        dnnLayoutDelete_F32(mkl_lt_trans_input);
 
       conv_res[dnnResourceSrc] =
           (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input;
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index 07a7e6b5da..070aeff49f 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -22,6 +22,9 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 #include <vector>
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -30,9 +33,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/tensor_format.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "third_party/mkl/include/mkl_dnn.h"
-#include "third_party/mkl/include/mkl_dnn_types.h"
 
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/util/work_sharder.h"
@@ -66,10 +66,11 @@ class MklLRNOp : public OpKernel {
   explicit MklLRNOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                        " larger than int max"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
     depth_radius_ = static_cast<size_t>(depth_radius64);
 
     OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
@@ -92,9 +93,10 @@ class MklLRNOp : public OpKernel {
                               : input.dims();
     OP_REQUIRES(context, mkl_context.in_dims == 4,
                 errors::InvalidArgument("input must be 4-dimensional"));
-    OP_REQUIRES(context, FastBoundsCheck(input.NumElements(),
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("argument to LRN too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input.NumElements(), std::numeric_limits<int>::max()),
+        errors::InvalidArgument("argument to LRN too large"));
 
     if (!input_in_mkl_format) {
       mkl_context.MklDefaultToEigen(context, depth_radius_, bias_, alpha_,
@@ -102,6 +104,15 @@ class MklLRNOp : public OpKernel {
       return;
     }
 
+    // TODO(inteltf) MKL will support depth radius not equal to 2 in the future
+    if (depth_radius_ != 2) {
+      Tensor converted_tensor =
+          ConvertMklToTF<T>(context, input, mkl_context.input_shape);
+      mkl_context.MklDefaultToEigen(context, depth_radius_, bias_, alpha_,
+                                    beta_, converted_tensor);
+      return;
+    }
+
     if (input_in_mkl_format) {
       // MKL supports normalization over channel dimension only
       if (mkl_context.input_shape.tf_dim_idx(mkl_context.in_dims - 1) ==
@@ -334,10 +345,11 @@ class MklLRNGradOp : public OpKernel {
   explicit MklLRNGradOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                        " larger than int max"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
     depth_radius_ = static_cast<int>(depth_radius64);
     OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
     OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
@@ -541,9 +553,6 @@ class MklLRNGradOp : public OpKernel {
       CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_bdw_input, lrn_bwd,
                                                 dnnResourceDiffDst),
                E_SUCCESS);
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_internal_input, lrn_bwd,
-                                                dnnResourceSrc),
-               E_SUCCESS);
 
       bool ingrad_in_mkl_format = ingrad_shape.IsMklTensor();
       if (ingrad_in_mkl_format) {
@@ -572,37 +581,44 @@ class MklLRNGradOp : public OpKernel {
         }
       }
 
-      bool inimage_in_mkl_format = inimage_shape.IsMklTensor();
-      if (inimage_in_mkl_format) {
-        if (!dnnLayoutCompare_F32(
-                lt_internal_input,
-                static_cast<dnnLayout_t>(inimage_shape.GetCurLayout()))) {
-          AllocTmpBuffer(context, mkl_tmp_image_buf_tensor, lt_internal_input,
-                         &res_lrn_bwd[dnnResourceSrc]);
-          ingrad_shape.GetConvertedFlatData(lt_internal_input, user_fwd_input,
-                                            res_lrn_bwd[dnnResourceSrc]);
-        } else {
-          res_lrn_bwd[dnnResourceSrc] = user_fwd_input;
-        }
+// Although MKL documentation for LRN does not specify setting/getting
+// of dnnResourceSrc and dnnResourceDst, Caffe code sets dnnResourceSrc.
+// So we set dnnResourceSrc here. But we do not know why we are setting
+// dnnResourceDst.
+#if 0
+    // NOTE: The code below is kept just so that we know how we should handle
+    // dnnResourceSrc if the primitive layout for dnnResourceSrc was supported.
+
+    if (!dnnLayoutCompare_F32(lt_internal_input,
+         static_cast<dnnLayout_t>inimage_shape.GetCurLayout())) {
+      AllocTmpBuffer(context, mkl_tmp_image_buf_tensor, lt_internal_input,
+                     &res_lrn_bwd[dnnResourceSrc]);
+      inimage_shape.GetConvertedFlatData(lt_internal_input,
+                                           user_fwd_input,
+                                           res_lrn_bwd[dnnResourceSrc]);
+    } else {
+      res_lrn_bwd[dnnResourceSrc] = user_fwd_input;
+    }
+#endif
+
+      // Since we cannot get expected layout for dnnResourceSrc, we construct
+      // buffer using
+      // MKL format if input is in MKL format.
+      if (inimage_shape.IsMklTensor()) {
+        AllocTmpBuffer(context, mkl_tmp_image_buf_tensor,
+                       (dnnLayout_t)inimage_shape.GetCurLayout(),
+                       &res_lrn_bwd[dnnResourceSrc]);
       } else {
-        if (!dnnLayoutCompare_F32(
-                lt_internal_input,
-                static_cast<dnnLayout_t>(inimage_shape.GetCurLayout()))) {
-          CHECK_EQ(dnnConversionCreate_F32(
-                       &convert_input,
-                       static_cast<dnnLayout_t>(inimage_shape.GetCurLayout()),
-                       lt_internal_input),
-                   E_SUCCESS);
+        res_lrn_bwd[dnnResourceSrc] = user_fwd_input;
+      }
 
-          AllocTmpBuffer(context, mkl_tmp_image_buf_tensor, lt_internal_input,
-                         &res_lrn_bwd[dnnResourceSrc]);
-          CHECK_EQ(dnnConversionExecute_F32(convert_input, user_fwd_input,
-                                            res_lrn_bwd[dnnResourceSrc]),
-                   E_SUCCESS);
-          dnnDelete_F32(convert_input);
-        } else {
-          res_lrn_bwd[dnnResourceSrc] = user_fwd_input;
-        }
+      // Same comment as above.
+      if (outimage_shape.IsMklTensor()) {
+        AllocTmpBuffer(context, mkl_tmp_outimage_buf_tensor,
+                       (dnnLayout_t)outimage_shape.GetCurLayout(),
+                       &res_lrn_bwd[dnnResourceDst]);
+      } else {
+        res_lrn_bwd[dnnResourceDst] = user_fwd_output;
       }
 
       res_lrn_bwd[dnnResourceWorkspace] = workspace_buffer;
@@ -612,6 +628,8 @@ class MklLRNGradOp : public OpKernel {
     // TODO(intelft) Check if we can use EigenLRNOp directly instead of making a
     // copy.
     void MklDefaultToEigen(OpKernelContext* context) {
+      // CHECK(false);
+
       Tensor in_grads;
       Tensor in_image;
       Tensor out_image;
@@ -691,7 +709,7 @@ class MklLRNGradOp : public OpKernel {
       Shard(worker_threads.num_threads, worker_threads.workers, nodes * batch,
             depth * depth, shard);
     }
-
+		
     // release mkl resources
     void Mklcleanup() {
       bool ingrad_in_mkl_format = ingrad_shape.IsMklTensor();
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index fabecc39a8..10d2937584 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -184,31 +184,38 @@ class MklReluGradOp : public OpKernel {
     dnnLayout_t lt_input, lt_grad;
 
     void MklPrepareReluGradInputs(OpKernelContext* context,
+                                  Tensor* mkl_tmp_grad_buf_tensor,
                                   Tensor* mkl_tmp_input_buf_tensor) {
+      dnnPrimitive_t cv_user_to_reluB_input, cv_user_to_reluB_grad;
+      dnnLayout_t mkl_lt_internal_input, mkl_lt_internal_grad;
+
       const Tensor& g = MklGetInput(context, 0);
       const Tensor& a = MklGetInput(context, 1);
-      void* buf_input = static_cast<void*>(const_cast<T*>(a.flat<T>().data()));
+
+      void* user_i = static_cast<void*>(const_cast<T*>(a.flat<T>().data()));
+      void* user_g = static_cast<void*>(const_cast<T*>(g.flat<T>().data()));
+      dnnPrimitive_t cv_input_to_grad = NULL;
+      Tensor mkl_tmp_buf_tensor;
       void* mkl_buffer_convert = nullptr;
-      dnnPrimitive_t cv_input_to_grad = nullptr;
 
       // if input and grad are not in the same layout, do a conversion between
       // them.
       if (!dnnLayoutCompare_F32(lt_input, lt_grad)) {
-        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_grad,
+        AllocTmpBuffer(context, &mkl_tmp_buf_tensor, lt_grad,
                        &mkl_buffer_convert);
-        CHECK_EQ(dnnConversionCreate_F32(&cv_input_to_grad, lt_input,
-                   lt_grad), E_SUCCESS);
-        CHECK_EQ(dnnConversionExecute_F32(cv_input_to_grad, buf_input,
+        CHECK_EQ(dnnConversionCreate_F32(&cv_input_to_grad, lt_input, lt_grad),
+                 E_SUCCESS);
+
+        CHECK_EQ(dnnConversionExecute_F32(cv_input_to_grad, user_i,
                                           mkl_buffer_convert),
                  E_SUCCESS);
         relu_res[dnnResourceSrc] = mkl_buffer_convert;
         dnnDelete_F32(cv_input_to_grad);
       } else {
-        relu_res[dnnResourceSrc] = buf_input;
+        relu_res[dnnResourceSrc] = user_i;
       }
 
-      void* buf_grad = static_cast<void*>(const_cast<T*>(g.flat<T>().data()));
-      relu_res[dnnResourceDiffDst] = buf_grad;
+      relu_res[dnnResourceDiffDst] = user_g;
     }
 
     void MklCreateInputLayouts(OpKernelContext* context) {
@@ -310,8 +317,9 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
                                      mkl_context.lt_grad, mkl_context.lt_grad,
                                      negative_slope),
            E_SUCCESS);
-  Tensor mkl_tmp_input_buf_tensor;
-  mkl_context.MklPrepareReluGradInputs(context, &mkl_tmp_input_buf_tensor);
+  Tensor mkl_tmp_grad_buf_tensor, mkl_tmp_input_buf_tensor;
+  mkl_context.MklPrepareReluGradInputs(context, &mkl_tmp_grad_buf_tensor,
+                                       &mkl_tmp_input_buf_tensor);
 
   if (input_is_mkl ||
       grad_is_mkl) { /*if  grad or input are MKL leave it in MKL*/
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.cc b/tensorflow/core/kernels/mkl_tfconv_op.cc
index b4aae67ca6..588d6874dd 100644
--- a/tensorflow/core/kernels/mkl_tfconv_op.cc
+++ b/tensorflow/core/kernels/mkl_tfconv_op.cc
@@ -24,13 +24,12 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/ops_util.h"
-#include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/tensor_format.h"
 
-#include "tensorflow/core/util/mkl_util.h"
 #include "third_party/mkl/include/mkl_dnn.h"
 #include "third_party/mkl/include/mkl_dnn_types.h"
+#include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -45,11 +44,10 @@ class MklToTfOp : public OpKernel {
   explicit MklToTfOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
     OP_REQUIRES_OK(context, context->GetAttr("T", &op_data_type));
-    has_avx512f_ = port::TestCPUFeature(port::CPUFeature::AVX512F);
   }
 
   void Compute(OpKernelContext* context) override {
-    // Check that input tensor is in MKL format.
+    // 1. Check that input tensor is in MKL format.
     const Tensor& input_tensor = MklGetInput(context, 0);
     MklShape input_shape;
     GetMklShape(context, 0, &input_shape);
@@ -70,12 +68,9 @@ class MklToTfOp : public OpKernel {
     CHECK_EQ(op_data_type, output_data_type);
 
     TensorShape output_shape;
-    size_t ndims = input_shape.GetDimension();
-    size_t* in_sizes = new size_t[ndims];
-    for (size_t i = 0; i < ndims; i++) {
+    for (size_t i = 0; i < input_shape.GetDimension(); i++) {
       // Outermost to innermost dimension
       output_shape.AddDim(input_shape.GetSizes()[input_shape.tf_dim_idx(i)]);
-      in_sizes[i] = input_shape.GetSizes()[i];
     }
 
     // Allocate output tensor.
@@ -83,41 +78,17 @@ class MklToTfOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, output_shape, &output_tensor));
 
-    // If data format is NHWC, transform MKL tensor to NCHW format and then
-    // do NCHW -> NHWC.
-    dnnLayout_t lt_trans_input = nullptr;
-    Tensor mkl_tmp_trans_input_buf_tensor;
-    void* buf_trans_input = nullptr;
-    bool input_fmt_nhwc = input_shape.IsTensorInNHWCFormat();
-    if (input_fmt_nhwc && ndims == 4 && has_avx512f_) {
-      size_t strides_nchw[4];
-      GetStridesFromSizes(FORMAT_NCHW, strides_nchw, in_sizes);
-      CHECK_EQ(
-          dnnLayoutCreate_F32(&lt_trans_input, ndims, in_sizes, strides_nchw),
-          E_SUCCESS);
-      AllocTmpBuffer(context, &mkl_tmp_trans_input_buf_tensor, lt_trans_input,
-                     &buf_trans_input);
-    } else {
-      lt_trans_input = static_cast<dnnLayout_t>(input_shape.GetTfLayout());
-      buf_trans_input =
-          static_cast<void*>(const_cast<T*>(output_tensor->flat<T>().data()));
-    }
+    // 3. Get input and output layout pointers.
+    dnnLayout_t output_layout =
+        static_cast<dnnLayout_t>(input_shape.GetTfLayout());
 
-    // Execute DNNConversion.
+    // 4. Execute DNNConversion.
     void* input_buffer =
         static_cast<void*>(const_cast<T*>(input_tensor.flat<T>().data()));
-    input_shape.GetConvertedFlatData(lt_trans_input, input_buffer,
-                                     buf_trans_input);
-    // NCHW -> NHWC, if data format is NHWC
-    if (input_fmt_nhwc && ndims == 4 && has_avx512f_) {
-      dnnLayoutDelete_F32(lt_trans_input);
-      TensorShape nhwc_shape = ShapeFromFormat(
-          FORMAT_NHWC, in_sizes[MklDims::N], in_sizes[MklDims::H],
-          in_sizes[MklDims::W], in_sizes[MklDims::C]);
-      MklNCHWToNHWC(mkl_tmp_trans_input_buf_tensor, &output_tensor);
-    }
-
-    delete[] in_sizes;
+    void* output_buffer =
+        static_cast<void*>(const_cast<T*>(output_tensor->flat<T>().data()));
+    input_shape.GetConvertedFlatData(output_layout, input_buffer,
+                                     output_buffer);
 
     VLOG(1) << "MKLToTFConversion complete successfully.";
   }
@@ -128,9 +99,6 @@ class MklToTfOp : public OpKernel {
 
   /// Data type of the operation
   DataType op_data_type;
-
-  /// CPUIDInfo
-  bool has_avx512f_ = false;
 };
 
 ///////////////////////////////////////////////////////////
diff --git a/tensorflow/core/kernels/non_max_suppression_op.cc b/tensorflow/core/kernels/non_max_suppression_op.cc
index dc95f67ff0..9ffe71e031 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op.cc
@@ -90,24 +90,20 @@ static inline float ComputeIOU(typename TTypes<float, 2>::ConstTensor boxes,
   return intersection_area / (area_i + area_j - intersection_area);
 }
 
-void DoNonMaxSuppressionOp(OpKernelContext* context,
-                           const Tensor& boxes,
-                           const Tensor& scores,
-                           const Tensor& max_output_size,
+void DoNonMaxSuppressionOp(OpKernelContext* context, const Tensor& boxes,
+                           const Tensor& scores, const Tensor& max_output_size,
                            const float iou_threshold) {
   OP_REQUIRES(context, iou_threshold >= 0 && iou_threshold <= 1,
-      errors::InvalidArgument("iou_threshold must be in [0, 1]"));
-  
+              errors::InvalidArgument("iou_threshold must be in [0, 1]"));
+
   int num_boxes = 0;
   ParseAndCheckBoxSizes(context, boxes, scores, &num_boxes);
   if (!context->status().ok()) {
     return;
   }
 
-  const int output_size =
-      std::min(max_output_size.scalar<int>()(), num_boxes);
-  typename TTypes<float, 2>::ConstTensor boxes_data =
-      boxes.tensor<float, 2>();
+  const int output_size = std::min(max_output_size.scalar<int>()(), num_boxes);
+  typename TTypes<float, 2>::ConstTensor boxes_data = boxes.tensor<float, 2>();
 
   std::vector<float> scores_data(num_boxes);
   std::copy_n(scores.flat<float>().data(), num_boxes, scores_data.begin());
@@ -127,7 +123,7 @@ void DoNonMaxSuppressionOp(OpKernelContext* context,
     for (int j = i + 1; j < num_boxes; ++j) {
       if (active[j]) {
         float iou =
-          ComputeIOU(boxes_data, sorted_indices[i], sorted_indices[j]);
+            ComputeIOU(boxes_data, sorted_indices[i], sorted_indices[j]);
         if (iou > iou_threshold) {
           active[j] = false;
           num_active--;
@@ -145,7 +141,7 @@ void DoNonMaxSuppressionOp(OpKernelContext* context,
   std::copy_n(selected.begin(), selected.size(), selected_indices_data.data());
 }
 
-} // namespace
+}  // namespace
 
 template <typename Device>
 class NonMaxSuppressionOp : public OpKernel {
@@ -167,7 +163,8 @@ class NonMaxSuppressionOp : public OpKernel {
         errors::InvalidArgument("max_output_size must be 0-D, got shape ",
                                 max_output_size.shape().DebugString()));
 
-    DoNonMaxSuppressionOp(context, boxes, scores, max_output_size, iou_threshold_);
+    DoNonMaxSuppressionOp(context, boxes, scores, max_output_size,
+                          iou_threshold_);
   }
 
  private:
@@ -178,8 +175,7 @@ template <typename Device>
 class NonMaxSuppressionV2Op : public OpKernel {
  public:
   explicit NonMaxSuppressionV2Op(OpKernelConstruction* context)
-      : OpKernel(context) {
-  }
+      : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
     // boxes: [num_boxes, 4]
@@ -194,14 +190,14 @@ class NonMaxSuppressionV2Op : public OpKernel {
                                 max_output_size.shape().DebugString()));
     // iou_threshold: scalar
     const Tensor& iou_threshold = context->input(3);
-    OP_REQUIRES(
-        context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
-        errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
-                                iou_threshold.shape().DebugString()));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
+                errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
+                                        iou_threshold.shape().DebugString()));
 
     const float iou_threshold_val = iou_threshold.scalar<float>()();
 
-    DoNonMaxSuppressionOp(context, boxes, scores, max_output_size, iou_threshold_val);
+    DoNonMaxSuppressionOp(context, boxes, scores, max_output_size,
+                          iou_threshold_val);
   }
 };
 
diff --git a/tensorflow/core/kernels/priority_queue.cc b/tensorflow/core/kernels/priority_queue.cc
index 4c406fc1ed..894ad3c9a0 100644
--- a/tensorflow/core/kernels/priority_queue.cc
+++ b/tensorflow/core/kernels/priority_queue.cc
@@ -339,7 +339,7 @@ void PriorityQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
             for (; s > 0; --s) {
               if (attempt->tuple.empty()) {
                 // Only allocate tuple when we have something to dequeue
-                // so we don't use excessive memory when there are many
+                // so we don't use exceessive memory when there are many
                 // blocked dequeue attempts waiting.
                 attempt->tuple.reserve(num_components());
                 for (int i = 0; i < num_components(); ++i) {
diff --git a/tensorflow/core/kernels/shape_ops.cc b/tensorflow/core/kernels/shape_ops.cc
index c5e3164145..d78c6d2639 100644
--- a/tensorflow/core/kernels/shape_ops.cc
+++ b/tensorflow/core/kernels/shape_ops.cc
@@ -48,7 +48,6 @@ REGISTER_KERNEL_BUILDER(Name("Shape")
                           ShapeOp<int64>);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-TF_CALL_bool(REGISTER_SYCL_KERNEL);
 #undef REGISTER_SYCL_KERNEL
 
 REGISTER_KERNEL_BUILDER(Name("Shape")
@@ -103,7 +102,7 @@ REGISTER_KERNEL_BUILDER(Name("Shape")
                             .TypeConstraint<int32>("T")
                             .TypeConstraint<int64>("out_type"),
                         ShapeOp<int64>);
-#endif  // GOOGLE_CUDA
+#endif
 
 // ShapeN ---------------------------------------
 REGISTER_KERNEL_BUILDER(Name("ShapeN")
@@ -153,9 +152,9 @@ REGISTER_KERNEL_BUILDER(Name("ShapeN")
                             .TypeConstraint<int32>("T")
                             .TypeConstraint<int64>("out_type"),
                         ShapeNOp<int64>);
-#endif  // GOOGLE_CUDA
+#endif
 
-#ifdef TENSORFLOW_USE_SYCL
+#if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(type)                               \
   REGISTER_KERNEL_BUILDER(Name("ShapeN")                         \
                               .Device(DEVICE_SYCL)               \
@@ -171,9 +170,11 @@ REGISTER_KERNEL_BUILDER(Name("ShapeN")
                           ShapeNOp<int64>)
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-TF_CALL_bool(REGISTER_SYCL_KERNEL);
 #undef REGISTER_SYCL_KERNEL
 
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
 REGISTER_KERNEL_BUILDER(Name("ShapeN")
                             .Device(DEVICE_SYCL)
                             .HostMemory("input")
@@ -201,9 +202,13 @@ REGISTER_KERNEL_BUILDER(Name("Rank").Device(DEVICE_CPU).HostMemory("output"),
                               .TypeConstraint<type>("T") \
                               .HostMemory("output"),     \
                           RankOp);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
+REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 
+// A special GPU kernel for int32 and bool.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
 REGISTER_KERNEL_BUILDER(Name("Rank")
                             .Device(DEVICE_SYCL)
                             .TypeConstraint<int32>("T")
@@ -245,7 +250,7 @@ REGISTER_KERNEL_BUILDER(Name("Rank")
                             .HostMemory("input")
                             .HostMemory("output"),
                         RankOp);
-#endif  // GOOGLE_CUDA
+#endif
 
 // Size ------------------------------------------
 REGISTER_KERNEL_BUILDER(Name("Size")
@@ -294,7 +299,7 @@ REGISTER_KERNEL_BUILDER(Name("Size")
                             .HostMemory("input")
                             .HostMemory("output"),
                         SizeOp<int64>);
-#endif  // GOOGLE_CUDA
+#endif
 
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(type)                               \
@@ -310,10 +315,13 @@ REGISTER_KERNEL_BUILDER(Name("Size")
                               .TypeConstraint<int64>("out_type") \
                               .HostMemory("output"),             \
                           SizeOp<int64>);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-TF_CALL_bool(REGISTER_SYCL_KERNEL);
+REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
 REGISTER_KERNEL_BUILDER(Name("Size")
                             .Device(DEVICE_SYCL)
                             .TypeConstraint<int32>("T")
@@ -328,7 +336,7 @@ REGISTER_KERNEL_BUILDER(Name("Size")
                             .HostMemory("input")
                             .HostMemory("output"),
                         SizeOp<int64>);
-#endif  // TENSORFLOW_USE_SYCL
+#endif // TENSORFLOW_USE_SYCL
 
 // ExpandDims ------------------------------------
 REGISTER_KERNEL_BUILDER(Name("ExpandDims")
@@ -357,7 +365,7 @@ REGISTER_KERNEL_BUILDER(Name("ExpandDims")
                             .HostMemory("dim")
                             .HostMemory("output"),
                         ExpandDimsOp);
-#endif  // GOOGLE_CUDA
+#endif // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(type)                           \
@@ -367,8 +375,9 @@ REGISTER_KERNEL_BUILDER(Name("ExpandDims")
                               .TypeConstraint<int32>("Tdim") \
                               .HostMemory("dim"),            \
                           ExpandDimsOp);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-TF_CALL_bool(REGISTER_SYCL_KERNEL);
+REGISTER_SYCL_KERNEL(float)
+REGISTER_SYCL_KERNEL(double)
+
 #undef REGISTER_SYCL_KERNEL
 
 REGISTER_KERNEL_BUILDER(Name("ExpandDims")
@@ -379,7 +388,7 @@ REGISTER_KERNEL_BUILDER(Name("ExpandDims")
                             .HostMemory("dim")
                             .HostMemory("output"),
                         ExpandDimsOp);
-#endif  // TENSORFLOW_USE_SYCL
+#endif // TENSORFLOW_USE_SYCL
 
 // Squeeze ---------------------------------------
 REGISTER_KERNEL_BUILDER(Name("Squeeze").Device(DEVICE_CPU), SqueezeOp);
@@ -402,23 +411,26 @@ REGISTER_KERNEL_BUILDER(Name("Squeeze")
                             .HostMemory("input")
                             .HostMemory("output"),
                         SqueezeOp);
-#endif  // GOOGLE_CUDA
+#endif
 
 #if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                                   \
-  REGISTER_KERNEL_BUILDER(                                           \
-      Name("Squeeze").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+#define REGISTER_SYCL_KERNEL(type)                                  \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("Squeeze").Device(DEVICE_SYCL).TypeConstraint<type>("T"),\
       SqueezeOp);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-TF_CALL_bool(REGISTER_SYCL_KERNEL);
+REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
 REGISTER_KERNEL_BUILDER(Name("Squeeze")
                             .Device(DEVICE_SYCL)
                             .TypeConstraint<int32>("T")
                             .HostMemory("input")
                             .HostMemory("output"),
                         SqueezeOp);
-#endif  // TENSORFLOW_USE_SYCL
+#endif // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/slice_op.cc b/tensorflow/core/kernels/slice_op.cc
index d46701749b..ee6f9a28cd 100644
--- a/tensorflow/core/kernels/slice_op.cc
+++ b/tensorflow/core/kernels/slice_op.cc
@@ -118,43 +118,6 @@ static void SharedValidation(OpKernelContext* context,
   }
 }
 
-// Extracted out code in SliceOp::Compute so that MklSliceOp can reuse this
-// generic code
-template <typename T>
-static void SharedSliceCommonCases(OpKernelContext* context,
-                                   TensorShape* output_shape,
-                                   gtl::InlinedVector<int64, 4>* begin,
-                                   gtl::InlinedVector<int64, 4>* size,
-                                   Tensor** result,
-                                   bool* done) {
-  bool is_identity = true;
-  bool slice_dim0 = true;
-  *done = false;
-
-  SharedValidation(context, output_shape, &is_identity, &slice_dim0, begin,
-                   size);
-  if (!context->status().ok()) return;
-  const Tensor& input = context->input(0);
-  if (is_identity) {
-    VLOG(1) << "Slice identity";
-    context->set_output(0, input);
-    *done = true;
-    return;
-  }
-
-  if (slice_dim0 && IsDim0SliceAligned<T>(input.shape(), (*begin)[0],
-                                          (*size)[0])) {
-    VLOG(1) << "Slice dim 0: " << input.shape().DebugString();
-    CHECK_GE(input.dims(), 1);  // Otherwise, is_identity should be true.
-    context->set_output(0, input.Slice((*begin)[0], (*begin)[0] + (*size)[0]));
-    *done = true;
-    return;
-  }
-
-  OP_REQUIRES_OK(context, context->allocate_output(0, *output_shape, result));
-}
-
-
 template <typename Device, typename T>
 class SliceOp : public OpKernel {
  public:
@@ -162,89 +125,29 @@ class SliceOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     TensorShape output_shape;
+    bool is_identity = true;
+    bool slice_dim0 = true;
     gtl::InlinedVector<int64, 4> begin;
     gtl::InlinedVector<int64, 4> size;
-    Tensor* result = nullptr;
-    bool done = false;
-    SharedSliceCommonCases<T>(context, &output_shape, &begin, &size, &result,
-                              &done);
-    if (!context->status().ok() || done == true) return;
-
+    SharedValidation(context, &output_shape, &is_identity, &slice_dim0, &begin,
+                     &size);
+    if (!context->status().ok()) return;
     const Tensor& input = context->input(0);
-    const int input_dims = input.dims();
-
-    if (output_shape.num_elements() > 0) {
-      if (std::is_same<Device, CPUDevice>::value && input_dims == 2 &&
-          DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) {
-        auto input = context->input(0).tensor<T, 2>();
-        auto output = result->tensor<T, 2>();
-        // TODO(agarwal): Consider multi-threading this loop for cases where
-        // size[0] is very large.
-        for (int i = 0; i < size[0]; ++i) {
-          const int64 row = begin[0] + i;
-          if (i + 1 < size[0]) {
-            port::prefetch<port::PREFETCH_HINT_T0>(&output(i + 1, 0));
-            port::prefetch<port::PREFETCH_HINT_T0>(&input(row + 1, begin[1]));
-          }
-          memcpy(&output(i, 0), &input(row, begin[1]), size[1] * sizeof(T));
-        }
-        return;
-      }
-#define HANDLE_DIM(NDIM)                            \
-  if (input_dims == NDIM) {                         \
-    HandleCase<NDIM>(context, begin, size, result); \
-    return;                                         \
-  }
-
-      HANDLE_DIM(1);
-      HANDLE_DIM(2);
-      HANDLE_DIM(3);
-      HANDLE_DIM(4);
-      HANDLE_DIM(5);
-      HANDLE_DIM(6);
-      HANDLE_DIM(7);
-
-#undef HANDLE_DIM
-
-      OP_REQUIRES(context, false, errors::Unimplemented(
-                                      "SliceOp : Unhandled input dimensions"));
+    if (is_identity) {
+      VLOG(1) << "Slice identity";
+      context->set_output(0, input);
+      return;
     }
-  }
 
- private:
-  template <int NDIM>
-  void HandleCase(OpKernelContext* context, const gtl::ArraySlice<int64>& begin,
-                  const gtl::ArraySlice<int64>& size, Tensor* result) {
-    Eigen::DSizes<Eigen::DenseIndex, NDIM> indices;
-    Eigen::DSizes<Eigen::DenseIndex, NDIM> sizes;
-    for (int i = 0; i < NDIM; ++i) {
-      indices[i] = begin[i];
-      sizes[i] = size[i];
+    if (slice_dim0 && IsDim0SliceAligned<T>(input.shape(), begin[0], size[0])) {
+      VLOG(1) << "Slice dim 0: " << input.shape().DebugString();
+      CHECK_GE(input.dims(), 1);  // Otherwise, is_identity should be true.
+      context->set_output(0, input.Slice(begin[0], begin[0] + size[0]));
+      return;
     }
 
-    functor::Slice<Device, T, NDIM>()(
-        context->eigen_device<Device>(), result->tensor<T, NDIM>(),
-        context->input(0).tensor<T, NDIM>(), indices, sizes);
-  }
-};
-
-#ifdef INTEL_MKL
-template <typename Device, typename T>
-class MklSliceOp : public OpKernel {
- public:
-  explicit MklSliceOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    TensorShape output_shape;
-    gtl::InlinedVector<int64, 4> begin;
-    gtl::InlinedVector<int64, 4> size;
     Tensor* result = nullptr;
-    bool done = false;
-    SharedSliceCommonCases<T>(context, &output_shape, &begin, &size, &result,
-                              &done);
-    if (!context->status().ok() || done == true) return;
-
-    const Tensor& input = context->input(0);
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &result));
     const int input_dims = input.dims();
 
     if (output_shape.num_elements() > 0) {
@@ -286,123 +189,9 @@ class MklSliceOp : public OpKernel {
   }
 
  private:
-  // Helper function for DoesSliceShapeDifferInOnly1D. Checks if the following
-  // criteria matches for slice_dim: if indices for slice are 0 in all dims
-  // except slice_dim and if sizes of all the dimensions of the slice are same
-  // as the sizes of all the dimensions of the input except slice_dim, then
-  // returns True. Otherwise, returns False.
-  bool DoesSliceShapeDifferInOnly1DHelper(const TensorShape& input_shape,
-                          const gtl::ArraySlice<int64>& begin,
-                          const gtl::ArraySlice<int64>& size,
-                          int slice_dim) {
-    for (int dim = 0; dim < 4; dim++) {
-      if (dim != slice_dim &&
-          (begin[dim] != 0 || size[dim] != input_shape.dim_size(dim))) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  // Is 'input' tensor being sliced over a single dimension out of 4?
-  //
-  // This check is applicable in the context of Slice of a 4-D tensor in
-  // NHWC or NCHW format over channel dimension.
-  //
-  // If indices for slice are 0 in all dims except one dimension and if sizes of
-  // all dimensions of slice are same as sizes of all dimensions of inputs
-  // except that dimension, then we are slicing over a single dimension.
-  //
-  // Returns True if Slicing over a single dimension, and sets slice_dim
-  // to the number of the dimension that satisfies criteria.
-  bool DoesSliceShapeDifferInOnly1D(const TensorShape& input_shape,
-                          const gtl::ArraySlice<int64>& begin,
-                          const gtl::ArraySlice<int64>& size,
-                          int* slice_dim) {
-    for (int dim = 0; dim < 4; dim++) {
-      if (DoesSliceShapeDifferInOnly1DHelper(input_shape, begin, size, dim)) {
-        *slice_dim = dim;
-        return true;
-      }
-    }
-    return false;
-  }
-
   template <int NDIM>
-  void HandleCase(OpKernelContext* context,
-                  const gtl::ArraySlice<int64>& begin,
+  void HandleCase(OpKernelContext* context, const gtl::ArraySlice<int64>& begin,
                   const gtl::ArraySlice<int64>& size, Tensor* result) {
-    int slice_dim = -1;
-    TensorShape in_shape = context->input(0).shape();
-    // Special case for handling 4-D tensor slice when shape of the slice
-    // differs from the input tensor in only 1 out of 4 dimensions.
-    // This case arises in the context of Slice of 4-D tensor in NHWC or NCHW
-    // format over channel dimension.
-    if (NDIM == 4 &&
-        DoesSliceShapeDifferInOnly1D(in_shape, begin, size, &slice_dim)) {
-        size_t in_strides[4] = { (size_t) in_shape.dim_size(1) *
-                                          in_shape.dim_size(2) *
-                                          in_shape.dim_size(3),
-                                 (size_t) in_shape.dim_size(2) *
-                                          in_shape.dim_size(3),
-                                 (size_t) in_shape.dim_size(3),
-                                 (size_t) 1
-                               };
-
-        size_t out_strides[4] = { (size_t) size[1] * size[2] * size[3],
-                                  (size_t) size[2] * size[3],
-                                  (size_t) size[3],
-                                  (size_t) 1 };
-
-        T *in_buf = const_cast<T*>(const_cast<const T*>(
-                    context->input(0).flat<T>().data()));
-        T *op_buf = result->flat<T>().data();
-
-        if (slice_dim == 1) {
-          /* data format = NCHW */
-
-          #pragma omp parallel for
-          for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
-              T *ip  = in_buf + (d0 * in_strides[0]);
-              T *op  = op_buf + ((d0 - begin[0]) * out_strides[0]);
-            #pragma omp parallel for
-            for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
-              T *ip1 = ip + (d1 * in_strides[1]);
-              T *op1 = op + ((d1 - begin[1]) * out_strides[1]);
-              // For NCHW, H and W will be contiguous. So we can copy
-              // both with one memcpy.
-              memcpy(static_cast<void*>(op1), static_cast<void*>(ip1),
-                     sizeof(T) * in_strides[1]);
-            }
-          }
-          return;
-        } else if (slice_dim == 3) {
-          /* data_format = NHWC */
-
-          #pragma omp parallel for
-          for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
-              T *ip = in_buf + (d0 * in_strides[0]);
-              T *op = op_buf + ((d0 - begin[0]) * out_strides[0]);
-            #pragma omp parallel for
-            for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
-              T *ip1 = ip + (d1 * in_strides[1]);
-              T *op1 = op + ((d1 - begin[1]) * out_strides[1]);
-              #pragma omp parallel for
-              for (size_t d2 = begin[2]; d2 < begin[2] + size[2]; d2++) {
-                T *ip2 = ip1 + (d2 * in_strides[2]);
-                T *ip3 = ip2 + begin[3];
-                T *op2 = op1 + ((d2 - begin[2]) * out_strides[2]);
-                T *op3 = op2;
-                memcpy(static_cast<void*>(op3), static_cast<void*>(ip3),
-                       sizeof(T) * size[3]);
-              }
-            }
-          }
-          return;
-        }
-        // slice_dim is not 1 or 3, then we fallback to Eigen implementation.
-    }
-
     Eigen::DSizes<Eigen::DenseIndex, NDIM> indices;
     Eigen::DSizes<Eigen::DenseIndex, NDIM> sizes;
     for (int i = 0; i < NDIM; ++i) {
@@ -415,7 +204,6 @@ class MklSliceOp : public OpKernel {
         context->input(0).tensor<T, NDIM>(), indices, sizes);
   }
 };
-#endif
 
 // Forward declarations of the functor specializations for declared in the
 // sharded source files.
@@ -445,7 +233,6 @@ DECLARE_FOR_N(bfloat16);
 #undef DECLARE_CPU_SPEC
 }  // namespace functor
 
-#ifndef INTEL_MKL
 #define REGISTER_SLICE(type)                             \
   REGISTER_KERNEL_BUILDER(Name("Slice")                  \
                               .Device(DEVICE_CPU)        \
@@ -457,21 +244,8 @@ DECLARE_FOR_N(bfloat16);
 TF_CALL_POD_STRING_TYPES(REGISTER_SLICE);
 TF_CALL_QUANTIZED_TYPES(REGISTER_SLICE);
 REGISTER_SLICE(bfloat16);
-#undef REGISTER_SLICE
-#else
-#define REGISTER_SLICE(type)                             \
-  REGISTER_KERNEL_BUILDER(Name("Slice")                  \
-                              .Device(DEVICE_CPU)        \
-                              .TypeConstraint<type>("T") \
-                              .HostMemory("begin")       \
-                              .HostMemory("size"),       \
-                          MklSliceOp<CPUDevice, type>)
 
-TF_CALL_POD_STRING_TYPES(REGISTER_SLICE);
-TF_CALL_QUANTIZED_TYPES(REGISTER_SLICE);
-REGISTER_SLICE(bfloat16);
 #undef REGISTER_SLICE
-#endif  // INTEL_MKL
 
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
diff --git a/tensorflow/core/kernels/sparse_reduce_op.cc b/tensorflow/core/kernels/sparse_reduce_op.cc
deleted file mode 100644
index 9e60791f97..0000000000
--- a/tensorflow/core/kernels/sparse_reduce_op.cc
+++ /dev/null
@@ -1,341 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// See docs in ../ops/sparse_ops.cc.
-
-#define EIGEN_USE_THREADS
-
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_util.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/util/sparse/sparse_tensor.h"
-
-// TODO(b/31496047): Fix non-standard include order.
-#include <numeric>  // clang-format off
-
-using tensorflow::sparse::SparseTensor;
-using tensorflow::gtl::ArraySlice;
-
-namespace tensorflow {
-
-struct ReduceDetails {
-  // The dimensions to call Reorder() with.
-  std::vector<int64> reorder_dims;
-
-  // The dimensions to call group() with after Reorder().
-  std::vector<int64> group_by_dims;
-
-  // The shape after reduction.
-  TensorShape reduced_shape;
-};
-
-// Compute common reduce parameters that'll be used for SparseTensor
-// reductions. Usage:
-// ReduceDetails reduction = SparseTensorReduceHelper(sp, axes, keep_dims);
-// sp.Reorder(reduction.reorder_dims);
-// for (const auto& g : sp.group(reduction.group_by_dims)) {
-//   ...
-// }
-// // Set output shape to reduction.reduced_shape.
-ReduceDetails SparseTensorReduceHelper(const SparseTensor &sp,
-                                       gtl::ArraySlice<int32> axes_slice,
-                                       bool keep_dims) {
-  ReduceDetails reduction;
-
-  std::vector<int32> reduction_axes(axes_slice.begin(), axes_slice.end());
-  int ndims = sp.dims();
-  for (int64 i = 0; i < reduction_axes.size(); ++i) {
-    reduction_axes[i] = (reduction_axes[i] + ndims) % ndims;
-  }
-  std::sort(reduction_axes.begin(), reduction_axes.end());
-
-  // (0) Calculate the grouping dimensions:
-  // group_by_dims == {0, .., NDIMS-1} \ reduction_axes.
-  std::vector<int64> perm(ndims);
-  std::iota(perm.begin(), perm.end(), 0);
-
-  // Requires perm and reduction_axes_ be sorted; group_by_dims will be
-  // sorted as well.
-  std::set_difference(
-      perm.begin(), perm.end(), reduction_axes.begin(), reduction_axes.end(),
-      std::inserter(reduction.group_by_dims, reduction.group_by_dims.begin()));
-
-  // Now append the rest of the axes (the complement of group_by_dims_);
-  // result is used by Reorder().
-  reduction.reorder_dims = reduction.group_by_dims;
-  std::set_difference(perm.begin(), perm.end(), reduction.group_by_dims.begin(),
-                      reduction.group_by_dims.end(),
-                      std::back_inserter(reduction.reorder_dims));
-
-  // (1) Calculate the shape after reduction.
-  auto sp_shape = sp.shape();
-  std::vector<int64> out_dim_sizes;
-  if (keep_dims) {
-    out_dim_sizes.reserve(ndims);
-    auto beg = reduction.group_by_dims.begin();
-    auto end = reduction.group_by_dims.end();
-    for (int d = 0; d < ndims; ++d) {
-      if (std::find(beg, end, d) == end) {
-        out_dim_sizes.push_back(1);  // A reduced axis.
-      } else {
-        out_dim_sizes.push_back(sp_shape[d]);
-      }
-    }
-  } else {
-    out_dim_sizes = sp.PickDims(reduction.group_by_dims);
-  }
-
-  reduction.reduced_shape = TensorShape(out_dim_sizes);
-  return reduction;
-}
-
-Status ValidateInputs(const Tensor *shape_t, const Tensor *reduction_axes_t) {
-  // indices and values are validated in SparseTensor ctor.
-  if (!TensorShapeUtils::IsVector(shape_t->shape())) {
-    return errors::InvalidArgument(
-        "Expected input_shape to be a vector; got shape: ",
-        shape_t->shape().DebugString());
-  }
-  if (!TensorShapeUtils::IsScalar(reduction_axes_t->shape()) &&
-      !TensorShapeUtils::IsVector(reduction_axes_t->shape())) {
-    return errors::InvalidArgument(
-        "Expected reduction_axes to be a scalar or a vector; got shape: ",
-        reduction_axes_t->shape().DebugString());
-  }
-
-  const auto reduction_axes_flat = reduction_axes_t->flat<int32>();
-  for (int64 i = 0; i < reduction_axes_flat.size(); i++) {
-    int32 axis = reduction_axes_flat(i);
-    if (axis < -shape_t->NumElements() || axis >= shape_t->NumElements()) {
-      return errors::InvalidArgument("Invalid reduction dimension ", axis,
-                                     ", for input with ",
-                                     shape_t->NumElements(), " dimensions.");
-    }
-  }
-
-  return Status::OK();
-}
-
-struct SumOp {
-  template <typename T>
-  static void Run(OpKernelContext *ctx, typename TTypes<T>::Scalar &s, const typename TTypes<T>::UnalignedVec &v) {
-      s.device(ctx->eigen_cpu_device()) = v.sum();
-  }
-  static StringPiece Name() {
-      return "sum";
-  }
-};
-
-struct MaxOp {
-  template <typename T>
-  static void Run(OpKernelContext *ctx, typename TTypes<T>::Scalar &s, const typename TTypes<T>::UnalignedVec &v) {
-      s.device(ctx->eigen_cpu_device()) = v.maximum();
-  }
-  static StringPiece Name() {
-      return "max";
-  }
-};
-
-template <typename T, typename Op>
-class SparseReduceOp : public OpKernel {
- public:
-  explicit SparseReduceOp(OpKernelConstruction *ctx) : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("keep_dims", &keep_dims_));
-  }
-
-  void Compute(OpKernelContext *ctx) override {
-    const Tensor *indices_t, *values_t, *shape_t, *reduction_axes_t;
-    OP_REQUIRES_OK(ctx, ctx->input("input_indices", &indices_t));
-    OP_REQUIRES_OK(ctx, ctx->input("input_values", &values_t));
-    OP_REQUIRES_OK(ctx, ctx->input("input_shape", &shape_t));
-    OP_REQUIRES_OK(ctx, ctx->input("reduction_axes", &reduction_axes_t));
-
-    OP_REQUIRES_OK(ctx, ValidateInputs(shape_t, reduction_axes_t));
-
-    // TODO(zongheng): we will call Reorder() below, which will modify
-    // in-place the underlying indices and values buffers.  To avoid
-    // surprises of this kernel being stateful, we work around the above by
-    // making deep copies here.  Remove this if/when we change Reorder()'s
-    // semantics.
-    const auto shape_vec = shape_t->vec<int64>();
-    SparseTensor sp(tensor::DeepCopy(*indices_t), tensor::DeepCopy(*values_t),
-                    TensorShape(shape_vec));
-    ReduceDetails reduction = SparseTensorReduceHelper(
-        sp, reduction_axes_t->flat<int32>(), keep_dims_);
-
-    Tensor *out_values;
-    OP_REQUIRES_OK(
-        ctx, ctx->allocate_output(0, reduction.reduced_shape, &out_values));
-    auto out_flat = out_values->flat<T>();
-    out_flat.setZero();
-
-    Tensor tmp_reduced_val;
-    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
-                                           TensorShape({}), &tmp_reduced_val));
-    auto reduced_val = tmp_reduced_val.scalar<T>();
-
-    // Compute strides, and use it to convert coords to flat index.  The
-    // coordinates returned by .group() have the same ndims as group_by_dims.
-    gtl::InlinedVector<int64, 8> output_strides(reduction.group_by_dims.size());
-    if (!output_strides.empty()) {  // Do this iff we don't reduce all.
-      output_strides.back() = 1;
-      for (int d = output_strides.size() - 2; d >= 0; --d) {
-        output_strides[d] =
-            output_strides[d + 1] * shape_vec(reduction.group_by_dims[d + 1]);
-      }
-    }
-
-    auto CoordinatesToFlatIndex = [](ArraySlice<int64> coords,
-                                     ArraySlice<int64> strides) {
-      if (strides.empty()) {  // Reduce all.
-        return 0LL;
-      }
-      CHECK_EQ(coords.size(), strides.size());
-      int64 idx = 0;
-      for (int i = 0; i < coords.size(); ++i) {
-        idx += coords[i] * strides[i];
-      }
-      return idx;
-    };
-
-    // Each group maps one-on-one onto a value in the reduced tensor.
-    // g.group() provides the coordinates of a particular reduced value.
-    sp.Reorder<T>(reduction.reorder_dims);
-    for (const auto &g : sp.group(reduction.group_by_dims)) {
-      Op::template Run<T>(ctx, reduced_val, g.template values<T>());
-      const int64 idx = CoordinatesToFlatIndex(g.group(), output_strides);
-      out_flat(idx) = reduced_val();
-      VLOG(2) << "coords: " << str_util::Join(g.group(), ",")
-              << "; idx: " << idx << "; group " << Op::Name() << ": "
-              << reduced_val();
-    }
-  }
-
- private:
-  // True if the number of dimensions should be maintained.
-  bool keep_dims_;
-};
-
-#define REGISTER_KERNELS(T)                                              \
-  REGISTER_KERNEL_BUILDER(                                               \
-      Name("SparseReduceSum").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
-      SparseReduceOp<T, SumOp>)
-TF_CALL_NUMBER_TYPES(REGISTER_KERNELS);
-#undef REGISTER_KERNELS
-
-#define REGISTER_KERNELS(T)                                              \
-  REGISTER_KERNEL_BUILDER(                                               \
-      Name("SparseReduceMax").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
-      SparseReduceOp<T, MaxOp>)
-TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS);
-#undef REGISTER_KERNELS
-
-template <typename T, typename Op>
-class SparseReduceSparseOp : public OpKernel {
- public:
-  explicit SparseReduceSparseOp(OpKernelConstruction *ctx) : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("keep_dims", &keep_dims_));
-  }
-
-  void Compute(OpKernelContext *ctx) override {
-    const Tensor *indices_t, *values_t, *shape_t, *reduction_axes_t;
-    OP_REQUIRES_OK(ctx, ctx->input("input_indices", &indices_t));
-    OP_REQUIRES_OK(ctx, ctx->input("input_values", &values_t));
-    OP_REQUIRES_OK(ctx, ctx->input("input_shape", &shape_t));
-    OP_REQUIRES_OK(ctx, ctx->input("reduction_axes", &reduction_axes_t));
-
-    OP_REQUIRES_OK(ctx, ValidateInputs(shape_t, reduction_axes_t));
-
-    SparseTensor sp(tensor::DeepCopy(*indices_t), tensor::DeepCopy(*values_t),
-                    TensorShape(shape_t->vec<int64>()));
-    ReduceDetails reduction = SparseTensorReduceHelper(
-        sp, reduction_axes_t->flat<int32>(), keep_dims_);
-
-    sp.Reorder<T>(reduction.reorder_dims);
-    // Count nnzs in the output SparseTensor.
-    int64 nnz = 0;
-    auto iter = sp.group(reduction.group_by_dims);
-    for (auto it = iter.begin(); it != iter.end(); ++it) {
-      nnz++;
-    }
-
-    Tensor *out_indices_t;
-    OP_REQUIRES_OK(ctx,
-                   ctx->allocate_output(
-                       0, TensorShape({nnz, reduction.reduced_shape.dims()}),
-                       &out_indices_t));
-    typename TTypes<int64>::Matrix out_indices_mat =
-        out_indices_t->matrix<int64>();
-    // For keep_dims. We don't explicitly set dim fields for reduced dims below.
-    out_indices_mat.setZero();
-
-    Tensor *out_values_t;
-    OP_REQUIRES_OK(ctx,
-                   ctx->allocate_output(1, TensorShape({nnz}), &out_values_t));
-    auto out_flat = out_values_t->flat<T>();
-
-    Tensor tmp_reduced_val;
-    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
-                                           TensorShape({}), &tmp_reduced_val));
-    auto reduced_val = tmp_reduced_val.scalar<T>();
-    int64 i = 0;
-    for (const auto &g : sp.group(reduction.group_by_dims)) {
-      Op::template Run<T>(ctx, reduced_val, g.template values<T>());
-      std::vector<int64> group = g.group();
-      for (int64 j = 0; j < group.size(); j++) {
-        if (keep_dims_) {
-          out_indices_mat(i, reduction.group_by_dims[j]) = group[j];
-        } else {
-          out_indices_mat(i, j) = group[j];
-        }
-      }
-      out_flat(i) = reduced_val();
-      i++;
-      VLOG(2) << "coords: " << str_util::Join(g.group(), ",")
-              << "; group " << Op::Name() << ": "
-              << reduced_val();
-    }
-
-    Tensor *out_shape_t;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(
-                            2, TensorShape({reduction.reduced_shape.dims()}),
-                            &out_shape_t));
-    auto out_shape_flat = out_shape_t->flat<int64>();
-    auto out_dim_sizes = reduction.reduced_shape.dim_sizes();
-    std::copy(out_dim_sizes.begin(), out_dim_sizes.end(), &out_shape_flat(0));
-  }
-
- private:
-  // True if the number of dimensions should be maintained.
-  bool keep_dims_;
-};
-
-#define REGISTER_KERNELS(T)                                                    \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("SparseReduceSumSparse").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
-      SparseReduceSparseOp<T, SumOp>)
-TF_CALL_NUMBER_TYPES(REGISTER_KERNELS);
-#undef REGISTER_KERNELS
-
-#define REGISTER_KERNELS(T)                                                    \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("SparseReduceMaxSparse").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
-      SparseReduceSparseOp<T, MaxOp>)
-TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS);
-#undef REGISTER_KERNELS
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/stack_ops.cc b/tensorflow/core/kernels/stack_ops.cc
index 2db3e5ef77..b4698a8053 100644
--- a/tensorflow/core/kernels/stack_ops.cc
+++ b/tensorflow/core/kernels/stack_ops.cc
@@ -40,9 +40,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
 
 class Stack : public ResourceBase {
  public:
@@ -185,10 +182,6 @@ class StackOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("Stack").Device(DEVICE_CPU), StackOp);
 REGISTER_KERNEL_BUILDER(Name("Stack").Device(DEVICE_GPU).HostMemory("handle"),
                         StackOp);
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("Stack").Device(DEVICE_SYCL).HostMemory("handle"),
-                        StackOp);
-#endif // TENSORFLOW_USE_SYCL
 
 template <typename Device>
 class StackPushOp : public AsyncOpKernel {
@@ -220,11 +213,7 @@ class StackPushOp : public AsyncOpKernel {
     static constexpr int kCopyThreshold = 2048;
     static constexpr double kOccupancy = 0.7;
     if (swap_memory_ && !alloc_attrs.on_host() &&
-        ( std::is_same<Device, GPUDevice>::value
-#ifdef TENSORFLOW_USE_SYCL
-          || std::is_same<Device, SYCLDevice>::value
-#endif // TENSORFLOW_USE_SYCL
-        ) &&
+        std::is_same<Device, GPUDevice>::value &&
         tensor.TotalBytes() > kCopyThreshold && stack->IsUsefulToSwap(tensor)) {
       DeviceContext* device_ctxt = ctx->op_device_context();
       auto device = static_cast<tensorflow::Device*>(ctx->device());
@@ -300,31 +289,6 @@ REGISTER_GPU_HOST_KERNEL(bool);
 
 #undef REGISTER_GPU_HOST_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                        \
-  REGISTER_KERNEL_BUILDER(Name("StackPush")               \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("handle")       \
-                              .TypeConstraint<type>("T"), \
-                          StackPushOp<SYCLDevice>);
-
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
-
-#define REGISTER_SYCL_HOST_KERNEL(type)                   \
-  REGISTER_KERNEL_BUILDER(Name("StackPush")               \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("handle")       \
-                              .HostMemory("elem")         \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          StackPushOp<SYCLDevice>)
-
-REGISTER_SYCL_HOST_KERNEL(int32);
-REGISTER_SYCL_HOST_KERNEL(bool);
-#undef REGISTER_SYCL_KERNEL
-#undef REGISTER_SYCL_HOST_KERNEL
-#endif // TENSORFLOW_USE_SYCL
-
 class StackPopOp : public AsyncOpKernel {
  public:
   explicit StackPopOp(OpKernelConstruction* context) : AsyncOpKernel(context) {}
@@ -395,31 +359,6 @@ REGISTER_GPU_HOST_KERNEL(bool);
 
 #undef REGISTER_GPU_HOST_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                                \
-  REGISTER_KERNEL_BUILDER(Name("StackPop")                        \
-                              .Device(DEVICE_SYCL)                \
-                              .HostMemory("handle")               \
-                              .TypeConstraint<type>("elem_type"), \
-                          StackPopOp)
-
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
-
-#define REGISTER_SYCL_HOST_KERNEL(type)                           \
-  REGISTER_KERNEL_BUILDER(Name("StackPop")                        \
-                              .Device(DEVICE_SYCL)                \
-                              .HostMemory("handle")               \
-                              .HostMemory("elem")                 \
-                              .TypeConstraint<type>("elem_type"), \
-                          StackPopOp)
-
-REGISTER_SYCL_HOST_KERNEL(int32);
-REGISTER_SYCL_HOST_KERNEL(bool);
-
-#undef REGISTER_SYCL_KERNEL
-#undef REGISTER_SYCL_HOST_KERNEL
-#endif // TENSORFLOW_USE_SYCL
-
 class StackCloseOp : public OpKernel {
  public:
   explicit StackCloseOp(OpKernelConstruction* context) : OpKernel(context) {}
@@ -437,8 +376,5 @@ class StackCloseOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("StackClose").Device(DEVICE_CPU), StackCloseOp);
 REGISTER_KERNEL_BUILDER(
     Name("StackClose").Device(DEVICE_GPU).HostMemory("handle"), StackCloseOp);
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(
-    Name("StackClose").Device(DEVICE_SYCL).HostMemory("handle"), StackCloseOp);
-#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/topk_op.cc b/tensorflow/core/kernels/topk_op.cc
index 5c89eaef5f..630fcb76f3 100644
--- a/tensorflow/core/kernels/topk_op.cc
+++ b/tensorflow/core/kernels/topk_op.cc
@@ -93,7 +93,7 @@ class TopK : public OpKernel {
       rows_by_one.set(0, num_rows);
 #else
       Eigen::array<int, 1> reduce_on_cols = {1};
-      Eigen::array<int, 2> rows_by_one = {static_cast<int>(num_rows), 1};
+      Eigen::array<int, 1> rows_by_one = {static_cast<int>(num_rows), 1};
 #endif
 
       values.device(d) =
diff --git a/tensorflow/core/kernels/transpose_functor.h b/tensorflow/core/kernels/transpose_functor.h
index f1ab770eeb..124cf14dd2 100644
--- a/tensorflow/core/kernels/transpose_functor.h
+++ b/tensorflow/core/kernels/transpose_functor.h
@@ -132,13 +132,6 @@ template <typename Device, typename T, int NDIMS>
 void TransposeUsingEigen(const Device& d, const Tensor& in,
                          const gtl::ArraySlice<int32> perm, Tensor* out);
 
-
-#ifdef TENSORFLOW_USE_SYCL
-// For SYCL lets always go through Eigen
-template <typename Device, typename T>
-void TransposeSYCL(const Device& d, const Tensor& in,
-                   const gtl::ArraySlice<int32> perm, Tensor* out);
-#endif // TENSORFLOW_USE_SYCL
 }  // namespace internal
 
 template <typename Device, typename T>
diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc
index d3305fb83a..75ed76a697 100644
--- a/tensorflow/core/kernels/transpose_op.cc
+++ b/tensorflow/core/kernels/transpose_op.cc
@@ -233,7 +233,10 @@ Status TransposeSyclOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
                               .TypeConstraint<int32>("Tperm") \
                               .HostMemory("perm"),            \
                           TransposeSyclOp);
-TF_CALL_POD_TYPES(REGISTER);
+REGISTER(float);
+REGISTER(bool);
+REGISTER(int32);
 #undef REGISTER
 #endif
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/typed_conditional_accumulator_base.h b/tensorflow/core/kernels/typed_conditional_accumulator_base.h
index 1980f758fc..dbd7de7ce0 100644
--- a/tensorflow/core/kernels/typed_conditional_accumulator_base.h
+++ b/tensorflow/core/kernels/typed_conditional_accumulator_base.h
@@ -22,7 +22,7 @@ namespace tensorflow {
 
 /*
  * TypedConditionalAccumulatorBase is a templated companion of
- * ConditionalAccumulatorBase which allows for subclasses to use different
+ * ConditionalAccumulatorBase which allows for subclassses to use different
  * types for the input gradients. (See ConditionalAccumulator and
  * SparseConditionalAccumulator.)
  *
diff --git a/tensorflow/core/lib/gtl/optional.h b/tensorflow/core/lib/gtl/optional.h
index 2ff8b9c7d1..8ba4b09143 100644
--- a/tensorflow/core/lib/gtl/optional.h
+++ b/tensorflow/core/lib/gtl/optional.h
@@ -656,7 +656,7 @@ class optional : private internal_optional::optional_data<T>,
   constexpr const T& reference() const { return *this->pointer(); }
   T& reference() { return *(this->pointer()); }
 
-  // T constraint checks.  You can't have an optional of nullopt_t, in_place_t or
+  // T constaint checks.  You can't have an optional of nullopt_t, in_place_t or
   // a reference.
   static_assert(
       !std::is_same<nullopt_t, typename std::remove_cv<T>::type>::value,
diff --git a/tensorflow/core/ops/math_grad.cc b/tensorflow/core/ops/math_grad.cc
index 9a58a31757..a530d286f7 100644
--- a/tensorflow/core/ops/math_grad.cc
+++ b/tensorflow/core/ops/math_grad.cc
@@ -155,26 +155,6 @@ Status Log1pGrad(const AttrSlice& attrs, FunctionDef* g) {
 }
 REGISTER_OP_GRADIENT("Log1p", Log1pGrad);
 
-Status SinhGrad(const AttrSlice& attrs, FunctionDef* g) {
-  // clang-format off
-  return GradForUnaryCwise(g, {
-      {{"cosh"}, "Cosh", {"x"}, {}, {"dy"}},
-      {{"dx"}, "Mul", {"dy", "cosh"}},  // dy * cosh(x)
-  });
-  // clang-format on
-}
-REGISTER_OP_GRADIENT("Sinh", SinhGrad);
-
-Status CoshGrad(const AttrSlice& attrs, FunctionDef* g) {
-  // clang-format off
-  return GradForUnaryCwise(g, {
-      {{"sinh"}, "Sinh", {"x"}, {}, {"dy"}},
-      {{"dx"}, "Mul", {"dy", "sinh"}},  // dy * sinh(x)
-  });
-  // clang-format on
-}
-REGISTER_OP_GRADIENT("Cosh", CoshGrad);
-
 Status TanhGrad(const AttrSlice& attrs, FunctionDef* g) {
   // clang-format off
   return GradForUnaryCwise(g, {
diff --git a/tensorflow/core/ops/math_grad_test.cc b/tensorflow/core/ops/math_grad_test.cc
index aa9706a328..38813b3f2b 100644
--- a/tensorflow/core/ops/math_grad_test.cc
+++ b/tensorflow/core/ops/math_grad_test.cc
@@ -495,26 +495,6 @@ TEST_F(MathGradTest, Log1p) {
   test::ExpectClose(ans, dx);
 }
 
-TEST_F(MathGradTest, Sinh) {
-  auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
-                                 TensorShape({2, 3}));
-  auto g = [](float x) { return std::cosh(x); };
-  auto dx = test::AsTensor<float>(
-      {g(-3.f), g(-2.f), g(-1.f), g(1.f), g(2.f), g(3.f)}, TensorShape({2, 3}));
-  auto ans = SymGrad("Sinh", x);
-  test::ExpectClose(ans, dx);
-}
-
-TEST_F(MathGradTest, Cosh) {
-  auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
-                                 TensorShape({2, 3}));
-  auto g = [](float x) { return std::sinh(x); };
-  auto dx = test::AsTensor<float>(
-      {g(-3.f), g(-2.f), g(-1.f), g(1.f), g(2.f), g(3.f)}, TensorShape({2, 3}));
-  auto ans = SymGrad("Cosh", x);
-  test::ExpectClose(ans, dx);
-}
-
 TEST_F(MathGradTest, Tanh) {
   auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
                                  TensorShape({2, 3}));
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 30d6987707..3fe543a239 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -293,14 +293,6 @@ Computes natural logarithm of (1 + x) element-wise.
 I.e., \\(y = \log_e (1 + x)\\).
 )doc");
 
-REGISTER_OP("Sinh").UNARY_COMPLEX().Doc(R"doc(
-Computes hyperbolic sine of x element-wise.
-)doc");
-
-REGISTER_OP("Cosh").UNARY_COMPLEX().Doc(R"doc(
-Computes hyperbolic cosine of x element-wise.
-)doc");
-
 REGISTER_OP("Tanh").UNARY_COMPLEX().Doc(R"doc(
 Computes hyperbolic tangent of `x` element-wise.
 )doc");
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 3a25fd15da..70302c3886 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -831,13 +831,11 @@ a different filter to each input channel (expanding from 1 channel to
 `channel_multiplier` channels for each), then concatenates the results
 together. Thus, the output has `in_channels * channel_multiplier` channels.
 
-```
 for k in 0..in_channels-1
   for q in 0..channel_multiplier-1
     output[b, i, j, k * channel_multiplier + q] =
       sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
                         filter[di, dj, k, q]
-```
 
 Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
 horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 08894dcafc..b122b5a992 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -5184,31 +5184,6 @@ op {
   }
   summary: "Computes cos of x element-wise."
 }
-op {
-  name: "Cosh"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  summary: "Computes hyperbolic cosine of x element-wise."
-}
 op {
   name: "CountUpTo"
   input_arg {
@@ -6302,7 +6277,7 @@ op {
     }
   }
   summary: "Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors."
-  description: "Given an input tensor of shape `[batch, in_height, in_width, in_channels]`\nand a filter / kernel tensor of shape\n`[filter_height, filter_width, in_channels, channel_multiplier]`, containing\n`in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies\na different filter to each input channel (expanding from 1 channel to\n`channel_multiplier` channels for each), then concatenates the results\ntogether. Thus, the output has `in_channels * channel_multiplier` channels.\n\n```\nfor k in 0..in_channels-1\n  for q in 0..channel_multiplier-1\n    output[b, i, j, k * channel_multiplier + q] =\n      sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *\n                        filter[di, dj, k, q]\n```\n\nMust have `strides[0] = strides[3] = 1`.  For the most common case of the same\nhorizontal and vertices strides, `strides = [1, stride, stride, 1]`."
+  description: "Given an input tensor of shape `[batch, in_height, in_width, in_channels]`\nand a filter / kernel tensor of shape\n`[filter_height, filter_width, in_channels, channel_multiplier]`, containing\n`in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies\na different filter to each input channel (expanding from 1 channel to\n`channel_multiplier` channels for each), then concatenates the results\ntogether. Thus, the output has `in_channels * channel_multiplier` channels.\n\nfor k in 0..in_channels-1\n  for q in 0..channel_multiplier-1\n    output[b, i, j, k * channel_multiplier + q] =\n      sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *\n                        filter[di, dj, k, q]\n\nMust have `strides[0] = strides[3] = 1`.  For the most common case of the same\nhorizontal and vertices strides, `strides = [1, stride, stride, 1]`."
 }
 op {
   name: "DepthwiseConv2dNativeBackpropFilter"
@@ -22817,31 +22792,6 @@ op {
   }
   summary: "Computes sin of x element-wise."
 }
-op {
-  name: "Sinh"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  summary: "Computes hyperbolic sine of x element-wise."
-}
 op {
   name: "Size"
   input_arg {
@@ -24938,123 +24888,6 @@ op {
   summary: "Multiply matrix \"a\" by matrix \"b\"."
   description: "The inputs must be two-dimensional matrices and the inner dimension of \"a\" must\nmatch the outer dimension of \"b\". This op is optimized for the case where at\nleast one of \"a\" or \"b\" is sparse. The breakeven for using this versus a dense\nmatrix multiply on one platform was 30% zero values in the sparse matrix.\n\nThe gradient computation of this operation will only take advantage of sparsity\nin the input gradient when that gradient comes from a Relu."
 }
-op {
-  name: "SparseReduceMax"
-  input_arg {
-    name: "input_indices"
-    description: "2-D.  `N x R` matrix with the indices of non-empty values in a\nSparseTensor, possibly not in canonical ordering."
-    type: DT_INT64
-  }
-  input_arg {
-    name: "input_values"
-    description: "1-D.  `N` non-empty values corresponding to `input_indices`."
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_shape"
-    description: "1-D.  Shape of the input SparseTensor."
-    type: DT_INT64
-  }
-  input_arg {
-    name: "reduction_axes"
-    description: "1-D.  Length-`K` vector containing the reduction axes."
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    description: "`R-K`-D.  The reduced Tensor."
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-    description: "If true, retain reduced dimensions with length 1."
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-  summary: "Computes the max of elements across dimensions of a SparseTensor."
-  description: "This Op takes a SparseTensor and is the sparse counterpart to\n`tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`\ninstead of a sparse one.\n\nReduces `sp_input` along the dimensions given in `reduction_axes`.  Unless\n`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in\n`reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained\nwith length 1.\n\nIf `reduction_axes` has no entries, all dimensions are reduced, and a tensor\nwith a single element is returned.  Additionally, the axes can be negative,\nwhich are interpreted according to the indexing rules in Python."
-}
-op {
-  name: "SparseReduceMaxSparse"
-  input_arg {
-    name: "input_indices"
-    description: "2-D.  `N x R` matrix with the indices of non-empty values in a\nSparseTensor, possibly not in canonical ordering."
-    type: DT_INT64
-  }
-  input_arg {
-    name: "input_values"
-    description: "1-D.  `N` non-empty values corresponding to `input_indices`."
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_shape"
-    description: "1-D.  Shape of the input SparseTensor."
-    type: DT_INT64
-  }
-  input_arg {
-    name: "reduction_axes"
-    description: "1-D.  Length-`K` vector containing the reduction axes."
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-    description: "If true, retain reduced dimensions with length 1."
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-  summary: "Computes the max of elements across dimensions of a SparseTensor."
-  description: "This Op takes a SparseTensor and is the sparse counterpart to\n`tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a\nSparseTensor.\n\nReduces `sp_input` along the dimensions given in `reduction_axes`.  Unless\n`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in\n`reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained\nwith length 1.\n\nIf `reduction_axes` has no entries, all dimensions are reduced, and a tensor\nwith a single element is returned.  Additionally, the axes can be negative,\nwhich are interpreted according to the indexing rules in Python."
-}
 op {
   name: "SparseReduceSum"
   input_arg {
diff --git a/tensorflow/core/ops/sparse_ops.cc b/tensorflow/core/ops/sparse_ops.cc
index 6aca2c3b01..9722f0ee9a 100644
--- a/tensorflow/core/ops/sparse_ops.cc
+++ b/tensorflow/core/ops/sparse_ops.cc
@@ -710,75 +710,6 @@ a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
 b: `ndims`-D Tensor.  With shape `a_shape`.
 )doc");
 
-REGISTER_OP("SparseReduceMax")
-    .Input("input_indices: int64")
-    .Input("input_values: T")
-    .Input("input_shape: int64")
-    .Input("reduction_axes: int32")
-    .Attr("keep_dims: bool = False")
-    .Output("output: T")
-    .Attr("T: realnumbertype")
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-Computes the max of elements across dimensions of a SparseTensor.
-
-This Op takes a SparseTensor and is the sparse counterpart to
-`tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
-instead of a sparse one.
-
-Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-with length 1.
-
-If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-with a single element is returned.  Additionally, the axes can be negative,
-which are interpreted according to the indexing rules in Python.
-
-input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-  SparseTensor, possibly not in canonical ordering.
-input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-input_shape: 1-D.  Shape of the input SparseTensor.
-reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-keep_dims: If true, retain reduced dimensions with length 1.
-output: `R-K`-D.  The reduced Tensor.
-)doc");
-
-REGISTER_OP("SparseReduceMaxSparse")
-    .Input("input_indices: int64")
-    .Input("input_values: T")
-    .Input("input_shape: int64")
-    .Input("reduction_axes: int32")
-    .Attr("keep_dims: bool = False")
-    .Output("output_indices: int64")
-    .Output("output_values: T")
-    .Output("output_shape: int64")
-    .Attr("T: realnumbertype")
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-Computes the max of elements across dimensions of a SparseTensor.
-
-This Op takes a SparseTensor and is the sparse counterpart to
-`tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a
-SparseTensor.
-
-Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-with length 1.
-
-If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-with a single element is returned.  Additionally, the axes can be negative,
-which are interpreted according to the indexing rules in Python.
-
-input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-  SparseTensor, possibly not in canonical ordering.
-input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-input_shape: 1-D.  Shape of the input SparseTensor.
-reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-keep_dims: If true, retain reduced dimensions with length 1.
-)doc");
-
 REGISTER_OP("SparseReduceSum")
     .Input("input_indices: int64")
     .Input("input_values: T")
diff --git a/tensorflow/core/platform/cloud/retrying_utils.cc b/tensorflow/core/platform/cloud/retrying_utils.cc
index 99691ecfb9..096c77c6e3 100644
--- a/tensorflow/core/platform/cloud/retrying_utils.cc
+++ b/tensorflow/core/platform/cloud/retrying_utils.cc
@@ -89,7 +89,7 @@ Status RetryingUtils::DeleteWithRetries(
   bool is_retried = false;
   return RetryingUtils::CallWithRetries(
       [delete_func, &is_retried]() {
-        const Status status = delete_func();
+        const auto& status = delete_func();
         if (is_retried && status.code() == error::NOT_FOUND) {
           return Status::OK();
         }
diff --git a/tensorflow/core/protobuf/worker.proto b/tensorflow/core/protobuf/worker.proto
index e476a84a13..cf05aece39 100644
--- a/tensorflow/core/protobuf/worker.proto
+++ b/tensorflow/core/protobuf/worker.proto
@@ -171,7 +171,7 @@ message ExecutorOpts {
 };
 
 message RunGraphRequest {
-  // session_handle is the master-generated unique id for this session.
+  // session_handle is the the master-generated unique id for this session.
   // If session_handle is non-empty, it must be the same as used when
   // registering the graph. If it is empty, a single global namespace is used to
   // search for the graph_handle.
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 0e5611e359..d30d7819fc 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -24,7 +24,7 @@ limitations under the License.
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX ""
+#define TF_VERSION_SUFFIX "-rc2"
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 67468bdc3f..6a37256ea9 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include "third_party/mkl/include/mkl_dnn.h"
 #include "third_party/mkl/include/mkl_dnn_types.h"
 #include "third_party/mkl/include/mkl_service.h"
-#include "third_party/mkl/include/mkl_trans.h"
+
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/util/tensor_format.h"
@@ -616,42 +616,6 @@ inline void ForwarMklTensorInToOut(OpKernelContext* context,
   }
 }
 
-  // TODO(intel_tf): Remove this routine when faster MKL layout conversion is
-  // out. 
-inline void MklNHWCToNCHW(const Tensor& input, Tensor** output) {
-  const float* buf_in = input.flat<float>().data();
-  float* buf_out = (*output)->flat<float>().data();
-
-  int64 N = input.dim_size(0);
-  int64 H = input.dim_size(1);
-  int64 W = input.dim_size(2);
-  int64 C = input.dim_size(3);
-  int64 stride_n = H*W*C;
-# pragma omp parallel for num_threads(16)
-  for (int64 n = 0; n < N; ++n) {
-    mkl_somatcopy('R', 'T', H*W, C, 1, buf_in + n*stride_n, C,
-        buf_out + n*stride_n, H*W);
-  }
-}
-
-  // TODO(intel_tf): Remove this routine when faster MKL layout conversion is
-  // out. 
-inline void MklNCHWToNHWC(const Tensor& input, Tensor** output) {
-  const float* buf_in = input.flat<float>().data();
-  float* buf_out = (*output)->flat<float>().data();
-
-  int64 N = (*output)->dim_size(0);
-  int64 H = (*output)->dim_size(1);
-  int64 W = (*output)->dim_size(2);
-  int64 C = (*output)->dim_size(3);
-  int64 stride_n = H*W*C;
-# pragma omp parallel for num_threads(16)
-  for (int64 n = 0; n < N; ++n) {
-    mkl_somatcopy('R', 'T', C, H*W, 1, buf_in + n*stride_n, H*W,
-        buf_out + n*stride_n, C);
-  }
-}
-
 namespace mkl_op_registry {
 static const char* kMklOpLabel = "MklOp";
 static const char* kMklOpLabelPattern = "label='MklOp'";
diff --git a/tensorflow/docs_src/api_guides/python/contrib.losses.md b/tensorflow/docs_src/api_guides/python/contrib.losses.md
index 30123e367f..8c289dd556 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.losses.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.losses.md
@@ -13,8 +13,8 @@ of samples in the batch and `d1` ... `dN` are the remaining dimensions.
 It is common, when training with multiple loss functions, to adjust the relative
 strengths of individual losses. This is performed by rescaling the losses via
 a `weight` parameter passed to the loss functions. For example, if we were
-training with both log_loss and mean_squared_error, and we wished that the
-log_loss penalty be twice as severe as the mean_squared_error, we would
+training with both log_loss and mean_square_error, and we wished that the
+log_loss penalty be twice as severe as the mean_square_error, we would
 implement this as:
 
 ```python
@@ -22,7 +22,7 @@ implement this as:
   tf.contrib.losses.log(predictions, labels, weight=2.0)
 
   # Uses default weight of 1.0
-  tf.contrib.losses.mean_squared_error(predictions, labels)
+  tf.contrib.losses.mean_square_error(predictions, labels)
 
   # All the losses are collected into the `GraphKeys.LOSSES` collection.
   losses = tf.get_collection(tf.GraphKeys.LOSSES)
@@ -74,7 +74,7 @@ these predictions.
   predictions = MyModelPredictions(images)
 
   weight = tf.cast(tf.greater(depths, 0), tf.float32)
-  loss  = tf.contrib.losses.mean_squared_error(predictions, depths, weight)
+  loss  = tf.contrib.losses.mean_square_error(predictions, depths, weight)
 ```
 
 Note that when using weights for the losses, the final average is computed
@@ -100,7 +100,7 @@ weighted average over the individual prediction errors:
 
   weight = MyComplicatedWeightingFunction(labels)
   weight = tf.div(weight, tf.size(weight))
-  loss = tf.contrib.losses.mean_squared_error(predictions, depths, weight)
+  loss = tf.contrib.losses.mean_square_error(predictions, depths, weight)
 ```
 
 @{tf.contrib.losses.absolute_difference}
diff --git a/tensorflow/docs_src/api_guides/python/math_ops.md b/tensorflow/docs_src/api_guides/python/math_ops.md
index 3d9f203297..b3c12a61dd 100644
--- a/tensorflow/docs_src/api_guides/python/math_ops.md
+++ b/tensorflow/docs_src/api_guides/python/math_ops.md
@@ -59,8 +59,6 @@ mathematical functions to your graph.
 *   @{tf.acos}
 *   @{tf.asin}
 *   @{tf.atan}
-*   @{tf.cosh}
-*   @{tf.sinh}
 *   @{tf.lgamma}
 *   @{tf.digamma}
 *   @{tf.erf}
diff --git a/tensorflow/docs_src/get_started/get_started.md b/tensorflow/docs_src/get_started/get_started.md
index d1c9cd696c..77b8e2dd2e 100644
--- a/tensorflow/docs_src/get_started/get_started.md
+++ b/tensorflow/docs_src/get_started/get_started.md
@@ -135,9 +135,8 @@ adder_node = a + b  # + provides a shortcut for tf.add(a, b)
 
 The preceding three lines are a bit like a function or a lambda in which we
 define two input parameters (a and b) and then an operation on them. We can
-evaluate this graph with multiple inputs by using the feed_dict argument to
-the [run method](https://www.tensorflow.org/api_docs/python/tf/Session#run)
-to feed concrete values to the placeholders:
+evaluate this graph with multiple inputs by using the feed_dict parameter to
+specify Tensors that provide concrete values to these placeholders:
 
 ```python
 print(sess.run(adder_node, {a: 3, b:4.5}))
diff --git a/tensorflow/docs_src/get_started/mnist/beginners.md b/tensorflow/docs_src/get_started/mnist/beginners.md
index 175de2be76..624d916474 100644
--- a/tensorflow/docs_src/get_started/mnist/beginners.md
+++ b/tensorflow/docs_src/get_started/mnist/beginners.md
@@ -367,7 +367,7 @@ train_step = tf.train.GradientDescentOptimizer(0.05).minimize(cross_entropy)
 
 In this case, we ask TensorFlow to minimize `cross_entropy` using the
 [gradient descent algorithm](https://en.wikipedia.org/wiki/Gradient_descent)
-with a learning rate of 0.05. Gradient descent is a simple procedure, where
+with a learning rate of 0.5. Gradient descent is a simple procedure, where
 TensorFlow simply shifts each variable a little bit in the direction that
 reduces the cost. But TensorFlow also provides
 @{$python/train#Optimizers$many other optimization algorithms}:
diff --git a/tensorflow/docs_src/get_started/mnist/mechanics.md b/tensorflow/docs_src/get_started/mnist/mechanics.md
index 27fae45b5b..48d9a395f2 100644
--- a/tensorflow/docs_src/get_started/mnist/mechanics.md
+++ b/tensorflow/docs_src/get_started/mnist/mechanics.md
@@ -82,7 +82,7 @@ After creating placeholders for the data, the graph is built from the
 `mnist.py` file according to a 3-stage pattern: `inference()`, `loss()`, and
 `training()`.
 
-1.  `inference()` - Builds the graph as far as required for running
+1.  `inference()` - Builds the graph as far as is required for running
 the network forward to make predictions.
 1.  `loss()` - Adds to the inference graph the ops required to generate
 loss.
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index 81aa6e3f76..91189f199d 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -35,7 +35,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for Mac OS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.2.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.2.0-rc2.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index 3f9096b822..c9b8dffadb 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -35,7 +35,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.2.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.2.0-rc2.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index 40ed9e1826..612c4c94f2 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -34,7 +34,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.2.0</version>
+  <version>1.2.0-rc2</version>
 </dependency>
 ```
 
@@ -63,7 +63,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.2.0</version>
+                 <version>1.2.0-rc2</version>
                </dependency>
              </dependencies>
          </project>
@@ -122,7 +122,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or Mac OS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.2.0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.2.0-rc2.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -141,7 +141,7 @@ Take the following steps to install TensorFlow for Java on Linux or Mac OS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.2.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.2.0-rc2.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -149,10 +149,10 @@ Take the following steps to install TensorFlow for Java on Linux or Mac OS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.2.0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.2.0-rc2.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.2.0.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.2.0-rc2.zip).
   3. Extract this .zip file.
 
 
@@ -200,7 +200,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.2.0.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.2.0-rc2.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -214,11 +214,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and Mac OS X:
 
-<pre><b>java -cp libtensorflow-1.2.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.2.0-rc2.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.2.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.2.0-rc2.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 99f27d7b85..8ce4acda13 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -171,8 +171,8 @@ Take the following steps to install TensorFlow with Virtualenv:
      issue the following command to install TensorFlow in the active
      virtualenv environment:
 
-     <pre>(tensorflow)$ <b>pip install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0-cp27-none-linux_x86_64.whl</b></pre>
+     <pre>(tensorflow)$ <b>pip3 install --upgrade \
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0rc2-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common_installation_problems).
@@ -276,8 +276,8 @@ take the following steps:
      the following command:
 
      <pre>
-     $ <b>sudo pip install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0-cp27-none-linux_x86_64.whl</b>
+     $ <b>sudo pip3 install --upgrade \
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0rc2-cp34-cp34m-linux_x86_64.whl</b>
      </pre>
 
      If this step fails, see
@@ -464,7 +464,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0-cp27-none-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0rc2-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -632,14 +632,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0rc2-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.2.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.2.0rc2-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -651,14 +651,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0rc2-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.2.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.2.0rc2-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -670,14 +670,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0rc2-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.2.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.2.0rc2-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -689,14 +689,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0rc2-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.2.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.2.0rc2-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index 8ff0fb872f..f85ecefb83 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -108,8 +108,8 @@ Take the following steps to install TensorFlow with Virtualenv:
      Python 2.7, the command to install
      TensorFlow in the active Virtualenv is as follows:
 
-     <pre> $ <b>pip install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.0-py2-none-any.whl</b></pre>
+     <pre> $ <b>pip3 install --upgrade \
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.0rc2-py2-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -229,8 +229,8 @@ take the following steps:
      you are installing TensorFlow for Mac OS and Python 2.7
      issue the following command:
 
-     <pre> $ <b>sudo pip install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.0-py2-none-any.whl</b> </pre>
+     <pre> $ <b>sudo pip3 install --upgrade \
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.0rc2-py2-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -339,7 +339,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.0-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.0rc2-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -512,7 +512,7 @@ This section documents the relevant values for Mac OS installations.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.0-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.0rc2-py2-none-any.whl
 </pre>
 
 
@@ -520,7 +520,7 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.0-py2-none-any.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.0-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.0rc2-py3-none-any.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index a082c3ce78..c455492387 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -342,10 +342,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.2.0 on Linux:
+for TensorFlow 1.2.0rc2 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.2.0-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.2.0rc2-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
diff --git a/tensorflow/docs_src/install/install_windows.md b/tensorflow/docs_src/install/install_windows.md
index 8282afaab4..42820660ee 100644
--- a/tensorflow/docs_src/install/install_windows.md
+++ b/tensorflow/docs_src/install/install_windows.md
@@ -38,10 +38,9 @@ installed on your system:
     [NVIDIA documentation](https://developer.nvidia.com/cuda-gpus) for a
     list of supported GPU cards.
 
-If you have a different version of one of the preceding packages, please
-change to the specified versions.  In particular, the cuDNN version
-must match exactly: TensorFlow will not load if it cannot find `cuDNN64_5.dll`.
-To use a different version of cuDNN, you must build from source.
+If you have an earlier version of the preceding packages, please
+upgrade to the specified versions.
+
 
 ## Determine how to install TensorFlow
 
@@ -74,7 +73,7 @@ Use that package at your own risk.
 If the following version of Python is not installed on your machine,
 install it now:
 
-  * [Python 3.5.x 64-bit from python.org](https://www.python.org/downloads/release/python-352/)
+  * [Python 3.5.x from python.org](https://www.python.org/downloads/release/python-352/)
 
 TensorFlow only supports version 3.5.x of Python on Windows.
 Note that Python 3.5.x comes with the pip3 package manager, which is the
@@ -115,12 +114,12 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      environment. To install the CPU-only version of TensorFlow, enter the
      following command:
 
-     <pre>(tensorflow)C:\> <b>pip install --ignore-installed --upgrade https://storage.googleapis.com/tensorflow/windows/cpu/tensorflow-1.2.0-cp35-cp35m-win_amd64.whl</b> </pre>
+     <pre>(tensorflow)C:\> <b>pip install --ignore-installed --upgrade https://storage.googleapis.com/tensorflow/windows/cpu/tensorflow-1.2.0rc2-cp35-cp35m-win_amd64.whl</b> </pre>
 
      To install the GPU version of TensorFlow, enter the following command
      (on a single line):
 
-     <pre>(tensorflow)C:\> <b>pip install --ignore-installed --upgrade https://storage.googleapis.com/tensorflow/windows/gpu/tensorflow_gpu-1.2.0-cp35-cp35m-win_amd64.whl</b> </pre>
+     <pre>(tensorflow)C:\> <b>pip install --ignore-installed --upgrade https://storage.googleapis.com/tensorflow/windows/gpu/tensorflow_gpu-1.2.0rc2-cp35-cp35m-win_amd64.whl</b> </pre>
 
 ## Validate your installation
 
diff --git a/tensorflow/docs_src/performance/performance_guide.md b/tensorflow/docs_src/performance/performance_guide.md
index a5508ac23e..07c5d3087f 100644
--- a/tensorflow/docs_src/performance/performance_guide.md
+++ b/tensorflow/docs_src/performance/performance_guide.md
@@ -52,8 +52,7 @@ bazel build -c opt --copt=-march="broadwell" --config=cuda //tensorflow/tools/pi
     (pascal): 6.2, Titan X (maxwell): 5.2, and K80: 3.7.
 *   Install the latest CUDA platform and cuDNN libraries.
 *   Make sure to use a version of gcc that supports all of the optimizations of
-    the target CPU. The recommended minimum gcc version is 4.8.3.  On OS X upgrade
-    to the latest Xcode version and use the version of clang that comes with Xcode.
+    the target CPU. The recommended minimum gcc version is 4.8.3.
 *   TensorFlow checks on startup whether it has been compiled with the
     optimizations available on the CPU. If the optimizations are not included,
     TensorFlow will emit warnings, e.g. AVX, AVX2, and FMA instructions not
@@ -123,11 +122,6 @@ format.
 The best practice is to build models that work with both `NCHW` and `NHWC` as it
 is common to train using `NCHW` on GPU, and then do inference with NHWC on CPU.
 
-There are edge cases where `NCHW` can be slower on GPU than `NHWC`. One
-[case](https://github.com/tensorflow/tensorflow/issues/7551#issuecomment-280421351)
-is using non-fused batch norm on WRN-16-4 without dropout. In that case using
-fused batch norm, which is also recommended, is the optimal solution.
-
 The very brief history of these two formats is that TensorFlow started by using
 `NHWC` because it was a little faster on CPUs. Then the TensorFlow team
 discovered that `NCHW` performs better when using the NVIDIA cuDNN library.  The
diff --git a/tensorflow/docs_src/performance/quantization.md b/tensorflow/docs_src/performance/quantization.md
index a37748d0c9..4667b4cad7 100644
--- a/tensorflow/docs_src/performance/quantization.md
+++ b/tensorflow/docs_src/performance/quantization.md
@@ -91,14 +91,11 @@ eight-bit computations:
 ```sh
 curl http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz -o /tmp/inceptionv3.tgz
 tar xzf /tmp/inceptionv3.tgz -C /tmp/
-bazel build tensorflow/tools/graph_transforms:transform_graph
-bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
-  --in_graph=/tmp/classify_image_graph_def.pb \
-  --outputs="softmax" --out_graph=/tmp/quantized_graph.pb \
-  --transforms='add_default_attributes strip_unused_nodes(type=float, shape="1,299,299,3")
-    remove_nodes(op=Identity, op=CheckNumerics) fold_constants(ignore_errors=true)
-    fold_batch_norms fold_old_batch_norms quantize_weights quantize_nodes
-    strip_unused_nodes sort_by_execution_order'
+bazel build tensorflow/tools/quantization:quantize_graph
+bazel-bin/tensorflow/tools/quantization/quantize_graph \
+  --input=/tmp/classify_image_graph_def.pb \
+  --output_node_names="softmax" --output=/tmp/quantized_graph.pb \
+  --mode=eightbit
 ```
 
 This will produce a new model that runs the same operations as the original, but
diff --git a/tensorflow/docs_src/tutorials/seq2seq.md b/tensorflow/docs_src/tutorials/seq2seq.md
index dd2ca8d524..6ffe3e8b03 100644
--- a/tensorflow/docs_src/tutorials/seq2seq.md
+++ b/tensorflow/docs_src/tutorials/seq2seq.md
@@ -8,10 +8,7 @@ some input and generate a meaningful response? For example, could we train
 a neural network to translate from English to French? It turns out that
 the answer is *yes*.
 
-This tutorial will show you how to build and train such a system end-to-end.
-Clone the [TensorFlow main repo](https://github.com/tensorflow/tensorflow) and
-the [TensorFlow models repo](https://github.com/tensorflow/models) from GitHub.
-You can then start by running the translate program:
+This tutorial will show you how to build and train such a system end-to-end. Clone the [TensorFlow main repo](https://github.com/tensorflow/tensorflow) and the [TensorFlow models repo](https://github.com/tensorflow/models) from GitHub. You can then start by running the translate program:
 
 ```
 cd models/tutorials/rnn/translate
@@ -28,7 +25,7 @@ This tutorial references the following files.
 
 File | What's in it?
 --- | ---
-`tensorflow/tensorflow/contrib/legacy_seq2seq/python/ops/seq2seq.py` | Library for building sequence-to-sequence models.
+`tensorflow/tensorflow/python/ops/seq2seq.py` | Library for building sequence-to-sequence models.
 `models/tutorials/rnn/translate/seq2seq_model.py` | Neural translation sequence-to-sequence model.
 `models/tutorials/rnn/translate/data_utils.py` | Helper functions for preparing translation data.
 `models/tutorials/rnn/translate/translate.py` | Binary that trains and runs the translation model.
@@ -151,9 +148,9 @@ have similar interfaces, so we will not describe them in detail. We will use
 ## Neural translation model
 
 While the core of the sequence-to-sequence model is constructed by
-the functions in `tensorflow/tensorflow/contrib/legacy_seq2seq/python/ops/seq2seq.py`,
-there are still a few tricks that are worth mentioning that are used in our
-translation model in `models/tutorials/rnn/translate/seq2seq_model.py`.
+the functions in `tensorflow/tensorflow/python/ops/seq2seq.py`, there are still a few tricks
+that are worth mentioning that are used in our translation model in
+`models/tutorials/rnn/translate/seq2seq_model.py`.
 
 ### Sampled softmax and output projection
 
diff --git a/tensorflow/docs_src/tutorials/wide.md b/tensorflow/docs_src/tutorials/wide.md
index 24c866eee5..c2621026c7 100644
--- a/tensorflow/docs_src/tutorials/wide.md
+++ b/tensorflow/docs_src/tutorials/wide.md
@@ -1,6 +1,6 @@
 # TensorFlow Linear Model Tutorial
 
-In this tutorial, we will use the tf.contrib.learn API in TensorFlow to solve a binary
+In this tutorial, we will use the TF.Learn API in TensorFlow to solve a binary
 classification problem: Given census data about a person such as age, gender,
 education and occupation (the features), we will try to predict whether or not
 the person earns more than 50,000 dollars a year (the target label). We will
@@ -16,7 +16,7 @@ To try the code for this tutorial:
 
 2.  Download [the tutorial code](https://www.tensorflow.org/code/tensorflow/examples/learn/wide_n_deep_tutorial.py).
 
-3.  Install the pandas data analysis library. tf.contrib.learn doesn't require pandas, but it does support it, and this tutorial uses pandas. To install pandas:
+3.  Install the pandas data analysis library. tf.learn doesn't require pandas, but it does support it, and this tutorial uses pandas. To install pandas:
 
     a. Get `pip`:
 
@@ -69,8 +69,8 @@ COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
            "marital_status", "occupation", "relationship", "race", "gender",
            "capital_gain", "capital_loss", "hours_per_week", "native_country",
            "income_bracket"]
-df_train = pd.read_csv(train_file.name, names=COLUMNS, skipinitialspace=True)
-df_test = pd.read_csv(test_file.name, names=COLUMNS, skipinitialspace=True, skiprows=1)
+df_train = pd.read_csv(train_file, names=COLUMNS, skipinitialspace=True)
+df_test = pd.read_csv(test_file, names=COLUMNS, skipinitialspace=True, skiprows=1)
 ```
 
 Since the task is a binary classification problem, we'll construct a label
@@ -136,9 +136,9 @@ Here's a list of columns available in the Census Income dataset:
 
 ## Converting Data into Tensors
 
-When building a tf.contrib.learn model, the input data is specified by means of an Input
+When building a TF.Learn model, the input data is specified by means of an Input
 Builder function. This builder function will not be called until it is later
-passed to tf.contrib.learn methods such as `fit` and `evaluate`. The purpose of this
+passed to TF.Learn methods such as `fit` and `evaluate`. The purpose of this
 function is to construct the input data, which is represented in the form of
 @{tf.Tensor}s
 or
@@ -211,7 +211,7 @@ to predict the target label.
 ### Base Categorical Feature Columns
 
 To define a feature column for a categorical feature, we can create a
-`SparseColumn` using the tf.contrib.learn API. If you know the set of all possible
+`SparseColumn` using the TF.Learn API. If you know the set of all possible
 feature values of a column and there are only a few of them, you can use
 `sparse_column_with_keys`. Each key in the list will get assigned an
 auto-incremental ID starting from 0. For example, for the `gender` column we can
@@ -361,7 +361,7 @@ in `model_dir`.
 ## Training and Evaluating Our Model
 
 After adding all the features to the model, now let's look at how to actually
-train the model. Training a model is just a one-liner using the tf.contrib.learn API:
+train the model. Training a model is just a one-liner using the TF.Learn API:
 
 ```python
 m.fit(input_fn=train_input_fn, steps=200)
@@ -467,4 +467,4 @@ value would be high.
 
 If you're interested in learning more, check out our @{$wide_and_deep$Wide & Deep Learning Tutorial} where we'll show you how to combine
 the strengths of linear models and deep neural networks by jointly training them
-using the tf.contrib.learn API.
+using the TF.Learn API.
diff --git a/tensorflow/docs_src/tutorials/wide_and_deep.md b/tensorflow/docs_src/tutorials/wide_and_deep.md
index 0978005d6c..77c905fd51 100644
--- a/tensorflow/docs_src/tutorials/wide_and_deep.md
+++ b/tensorflow/docs_src/tutorials/wide_and_deep.md
@@ -9,7 +9,7 @@ great for training deep neural networks too, and you might be thinking which one
 you should choose—Well, why not both? Would it be possible to combine the
 strengths of both in one model?
 
-In this tutorial, we'll introduce how to use the tf.contrib.learn API to jointly train a
+In this tutorial, we'll introduce how to use the TF.Learn API to jointly train a
 wide linear model and a deep feed-forward neural network. This approach combines
 the strengths of memorization and generalization. It's useful for generic
 large-scale regression and classification problems with sparse input features
@@ -23,7 +23,7 @@ The figure above shows a comparison of a wide model (logistic regression with
 sparse features and transformations), a deep model (feed-forward neural network
 with an embedding layer and several hidden layers), and a Wide & Deep model
 (joint training of both). At a high level, there are only 3 steps to configure a
-wide, deep, or Wide & Deep model using the tf.contrib.learn API:
+wide, deep, or Wide & Deep model using the TF.Learn API:
 
 1.  Select features for the wide part: Choose the sparse base columns and
     crossed columns you want to use.
@@ -42,7 +42,7 @@ To try the code for this tutorial:
 
 2.  Download [the tutorial code](https://www.tensorflow.org/code/tensorflow/examples/learn/wide_n_deep_tutorial.py).
 
-3.  Install the pandas data analysis library. tf.contrib.learn doesn't require pandas, but it does support it, and this tutorial uses pandas. To install pandas:
+3.  Install the pandas data analysis library. tf.learn doesn't require pandas, but it does support it, and this tutorial uses pandas. To install pandas:
 
     a. Get `pip`:
 
diff --git a/tensorflow/docs_src/tutorials/word2vec.md b/tensorflow/docs_src/tutorials/word2vec.md
index 8e7c19035e..dfb21334f8 100644
--- a/tensorflow/docs_src/tutorials/word2vec.md
+++ b/tensorflow/docs_src/tutorials/word2vec.md
@@ -351,7 +351,7 @@ to evaluate embeddings is to directly use them to predict syntactic and semantic
 relationships like `king is to queen as father is to ?`. This is called
 *analogical reasoning* and the task was introduced by
 [Mikolov and colleagues
-](http://www.anthology.aclweb.org/N/N13/N13-1090.pdf).
+](http://msr-waypoint.com/en-us/um/people/gzweig/Pubs/NAACL2013Regularities.pdf).
 Download the dataset for this task from
 [download.tensorflow.org](http://download.tensorflow.org/data/questions-words.txt).
 
diff --git a/tensorflow/examples/image_retraining/retrain.py b/tensorflow/examples/image_retraining/retrain.py
index 44a3097d80..8e3b1a3a36 100644
--- a/tensorflow/examples/image_retraining/retrain.py
+++ b/tensorflow/examples/image_retraining/retrain.py
@@ -800,27 +800,11 @@ def add_evaluation_step(result_tensor, ground_truth_tensor):
   return evaluation_step, prediction
 
 
-def save_graph_to_file(sess, graph, graph_file_name):
-  output_graph_def = graph_util.convert_variables_to_constants(
-    sess, graph.as_graph_def(), [FLAGS.final_tensor_name])
-  with gfile.FastGFile(graph_file_name, 'wb') as f:
-    f.write(output_graph_def.SerializeToString())
-  return
-
-
-def prepare_file_system():
+def main(_):
   # Setup the directory we'll write summaries to for TensorBoard
   if tf.gfile.Exists(FLAGS.summaries_dir):
     tf.gfile.DeleteRecursively(FLAGS.summaries_dir)
   tf.gfile.MakeDirs(FLAGS.summaries_dir)
-  if FLAGS.intermediate_store_frequency > 0:
-    ensure_dir_exists(FLAGS.intermediate_output_graphs_dir)
-  return
-
-
-def main(_):
-  # Prepare necessary directories  that can be used during training
-  prepare_file_system()
 
   # Set up the pre-trained graph.
   maybe_download_and_extract()
@@ -933,14 +917,6 @@ def main(_):
               (datetime.now(), i, validation_accuracy * 100,
                len(validation_bottlenecks)))
 
-      # Store intermediate results
-      intermediate_frequency = FLAGS.intermediate_store_frequency
-
-      if intermediate_frequency > 0 and (i % intermediate_frequency == 0) and i > 0:
-        intermediate_file_name = FLAGS.intermediate_output_graphs_dir + 'intermediate_' + str(i) + '.pb'
-        print('Save intermediate result to : ' + intermediate_file_name)
-        save_graph_to_file(sess, graph, intermediate_file_name)
-                
     # We've completed all our training, so run a final test evaluation on
     # some new images we haven't used before.
     test_bottlenecks, test_ground_truth, test_filenames = (
@@ -964,7 +940,10 @@ def main(_):
 
     # Write out the trained graph and labels with the weights stored as
     # constants.
-    save_graph_to_file(sess, graph, FLAGS.output_graph)
+    output_graph_def = graph_util.convert_variables_to_constants(
+        sess, graph.as_graph_def(), [FLAGS.final_tensor_name])
+    with gfile.FastGFile(FLAGS.output_graph, 'wb') as f:
+      f.write(output_graph_def.SerializeToString())
     with gfile.FastGFile(FLAGS.output_labels, 'w') as f:
       f.write('\n'.join(image_lists.keys()) + '\n')
 
@@ -983,18 +962,6 @@ if __name__ == '__main__':
       default='/tmp/output_graph.pb',
       help='Where to save the trained graph.'
   )
-  parser.add_argument(
-      '--intermediate_output_graphs_dir',
-      type=str,
-      default='/tmp/intermediate_graph/',
-      help='Where to save the intermediate graphs.'
-  )
-  parser.add_argument(
-      '--intermediate_store_frequency',
-      type=int,
-      default=0,
-      help='How many steps to store intermediate graph. If "0" then will not store.'
-  )
   parser.add_argument(
       '--output_labels',
       type=str,
diff --git a/tensorflow/examples/learn/examples_test.sh b/tensorflow/examples/learn/examples_test.sh
index 4c5893384a..77b245ab15 100755
--- a/tensorflow/examples/learn/examples_test.sh
+++ b/tensorflow/examples/learn/examples_test.sh
@@ -32,7 +32,7 @@ TFLEARN_EXAMPLE_BASE_DIR=$DIR/tensorflow/examples/learn
 
 
 function test() {
-  echo "Test $1:"
+  echo "Test "$1":"
   $TFLEARN_EXAMPLE_BASE_DIR/$1 $2
   if [ $? -eq 0 ]
   then
diff --git a/tensorflow/examples/tutorials/mnist/mnist.py b/tensorflow/examples/tutorials/mnist/mnist.py
index 3585043a2a..d533697976 100644
--- a/tensorflow/examples/tutorials/mnist/mnist.py
+++ b/tensorflow/examples/tutorials/mnist/mnist.py
@@ -17,7 +17,7 @@
 
 Implements the inference/loss/training pattern for model building.
 
-1. inference() - Builds the model as far as required for running the network
+1. inference() - Builds the model as far as is required for running the network
 forward to make predictions.
 2. loss() - Adds to the inference model the layers required to generate loss.
 3. training() - Adds to the loss model the Ops required to generate and
diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
index aee482fda5..13e5717b0d 100644
--- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
+++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
@@ -91,6 +91,7 @@ print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])
 
 data_index = 0
 
+
 # Step 3: Function to generate a training batch for the skip-gram model.
 def generate_batch(batch_size, num_skips, skip_window):
   global data_index
@@ -100,10 +101,9 @@ def generate_batch(batch_size, num_skips, skip_window):
   labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
   span = 2 * skip_window + 1  # [ skip_window target skip_window ]
   buffer = collections.deque(maxlen=span)
-  if data_index + span > len(data):
-    data_index = 0
-  buffer.extend(data[data_index:data_index + span])
-  data_index += span
+  for _ in range(span):
+    buffer.append(data[data_index])
+    data_index = (data_index + 1) % len(data)
   for i in range(batch_size // num_skips):
     target = skip_window  # target label at the center of the buffer
     targets_to_avoid = [skip_window]
@@ -113,12 +113,8 @@ def generate_batch(batch_size, num_skips, skip_window):
       targets_to_avoid.append(target)
       batch[i * num_skips + j] = buffer[skip_window]
       labels[i * num_skips + j, 0] = buffer[target]
-    if data_index == len(data):
-      buffer[:] = data[:span]
-      data_index = span
-    else:
-      buffer.append(data[data_index])
-      data_index += 1
+    buffer.append(data[data_index])
+    data_index = (data_index + 1) % len(data)
   # Backtrack a little bit to avoid skipping words in the end of a batch
   data_index = (data_index + len(data) - span) % len(data)
   return batch, labels
diff --git a/tensorflow/go/README.md b/tensorflow/go/README.md
index 9c2fa60017..a1b4255292 100644
--- a/tensorflow/go/README.md
+++ b/tensorflow/go/README.md
@@ -9,6 +9,35 @@ Construct and execute TensorFlow graphs in Go.
 > (`github.com/tensorflow/tensorflow/tensorflow/go`).
 
 ## Quickstart
+1.  Download and extract the TensorFlow C library, preferably into `/usr/local`.
+    GPU-enabled versions require CUDA 8.0 and cuDNN 5.1. For other versions, the
+    TensorFlow C library will have to be built from source (see below).
+
+    -   Linux:
+        [CPU-only](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-cpu-linux-x86_64-1.1.0.tar.gz),
+        [GPU-enabled](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-gpu-linux-x86_64-1.1.0.tar.gz)
+    -   OS X
+        [CPU-only](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-cpu-darwin-x86_64-1.1.0.tar.gz),
+
+    The following shell snippet downloads and extracts into `/usr/local`:
+
+    ```sh
+    TF_TYPE="cpu" # Set to "gpu" for GPU support
+    curl -L \
+      "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.1.0.tar.gz" |
+    sudo tar -C /usr/local -xz
+    ```
+
+2.  `go get` this package (and run tests):
+
+    ```sh
+    go get github.com/tensorflow/tensorflow/tensorflow/go
+    go test github.com/tensorflow/tensorflow/tensorflow/go
+    ```
+
+3.  Done!
+
+### Installing into locations other than `/usr/local`
 
 Refer to [Installing TensorFlow for Go](https://www.tensorflow.org/install/install_go)
 
diff --git a/tensorflow/go/genop/generate.sh b/tensorflow/go/genop/generate.sh
index 15ef3b9525..d791e39c40 100644
--- a/tensorflow/go/genop/generate.sh
+++ b/tensorflow/go/genop/generate.sh
@@ -20,7 +20,7 @@ go get github.com/golang/protobuf/proto
 go get github.com/golang/protobuf/protoc-gen-go
 
 cd $(dirname $0)
-for g in $(echo "${GOPATH//:/ }"); do
+for g in $(echo $GOPATH | sed "s/:/ /g"); do
     TF_DIR="${g}/src/github.com/tensorflow/tensorflow"
     PROTOC="${TF_DIR}/bazel-out/host/bin/external/protobuf/protoc"
     if [ -x "${PROTOC}" ]; then
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 6675f1097e..8dd2931703 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -14288,36 +14288,6 @@ func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// Computes hyperbolic sine of x element-wise.
-func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sinh",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes hyperbolic cosine of x element-wise.
-func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Cosh",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Computes hyperbolic tangent of `x` element-wise.
 func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
diff --git a/tensorflow/go/shape.go b/tensorflow/go/shape.go
index 8d000cb9de..114ab5decb 100644
--- a/tensorflow/go/shape.go
+++ b/tensorflow/go/shape.go
@@ -59,7 +59,7 @@ func (s Shape) NumDimensions() int {
 //
 // REQUIRES: 0 <= dim < s.NumDimensions()
 func (s Shape) Size(dim int) int64 {
-	if dim < 0 || dim >= s.NumDimensions() {
+	if dim < 0 || dim > s.NumDimensions() {
 		return -1
 	}
 	return s.dims[dim]
diff --git a/tensorflow/go/tensor.go b/tensorflow/go/tensor.go
index 4a60c736b5..34e797a2b3 100644
--- a/tensorflow/go/tensor.go
+++ b/tensorflow/go/tensor.go
@@ -270,7 +270,7 @@ func typeOf(dt DataType, shape []int64) reflect.Type {
 	if ret == nil {
 		panic(bug("DataType %v is not supported", dt))
 	}
-	for range shape {
+	for _ = range shape {
 		ret = reflect.SliceOf(ret)
 	}
 	return ret
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index 90372660cd..9abb63c966 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -9,10 +9,7 @@ load("build_defs", "JAVACOPTS")
 
 java_library(
     name = "tensorflow",
-    srcs = [
-        ":java_op_sources",
-        ":java_sources",
-    ],
+    srcs = [":java_sources"],
     data = [":libtensorflow_jni"],
     javacopts = JAVACOPTS,
     visibility = ["//visibility:public"],
@@ -29,14 +26,6 @@ filegroup(
     ],
 )
 
-filegroup(
-    name = "java_op_sources",
-    srcs = glob(["src/main/java/org/tensorflow/op/*.java"]),
-    visibility = [
-        "//tensorflow/java:__pkg__",
-    ],
-)
-
 java_library(
     name = "testutil",
     testonly = 1,
@@ -149,19 +138,6 @@ java_test(
     ],
 )
 
-java_test(
-    name = "ScopeTest",
-    size = "small",
-    srcs = ["src/test/java/org/tensorflow/op/ScopeTest.java"],
-    javacopts = JAVACOPTS,
-    test_class = "org.tensorflow.op.ScopeTest",
-    deps = [
-        ":tensorflow",
-        ":testutil",
-        "@junit",
-    ],
-)
-
 filegroup(
     name = "libtensorflow_jni",
     srcs = select({
diff --git a/tensorflow/java/build_defs.bzl b/tensorflow/java/build_defs.bzl
index 5bd5b9a388..750d76301e 100644
--- a/tensorflow/java/build_defs.bzl
+++ b/tensorflow/java/build_defs.bzl
@@ -1,14 +1,5 @@
 # -*- Python -*-
 
-# Pin to Java 1.7 to ensure broader compatibility for the Java bindings on
-# Android. Note also that the android_library bazel rule currently enforces
-# java 7
-# https://github.com/bazelbuild/bazel/blob/6c1106b1a721516d3b3db54d2e1c31b44a76fbb1/src/main/java/com/google/devtools/build/lib/bazel/rules/android/BazelAndroidSemantics.java#L73
-
-JAVA_VERSION_OPTS = [
-    "-source 7 -target 7",
-]
-
 # A more robust set of lint and errorprone checks when building
 # Java source to improve code consistency.
 
@@ -160,4 +151,4 @@ EP_DISABLED_CHECKS = [
 
 EP_OPTS = EP_ENABLED_WARNINGS + EP_DISABLED_CHECKS
 
-JAVACOPTS = JAVA_VERSION_OPTS + XLINT_OPTS + EP_OPTS
+JAVACOPTS = XLINT_OPTS + EP_OPTS
diff --git a/tensorflow/java/maven/libtensorflow/pom.xml b/tensorflow/java/maven/libtensorflow/pom.xml
index 462e38d332..1fc9d49a86 100644
--- a/tensorflow/java/maven/libtensorflow/pom.xml
+++ b/tensorflow/java/maven/libtensorflow/pom.xml
@@ -19,8 +19,8 @@
         <artifactId>maven-compiler-plugin</artifactId>
         <version>3.6.1</version>
         <configuration>
-          <source>1.7</source>
-          <target>1.7</target>
+          <source>1.8</source>
+          <target>1.8</target>
         </configuration>
       </plugin>
       <plugin>
diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml
index f2974220d9..d1d0359b1d 100644
--- a/tensorflow/java/maven/proto/pom.xml
+++ b/tensorflow/java/maven/proto/pom.xml
@@ -16,7 +16,7 @@
     <dependency>
       <groupId>com.google.protobuf</groupId>
       <artifactId>protobuf-java</artifactId>
-      <version>3.3.1</version>
+      <version>3.2.0</version>
     </dependency>
   </dependencies>
 
@@ -27,8 +27,8 @@
         <artifactId>maven-compiler-plugin</artifactId>
         <version>3.6.1</version>
         <configuration>
-          <source>1.7</source>
-          <target>1.7</target>
+          <source>1.8</source>
+          <target>1.8</target>
         </configuration>
       </plugin>
       <plugin>
diff --git a/tensorflow/java/maven/release.sh b/tensorflow/java/maven/release.sh
index 0bb4efbcc0..0e14bc4964 100755
--- a/tensorflow/java/maven/release.sh
+++ b/tensorflow/java/maven/release.sh
@@ -21,7 +21,7 @@ TF_VERSION="$1"
 SETTINGS_XML="$2"
 shift
 shift
-CMD="$*"
+CMD="$@"
 
 if [[ -z "${TF_VERSION}" ]]
 then
diff --git a/tensorflow/java/maven/run_inside_container.sh b/tensorflow/java/maven/run_inside_container.sh
index 302ed96c12..b5e2bfc3a6 100644
--- a/tensorflow/java/maven/run_inside_container.sh
+++ b/tensorflow/java/maven/run_inside_container.sh
@@ -23,7 +23,7 @@ IS_SNAPSHOT="false"
 if [[ "${TF_VERSION}" == *"-SNAPSHOT" ]]; then
   IS_SNAPSHOT="true"
 fi
-PROTOC_RELEASE_URL="https://github.com/google/protobuf/releases/download/v3.3.0/protoc-3.3.0-linux-x86_64.zip"
+PROTOC_RELEASE_URL="https://github.com/google/protobuf/releases/download/v3.2.0/protoc-3.2.0-linux-x86_64.zip"
 
 set -ex
 
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Input.java b/tensorflow/java/src/main/java/org/tensorflow/Input.java
deleted file mode 100644
index dff3a45463..0000000000
--- a/tensorflow/java/src/main/java/org/tensorflow/Input.java
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow;
-
-/**
- * Interface implemented by operands of a TensorFlow operation.
- *
- * <p>Example usage:
- *
- * <pre>{@code
- * // The "decodeJpeg" operation can be used as input to the "cast" operation
- * Input decodeJpeg = ops.image().decodeJpeg(...);
- * ops.math().cast(decodeJpeg, DataType.FLOAT);
- *
- * // The output "y" of the "unique" operation can be used as input to the "cast" operation
- * Output y = ops.array().unique(...).y();
- * ops.math().cast(y, DataType.FLOAT);
- *
- * // The "split" operation can be used as input list to the "concat" operation
- * Iterable<? extends Input> split = ops.array().split(...);
- * ops.array().concat(0, split);
- * }</pre>
- */
-public interface Input {
-
-  /**
-   * Returns the symbolic handle of a tensor.
-   *
-   * <p>Inputs to TensorFlow operations are outputs of another TensorFlow operation. This method is
-   * used to obtain a symbolic handle that represents the computation of the input.
-   *
-   * @see {@link OperationBuilder#addInput(Output)}.
-   */
-  Output asOutput();
-}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Operation.java b/tensorflow/java/src/main/java/org/tensorflow/Operation.java
index e7de603409..43dbaf125c 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Operation.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Operation.java
@@ -79,7 +79,7 @@ public final class Operation {
    *
    * @param name identifier of the list of tensors (of which there may
    *        be many) produced by this operation.
-   * @return the size of the list of Tensors produced by this named output.
+   * @returns the size of the list of Tensors produced by this named output.
    * @throws IllegalArgumentException if this operation has no output
    *         with the provided name.
    */
@@ -97,28 +97,6 @@ public final class Operation {
     return new Output(this, idx);
   }
 
-  /**
-   * Returns the size of the given inputs list of Tensors for this operation.
-   *
-   * <p>An Operation has multiple named inputs, each of which contains either
-   * a single tensor or a list of tensors. This method returns the size of
-   * the list of tensors for a specific named input of the operation.
-   *
-   * @param name identifier of the list of tensors (of which there may
-   *        be many) inputs to this operation.
-   * @returns the size of the list of Tensors produced by this named input.
-   * @throws IllegalArgumentException if this operation has no input
-   *         with the provided name.
-   */
-  public int inputListLength(final String name) {
-    Graph.Reference r = graph.ref();
-    try {
-      return inputListLength(unsafeNativeHandle, name);
-    } finally {
-      r.close();
-    }
-  }
-
   long getUnsafeNativeHandle() {
     return unsafeNativeHandle;
   }
@@ -154,8 +132,6 @@ public final class Operation {
 
   private static native int outputListLength(long handle, String name);
 
-  private static native int inputListLength(long handle, String name);
-
   private static native long[] shape(long graphHandle, long opHandle, int output);
 
   private static native int dtype(long graphHandle, long opHandle, int output);
diff --git a/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java b/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java
index 8f7559d39e..38ffa2a8e1 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java
@@ -28,7 +28,7 @@ import java.nio.charset.Charset;
  * <pre>{@code
  * // g is a Graph instance.
  * try (Tensor c1 = Tensor.create(3.0f)) {
- *   g.opBuilder("Const", "MyConst")
+ *   g.opBuilder("Constant", "MyConst")
  *       .setAttr("dtype", c1.dataType())
  *       .setAttr("value", c1)
  *       .build();
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Output.java b/tensorflow/java/src/main/java/org/tensorflow/Output.java
index 2e3f8d4eac..ab128c2b30 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Output.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Output.java
@@ -20,11 +20,8 @@ package org.tensorflow;
  *
  * <p>An Output is a symbolic handle to a tensor. The value of the Tensor is computed by executing
  * the {@link Operation} in a {@link Session}.
- *
- * <p>By implementing the {@link Input} interface, instances of this class could also be passed
- * directly in input to an operation.
  */
-public final class Output implements Input {
+public final class Output {
 
   /** Handle to the idx-th output of the Operation {@code op}. */
   public Output(Operation op, int idx) {
@@ -52,11 +49,6 @@ public final class Output implements Input {
     return operation.dtype(index);
   }
 
-  @Override
-  public Output asOutput() {
-    return this;
-  }
-
   private final Operation operation;
   private final int index;
 }
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Session.java b/tensorflow/java/src/main/java/org/tensorflow/Session.java
index f73cded4e3..0d071e1674 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Session.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Session.java
@@ -125,7 +125,7 @@ public final class Session implements AutoCloseable {
      *     <tt>operation_name:output_index</tt> , in which case this method acts like {@code
      *     feed(operation_name, output_index)}. These colon-separated names are commonly used in the
      *     {@code SignatureDef} protocol buffer messages that are included in {@link
-     *     SavedModelBundle#metaGraphDef()}.
+     *     SavedModelBundle.metaGraphDef()}.
      */
     public Runner feed(String operation, Tensor t) {
       return feed(parseOutput(operation), t);
@@ -165,7 +165,7 @@ public final class Session implements AutoCloseable {
      *     <tt>operation_name:output_index</tt> , in which case this method acts like {@code
      *     fetch(operation_name, output_index)}. These colon-separated names are commonly used in
      *     the {@code SignatureDef} protocol buffer messages that are included in {@link
-     *     SavedModelBundle#metaGraphDef()}.
+     *     SavedModelBundle.metaGraphDef()}.
      */
     public Runner fetch(String operation) {
       return fetch(parseOutput(operation));
diff --git a/tensorflow/java/src/main/java/org/tensorflow/op/NameScope.java b/tensorflow/java/src/main/java/org/tensorflow/op/NameScope.java
deleted file mode 100644
index 2e84cac1ac..0000000000
--- a/tensorflow/java/src/main/java/org/tensorflow/op/NameScope.java
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.op;
-
-import java.util.HashMap;
-import java.util.Map;
-import java.util.regex.Pattern;
-
-/**
- * A class to manage scoped (hierarchical) names for operators.
- *
- * <p>{@code NameScope} manages hierarchical names where each component in the hierarchy is
- * separated by a forward slash {@code '/'}. For instance, {@code nn/Const_72} or {@code
- * nn/gradient/assign/init}. Each scope is a subtree in this hierarchy.
- *
- * <p>Use {@code NameScope} to group related operations within a hierarchy, which for example lets
- * tensorboard coalesce nodes for better graph visualizations.
- *
- * <p>This class is package private, user code creates {@link Scope} which internally delegates
- * calls to an underlying {@code NameScope}.
- *
- * <p>This class is <b>not</b> thread-safe.
- */
-final class NameScope {
-
-  NameScope withSubScope(String scopeName) {
-    checkPattern(NAME_REGEX, scopeName);
-    // Override with opName if it exists.
-    String actualName = (opName != null) ? opName : scopeName;
-    String newPrefix = fullyQualify(makeUnique(actualName));
-    return new NameScope(newPrefix, null, null);
-  }
-
-  NameScope withName(String name) {
-    checkPattern(NAME_REGEX, name);
-    // All context except for the opName is shared with the new scope.
-    return new NameScope(opPrefix, name, ids);
-  }
-
-  String makeOpName(String name) {
-    checkPattern(NAME_REGEX, name);
-    // Override with opName if it exists.
-    String actualName = (opName != null) ? opName : name;
-    return fullyQualify(makeUnique(actualName));
-  }
-
-  /**
-   * Create a new, root-level namescope.
-   *
-   * <p>A root-level namescope generates operator names with no components, like {@code Const_72}
-   * and {@code result}.
-   */
-  NameScope() {
-    this(null, null, null);
-  }
-
-  private NameScope(String opPrefix, String opName, Map<String, Integer> ids) {
-    this.opPrefix = opPrefix;
-    this.opName = opName;
-    if (ids != null) {
-      this.ids = ids;
-    } else {
-      this.ids = new HashMap<String, Integer>();
-    }
-  }
-
-  // Generate a unique name, different from existing ids.
-  //
-  // ids is a map from id to integer, representing a counter of the
-  // number of previous requests to generate a unique name for the
-  // given id.
-  //
-  // For instance, the first use of makeUnique("a") adds "a" -> 1
-  // to ids and returns "a".
-  //
-  // The second use of makeUnique("a") updates ids to "a" -> 2
-  // and returns "a_1", and so on.
-  private String makeUnique(String id) {
-    if (!ids.containsKey(id)) {
-      ids.put(id, 1);
-      return id;
-    } else {
-      int cur = ids.get(id);
-      ids.put(id, cur + 1);
-      return String.format("%s_%d", id, cur);
-    }
-  }
-
-  private String fullyQualify(String name) {
-    if (opPrefix != null) {
-      return String.format("%s/%s", opPrefix, name);
-    } else {
-      return name;
-    }
-  }
-
-  // If opPrefix is non-null, it is a prefix applied to all names
-  // created by this instance.
-  private final String opPrefix;
-
-  // If opName is non-null, it is used to derive the unique name
-  // for operators rather than the provided default name.
-  private final String opName;
-
-  // NameScope generates unique names by appending a numeric suffix if
-  // needed. This is a map containing names already created by this
-  // instance mapped to the next available numeric suffix for it.
-  private final Map<String, Integer> ids;
-
-  private static void checkPattern(Pattern pattern, String name) {
-    if (name == null) {
-      throw new IllegalArgumentException("Names cannot be null");
-    }
-    if (!pattern.matcher(name).matches()) {
-      throw new IllegalArgumentException(
-          String.format(
-              "invalid name: '%s' does not match the regular expression %s",
-              name, NAME_REGEX.pattern()));
-    }
-  }
-
-  // The constraints for operator and scope names originate from restrictions on node names
-  // noted in the proto definition core/framework/node_def.proto for NodeDef and actually
-  // implemented in core/framework/node_def_util.cc [Note that the proto comment does not include
-  // dash (-) in names, while the actual implementation permits it. This regex follows the actual
-  // implementation.]
-  //
-  // This pattern is used to ensure fully qualified names always start with a LETTER_DIGIT_DOT,
-  // followed by zero or more LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE. SLASH is not permitted in
-  // actual user-supplied names to NameScope - it is used as a reserved character to separate
-  // subcomponents within fully qualified names.
-  private static final Pattern NAME_REGEX = Pattern.compile("[A-Za-z0-9.][A-Za-z0-9_.\\-]*");
-}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/op/Scope.java b/tensorflow/java/src/main/java/org/tensorflow/op/Scope.java
deleted file mode 100644
index 8de2eaeb79..0000000000
--- a/tensorflow/java/src/main/java/org/tensorflow/op/Scope.java
+++ /dev/null
@@ -1,165 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.op;
-
-import org.tensorflow.Graph;
-
-/**
- * Manages groups of related properties when creating Tensorflow Operations, such as a common name
- * prefix.
- *
- * <p>A {@code Scope} is a container for common properties applied to TensorFlow Ops. Normal user
- * code initializes a {@code Scope} and provides it to Operation building classes. For example:
- *
- * <pre>{@code
- * Scope scope = new Scope(graph);
- * Constant c = Constant.create(scope, 42);
- * }</pre>
- *
- * <p>An Operation building class acquires a Scope, and uses it to set properties on the underlying
- * Tensorflow ops. For example:
- *
- * <pre>{@code
- * // An operator class that adds a constant.
- * public class Constant {
- *   public static Constant create(Scope scope, ...) {
- *      scope.graph().opBuilder(
- *        "Const", scope.makeOpName("Const"))
- *        .setAttr(...)
- *        .build()
- *      ...
- *   }
- * }
- * }</pre>
- *
- * <p><b>Scope hierarchy:</b>
- *
- * <p>A {@code Scope} provides various {@code with()} methods that create a new scope. The new scope
- * typically has one property changed while other properties are inherited from the parent scope.
- *
- * <p>An example using {@code Constant} implemented as before:
- *
- * <pre>{@code
- * Scope root = new Scope(graph);
- *
- * // The linear subscope will generate names like linear/...
- * Scope linear = Scope.withSubScope("linear");
- *
- * // This op name will be "linear/W"
- * Constant.create(linear.withName("W"), ...);
- *
- * // This op will be "linear/Const", using the default
- * // name provided by Constant
- * Constant.create(linear, ...);
- *
- * // This op will be "linear/Const_1", using the default
- * // name provided by Constant and making it unique within
- * // this scope
- * Constant.create(linear, ...);
- * }</pre>
- *
- * <p>Scope objects are <b>not</b> thread-safe.
- */
-public final class Scope {
-
-  /**
-   * Create a new top-level scope.
-   *
-   * @param graph The graph instance to be managed by the scope.
-   */
-  public Scope(Graph graph) {
-    this(graph, new NameScope());
-  }
-
-  /** Returns the graph managed by this scope. */
-  public Graph graph() {
-    return graph;
-  }
-
-  /**
-   * Returns a new scope where added operations will have the provided name prefix.
-   *
-   * <p>Ops created with this scope will have {@code name/childScopeName/} as the prefix. The actual
-   * name will be unique in the returned scope. All other properties are inherited from the current
-   * scope.
-   *
-   * <p>The child scope name must match the regular expression {@code [A-Za-z0-9.][A-Za-z0-9_.\-]*}
-   *
-   * @param childScopeName name for the new child scope
-   * @return a new subscope
-   * @throws IllegalArgumentException if the name is invalid
-   */
-  public Scope withSubScope(String childScopeName) {
-    return new Scope(graph, nameScope.withSubScope(childScopeName));
-  }
-
-  /**
-   * Return a new scope that uses the provided name for an op.
-   *
-   * <p>Operations created within this scope will have a name of the form {@code
-   * name/opName[_suffix]}. This lets you name a specific operator more meaningfully.
-   *
-   * <p>Names must match the regular expression {@code [A-Za-z0-9.][A-Za-z0-9_.\-]*}
-   *
-   * @param opName name for an operator in the returned scope
-   * @return a new Scope that uses opName for operations.
-   * @throws IllegalArgumentException if the name is invalid
-   */
-  public Scope withName(String opName) {
-    return new Scope(graph, nameScope.withName(opName));
-  }
-
-  /**
-   * Create a unique name for an operator, using a provided default if necessary.
-   *
-   * <p>This is normally called only by operator building classes.
-   *
-   * <p>This method generates a unique name, appropriate for the name scope controlled by this
-   * instance. Typical operator building code might look like
-   *
-   * <pre>{@code
-   * scope.graph().opBuilder("Const", scope.makeOpName("Const"))...
-   * }</pre>
-   *
-   * <p><b>Note:</b> if you provide a composite operator building class (i.e, a class that adds a
-   * set of related operations to the graph by calling other operator building code) you should also
-   * create a {@link #withSubScope(String)} scope for the underlying operators to group them under a
-   * meaningful name.
-   *
-   * <pre>{@code
-   * public static Stddev create(Scope scope, ...) {
-   *   // group sub-operations under a common name
-   *   Scope group = scope.withSubScope("stddev");
-   *   ... Sqrt.create(group, Mean.create(group, ...))
-   * }
-   * }</pre>
-   *
-   * @param defaultName name for the underlying operator.
-   * @return unique name for the operator.
-   * @throws IllegalArgumentException if the default name is invalid.
-   */
-  public String makeOpName(String defaultName) {
-    return nameScope.makeOpName(defaultName);
-  }
-
-  private Scope(Graph graph, NameScope nameScope) {
-    this.graph = graph;
-    this.nameScope = nameScope;
-  }
-
-  private final Graph graph;
-  private final NameScope nameScope;
-}
diff --git a/tensorflow/java/src/main/native/operation_jni.cc b/tensorflow/java/src/main/native/operation_jni.cc
index ccc44d91c0..b3d5fc4ec3 100644
--- a/tensorflow/java/src/main/native/operation_jni.cc
+++ b/tensorflow/java/src/main/native/operation_jni.cc
@@ -156,21 +156,3 @@ JNIEXPORT jint JNICALL Java_org_tensorflow_Operation_dtype(JNIEnv* env,
 
   return static_cast<jint>(TF_OperationOutputType(TF_Output{op, output_index}));
 }
-
-JNIEXPORT jint JNICALL Java_org_tensorflow_Operation_inputListLength(JNIEnv* env,
-                                                                      jclass clazz,
-                                                                      jlong handle,
-                                                                      jstring name) {
-  TF_Operation* op = requireHandle(env, handle);
-  if (op == nullptr) return 0;
-
-  TF_Status* status = TF_NewStatus();
-
-  const char* cname = env->GetStringUTFChars(name, nullptr);
-  int result = TF_OperationInputListLength(op, cname, status);
-  env->ReleaseStringUTFChars(name, cname);
-
-  throwExceptionIfNotOK(env, status);
-  TF_DeleteStatus(status);
-  return result;
-}
diff --git a/tensorflow/java/src/main/native/operation_jni.h b/tensorflow/java/src/main/native/operation_jni.h
index 6f379256d2..b5d156f7c2 100644
--- a/tensorflow/java/src/main/native/operation_jni.h
+++ b/tensorflow/java/src/main/native/operation_jni.h
@@ -73,17 +73,6 @@ JNIEXPORT jlongArray JNICALL Java_org_tensorflow_Operation_shape(JNIEnv *,
 JNIEXPORT jint JNICALL Java_org_tensorflow_Operation_dtype(JNIEnv *, jclass,
                                                            jlong, jlong, jint);
 
-
-/*
- * Class:     org_tensorflow_Operation
- * Method:    inputListLength
- * Signature: (JLjava/lang/String;)I
- */
-JNIEXPORT jint JNICALL Java_org_tensorflow_Operation_inputListLength(JNIEnv *,
-                                                                      jclass,
-                                                                      jlong,
-                                                                      jstring);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/java/src/test/java/org/tensorflow/OperationTest.java b/tensorflow/java/src/test/java/org/tensorflow/OperationTest.java
index 4fa68130c0..74fdcf484e 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/OperationTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/OperationTest.java
@@ -52,16 +52,6 @@ public class OperationTest {
     assertEquals(3, split(new int[] {0, 1, 2}, 3));
   }
 
-  @Test
-  public void inputListLength() {
-    assertEquals(1, splitWithInputList(new int[] {0, 1}, 1, "split_dim"));
-    try {
-      splitWithInputList(new int[] {0, 1}, 2, "inputs");
-    } catch (IllegalArgumentException iae) {
-      // expected
-    }
-  }
-
   private static int split(int[] values, int num_split) {
     try (Graph g = new Graph()) {
       return g.opBuilder("Split", "Split")
@@ -72,15 +62,4 @@ public class OperationTest {
           .outputListLength("output");
     }
   }
-
-  private static int splitWithInputList(int[] values, int num_split, String name) {
-    try (Graph g = new Graph()) {
-      return g.opBuilder("Split", "Split")
-          .addInput(TestUtil.constant(g, "split_dim", 0))
-          .addInput(TestUtil.constant(g, "values", values))
-          .setAttr("num_split", num_split)
-          .build()
-          .inputListLength(name);
-    }
-  }
 }
diff --git a/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java b/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java
index 50bdf351e3..0d2dbc5b88 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java
@@ -109,7 +109,7 @@ public class SessionTest {
         assertEquals(1, outputs.size());
         final int[][] expected = {{31}};
         assertArrayEquals(expected, outputs.get(0).copyTo(new int[1][1]));
-        // Sanity check on metadata
+        // Sanity check on metadatar
         // See comments in fullTraceRunOptions() for an explanation about
         // why this check is really silly. Ideally, this would be:
         /*
@@ -187,7 +187,7 @@ public class SessionTest {
     // https://github.com/bazelbuild/rules_go/pull/121#issuecomment-251515362
     // https://github.com/bazelbuild/rules_go/pull/121#issuecomment-251692558
     //
-    // For this test, for now, the use of specific bytes suffices.
+    // For this test, for now, the use of specific bytes sufficies.
     return new byte[] {0x08, 0x03};
     /*
     return org.tensorflow.framework.RunOptions.newBuilder()
@@ -207,7 +207,7 @@ public class SessionTest {
     // https://github.com/bazelbuild/rules_go/pull/121#issuecomment-251515362
     // https://github.com/bazelbuild/rules_go/pull/121#issuecomment-251692558
     //
-    // For this test, for now, the use of specific bytes suffices.
+    // For this test, for now, the use of specific bytes sufficies.
     return new byte[] {0x10, 0x01, 0x28, 0x01};
     /*
     return org.tensorflow.framework.ConfigProto.newBuilder()
diff --git a/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java b/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java
index 3ff59e71b2..44eecc1d1e 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java
@@ -472,7 +472,7 @@ public class TensorTest {
   @Test
   public void fromHandle() {
     // fromHandle is a package-visible method intended for use when the C TF_Tensor object has been
-    // created independently of the Java code. In practice, two Tensor instances MUST NOT have the
+    // created indepdently of the Java code. In practice, two Tensor instances MUST NOT have the
     // same native handle.
     //
     // An exception is made for this test, where the pitfalls of this is avoided by not calling
diff --git a/tensorflow/java/src/test/java/org/tensorflow/op/ScopeTest.java b/tensorflow/java/src/test/java/org/tensorflow/op/ScopeTest.java
deleted file mode 100644
index 9256cb281d..0000000000
--- a/tensorflow/java/src/test/java/org/tensorflow/op/ScopeTest.java
+++ /dev/null
@@ -1,270 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.op;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.fail;
-
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-import org.tensorflow.Graph;
-import org.tensorflow.Output;
-import org.tensorflow.Session;
-import org.tensorflow.Tensor;
-
-/** Unit tests for {@link org.tensorflow.Scope}. */
-@RunWith(JUnit4.class)
-public class ScopeTest {
-
-  @Test
-  public void basicNames() {
-    try (Graph g = new Graph()) {
-      Scope root = new Scope(g);
-      assertEquals("add", root.makeOpName("add"));
-      assertEquals("add_1", root.makeOpName("add"));
-      assertEquals("add_2", root.makeOpName("add"));
-      assertEquals("mul", root.makeOpName("mul"));
-    }
-  }
-
-  @Test
-  public void hierarchicalNames() {
-    try (Graph g = new Graph()) {
-      Scope root = new Scope(g);
-      Scope child = root.withSubScope("child");
-      assertEquals("child/add", child.makeOpName("add"));
-      assertEquals("child/add_1", child.makeOpName("add"));
-      assertEquals("child/mul", child.makeOpName("mul"));
-
-      Scope child_1 = root.withSubScope("child");
-      assertEquals("child_1/add", child_1.makeOpName("add"));
-      assertEquals("child_1/add_1", child_1.makeOpName("add"));
-      assertEquals("child_1/mul", child_1.makeOpName("mul"));
-
-      Scope c_c = root.withSubScope("c").withSubScope("c");
-      assertEquals("c/c/add", c_c.makeOpName("add"));
-
-      Scope c_1 = root.withSubScope("c");
-      Scope c_1_c = c_1.withSubScope("c");
-      assertEquals("c_1/c/add", c_1_c.makeOpName("add"));
-
-      Scope c_1_c_1 = c_1.withSubScope("c");
-      assertEquals("c_1/c_1/add", c_1_c_1.makeOpName("add"));
-    }
-  }
-
-  @Test
-  public void scopeAndOpNames() {
-    try (Graph g = new Graph()) {
-      Scope root = new Scope(g);
-
-      Scope child = root.withSubScope("child");
-
-      assertEquals("child/add", child.makeOpName("add"));
-      assertEquals("child_1", root.makeOpName("child"));
-      assertEquals("child_2/p", root.withSubScope("child").makeOpName("p"));
-    }
-  }
-
-  @Test
-  public void validateNames() {
-    try (Graph g = new Graph()) {
-      Scope root = new Scope(g);
-
-      final String[] invalid_names = {
-        "_", "-", "-x", // Names are constrained to start with [A-Za-z0-9.]
-        null, "", "a$", // Invalid characters
-        "a/b", // slashes not allowed
-      };
-
-      for (String name : invalid_names) {
-        try {
-          root.withName(name);
-          fail("failed to catch invalid op name.");
-        } catch (IllegalArgumentException ex) {
-          // expected
-        }
-        // Subscopes follow the same rules
-        try {
-          root.withSubScope(name);
-          fail("failed to catch invalid scope name: " + name);
-        } catch (IllegalArgumentException ex) {
-          // expected
-        }
-      }
-
-      // Unusual but valid names.
-      final String[] valid_names = {".", "..", "._-.", "a--."};
-
-      for (String name : valid_names) {
-        root.withName(name);
-        root.withSubScope(name);
-      }
-    }
-  }
-
-  @Test
-  public void basic() {
-    try (Graph g = new Graph()) {
-      Scope s = new Scope(g);
-      Const c1 = Const.create(s, 42);
-      assertEquals("Const", c1.output().op().name());
-      Const c2 = Const.create(s, 7);
-      assertEquals("Const_1", c2.output().op().name());
-      Const c3 = Const.create(s.withName("four"), 4);
-      assertEquals("four", c3.output().op().name());
-      Const c4 = Const.create(s.withName("four"), 4);
-      assertEquals("four_1", c4.output().op().name());
-    }
-  }
-
-  @Test
-  public void hierarchy() {
-    try (Graph g = new Graph()) {
-      Scope root = new Scope(g);
-      Scope child = root.withSubScope("child");
-      assertEquals("child/Const", Const.create(child, 42).output().op().name());
-      assertEquals("child/four", Const.create(child.withName("four"), 4).output().op().name());
-    }
-  }
-
-  @Test
-  public void composite() {
-    try (Graph g = new Graph();
-        Session sess = new Session(g)) {
-      Scope s = new Scope(g);
-      Output data = Const.create(s.withName("data"), new int[] {600, 470, 170, 430, 300}).output();
-
-      // Create a composite op with a customized name
-      Variance var1 = Variance.create(s.withName("example"), data);
-      assertEquals("example/variance", var1.output().op().name());
-
-      // Confirm internally added ops have the right names.
-      assertNotNull(g.operation("example/squared_deviation"));
-      assertNotNull(g.operation("example/Mean"));
-      assertNotNull(g.operation("example/zero"));
-
-      // Same composite op with a default name
-      Variance var2 = Variance.create(s, data);
-      assertEquals("variance/variance", var2.output().op().name());
-
-      // Confirm internally added ops have the right names.
-      assertNotNull(g.operation("variance/squared_deviation"));
-      assertNotNull(g.operation("variance/Mean"));
-      assertNotNull(g.operation("variance/zero"));
-
-      // Verify correct results as well.
-      Tensor result = sess.runner().fetch(var1.output()).run().get(0);
-      assertEquals(21704, result.intValue());
-      result = sess.runner().fetch(var2.output()).run().get(0);
-      assertEquals(21704, result.intValue());
-    }
-  }
-
-  // "handwritten" sample operator classes
-  private static final class Const {
-    private final Output output;
-
-    static Const create(Scope s, Object v) {
-      try (Tensor value = Tensor.create(v)) {
-        return new Const(
-            s.graph()
-                .opBuilder("Const", s.makeOpName("Const"))
-                .setAttr("dtype", value.dataType())
-                .setAttr("value", value)
-                .build()
-                .output(0));
-      }
-    }
-
-    Const(Output o) {
-      output = o;
-    }
-
-    Output output() {
-      return output;
-    }
-  }
-
-  private static final class Mean {
-    private final Output output;
-
-    static Mean create(Scope s, Output input, Output reductionIndices) {
-      return new Mean(
-          s.graph()
-              .opBuilder("Mean", s.makeOpName("Mean"))
-              .addInput(input)
-              .addInput(reductionIndices)
-              .build()
-              .output(0));
-    }
-
-    Mean(Output o) {
-      output = o;
-    }
-
-    Output output() {
-      return output;
-    }
-  }
-
-  private static final class SquaredDifference {
-    private final Output output;
-
-    static SquaredDifference create(Scope s, Output x, Output y) {
-      return new SquaredDifference(
-          s.graph()
-              .opBuilder("SquaredDifference", s.makeOpName("SquaredDifference"))
-              .addInput(x)
-              .addInput(y)
-              .build()
-              .output(0));
-    }
-
-    SquaredDifference(Output o) {
-      output = o;
-    }
-
-    Output output() {
-      return output;
-    }
-  }
-
-  private static final class Variance {
-    private final Output output;
-
-    static Variance create(Scope base, Output x) {
-      Scope s = base.withSubScope("variance");
-      Output zero = Const.create(s.withName("zero"), new int[] {0}).output();
-      Output sqdiff =
-          SquaredDifference.create(
-                  s.withName("squared_deviation"), x, Mean.create(s, x, zero).output())
-              .output();
-
-      return new Variance(Mean.create(s.withName("variance"), sqdiff, zero).output());
-    }
-
-    Variance(Output o) {
-      output = o;
-    }
-
-    Output output() {
-      return output;
-    }
-  }
-}
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 22b18b9cde..b65a638620 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -833,7 +833,6 @@ py_test(
     data = ["//tensorflow/python:meta_graph_testdata"],
     main = "framework/meta_graph_test.py",
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -3056,7 +3055,7 @@ py_test(
     srcs = ["client/session_clusterspec_prop_test.py"],
     srcs_version = "PY2AND3",
     tags = [
-        "no_pip_gpu",
+        "no_gpu",
     ],
     deps = [
         ":array_ops",
@@ -3080,7 +3079,7 @@ py_test(
     srcs = ["client/session_list_devices_test.py"],
     srcs_version = "PY2AND3",
     tags = [
-        "no_pip_gpu",
+        "no_gpu",
     ],
     deps = [
         ":client",
diff --git a/tensorflow/python/debug/cli/analyzer_cli.py b/tensorflow/python/debug/cli/analyzer_cli.py
index e863c2ddb9..da27f4cebe 100644
--- a/tensorflow/python/debug/cli/analyzer_cli.py
+++ b/tensorflow/python/debug/cli/analyzer_cli.py
@@ -1312,7 +1312,7 @@ class DebugAnalyzer(object):
     all_inputs = copy.copy(tracker(node_name, is_control=False))
     is_ctrl = [False] * len(all_inputs)
     if include_control:
-      # Sort control inputs or recipients in alphabetical order of the node
+      # Sort control inputs or recipients in in alphabetical order of the node
       # names.
       ctrl_inputs = sorted(tracker(node_name, is_control=True))
       all_inputs.extend(ctrl_inputs)
diff --git a/tensorflow/python/debug/wrappers/framework_test.py b/tensorflow/python/debug/wrappers/framework_test.py
index 2b2289d6a8..536365b692 100644
--- a/tensorflow/python/debug/wrappers/framework_test.py
+++ b/tensorflow/python/debug/wrappers/framework_test.py
@@ -215,7 +215,7 @@ class DebugWrapperSessionTest(test_util.TensorFlowTestCase):
       wrapper_sess.partial_run_setup(self._p)
 
   def testInteractiveSessionInit(self):
-    """The wrapper should work also on other subclasses of session.Session."""
+    """The wrapper should work also on other subclassses of session.Session."""
 
     TestDebugWrapperSession(
         session.InteractiveSession(), self._dump_root, self._observer)
diff --git a/tensorflow/python/estimator/canned/dnn_linear_combined.py b/tensorflow/python/estimator/canned/dnn_linear_combined.py
index 03bcf7ae57..8785ddc5a6 100644
--- a/tensorflow/python/estimator/canned/dnn_linear_combined.py
+++ b/tensorflow/python/estimator/canned/dnn_linear_combined.py
@@ -395,7 +395,7 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
 
 
 class DNNLinearCombinedRegressor(estimator.Estimator):
-  """An estimator for TensorFlow Linear and DNN joined models for regression.
+  """An estimator for TensorFlow Linear and DNN joined models for regresssion.
 
   Note: This estimator is also known as wide-n-deep.
 
diff --git a/tensorflow/python/estimator/canned/linear.py b/tensorflow/python/estimator/canned/linear.py
index 552b1bdf01..05b1e5b44a 100644
--- a/tensorflow/python/estimator/canned/linear.py
+++ b/tensorflow/python/estimator/canned/linear.py
@@ -42,8 +42,8 @@ def _get_default_optimizer(feature_columns):
   return ftrl.FtrlOptimizer(learning_rate=learning_rate)
 
 
-# TODO(b/36813849): Revisit passing params vs named arguments.
-def _linear_model_fn(features, labels, mode, params, config):
+def _linear_model_fn(features, labels, mode, head, feature_columns, optimizer,
+                     partitioner, config):
   """A model_fn for linear models that use a gradient-based optimizer.
 
   Args:
@@ -51,13 +51,12 @@ def _linear_model_fn(features, labels, mode, params, config):
     labels: `Tensor` of shape `[batch_size, logits_dimension]`.
     mode: Defines whether this is training, evaluation or prediction.
       See `ModeKeys`.
-    params: A dict of hyperparameters.
-      The following hyperparameters are expected:
-      * head: A `Head` instance.
-      * feature_columns: An iterable containing all the feature columns used by
-          the model.
-      * optimizer: string, `Optimizer` object, or callable that defines the
-          optimizer to use for training. If `None`, will use a FTRL optimizer.
+    head: A `Head` instance.
+    feature_columns: An iterable containing all the feature columns used by
+      the model.
+    optimizer: string, `Optimizer` object, or callable that defines the
+      optimizer to use for training. If `None`, will use a FTRL optimizer.
+    partitioner: Partitioner for variables.
     config: `RunConfig` object to configure the runtime settings.
 
   Returns:
@@ -66,14 +65,12 @@ def _linear_model_fn(features, labels, mode, params, config):
   Raises:
     ValueError: If mode or params are invalid.
   """
-  head = params['head']
-  feature_columns = tuple(params['feature_columns'])
   optimizer = optimizers.get_optimizer_instance(
-      params.get('optimizer') or _get_default_optimizer(feature_columns),
+      optimizer or _get_default_optimizer(feature_columns),
       learning_rate=_LEARNING_RATE)
   num_ps_replicas = config.num_ps_replicas if config else 0
 
-  partitioner = params.get('partitioner') or (
+  partitioner = partitioner or (
       partitioned_variables.min_max_variable_partitioner(
           max_partitions=num_ps_replicas,
           min_slice_size=64 << 20))
@@ -210,16 +207,20 @@ class LinearClassifier(estimator.Estimator):
       head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(  # pylint: disable=protected-access
           n_classes, weight_column=weight_column,
           label_vocabulary=label_vocabulary)
+    def _model_fn(features, labels, mode, config):
+      return _linear_model_fn(
+          features=features,
+          labels=labels,
+          mode=mode,
+          head=head,
+          feature_columns=tuple(feature_columns or []),
+          optimizer=optimizer,
+          partitioner=partitioner,
+          config=config)
     super(LinearClassifier, self).__init__(
-        model_fn=_linear_model_fn,
+        model_fn=_model_fn,
         model_dir=model_dir,
-        config=config,
-        params={
-            'head': head,
-            'feature_columns': feature_columns,
-            'optimizer': optimizer,
-            'partitioner': partitioner,
-        })
+        config=config)
 
 
 class LinearRegressor(estimator.Estimator):
@@ -298,21 +299,19 @@ class LinearRegressor(estimator.Estimator):
       config: `RunConfig` object to configure the runtime settings.
       partitioner: Optional. Partitioner for input layer.
     """
+    head = head_lib._regression_head_with_mean_squared_error_loss(  # pylint: disable=protected-access
+        label_dimension=label_dimension, weight_column=weight_column)
+    def _model_fn(features, labels, mode, config):
+      return _linear_model_fn(
+          features=features,
+          labels=labels,
+          mode=mode,
+          head=head,
+          feature_columns=tuple(feature_columns or []),
+          optimizer=optimizer,
+          partitioner=partitioner,
+          config=config)
     super(LinearRegressor, self).__init__(
-        model_fn=_linear_model_fn,
+        model_fn=_model_fn,
         model_dir=model_dir,
-        config=config,
-        params={
-            # pylint: disable=protected-access
-            'head':
-                head_lib._regression_head_with_mean_squared_error_loss(
-                    label_dimension=label_dimension,
-                    weight_column=weight_column),
-            # pylint: enable=protected-access
-            'feature_columns':
-                feature_columns,
-            'optimizer':
-                optimizer,
-            'partitioner':
-                partitioner,
-        })
+        config=config)
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index d8e069105f..257677ce49 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -2382,6 +2382,11 @@ cuda_py_test(
         "//tensorflow/python:util",
         "//tensorflow/python:data_flow_ops",
     ],
+    tags = [
+        "manual",  # http://b/62429636
+        "noguitar",
+        "notap",
+    ],
 )
 
 cuda_py_test(
@@ -2396,6 +2401,11 @@ cuda_py_test(
         "//tensorflow/python:util",
         "//tensorflow/python:data_flow_ops",
     ],
+    tags = [
+        "manual",  # http://b/62429636
+        "noguitar",
+        "notap",
+    ],
 )
 
 cuda_py_test(
diff --git a/tensorflow/python/kernel_tests/basic_gpu_test.py b/tensorflow/python/kernel_tests/basic_gpu_test.py
index b5b17ff80a..013aa1ba8a 100644
--- a/tensorflow/python/kernel_tests/basic_gpu_test.py
+++ b/tensorflow/python/kernel_tests/basic_gpu_test.py
@@ -113,7 +113,6 @@ class MathBuiltinUnaryTest(test.TestCase):
     self._compare(data, np.arctan, math_ops.atan, use_gpu)
     self._compare(data, np.ceil, math_ops.ceil, use_gpu)
     self._compare(data, np.cos, math_ops.cos, use_gpu)
-    self._compare(data, np.cosh, math_ops.cosh, use_gpu)
     self._compare(data, np.exp, math_ops.exp, use_gpu)
     self._compare(data, np.floor, math_ops.floor, use_gpu)
     self._compare(data, np.log, math_ops.log, use_gpu)
@@ -121,7 +120,6 @@ class MathBuiltinUnaryTest(test.TestCase):
     self._compare(data, np.negative, math_ops.negative, use_gpu)
     self._compare(data, self._rsqrt, math_ops.rsqrt, use_gpu)
     self._compare(data, np.sin, math_ops.sin, use_gpu)
-    self._compare(data, np.sinh, math_ops.sinh, use_gpu)
     self._compare(data, np.sqrt, math_ops.sqrt, use_gpu)
     self._compare(data, np.square, math_ops.square, use_gpu)
     self._compare(data, np.tan, math_ops.tan, use_gpu)
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index b47139e6b8..0846470abc 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -200,8 +200,6 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(x, np.expm1, math_ops.expm1)
     self._compareBoth(z, np.log, math_ops.log)
     self._compareBoth(z, np.log1p, math_ops.log1p)
-    self._compareBoth(x, np.sinh, math_ops.sinh)
-    self._compareBoth(x, np.cosh, math_ops.cosh)
     self._compareBoth(x, np.tanh, math_ops.tanh)
     self._compareBoth(x, self._sigmoid, math_ops.sigmoid)
     self._compareBoth(x, self._log_sigmoid, math_ops.log_sigmoid)
@@ -247,8 +245,6 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(x, np.expm1, math_ops.expm1)
     self._compareBoth(x, np.log, math_ops.log)
     self._compareBoth(x, np.log1p, math_ops.log1p)
-    self._compareBoth(x, np.sinh, math_ops.sinh)
-    self._compareBoth(x, np.cosh, math_ops.cosh)
     self._compareBoth(x, np.tanh, math_ops.tanh)
     self._compareBoth(x, self._sigmoid, math_ops.sigmoid)
     self._compareBoth(x, np.sign, math_ops.sign)
@@ -289,8 +285,6 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(x, np.expm1, math_ops.expm1)
     self._compareBoth(z, np.log, math_ops.log)
     self._compareBoth(z, np.log1p, math_ops.log1p)
-    self._compareBoth(x, np.sinh, math_ops.sinh)
-    self._compareBoth(x, np.cosh, math_ops.cosh)
     self._compareBoth(x, np.tanh, math_ops.tanh)
     self._compareBoth(x, self._sigmoid, math_ops.sigmoid)
     self._compareBoth(y, np.sign, math_ops.sign)
@@ -395,8 +389,6 @@ class UnaryOpTest(test.TestCase):
     self._compareCpu(x, np.expm1, math_ops.expm1)
     self._compareCpu(y, np.log, math_ops.log)
     self._compareCpu(y, np.log1p, math_ops.log1p)
-    self._compareCpu(x, np.sinh, math_ops.sinh)
-    self._compareCpu(x, np.cosh, math_ops.cosh)
     self._compareCpu(x, np.tanh, math_ops.tanh)
     self._compareCpu(x, self._sigmoid, math_ops.sigmoid)
     self._compareCpu(x, np.sin, math_ops.sin)
@@ -431,8 +423,6 @@ class UnaryOpTest(test.TestCase):
     self._compareCpu(x, np.expm1, math_ops.expm1)
     self._compareCpu(y, np.log, math_ops.log)
     self._compareCpu(y, np.log1p, math_ops.log1p)
-    self._compareCpu(x, np.sinh, math_ops.sinh)
-    self._compareCpu(x, np.cosh, math_ops.cosh)
     self._compareCpu(x, np.tanh, math_ops.tanh)
     self._compareCpu(x, self._sigmoid, math_ops.sigmoid)
     self._compareCpu(x, np.sin, math_ops.sin)
@@ -707,11 +697,6 @@ class BinaryOpTest(test.TestCase):
     except ImportError as e:
       tf_logging.warn("Cannot test special functions: %s" % str(e))
 
-  def testUint8Basic(self):
-    x = np.arange(1, 13, 2).reshape(1, 3, 2).astype(np.uint8)
-    y = np.arange(1, 7, 1).reshape(1, 3, 2).astype(np.uint8)
-    self._compareBoth(x, y, np.add, math_ops.add)
-
   def testInt8Basic(self):
     x = np.arange(1, 13, 2).reshape(1, 3, 2).astype(np.int8)
     y = np.arange(1, 7, 1).reshape(1, 3, 2).astype(np.int8)
diff --git a/tensorflow/python/kernel_tests/fft_ops_test.py b/tensorflow/python/kernel_tests/fft_ops_test.py
index 6c575aea12..6544fe9735 100644
--- a/tensorflow/python/kernel_tests/fft_ops_test.py
+++ b/tensorflow/python/kernel_tests/fft_ops_test.py
@@ -338,38 +338,37 @@ class RFFTOpsTest(BaseFFTOpsTest):
                 use_placeholder=True)
 
   def testFftLength(self):
-    if test.is_gpu_available(cuda_only=True):
-      with self._fft_kernel_label_map():
-        for rank in VALID_FFT_RANKS:
-          for dims in xrange(rank, rank + 3):
-            for size in (5, 6):
-              inner_dim = size // 2 + 1
-              r2c = np.mod(np.arange(np.power(size, dims)), 10).reshape(
-                  (size,) * dims)
-              c2r = np.mod(np.arange(np.power(size, dims - 1) * inner_dim),
-                           10).reshape((size,) * (dims - 1) + (inner_dim,))
-
-              # Test truncation (FFT size < dimensions).
-              fft_length = (size - 2,) * rank
-              self._CompareForward(r2c.astype(np.float32), rank, fft_length)
-              self._CompareBackward(c2r.astype(np.complex64), rank, fft_length)
-
-              # Confirm it works with unknown shapes as well.
-              self._CompareForward(r2c.astype(np.float32), rank, fft_length,
-                                   use_placeholder=True)
-              self._CompareBackward(c2r.astype(np.complex64), rank, fft_length,
-                                    use_placeholder=True)
-
-              # Test padding (FFT size > dimensions).
-              fft_length = (size + 2,) * rank
-              self._CompareForward(r2c.astype(np.float32), rank, fft_length)
-              self._CompareBackward(c2r.astype(np.complex64), rank, fft_length)
-
-              # Confirm it works with unknown shapes as well.
-              self._CompareForward(r2c.astype(np.float32), rank, fft_length,
-                                   use_placeholder=True)
-              self._CompareBackward(c2r.astype(np.complex64), rank, fft_length,
-                                    use_placeholder=True)
+    with self._fft_kernel_label_map():
+      for rank in VALID_FFT_RANKS:
+        for dims in xrange(rank, rank + 3):
+          for size in (5, 6):
+            inner_dim = size // 2 + 1
+            r2c = np.mod(np.arange(np.power(size, dims)), 10).reshape(
+                (size,) * dims)
+            c2r = np.mod(np.arange(np.power(size, dims - 1) * inner_dim),
+                         10).reshape((size,) * (dims - 1) + (inner_dim,))
+
+            # Test truncation (FFT size < dimensions).
+            fft_length = (size - 2,) * rank
+            self._CompareForward(r2c.astype(np.float32), rank, fft_length)
+            self._CompareBackward(c2r.astype(np.complex64), rank, fft_length)
+
+            # Confirm it works with unknown shapes as well.
+            self._CompareForward(r2c.astype(np.float32), rank, fft_length,
+                                 use_placeholder=True)
+            self._CompareBackward(c2r.astype(np.complex64), rank, fft_length,
+                                  use_placeholder=True)
+
+            # Test padding (FFT size > dimensions).
+            fft_length = (size + 2,) * rank
+            self._CompareForward(r2c.astype(np.float32), rank, fft_length)
+            self._CompareBackward(c2r.astype(np.complex64), rank, fft_length)
+
+            # Confirm it works with unknown shapes as well.
+            self._CompareForward(r2c.astype(np.float32), rank, fft_length,
+                                 use_placeholder=True)
+            self._CompareBackward(c2r.astype(np.complex64), rank, fft_length,
+                                  use_placeholder=True)
 
   def testRandom(self):
     with self._fft_kernel_label_map():
diff --git a/tensorflow/python/kernel_tests/map_stage_op_test.py b/tensorflow/python/kernel_tests/map_stage_op_test.py
index 4ceb24862f..2d2169c310 100644
--- a/tensorflow/python/kernel_tests/map_stage_op_test.py
+++ b/tensorflow/python/kernel_tests/map_stage_op_test.py
@@ -24,7 +24,6 @@ from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
-TIMEOUT = 1
 
 class MapStageTest(test.TestCase):
 
@@ -195,7 +194,8 @@ class MapStageTest(test.TestCase):
     import threading
 
     queue = Queue.Queue()
-    n = 8
+    n = 5
+    missed = 0
 
     with self.test_session(use_gpu=True, graph=G) as sess:
       # Stage data in a separate thread which will block
@@ -207,33 +207,30 @@ class MapStageTest(test.TestCase):
           queue.put(0)
 
       t = threading.Thread(target=thread_run)
-      t.daemon = True
       t.start()
 
-      # Get tokens from the queue until a timeout occurs
-      try:
-        for i in range(n):
-          queue.get(timeout=TIMEOUT)
-      except Queue.Empty:
-        pass
-
-      # Should've timed out on the iteration 'capacity'
-      if not i == capacity:
-        self.fail("Expected to timeout on iteration '{}' "
-                  "but instead timed out on iteration '{}' "
-                  "Staging Area size is '{}' and configured "
-                  "capacity is '{}'.".format(capacity, i,
-                                            sess.run(size),
-                                            capacity))
-
-      # Should have capacity elements in the staging area
-      self.assertTrue(sess.run(size) == capacity)
-
-      # Clear the staging area completely
+      # Get tokens from the queue, making notes of when we timeout
       for i in range(n):
+        try:
+          queue.get(timeout=0.05)
+        except Queue.Empty:
+          missed += 1
+
+      # We timed out n - capacity times waiting for queue puts
+      self.assertTrue(missed == n - capacity)
+
+      # Clear the staging area out a bit
+      for i in range(n - capacity):
         sess.run(get)
 
-      self.assertTrue(sess.run(size) == 0)
+      # This should now succeed
+      t.join()
+
+      self.assertTrue(sess.run(size) == capacity)
+
+      # Clear out the staging area completely
+      for i in range(capacity):
+        sess.run(get)
 
   def testMemoryLimit(self):
     memory_limit = 512*1024  # 512K
@@ -259,7 +256,8 @@ class MapStageTest(test.TestCase):
     import numpy as np
 
     queue = Queue.Queue()
-    n = 8
+    n = 5
+    missed = 0
 
     with self.test_session(use_gpu=True, graph=G) as sess:
       # Stage data in a separate thread which will block
@@ -267,38 +265,35 @@ class MapStageTest(test.TestCase):
       # not fill the queue with n tokens
       def thread_run():
         for i in range(n):
-          data = np.full(chunk, i, dtype=np.uint8)
-          sess.run(stage, feed_dict={x: data, pi: i})
+          sess.run(stage, feed_dict={x: np.full(chunk, i, dtype=np.uint8),
+                                    pi: i})
           queue.put(0)
 
       t = threading.Thread(target=thread_run)
-      t.daemon = True
       t.start()
 
-      # Get tokens from the queue until a timeout occurs
-      try:
-        for i in range(n):
-          queue.get(timeout=TIMEOUT)
-      except Queue.Empty:
-        pass
-
-      # Should've timed out on the iteration 'capacity'
-      if not i == capacity:
-        self.fail("Expected to timeout on iteration '{}' "
-                  "but instead timed out on iteration '{}' "
-                  "Staging Area size is '{}' and configured "
-                  "capacity is '{}'.".format(capacity, i,
-                                            sess.run(size),
-                                            capacity))
-
-      # Should have capacity elements in the staging area
-      self.assertTrue(sess.run(size) == capacity)
-
-      # Clear the staging area completely
+      # Get tokens from the queue, making notes of when we timeout
       for i in range(n):
+        try:
+          queue.get(timeout=0.05)
+        except Queue.Empty:
+          missed += 1
+
+      # We timed out n - capacity times waiting for queue puts
+      self.assertTrue(missed == n - capacity)
+
+      # Clear the staging area out a bit
+      for i in range(n - capacity):
         sess.run(get)
 
-      self.assertTrue(sess.run(size) == 0)
+      # This should now succeed
+      t.join()
+
+      self.assertTrue(sess.run(size) == capacity)
+
+      # Clear out the staging area completely
+      for i in range(capacity):
+        sess.run(get)
 
   def testOrdering(self):
     import six
diff --git a/tensorflow/python/kernel_tests/record_input_test.py b/tensorflow/python/kernel_tests/record_input_test.py
index 8fec2affa5..a3fc98c20f 100644
--- a/tensorflow/python/kernel_tests/record_input_test.py
+++ b/tensorflow/python/kernel_tests/record_input_test.py
@@ -53,7 +53,6 @@ class RecordInputOpTest(test.TestCase):
   def testRecordInputEpochs(self):
     files = 100
     records_per_file = 100
-    batches = 2
     with self.test_session() as sess:
       self.generateTestData("basic", files, records_per_file)
 
@@ -64,20 +63,17 @@ class RecordInputOpTest(test.TestCase):
           batch_size=1,
           shift_ratio=0.33,
           seed=10,
-          name="record_input",
-          batches=batches)
+          name="record_input")
 
       yield_op = records.get_yield_op()
 
       # cycle over 3 epochs and make sure we never duplicate
       for _ in range(3):
         epoch_set = set()
-        for _ in range(int(files * records_per_file / batches)):
-          op_list = sess.run(yield_op)
-          self.assertTrue(len(op_list) is batches)
-          for r in op_list:
-            self.assertTrue(r[0] not in epoch_set)
-            epoch_set.add(r[0])
+        for _ in range(files * records_per_file):
+          r = sess.run(yield_op)
+          self.assertTrue(r[0] not in epoch_set)
+          epoch_set.add(r[0])
 
   def testDoesNotDeadlock(self):
     # Iterate multiple times to cause deadlock if there is a chance it can occur
diff --git a/tensorflow/python/kernel_tests/sparse_ops_test.py b/tensorflow/python/kernel_tests/sparse_ops_test.py
index 14eb2cba68..e67a2c25e9 100644
--- a/tensorflow/python/kernel_tests/sparse_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_ops_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import unittest
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -545,24 +544,21 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(empty_row_indicator_out, np.zeros(2).astype(np.bool))
 
 
-class SparseReduceTest(test_util.TensorFlowTestCase):
+class SparseReduceSumTest(test_util.TensorFlowTestCase):
 
-  # [[1, ?, 2]
-  #  [?, 3, ?]]
+  # [[1, ?, 1]
+  #  [?, 1, ?]]
   # where ? is implictly-zero.
   ind = np.array([[0, 0], [0, 2], [1, 1]]).astype(np.int64)
   vals = np.array([1, 1, 1]).astype(np.int32)
   dense_shape = np.array([2, 3]).astype(np.int64)
 
-  def _compare(self, sp_t, reduction_axes, ndims, keep_dims, do_sum):
+  def _compare(self, sp_t, reduction_axes, ndims, keep_dims):
     densified = sparse_ops.sparse_tensor_to_dense(sp_t).eval()
 
     np_ans = densified
     if reduction_axes is None:
-      if do_sum:
-        np_ans = np.sum(np_ans, keepdims=keep_dims)
-      else:
-        np_ans = np.max(np_ans, keepdims=keep_dims)
+      np_ans = np.sum(np_ans, keepdims=keep_dims)
     else:
       if not isinstance(reduction_axes, list):  # Single scalar.
         reduction_axes = [reduction_axes]
@@ -572,28 +568,15 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
       # Loop below depends on sorted.
       reduction_axes.sort()
       for ra in reduction_axes.ravel()[::-1]:
-        if do_sum:
-          np_ans = np.sum(np_ans, axis=ra, keepdims=keep_dims)
-        else:
-          np_ans = np.max(np_ans, axis=ra, keepdims=keep_dims)
+        np_ans = np.sum(np_ans, axis=ra, keepdims=keep_dims)
 
     with self.test_session():
-      if do_sum:
-        tf_dense_ans = sparse_ops.sparse_reduce_sum(sp_t, reduction_axes,
-                                                    keep_dims)
-      else:
-        tf_dense_ans = sparse_ops.sparse_reduce_max(sp_t, reduction_axes,
-                                                    keep_dims)
+      tf_dense_ans = sparse_ops.sparse_reduce_sum(sp_t, reduction_axes,
+                                                  keep_dims)
       out_dense = tf_dense_ans.eval()
 
-      if do_sum:
-        tf_sparse_ans = sparse_ops.sparse_reduce_sum_sparse(sp_t,
-                                                            reduction_axes,
-                                                            keep_dims)
-      else:
-        tf_sparse_ans = sparse_ops.sparse_reduce_max_sparse(sp_t,
-                                                            reduction_axes,
-                                                            keep_dims)
+      tf_sparse_ans = sparse_ops.sparse_reduce_sum_sparse(sp_t, reduction_axes,
+                                                          keep_dims)
       # Convert to dense for comparison purposes.
       out_sparse = sparse_ops.sparse_tensor_to_dense(tf_sparse_ans).eval()
 
@@ -601,12 +584,9 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
     self.assertAllClose(np_ans, out_sparse)
 
   def _compare_all(self, sp_t, reduction_axes, ndims):
-    self._compare(sp_t, reduction_axes, ndims, False, False)
-    self._compare(sp_t, reduction_axes, ndims, False, True)
-    self._compare(sp_t, reduction_axes, ndims, True, False)
-    self._compare(sp_t, reduction_axes, ndims, True, True)
+    self._compare(sp_t, reduction_axes, ndims, False)
+    self._compare(sp_t, reduction_axes, ndims, True)
 
-  @unittest.skipIf(np.__version__ == "1.13.0", "numpy 1.13 bug")
   def testSimpleAndRandomInputs(self):
     if np.__version__ == "1.13.0":
       self.skipTest("numpy 1.13.0 bug")
@@ -641,12 +621,7 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
         sparse_ops.sparse_reduce_sum(sp_t, -3).eval()
       with self.assertRaisesOpError("Invalid reduction dimension 2"):
         sparse_ops.sparse_reduce_sum(sp_t, 2).eval()
-      with self.assertRaisesOpError("Invalid reduction dimension -3"):
-        sparse_ops.sparse_reduce_max(sp_t, -3).eval()
-      with self.assertRaisesOpError("Invalid reduction dimension 2"):
-        sparse_ops.sparse_reduce_max(sp_t, 2).eval()
 
-  @unittest.skipIf(np.__version__ == "1.13.0", "numpy 1.13 bug")
   def testGradient(self):
     if np.__version__ == "1.13.0":
       self.skipTest("numpy 1.13.0 bug")
diff --git a/tensorflow/python/kernel_tests/stage_op_test.py b/tensorflow/python/kernel_tests/stage_op_test.py
index 1a6a869e3d..4a89fb64e3 100644
--- a/tensorflow/python/kernel_tests/stage_op_test.py
+++ b/tensorflow/python/kernel_tests/stage_op_test.py
@@ -23,11 +23,9 @@ from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
-TIMEOUT = 1
 
 class StageTest(test.TestCase):
 
-
   def testSimple(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -174,7 +172,8 @@ class StageTest(test.TestCase):
     import threading
 
     queue = Queue.Queue()
-    n = 8
+    n = 5
+    missed = 0
 
     with self.test_session(use_gpu=True, graph=G) as sess:
       # Stage data in a separate thread which will block
@@ -186,33 +185,31 @@ class StageTest(test.TestCase):
           queue.put(0)
 
       t = threading.Thread(target=thread_run)
-      t.daemon = True
       t.start()
 
-      # Get tokens from the queue until a timeout occurs
-      try:
-        for i in range(n):
-          queue.get(timeout=TIMEOUT)
-      except Queue.Empty:
-        pass
-
-      # Should've timed out on the iteration 'capacity'
-      if not i == capacity:
-        self.fail("Expected to timeout on iteration '{}' "
-                  "but instead timed out on iteration '{}' "
-                  "Staging Area size is '{}' and configured "
-                  "capacity is '{}'.".format(capacity, i,
-                                            sess.run(size),
-                                            capacity))
-
-      # Should have capacity elements in the staging area
+      # Get tokens from the queue, making notes of when we timeout
+      for i in range(n):
+        try:
+          queue.get(timeout=0.05)
+        except Queue.Empty:
+          missed += 1
+
+      # We timed out n - capacity times waiting for queue puts
+      self.assertTrue(missed == n - capacity)
+
+      # Clear the staging area out a bit
+      for i in range(n - capacity):
+        self.assertTrue(sess.run(ret) == i)
+
+      # Thread should be able to join now
+      t.join()
+
       self.assertTrue(sess.run(size) == capacity)
 
       # Clear the staging area completely
-      for i in range(n):
-        self.assertTrue(sess.run(ret) == i)
+      for i in range(capacity):
+        self.assertTrue(sess.run(ret) == i+(n-capacity))
 
-      # It should now be empty
       self.assertTrue(sess.run(size) == 0)
 
   def testMemoryLimit(self):
@@ -237,7 +234,8 @@ class StageTest(test.TestCase):
     import numpy as np
 
     queue = Queue.Queue()
-    n = 8
+    n = 5
+    missed = 0
 
     with self.test_session(use_gpu=True, graph=G) as sess:
       # Stage data in a separate thread which will block
@@ -249,31 +247,30 @@ class StageTest(test.TestCase):
           queue.put(0)
 
       t = threading.Thread(target=thread_run)
-      t.daemon = True
       t.start()
 
-      # Get tokens from the queue until a timeout occurs
-      try:
-        for i in range(n):
-          queue.get(timeout=TIMEOUT)
-      except Queue.Empty:
-        pass
-
-      # Should've timed out on the iteration 'capacity'
-      if not i == capacity:
-        self.fail("Expected to timeout on iteration '{}' "
-                  "but instead timed out on iteration '{}' "
-                  "Staging Area size is '{}' and configured "
-                  "capacity is '{}'.".format(capacity, i,
-                                            sess.run(size),
-                                            capacity))
-
-      # Should have capacity elements in the staging area
+      # Get tokens from the queue, making notes of when we timeout
+      for i in range(n):
+        try:
+          queue.get(timeout=0.05)
+        except Queue.Empty:
+          missed += 1
+
+      # We timed out n - capacity times waiting for queue puts
+      self.assertTrue(missed == n - capacity)
+
+      # Clear the staging area out a bit
+      for i in range(n - capacity):
+        self.assertTrue(sess.run(ret)[0] == i)
+
+      # Thread should be able to join now
+      t.join()
+
       self.assertTrue(sess.run(size) == capacity)
 
       # Clear the staging area completely
-      for i in range(n):
-        self.assertTrue(np.all(sess.run(ret) == i))
+      for i in range(capacity):
+        self.assertTrue(sess.run(ret)[0] == i+(n-capacity))
 
       self.assertTrue(sess.run(size) == 0)
 
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index a8f596c7a3..b2aff617df 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -402,15 +402,14 @@ def _SliceHelper(tensor, slice_spec, var=None):
 
   # Insert another dimension
   foo = tf.constant([[1,2,3], [4,5,6], [7,8,9]])
-  print(foo[tf.newaxis, :, :].eval()) # => [[[1,2,3], [4,5,6], [7,8,9]]]
-  print(foo[:, tf.newaxis, :].eval()) # => [[[1,2,3]], [[4,5,6]], [[7,8,9]]]
-  print(foo[:, :, tf.newaxis].eval()) # => [[[1],[2],[3]], [[4],[5],[6]], [[7],[8],[9]]]
+  print(foo[tf.newaxis, :, :].eval()) # => [[[3,2,1], [9,8,7]]]
+  print(foo[:, tf.newaxis, :].eval()) # => [[[3,2,1]], [[9,8,7]]]
+  print(foo[:, :, tf.newaxis].eval()) # => [[[3],[2],[1]], [[9],[8],[7]]]
 
   # Ellipses (3 equivalent operations)
-  foo = tf.constant([[1,2,3], [4,5,6], [7,8,9]])
-  print(foo[tf.newaxis, :, :].eval()) # => [[[1,2,3], [4,5,6], [7,8,9]]]
-  print(foo[tf.newaxis, ...].eval()) # => [[[1,2,3], [4,5,6], [7,8,9]]]
-  print(foo[tf.newaxis].eval()) # => [[[1,2,3], [4,5,6], [7,8,9]]]
+  print(foo[tf.newaxis, :, :].eval()) # => [[[3,2,1], [9,8,7]]]
+  print(foo[tf.newaxis, ...].eval()) # => [[[3,2,1], [9,8,7]]]
+  print(foo[tf.newaxis].eval()) # => [[[3,2,1], [9,8,7]]]
   ```
 
   Notes:
@@ -761,14 +760,11 @@ def parallel_stack(values, name="parallel_stack"):
   parallel_stack([x, y, z])  # => [[1, 4], [2, 5], [3, 6]]
   ```
 
-  The difference between `stack` and `parallel_stack` is that `stack` requires
-  all the inputs be computed before the operation will begin but doesn't require
-  that the input shapes be known during graph construction.
-  
-  `parallel_stack` will copy pieces of the input into the output as they become
-  available, in some situations this can provide a performance benefit.
-  
-  Unlike `stack`, `parallel_stack` does NOT support backpropagation.
+  The difference between stack and parallel_stack is that stack requires all
+  of the inputs be computed before the operation will begin but doesn't require
+  that the input shapes be known during graph construction.  Parallel stack
+  will copy pieces of the input into the output as they become available, in
+  some situations this can provide a performance benefit.
 
   This is the opposite of unstack.  The numpy equivalent is
 
diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index 829aa99284..4eead79531 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -1582,7 +1582,7 @@ class StagingArea(BaseStagingArea):
   This is mostly useful for limiting the number of tensors on
   devices such as GPUs.
 
-  All get() and peek() commands block if the requested data
+  All get() and peek() commands block if the the requested data
   is not present in the Staging Area.
 
   """
@@ -2155,8 +2155,7 @@ class RecordInput(object):
                parallelism=1,
                shift_ratio=0,
                seed=0,
-               name=None,
-               batches=None):
+               name=None):
     """Constructs a RecordInput Op.
 
     Args:
@@ -2170,18 +2169,12 @@ class RecordInput(object):
       seed: Specify the random number seed used by generator that randomizes
         records.
       name: Optional name for the operation.
-      batches: None by default, creating a single batch op. Otherwise specifies
-        how many batches to create, which are returned as a list when
-        `get_yield_op()` is called. An example use case is to split processing
-        between devices on one computer.
 
     Raises:
       ValueError: If one of the arguments is invalid.
     """
+
     self._batch_size = batch_size
-    if batches is not None:
-      self._batch_size *= batches
-    self._batches = batches
     self._file_pattern = file_pattern
     self._buffer_size = buffer_size
     self._parallelism = parallelism
@@ -2190,11 +2183,8 @@ class RecordInput(object):
     self._name = name
 
   def get_yield_op(self):
-    """Adds a node that yields a group of records every time it is executed.
-    If RecordInput `batches` parameter is not None, it yields a list of
-    record batches with the specified `batch_size`.
-    """
-    records = gen_data_flow_ops.record_input(
+    """Add a node that yields a minibatch every time it is executed."""
+    return gen_data_flow_ops.record_input(
         file_pattern=self._file_pattern,
         file_buffer_size=self._buffer_size,
         file_parallelism=self._parallelism,
@@ -2202,14 +2192,3 @@ class RecordInput(object):
         batch_size=self._batch_size,
         file_random_seed=self._seed,
         name=self._name)
-    if self._batches is None:
-      return records
-    else:
-      with ops.name_scope(self._name):
-        batch_list = [[] for i in six.moves.range(self._batches)]
-        records = array_ops.split(records, self._batch_size, 0)
-        records = [array_ops.reshape(record, []) for record in records]
-        for index, protobuf in zip(six.moves.range(len(records)), records):
-          batch_index = index % self._batches
-          batch_list[batch_index].append(protobuf)
-        return batch_list
diff --git a/tensorflow/python/ops/distributions/special_math.py b/tensorflow/python/ops/distributions/special_math.py
index 3a804c941a..f96eafed71 100644
--- a/tensorflow/python/ops/distributions/special_math.py
+++ b/tensorflow/python/ops/distributions/special_math.py
@@ -324,7 +324,7 @@ def log_ndtr(x, series_order=3, name="log_ndtr"):
 
 
 def _log_ndtr_lower(x, series_order):
-  """Asymptotic expansion version of `Log[cdf(x)]`, appropriate for `x<<-1`."""
+  """Asymptotic expansion version of `Log[cdf(x)]`, apppropriate for `x<<-1`."""
   x_2 = math_ops.square(x)
   # Log of the term multiplying (1 + sum)
   log_scale = -0.5 * x_2 - math_ops.log(-x) - 0.5 * math.log(2. * math.pi)
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index dade053589..aac7aef8ce 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -989,9 +989,9 @@ def index_table_from_tensor(vocabulary_list,
   Sample Usages:
 
   ```python
-  vocabulary_list = tf.constant(["emerson", "lake", "palmer"])
+  vocabulary_list = t.constant(["emerson", "lake", "palmer")
   table = tf.contrib.lookup.index_table_from_tensor(
-      mapping=vocabulary_list, num_oov_buckets=1, default_value=-1)
+      vocabulary_list=vocabulary_list, num_oov_buckets=1, default_value=-1)
   features = tf.constant(["emerson", "lake", "and", "palmer"])
   ids = table.lookup(features)
   ...
@@ -1161,7 +1161,7 @@ def index_to_string_table_from_tensor(vocabulary_list,
   Sample Usages:
 
   ```python
-  vocabulary_list = tf.constant(["emerson", "lake", "palmer"])
+  vocabulary_list = t.constant(["emerson", "lake", "palmer")
   indices = tf.constant([1, 5], tf.int64)
   table = tf.contrib.lookup.index_to_string_table_from_tensor(
       vocabulary_list, default_value="UNKNOWN")
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index a0f505e47b..024158e709 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -369,24 +369,6 @@ def _Log1pGrad(op, grad):
     return grad * math_ops.reciprocal(1 + x)
 
 
-@ops.RegisterGradient("Sinh")
-def _SinhGrad(op, grad):
-  """Returns grad * cosh(x)."""
-  x = op.inputs[0]
-  with ops.control_dependencies([grad.op]):
-    x = math_ops.conj(x)
-    return grad * math_ops.cosh(x)
-
-
-@ops.RegisterGradient("Cosh")
-def _CoshGrad(op, grad):
-  """Returns grad * sinh(x)."""
-  x = op.inputs[0]
-  with ops.control_dependencies([grad.op]):
-    x = math_ops.conj(x)
-    return grad * math_ops.sinh(x)
-
-
 @ops.RegisterGradient("Tanh")
 def _TanhGrad(op, grad):
   """Returns grad * (1 - tanh(x) * tanh(x))."""
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index bf46bbdcc2..23af8c0a57 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -45,8 +45,6 @@ See the @{$python/math_ops} guide.
 @@expm1
 @@log
 @@log1p
-@@sinh
-@@cosh
 @@ceil
 @@floor
 @@maximum
@@ -2065,11 +2063,12 @@ def tanh(x, name=None):
 
   Args:
     x: A Tensor or SparseTensor with type `float`, `double`, `int32`,
-      `complex64`, or `int64`.
+      `complex64`, `int64`, or `qint32`.
     name: A name for the operation (optional).
 
   Returns:
-    A Tensor or SparseTensor respectively with the same type as `x`.
+    A Tensor or SparseTensor respectively with the same type as `x` if
+    `x.dtype != qint32` otherwise the return type is `quint8`.
   """
   with ops.name_scope(name, "Tanh", [x]) as name:
     if isinstance(x, sparse_tensor.SparseTensor):
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index a1e603199f..c0d9c971a0 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -416,7 +416,7 @@ class LSTMCell(RNNCell):
 
   The default non-peephole implementation is based on:
 
-    http://www.bioinf.jku.at/publications/older/2604.pdf
+    http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
 
   S. Hochreiter and J. Schmidhuber.
   "Long Short-Term Memory". Neural Computation, 9(8):1735-1780, 1997.
@@ -622,7 +622,7 @@ class DropoutWrapper(RNNCell):
     """Create a cell with added input, state, and/or output dropout.
 
     If `variational_recurrent` is set to `True` (**NOT** the default behavior),
-    then the same dropout mask is applied at every step, as described in:
+    then the the same dropout mask is applied at every step, as described in:
 
     Y. Gal, Z Ghahramani.  "A Theoretically Grounded Application of Dropout in
     Recurrent Neural Networks".  https://arxiv.org/abs/1512.05287
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 93a9656950..377adcdab8 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -30,8 +30,6 @@
 @@sparse_reset_shape
 @@sparse_fill_empty_rows
 @@sparse_transpose
-@@sparse_reduce_max
-@@sparse_reduce_max_sparse
 @@sparse_reduce_sum
 @@sparse_reduce_sum_sparse
 @@sparse_add
@@ -712,90 +710,6 @@ def sparse_to_dense(sparse_indices,
       name=name)
 
 
-def sparse_reduce_max(sp_input, axis=None, keep_dims=False,
-                      reduction_axes=None):
-  """Computes the max of elements across dimensions of a SparseTensor.
-
-  This Op takes a SparseTensor and is the sparse counterpart to
-  `tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
-  instead of a sparse one.
-
-  Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-  `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-  `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-  with length 1.
-
-  If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-  with a single element is returned.  Additionally, the axes can be negative,
-  similar to the indexing rules in Python.
-
-  For example:
-
-  ```python
-  # 'x' represents [[1, ?, 2]
-  #                 [?, 3, ?]]
-  # where ? is implicitly-zero.
-  tf.sparse_reduce_max(x) ==> 3
-  tf.sparse_reduce_max(x, 0) ==> [1, 3, 2]
-  tf.sparse_reduce_max(x, 1) ==> [2, 3]  # Can also use -1 as the axis.
-  tf.sparse_reduce_max(x, 1, keep_dims=True) ==> [[2], [3]]
-  tf.sparse_reduce_max(x, [0, 1]) ==> 3
-  ```
-
-  Args:
-    sp_input: The SparseTensor to reduce. Should have numeric type.
-    axis: The dimensions to reduce; list or scalar. If `None` (the
-      default), reduces all dimensions.
-    keep_dims: If true, retain reduced dimensions with length 1.
-    reduction_axes: Deprecated name of axis.
-
-  Returns:
-    The reduced Tensor.
-  """
-  return gen_sparse_ops.sparse_reduce_max(
-      sp_input.indices, sp_input.values,
-      sp_input.dense_shape,
-      math_ops._ReductionDims(sp_input, axis, reduction_axes),
-      keep_dims)
-
-
-def sparse_reduce_max_sparse(sp_input, axis=None, keep_dims=False,
-                             reduction_axes=None):
-  """Computes the max of elements across dimensions of a SparseTensor.
-
-  This Op takes a SparseTensor and is the sparse counterpart to
-  `tf.reduce_max()`.  In contrast to SparseReduceSum, this Op returns a
-  SparseTensor.
-
-  Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-  `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-  `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-  with length 1.
-
-  If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-  with a single element is returned.  Additionally, the axes can be negative,
-  which are interpreted according to the indexing rules in Python.
-
-  Args:
-    sp_input: The SparseTensor to reduce. Should have numeric type.
-    axis: The dimensions to reduce; list or scalar. If `None` (the
-      default), reduces all dimensions.
-    keep_dims: If true, retain reduced dimensions with length 1.
-    reduction_axes: Deprecated name of axis
-
-  Returns:
-    The reduced SparseTensor.
-  """
-  output_ind, output_val, output_shape = (
-      gen_sparse_ops.sparse_reduce_max_sparse(
-          sp_input.indices, sp_input.values,
-          sp_input.dense_shape, math_ops._ReductionDims(sp_input, axis,
-                                                        reduction_axes),
-          keep_dims))
-
-  return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
-
-
 def sparse_reduce_sum(sp_input, axis=None, keep_dims=False,
                       reduction_axes=None):
   """Computes the sum of elements across dimensions of a SparseTensor.
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index 20ae082ee1..7a6abc8e61 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -87,7 +87,7 @@ class TensorArray(object):
         the shape constraints of each of the elements of the TensorArray.
         Need not be fully defined.
       colocate_with_first_write_call: If `True`, the TensorArray will be
-        colocated on the same device as the Tensor used on its first write
+        colocated on the same device as the the Tensor used on its first write
         (write operations include `write`, `unstack`, and `split`).  If `False`,
         the TensorArray will be placed on the device determined by the
         device context available during its initialization.
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 24616aeac3..aceffd373a 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -1501,11 +1501,6 @@ def variable_scope(name_or_scope,
   A note about name scoping: Setting `reuse` does not impact the naming of other
   ops such as mult. See related discussion on [github#6189](https://github.com/tensorflow/tensorflow/issues/6189)
 
-  Note that up to and including version 1.0, it was allowed (though
-  explicitly discouraged) to pass False to the reuse argument, yielding
-  undocumented behaviour slightly different from None. Starting at 1.1.0
-  passing None and False as reuse has exactly the same effect.
-
   Args:
     name_or_scope: `string` or `VariableScope`: the scope to open.
     default_name: The default name to use if the `name_or_scope` argument is
diff --git a/tensorflow/python/tools/print_selective_registration_header.py b/tensorflow/python/tools/print_selective_registration_header.py
index 62f00f4467..3e2ab4695e 100644
--- a/tensorflow/python/tools/print_selective_registration_header.py
+++ b/tensorflow/python/tools/print_selective_registration_header.py
@@ -16,7 +16,7 @@ r"""Prints a header file to be used with SELECTIVE_REGISTRATION.
 
 An example of command-line usage is:
   bazel build tensorflow/python/tools:print_selective_registration_header && \
-  bazel-bin/tensorflow/python/tools/print_selective_registration_header \
+  bazel-bin/tensorflow/python/tools:print_selective_registration_header \
     --graphs=path/to/graph.pb > ops_to_register.h
 
 Then when compiling tensorflow, include ops_to_register.h in the include search
diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py
index 21183823c2..1755167938 100644
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@@ -762,9 +762,6 @@ def _shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
   tensor_list = _as_tensor_list(tensors)
   with ops.name_scope(name, "shuffle_batch",
                       list(tensor_list) + [keep_input]) as name:
-    if capacity <= min_after_dequeue:
-      raise ValueError("capacity %d must be bigger than min_after_dequeue %d."
-                       % (capacity, min_after_dequeue))
     tensor_list = _validate(tensor_list)
     keep_input = _validate_keep_input(keep_input, enqueue_many)
     tensor_list, sparse_info = _store_sparse_tensors(
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 5b1d9a517e..690af5485a 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -147,7 +147,7 @@ def tf_copts():
       "-Iexternal/gemmlowp",
       "-Wno-sign-compare",
       "-fno-exceptions",
-  ] + if_cuda(["-DGOOGLE_CUDA=1"]) + if_mkl(["-DINTEL_MKL=1", "-fopenmp",]) + if_android_arm(
+  ] + if_cuda(["-DGOOGLE_CUDA=1"]) + if_mkl(["-DINTEL_MKL=1"]) + if_android_arm(
       ["-mfpu=neon"]) + if_x86(["-msse3"]) + select({
           clean_dep("//tensorflow:android"): [
               "-std=c++11",
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index eca017edf5..b761f41361 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -768,10 +768,6 @@ tf_module {
     name: "cos"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "cosh"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "count_nonzero"
     argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'dtype\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \"<dtype: \'int64\'>\", \'None\', \'None\'], "
@@ -1644,10 +1640,6 @@ tf_module {
     name: "sin"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "sinh"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "size"
     argspec: "args=[\'input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int32\'>\"], "
@@ -1704,14 +1696,6 @@ tf_module {
     name: "sparse_placeholder"
     argspec: "args=[\'dtype\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "sparse_reduce_max"
-    argspec: "args=[\'sp_input\', \'axis\', \'keep_dims\', \'reduction_axes\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_reduce_max_sparse"
-    argspec: "args=[\'sp_input\', \'axis\', \'keep_dims\', \'reduction_axes\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
-  }
   member_method {
     name: "sparse_reduce_sum"
     argspec: "args=[\'sp_input\', \'axis\', \'keep_dims\', \'reduction_axes\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index 85c712d3c6..e0a1391d6e 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@@ -205,17 +205,14 @@ if [[ -n "${PY_TAGS}" ]]; then
 $(echo ${WHL_BASE_NAME} | cut -d \- -f 2)-${PY_TAGS}-${PLATFORM_TAG}.whl
 
   if [[ ! -f "${WHL_DIR}/${NEW_WHL_BASE_NAME}" ]]; then
-    if cp "${WHL_DIR}/${WHL_BASE_NAME}" "${WHL_DIR}/${NEW_WHL_BASE_NAME}"
-    then
-      echo "Copied wheel file: ${WHL_BASE_NAME} --> ${NEW_WHL_BASE_NAME}"
-    else
+    cp "${WHL_DIR}/${WHL_BASE_NAME}" "${WHL_DIR}/${NEW_WHL_BASE_NAME}" && \
+      echo "Copied wheel file: ${WHL_BASE_NAME} --> ${NEW_WHL_BASE_NAME}" || \
       die "ERROR: Failed to copy wheel file to ${NEW_WHL_BASE_NAME}"
-    fi
   fi
 fi
 
 if [[ $(uname) == "Linux" ]]; then
-  AUDITED_WHL_NAME="${WHL_DIR}/$(echo ${WHL_BASE_NAME//linux/manylinux1})"
+  AUDITED_WHL_NAME="${WHL_DIR}/$(echo ${WHL_BASE_NAME} | sed "s/linux/manylinux1/")"
 
   # Repair the wheels for cpu manylinux1
   if [[ ${CONTAINER_TYPE} == "cpu" ]]; then
@@ -243,20 +240,14 @@ echo "Installing pip whl file: ${WHL_PATH}"
 VENV_DIR="${PIP_TEST_ROOT}/venv"
 
 if [[ -d "${VENV_DIR}" ]]; then
-  if rm -rf "${VENV_DIR}"
-  then
-    echo "Removed existing virtualenv directory: ${VENV_DIR}"
-  else
-    die "Failed to remove existing virtualenv directory: ${VENV_DIR}"
-  fi
+  rm -rf "${VENV_DIR}" && \
+      echo "Removed existing virtualenv directory: ${VENV_DIR}" || \
+      die "Failed to remove existing virtualenv directory: ${VENV_DIR}"
 fi
 
-if mkdir -p ${VENV_DIR}
-then
-  echo "Created virtualenv directory: ${VENV_DIR}"
-else
-  die "FAILED to create virtualenv directory: ${VENV_DIR}"
-fi
+mkdir -p ${VENV_DIR} && \
+    echo "Created virtualenv directory: ${VENV_DIR}" || \
+    die "FAILED to create virtualenv directory: ${VENV_DIR}"
 
 # Verify that virtualenv exists
 if [[ -z $(which virtualenv) ]]; then
diff --git a/tensorflow/tools/ci_build/ci_build.sh b/tensorflow/tools/ci_build/ci_build.sh
index 9c1b75d004..3b640dd5e8 100755
--- a/tensorflow/tools/ci_build/ci_build.sh
+++ b/tensorflow/tools/ci_build/ci_build.sh
@@ -80,7 +80,7 @@ fi
 
 # cmake (CPU) builds do not require configuration.
 if [[ "${CONTAINER_TYPE}" == "cmake" ]]; then
-  CI_COMMAND_PREFIX=("")
+  CI_COMMAND_PREFIX=""
 fi
 
 # Use nvidia-docker if the container is GPU.
@@ -120,9 +120,9 @@ DOCKER_IMG_NAME=$(echo "${DOCKER_IMG_NAME}" | tr '[:upper:]' '[:lower:]')
 
 # Print arguments.
 echo "WORKSPACE: ${WORKSPACE}"
-echo "CI_DOCKER_EXTRA_PARAMS: ${CI_DOCKER_EXTRA_PARAMS[*]}"
-echo "COMMAND: ${COMMAND[*]}"
-echo "CI_COMMAND_PREFIX: ${CI_COMMAND_PREFIX[*]}"
+echo "CI_DOCKER_EXTRA_PARAMS: ${CI_DOCKER_EXTRA_PARAMS[@]}"
+echo "COMMAND: ${COMMAND[@]}"
+echo "CI_COMMAND_PREFIX: ${CI_COMMAND_PREFIX[@]}"
 echo "CONTAINER_TYPE: ${CONTAINER_TYPE}"
 echo "BUILD_TAG: ${BUILD_TAG}"
 echo "  (docker container name will be ${DOCKER_IMG_NAME})"
@@ -140,7 +140,7 @@ if [[ $? != "0" ]]; then
 fi
 
 # Run the command inside the container.
-echo "Running '${COMMAND[*]}' inside ${DOCKER_IMG_NAME}..."
+echo "Running '${COMMAND[@]}' inside ${DOCKER_IMG_NAME}..."
 mkdir -p ${WORKSPACE}/bazel-ci_build-cache
 # By default we cleanup - remove the container once it finish running (--rm)
 # and share the PID namespace (--pid=host) so the process inside does not have
diff --git a/tensorflow/tools/ci_build/ci_parameterized_build.sh b/tensorflow/tools/ci_build/ci_parameterized_build.sh
index 1cf87d7c7c..dfaf50eb4f 100755
--- a/tensorflow/tools/ci_build/ci_parameterized_build.sh
+++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh
@@ -200,8 +200,8 @@ echo "  TF_BUILD_ENABLE_XLA=${TF_BUILD_ENABLE_XLA}"
 function get_cuda_capability_version() {
   if [[ ! -z $(which deviceQuery) ]]; then
     # The first listed device is used
-    deviceQuery | grep "CUDA Capability .* version" | \
-        head -1 | awk '{print $NF}'
+    echo $(deviceQuery | grep "CUDA Capability .* version" | \
+        head -1 | awk '{print $NF}')
   fi
 }
 
@@ -532,14 +532,11 @@ if [[ "${TF_BUILD_PYTHON_VERSION}" == "python3.5" ]]; then
   DOCKERFILE="${TMP_DIR}/Dockerfile.${TF_BUILD_CONTAINER_TYPE}"
 
   # Replace a line in the Dockerfile
-  if sed -i \
+  sed -i \
       's/RUN \/install\/install_pip_packages.sh/RUN \/install\/install_python3.5_pip_packages.sh/g' \
-      "${DOCKERFILE}"
-  then
-    echo "Copied and modified Dockerfile for Python 3.5 build: ${DOCKERFILE}"
-  else
-    die "ERROR: Faild to copy and modify Dockerfile: ${DOCKERFILE}"
-  fi
+      "${DOCKERFILE}" && \
+      echo "Copied and modified Dockerfile for Python 3.5 build: ${DOCKERFILE}" || \
+      die "ERROR: Faild to copy and modify Dockerfile: ${DOCKERFILE}"
 
   DOCKERFILE_FLAG="--dockerfile ${DOCKERFILE}"
 fi
@@ -577,7 +574,7 @@ rm -f ${TMP_SCRIPT}
 END_TIME=$(date +'%s')
 echo ""
 echo "Parameterized build ends with ${RESULT} at: $(date) "\
-"(Elapsed time: $((END_TIME - START_TIME)) s)"
+"(Elapsed time: $((${END_TIME} - ${START_TIME})) s)"
 
 
 # Clean up temporary directory if it exists
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index e428766a40..fd2874df91 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -47,7 +47,7 @@ num_cpus() {
 # Get the hash of the last non-merge git commit on the current branch.
 # Usage: get_last_non_merge_git_commit
 get_last_non_merge_git_commit() {
-  git rev-list --no-merges -n 1 HEAD
+  echo $(git rev-list --no-merges -n 1 HEAD)
 }
 
 # List files changed (i.e., added, removed or revised) in the last non-merge
@@ -75,7 +75,7 @@ get_py_files_to_check() {
 
     echo "${PY_FILES}"
   else
-    find tensorflow -name '*.py'
+    echo $(find tensorflow -name '*.py')
   fi
 }
 
@@ -157,25 +157,25 @@ do_pylint() {
   NONWL_ERRORS_FILE="$(mktemp)_pylint_nonwl_errors.log"
 
   rm -rf ${OUTPUT_FILE}
-  rm -rf ${ERRORS_FILE}
+  rm -rf ${ERRORS_FLIE}
   rm -rf ${NONWL_ERRORS_FILE}
   touch ${NONWL_ERRORS_FILE}
 
   ${PYLINT_BIN} --rcfile="${PYLINTRC_FILE}" --output-format=parseable \
-      --jobs=${NUM_CPUS} ${PYTHON_SRC_FILES} > ${OUTPUT_FILE} 2>&1
+      --jobs=${NUM_CPUS} ${PYTHON_SRC_FILES} 2>&1 > ${OUTPUT_FILE}
   PYLINT_END_TIME=$(date +'%s')
 
   echo ""
-  echo "pylint took $((PYLINT_END_TIME - PYLINT_START_TIME)) s"
+  echo "pylint took $((${PYLINT_END_TIME} - ${PYLINT_START_TIME})) s"
   echo ""
 
   grep -E '(\[E|\[W0311|\[W0312)' ${OUTPUT_FILE} > ${ERRORS_FILE}
 
   N_ERRORS=0
-  while read -r LINE; do
+  while read LINE; do
     IS_WHITELISTED=0
     for WL_REGEX in ${ERROR_WHITELIST}; do
-      if echo ${LINE} | grep -q "${WL_REGEX}"; then
+      if [[ ! -z $(echo ${LINE} | grep "${WL_REGEX}") ]]; then
         echo "Found a whitelisted error:"
         echo "  ${LINE}"
         IS_WHITELISTED=1
@@ -248,7 +248,7 @@ do_pep8() {
   PEP8_END_TIME=$(date +'%s')
 
   echo ""
-  echo "pep8 took $((PEP8_END_TIME - PEP8_START_TIME)) s"
+  echo "pep8 took $((${PEP8_END_TIME} - ${PEP8_START_TIME})) s"
   echo ""
 
   if [[ -s ${PEP8_OUTPUT_FILE} ]]; then
@@ -278,7 +278,7 @@ do_buildifier(){
   BUILDIFIER_END_TIME=$(date +'%s')
 
   echo ""
-  echo "buildifier took $((BUILDIFIER_END_TIME - BUILDIFIER_START_TIME)) s"
+  echo "buildifier took $((${BUILDIFIER_END_TIME} - ${BUILDIFIER_START_TIME})) s"
   echo ""
 
   if [[ -s ${BUILDIFIER_OUTPUT_FILE} ]]; then
@@ -306,7 +306,7 @@ do_external_licenses_check(){
 
   echo "Getting external dependencies for ${BUILD_TARGET}"
  bazel query "attr('licenses', 'notice', deps(${BUILD_TARGET}))" --no_implicit_deps --no_host_deps --keep_going \
-  | grep -E -v "^//tensorflow" \
+  | egrep -v "^//tensorflow" \
   | sed -e 's|:.*||' \
   | sort \
   | uniq 2>&1 \
@@ -315,7 +315,7 @@ do_external_licenses_check(){
   echo
   echo "Getting list of external licenses mentioned in ${LICENSES_TARGET}."
   bazel query "deps(${LICENSES_TARGET})" --no_implicit_deps --no_host_deps --keep_going \
-  | grep -E -v "^//tensorflow" \
+  | egrep -v "^//tensorflow" \
   | sed -e 's|:.*||' \
   | sort \
   | uniq 2>&1 \
@@ -329,7 +329,7 @@ do_external_licenses_check(){
   EXTERNAL_LICENSES_CHECK_END_TIME=$(date +'%s')
 
   echo
-  echo "do_external_licenses_check took $((EXTERNAL_LICENSES_CHECK_END_TIME - EXTERNAL_LICENSES_CHECK_START_TIME)) s"
+  echo "do_external_licenses_check took $((${EXTERNAL_LICENSES_CHECK_END_TIME} - ${EXTERNAL_LICENSES_CHECK_START_TIME})) s"
   echo
 
   if [[ -s ${MISSING_LICENSES_FILE} ]] || [[ -s ${EXTRA_LICENSES_FILE} ]] ; then
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index 7fcd235e62..b8f9fc8453 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -44,8 +44,8 @@ pip2 install --upgrade markdown==2.6.8
 pip3 install --upgrade markdown==2.6.8
 
 # Install protobuf.
-pip2 install --upgrade protobuf==3.3.0
-pip3 install --upgrade protobuf==3.3.0
+pip2 install --upgrade protobuf==3.2.0
+pip3 install --upgrade protobuf==3.2.0
 
 # Remove obsolete version of six, which can sometimes confuse virtualenv.
 rm -rf /usr/lib/python3/dist-packages/six*
diff --git a/tensorflow/tools/ci_build/install/install_proto3.sh b/tensorflow/tools/ci_build/install/install_proto3.sh
index 7934002b2c..773c89b70b 100755
--- a/tensorflow/tools/ci_build/install/install_proto3.sh
+++ b/tensorflow/tools/ci_build/install/install_proto3.sh
@@ -17,9 +17,9 @@
 # Install protobuf3.
 
 # Select protobuf version.
-PROTOBUF_VERSION="3.3.0"
+PROTOBUF_VERSION="3.2.0"
 protobuf_ver_flat=$(echo $PROTOBUF_VERSION | sed 's/\.//g' | sed 's/^0*//g')
-local_protobuf_ver=$(protoc --version)
+local_protobuf_ver=$(protoc --version | awk '{print $2}')
 local_protobuf_ver_flat=$(echo $local_protobuf_ver | sed 's/\.//g' | sed 's/^0*//g')
 if [[ -z $local_protobuf_ver_flat ]]; then
   local_protobuf_ver_flat=0
@@ -30,7 +30,7 @@ if (( $local_protobuf_ver_flat < $protobuf_ver_flat )); then
   PROTOBUF_ZIP=$(basename "${PROTOBUF_URL}")
   UNZIP_DEST="google-protobuf"
 
-  wget "${PROTOBUF_URL}"
+  wget -q "${PROTOBUF_URL}"
   unzip "${PROTOBUF_ZIP}" -d "${UNZIP_DEST}"
   cp "${UNZIP_DEST}/bin/protoc" /usr/local/bin/
 
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index 084ac49496..e7e2d256cd 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -64,7 +64,7 @@ set -e
 pip3.5 install --upgrade six==1.10.0
 
 # Install protobuf.
-pip3.5 install --upgrade protobuf==3.3.0
+pip3.5 install --upgrade protobuf==3.2.0
 
 # Remove obsolete version of six, which can sometimes confuse virtualenv.
 rm -rf /usr/lib/python3/dist-packages/six*
diff --git a/tensorflow/tools/ci_build/protobuf/protobuf_optimized_pip.sh b/tensorflow/tools/ci_build/protobuf/protobuf_optimized_pip.sh
index 3e31aa1ce1..59ba71f5df 100755
--- a/tensorflow/tools/ci_build/protobuf/protobuf_optimized_pip.sh
+++ b/tensorflow/tools/ci_build/protobuf/protobuf_optimized_pip.sh
@@ -14,7 +14,7 @@
 # limitations under the License.
 # ==============================================================================
 
-PROTOBUF_VERSION="3.3.1"
+PROTOBUF_VERSION="3.2.0"
 PYTHON_BIN=${PYTHON_BIN:-python}
 DIR=${PWD}/protobuf
 
diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
index dff4707cbe..5581023ad7 100644
--- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
@@ -161,5 +161,5 @@ function create_python_test_dir() {
 
 function reinstall_tensorflow_pip() {
   echo "y" | pip uninstall tensorflow -q || true
-  pip install ${1} --no-deps
+  pip install ${1}
 }
diff --git a/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat b/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat
index 96fbadd176..8b6836f4ed 100644
--- a/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat
+++ b/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat
@@ -29,6 +29,9 @@ IF DEFINED PIP_EXE (ECHO PIP_EXE is set to %PIP_EXE%) ELSE (SET PIP_EXE="C:\Prog
 CALL %REPO_ROOT%\tensorflow\tools\ci_build\windows\cpu\cmake\run_build.bat
 if %errorlevel% neq 0 exit /b %errorlevel%
 
+:: Attempt to upgrade PIP to work around Anaconda issue #542.
+%PIP_EXE% install --ignore-installed --upgrade pip setuptools -v -v
+
 :: Since there are no wildcards in windows command prompt, use dark magic to get the wheel file name.
 DIR %REPO_ROOT%\%BUILD_DIR%\tf_python\dist\ /S /B > wheel_filename_file
 set /p WHEEL_FILENAME=<wheel_filename_file
@@ -36,7 +39,7 @@ del wheel_filename_file
 
 :: Install the pip package.
 echo Installing PIP package...
-%PIP_EXE% install --upgrade --no-deps %WHEEL_FILENAME% -v -v
+%PIP_EXE% install --upgrade %WHEEL_FILENAME% -v -v
 if %errorlevel% neq 0 exit /b %errorlevel%
 
 :: Run all python tests if the installation succeeded.
diff --git a/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat b/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
index e774a6e916..ba2d939b5f 100644
--- a/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
+++ b/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
@@ -28,6 +28,9 @@ IF DEFINED PIP_EXE (ECHO PIP_EXE is set to %PIP_EXE%) ELSE (SET PIP_EXE="C:\Prog
 CALL %REPO_ROOT%\tensorflow\tools\ci_build\windows\gpu\cmake\run_build.bat
 if %errorlevel% neq 0 exit /b %errorlevel%
 
+:: Attempt to upgrade PIP to work around Anaconda issue #542.
+%PIP_EXE% install --ignore-installed --upgrade pip setuptools -v -v
+
 :: Since there are no wildcards in windows command prompt, use dark magic to get the wheel file name.
 DIR %REPO_ROOT%\%BUILD_DIR%\tf_python\dist\ /S /B > wheel_filename_file
 set /p WHEEL_FILENAME=<wheel_filename_file
@@ -35,7 +38,7 @@ del wheel_filename_file
 
 :: Install the pip package.
 echo Installing PIP package...
-%PIP_EXE% install --upgrade --no-deps %WHEEL_FILENAME% -v -v
+%PIP_EXE% install --upgrade %WHEEL_FILENAME% -v -v
 if %errorlevel% neq 0 exit /b %errorlevel%
 
 :: Run all python tests if the installation succeeded.
diff --git a/tensorflow/tools/dist_test/local_test.sh b/tensorflow/tools/dist_test/local_test.sh
index 7d7f92d246..f536beef54 100755
--- a/tensorflow/tools/dist_test/local_test.sh
+++ b/tensorflow/tools/dist_test/local_test.sh
@@ -70,7 +70,7 @@ get_container_id_by_image_name() {
     # Get the id of a container by image name
     # Usage: get_docker_container_id_by_image_name <img_name>
 
-    docker ps | grep $1 | awk '{print $1}'
+    echo $(docker ps | grep $1 | awk '{print $1}')
 }
 
 # Parse input arguments
@@ -152,7 +152,7 @@ rm -rf "${BUILD_DIR}"
 docker run ${DOCKER_IMG_NAME} \
     /var/tf_dist_test/scripts/dist_mnist_test.sh \
     --ps_hosts $(seq -f "localhost:%g" -s "," \
-                 2000 $((2000 + NUM_PARAMETER_SERVERS - 1))) \
+                 2000 $((2000 + ${NUM_PARAMETER_SERVERS} - 1))) \
     --worker_hosts $(seq -f "localhost:%g" -s "," \
-                     3000 $((3000 + NUM_WORKERS - 1))) \
+                     3000 $((3000 + ${NUM_WORKERS} - 1))) \
     --num_gpus 0 ${SYNC_REPLICAS_FLAG}
diff --git a/tensorflow/tools/docker/README.md b/tensorflow/tools/docker/README.md
index 6d5a9bdc4c..3e45ae362c 100644
--- a/tensorflow/tools/docker/README.md
+++ b/tensorflow/tools/docker/README.md
@@ -54,30 +54,6 @@ for additional containers, such as release candidates or nightly builds.
 
 ## Rebuilding the containers
 
-Building TensorFlow Docker containers should be done through the
-[parameterized_docker_build.sh](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/docker/README.md)
-script. The raw Dockerfiles should not be used directly as they contain strings
-to be replaced by the script during the build.
+Just pick the dockerfile corresponding to the container you want to build, and run
 
-To use the script, specify the container type (`CPU` vs. `GPU`), the desired
-Python version (`PYTHON2` vs. `PYTHON3`) and whether the developer Docker image
-is to be built (`NO` vs. `YES`). In addition, you need to specify the central
-location from where the pip package of TensorFlow will be downloaded.
-
-For example, to build a CPU-only non-developer Docker image for Python 2, using
-TensorFlow's nightly pip package:
-
-``` bash
-export TF_DOCKER_BUILD_IS_DEVEL=NO
-export TF_DOCKER_BUILD_TYPE=CPU
-export TF_DOCKER_BUILD_PYTHON_VERSION=PYTHON2
-
-export NIGHTLY_VERSION="1.head"
-export TF_DOCKER_BUILD_CENTRAL_PIP=$(echo ${TF_DOCKER_BUILD_PYTHON_VERSION} | sed s^PYTHON2^http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=${TF_DOCKER_BUILD_PYTHON_VERSION},label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-${NIGHTLY_VERSION}-cp27-cp27mu-manylinux1_x86_64.whl^ | sed s^PYTHON3^http://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-${NIGHTLY_VERSION}-cp35-cp35m-manylinux1_x86_64.whl^)
-
-tensorflow/tools/docker/parameterized_docker_build.sh
-```
-
-If successful, the image will be tagged as `${USER}/tensorflow:latest` by default.
-
-Rebuilding GPU images requires [nvidia-docker](https://github.com/NVIDIA/nvidia-docker).
+    $ docker build --pull -t $USER/tensorflow-suffix -f Dockerfile.suffix .
diff --git a/tensorflow/tools/docker/parameterized_docker_build.sh b/tensorflow/tools/docker/parameterized_docker_build.sh
index ea88d8165f..f88af68cde 100755
--- a/tensorflow/tools/docker/parameterized_docker_build.sh
+++ b/tensorflow/tools/docker/parameterized_docker_build.sh
@@ -233,16 +233,13 @@ if [[ "${TF_DOCKER_BUILD_IS_DEVEL}" == "no" ]]; then
 
   # Modify python/pip version if necessary.
   if [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python3" ]]; then
-    if sed -i -e 's/python /python3 /g' "${DOCKERFILE}" && \
+    sed -i -e 's/python /python3 /g' "${DOCKERFILE}" && \
         sed -i -e 's/python-dev/python3-dev/g' "${DOCKERFILE}" && \
         sed -i -e 's/pip /pip3 /g' "${DOCKERFILE}" && \
-        sed -i -e 's^# RUN ln -s /usr/bin/python3 /usr/bin/python#^RUN ln -s /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
-    then
-      echo "Modified Dockerfile for python version "\
-"${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}"
-    else
-      die "FAILED to modify ${DOCKERFILE} for python3"
-    fi
+        sed -i -e 's^# RUN ln -s /usr/bin/python3 /usr/bin/python#^RUN ln -s /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}" && \
+        echo "Modified Dockerfile for python version "\
+"${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}" || \
+        die "FAILED to modify ${DOCKERFILE} for python3"
   fi
 else
   DOCKERFILE="${TMP_DIR}/Dockerfile"
@@ -253,17 +250,14 @@ else
 
   # Modify python/pip version if necessary.
   if [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python3" ]]; then
-    if sed -i -e 's/python-dev/python-dev python3-dev/g' "${DOCKERFILE}" && \
+    sed -i -e 's/python-dev/python-dev python3-dev/g' "${DOCKERFILE}" && \
         sed -i -e 's/python /python3 /g' "${DOCKERFILE}" && \
         sed -i -e 's^/tmp/pip^/tmp/pip3^g' "${DOCKERFILE}" && \
         sed -i -e 's/pip /pip3 /g' "${DOCKERFILE}" && \
         sed -i -e 's/ENV CI_BUILD_PYTHON python/ENV CI_BUILD_PYTHON python3/g' "${DOCKERFILE}" && \
-        sed -i -e 's^# RUN ln -s /usr/bin/python3 /usr/bin/python#^RUN ln -s /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
-    then
-      echo "Modified Dockerfile further for python version ${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}"
-    else
-      die "FAILED to modify ${DOCKERFILE} for python3"
-    fi
+        sed -i -e 's^# RUN ln -s /usr/bin/python3 /usr/bin/python#^RUN ln -s /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}" && \
+        echo "Modified Dockerfile further for python version ${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}" || \
+        die "FAILED to modify ${DOCKERFILE} for python3"
   fi
 fi
 
@@ -283,7 +277,7 @@ fi
 
 # Make sure that there is no other containers of the same image running
 # TODO(cais): Move to an earlier place.
-if "${DOCKER_BINARY}" ps | grep -q "${IMG}"; then
+if [[ ! -z $("${DOCKER_BINARY}" ps | grep "${IMG}") ]]; then
   die "ERROR: It appears that there are docker containers of the image "\
 "${IMG} running. Please stop them before proceeding"
 fi
@@ -316,22 +310,16 @@ if [[ "${TF_DOCKER_BUILD_IS_DEVEL}" == "no" ]]; then
     # on the running docker container
     echo ""
     echo "Performing basic sanity checks on the running container..."
-    if wget -qO- "http://127.0.0.1:${CONTAINER_PORT}/tree" &> /dev/null
-    then
-      echo "  PASS: wget tree"
-    else
-      mark_check_failed "  FAIL: wget tree"
-    fi
+    wget -qO- "http://127.0.0.1:${CONTAINER_PORT}/tree" &> /dev/null && \
+        echo "  PASS: wget tree" || \
+        mark_check_failed "  FAIL: wget tree"
 
     for NB in ${TMP_DIR}/notebooks/*.ipynb; do
       NB_BASENAME=$(basename "${NB}")
       NB_URL="http://127.0.0.1:${CONTAINER_PORT}/notebooks/${NB_BASENAME}"
-      if wget -qO- "${NB_URL}" -o "${TMP_DIR}/${NB_BASENAME}" &> /dev/null
-      then
-        echo "  PASS: wget ${NB_URL}"
-      else
-        mark_check_failed  "  FAIL: wget ${NB_URL}"
-      fi
+      wget -qO- "${NB_URL}" -o "${TMP_DIR}/${NB_BASENAME}" &> /dev/null && \
+          echo "  PASS: wget ${NB_URL}" || \
+          mark_check_failed  "  FAIL: wget ${NB_URL}"
     done
   fi
 
diff --git a/tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh b/tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh
index 7146213b33..2ce0fb394f 100755
--- a/tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh
+++ b/tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh
@@ -71,7 +71,7 @@ rm -rf ${LOG_FILE} || \
 
 # Invoke main Python file
 python "${GCS_SMOKE_PY}" --gcs_bucket_url="${GCS_BUCKET_URL}" \
-    > "${LOG_FILE}" 2>&1
+    2>&1 > "${LOG_FILE}"
 
 if [[ $? != "0" ]]; then
   cat ${LOG_FILE}
@@ -92,9 +92,6 @@ NEW_TFREC_URL=$(grep "Using input path" "${LOG_FILE}" | \
 if [[ -z ${NEW_TFREC_URL} ]]; then
   die "FAIL: Unable to determine the URL to the new tfrecord file in GCS"
 fi
-if "${GSUTIL_BIN}" rm "${NEW_TFREC_URL}"
-then
-  echo "Cleaned up new tfrecord file in GCS: ${NEW_TFREC_URL}"
-else
-  die "FAIL: Unable to clean up new tfrecord file in GCS: ${NEW_TFREC_URL}"
-fi
+"${GSUTIL_BIN}" rm "${NEW_TFREC_URL}" && \
+    echo "Cleaned up new tfrecord file in GCS: ${NEW_TFREC_URL}" || \
+    die "FAIL: Unable to clean up new tfrecord file in GCS: ${NEW_TFREC_URL}"
diff --git a/tensorflow/tools/git/gen_git_source.sh b/tensorflow/tools/git/gen_git_source.sh
index 977fe16333..1487658049 100755
--- a/tensorflow/tools/git/gen_git_source.sh
+++ b/tensorflow/tools/git/gen_git_source.sh
@@ -20,7 +20,7 @@ if [[ -z "${OUTPUT_FILENAME}"  ]]; then
   exit 1
 fi
 
-GIT_VERSION=$(git describe --long --tags)
+GIT_VERSION=`git describe --long --tags`
 if [[ $? != 0 ]]; then
    GIT_VERSION=unknown;
 fi
diff --git a/tensorflow/tools/graph_transforms/README.md b/tensorflow/tools/graph_transforms/README.md
index b4274e67df..bfda55d3ad 100644
--- a/tensorflow/tools/graph_transforms/README.md
+++ b/tensorflow/tools/graph_transforms/README.md
@@ -1055,7 +1055,7 @@ in the future.
 
 The Graph Transform Tool associates names of transforms with the code to
 implement them using the `REGISTER_GRAPH_TRANSFORM()` macro. This takes a string
-and a function, and automagically registers the transform with the tool. You
+and a function, and automatically registers the transform with the tool. You
 will need to watch out for a few things though:
 
 *   Because it's using global C++ objects in each file under the hood, the
diff --git a/tensorflow/tools/graph_transforms/quantize_weights_test.cc b/tensorflow/tools/graph_transforms/quantize_weights_test.cc
index 63c5b5a64d..e1a105bdd3 100644
--- a/tensorflow/tools/graph_transforms/quantize_weights_test.cc
+++ b/tensorflow/tools/graph_transforms/quantize_weights_test.cc
@@ -90,13 +90,13 @@ class QuantizeWeightsTest : public ::testing::Test {
     EXPECT_EQ("Const", q_weights_const->op());
     EXPECT_EQ(DT_QUINT8, q_weights_const->attr().at("dtype").type());
 
-    // Run the original graph.
+    // Run the the original graph.
     std::unique_ptr<Session> original_session(NewSession(SessionOptions()));
     TF_ASSERT_OK(original_session->Create(original_graph_def));
     std::vector<Tensor> original_outputs;
     TF_ASSERT_OK(original_session->Run({}, {"output"}, {}, &original_outputs));
 
-    // Run the quantized graph.
+    // Run the the quantized graph.
     std::unique_ptr<Session> quantized_session(NewSession(SessionOptions()));
     TF_ASSERT_OK(quantized_session->Create(quantized_graph_def));
     std::vector<Tensor> quantized_outputs;
diff --git a/tensorflow/tools/lib_package/libtensorflow_java_test.sh b/tensorflow/tools/lib_package/libtensorflow_java_test.sh
index c44978fc57..a44298e01a 100755
--- a/tensorflow/tools/lib_package/libtensorflow_java_test.sh
+++ b/tensorflow/tools/lib_package/libtensorflow_java_test.sh
@@ -29,7 +29,7 @@ TAR="${TAR}"
 
 [ -z "${JAVA}" ] && JAVA="java"
 [ -z "${JAVAC}" ] && JAVAC="javac"
-[ -z "${TAR}" ] && TAR="tar"
+[ -z "${TAR}"] && TAR="tar"
 
 # bazel tests run with ${PWD} set to the root of the bazel workspace
 TARFILE="${PWD}/tensorflow/tools/lib_package/libtensorflow_jni.tar.gz"
diff --git a/tensorflow/tools/lib_package/libtensorflow_test.sh b/tensorflow/tools/lib_package/libtensorflow_test.sh
index 7dfe8eefcc..6430c755af 100755
--- a/tensorflow/tools/lib_package/libtensorflow_test.sh
+++ b/tensorflow/tools/lib_package/libtensorflow_test.sh
@@ -26,7 +26,7 @@ CC="${CC}"
 TAR="${TAR}"
 
 [ -z "${CC}" ] && CC="/usr/bin/gcc"
-[ -z "${TAR}" ] && TAR="tar"
+[ -z "${TAR}"] && TAR="tar"
 
 # bazel tests run with ${PWD} set to the root of the bazel workspace
 TARFILE="${PWD}/tensorflow/tools/lib_package/libtensorflow.tar.gz"
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 54ba8064e8..073988765c 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -29,7 +29,7 @@ from setuptools.dist import Distribution
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.2.0'
+_VERSION = '1.2.0-rc2'
 
 REQUIRED_PACKAGES = [
     'numpy >= 1.11.0',
diff --git a/tensorflow/tools/tfprof/BUILD b/tensorflow/tools/tfprof/BUILD
index 57cccd8921..541df78e47 100644
--- a/tensorflow/tools/tfprof/BUILD
+++ b/tensorflow/tools/tfprof/BUILD
@@ -33,6 +33,7 @@ cc_binary(
         "//tensorflow/tools/tfprof/internal:tfprof_options",
         "//tensorflow/tools/tfprof/internal:tfprof_stats",
         "//tensorflow/tools/tfprof/internal:tfprof_utils",
+        "//tensorflow/tools/tfprof/internal/advisor:tfprof_advisor",
         "@linenoise//:linenoise",
     ],
 )
diff --git a/tensorflow/tools/tfprof/README.md b/tensorflow/tools/tfprof/README.md
index 54f3cd62f2..816ad8c07e 100644
--- a/tensorflow/tools/tfprof/README.md
+++ b/tensorflow/tools/tfprof/README.md
@@ -1,15 +1,19 @@
 # tfprof: TensorFlow Profiler and Beyond
 
-### Features
+###Features
 
 * Profile model architectures
   * parameters, tensor shapes, float operations, device placement, etc.
 * Profile model performance
   * execution time, memory consumption
   * Profile multiple steps.
-* Auto detect and advise. (Experimental)
+* Auto profile and advise.
+  * accelerator utilization check
+  * expensive operation check
+  * operation configuration check
+  * distributed runtime check (Not OSS)
 
-### Interfaces
+###Interfaces
 
 * Python API
 * Command Line
@@ -47,7 +51,7 @@ See [Options](g3doc/options.md) for detail instructions.
 -output                     stdout:
 ```
 
-### Tutorials
+###Tutorials
 
 *  [Python API](g3doc/python_api.md)
 *  [Command Line Interface](g3doc/command_line.md)
diff --git a/tensorflow/tools/tfprof/g3doc/advise.md b/tensorflow/tools/tfprof/g3doc/advise.md
index 3bce6270ff..e30add6fbf 100644
--- a/tensorflow/tools/tfprof/g3doc/advise.md
+++ b/tensorflow/tools/tfprof/g3doc/advise.md
@@ -3,6 +3,7 @@
 tfprof analyzes profiles and generates advises for common issues.
 
 ### Run Advise.
+
 ```python
 # First create a profiler. See profiler tutorials for more details.
 profiler = model_analyzer.Profiler(sess.graph)
@@ -13,8 +14,63 @@ _ = sess.run(r1,
              run_metadata=run_meta)
 profiler.add_step(1, run_meta)
 
-# Start advise.
-profiler.advise()
+# Then Start advise.
+profiler.advise(model_analyzer.ALL_ADVICE)
+
+# For one-shot API
+tf.contrib.tfprof.model_analyzer.advise(
+    sess.graph, run_meta=run_metadata)
+```
+
+```shell
+# Run advisor on CLI
+# See CLI tutorial on generating the files.
+tfprof --graph_path=graph.pbtxt \
+       --run_meta_path=run_metadata \
+       --op_log_path=tfprof_log
+
+tfprof> advise
+AcceleratorUtilizationChecker:
+device: /job:worker/replica:0/task:0/gpu:0 low utilization: 0.03
+device: /job:worker/replica:0/task:0/gpu:1 low utilization: 0.08
+device: /job:worker/replica:0/task:0/gpu:2 low utilization: 0.04
+device: /job:worker/replica:0/task:0/gpu:3 low utilization: 0.21
+
+OperationChecker:
+Found operation using NHWC data_format on GPU. Maybe NCHW is faster.
+
+ExpensiveOperationChecker:
+top 1 operation type: SoftmaxCrossEntropyWithLogits, cpu: 1.37sec, accelerator: 0us, total: 1.37sec (26.68%)
+top 2 operation type: MatMul, cpu: 427.39ms, accelerator: 280.76ms, total: 708.14ms (13.83%)
+top 3 operation type: ConcatV2, cpu: 357.83ms, accelerator: 31.80ms, total: 389.63ms (7.61%)
+seq2seq_attention_model.py:360:build_graph:self._add_seq2seq(), cpu: 3.16sec, accelerator: 214.84ms, total: 3.37sec
+  seq2seq_attention_model.py:293:_add_seq2seq:decoder_outputs, ..., cpu: 2.46sec, accelerator: 3.25ms, total: 2.47sec
+    seq2seq_lib.py:181:sampled_sequence_...:average_across_ti..., cpu: 2.46sec, accelerator: 3.24ms, total: 2.47sec
+      seq2seq_lib.py:147:sequence_loss_by_...:crossent = loss_f..., cpu: 2.46sec, accelerator: 3.06ms, total: 2.46sec
+        seq2seq_attention_model.py:289:sampled_loss_func:num_classes=vsize), cpu: 2.46sec, accelerator: 3.06ms, total: 2.46sec
+        seq2seq_attention_model.py:282:sampled_loss_func:labels = tf.resha..., cpu: 164us, accelerator: 0us, total: 164us
+      seq2seq_lib.py:148:sequence_loss_by_...:log_perp_list.app..., cpu: 1.33ms, accelerator: 120us, total: 1.45ms
+      seq2seq_lib.py:151:sequence_loss_by_...:total_size = tf.a..., cpu: 154us, accelerator: 23us, total: 177us
+    seq2seq_lib.py:184:sampled_sequence_...:return cost / tf...., cpu: 97us, accelerator: 8us, total: 105us
+      math_ops.py:690:cast:return gen_math_o..., cpu: 62us, accelerator: 3us, total: 65us
+      math_ops.py:839:binary_op_wrapper:return func(x, y,..., cpu: 35us, accelerator: 5us, total: 40us
+  seq2seq_attention_model.py:192:_add_seq2seq:sequence_length=a..., cpu: 651.56ms, accelerator: 158.92ms, total: 810.48ms
+    seq2seq_lib.py:104:bidirectional_rnn:sequence_length, ..., cpu: 306.58ms, accelerator: 73.54ms, total: 380.12ms
+      core_rnn.py:195:static_rnn:state_size=cell.s..., cpu: 306.52ms, accelerator: 73.54ms, total: 380.05ms
+        rnn.py:218:_rnn_step:_maybe_copy_some_..., cpu: 303.76ms, accelerator: 73.54ms, total: 377.30ms
+        rnn.py:216:_rnn_step:time >= max_seque..., cpu: 2.75ms, accelerator: 0us, total: 2.75ms
+      core_rnn.py:179:static_rnn:max_sequence_leng..., cpu: 67us, accelerator: 0us, total: 67us
+    seq2seq_lib.py:110:bidirectional_rnn:initial_state_bw,..., cpu: 296.21ms, accelerator: 73.54ms, total: 369.75ms
+      core_rnn.py:195:static_rnn:state_size=cell.s..., cpu: 296.11ms, accelerator: 73.54ms, total: 369.65ms
+        rnn.py:218:_rnn_step:_maybe_copy_some_..., cpu: 292.04ms, accelerator: 73.54ms, total: 365.58ms
+        rnn.py:216:_rnn_step:time >= max_seque..., cpu: 4.07ms, accelerator: 0us, total: 4.07ms
+      core_rnn.py:178:static_rnn:min_sequence_leng..., cpu: 85us, accelerator: 0us, total: 85us
+      core_rnn.py:179:static_rnn:max_sequence_leng..., cpu: 16us, accelerator: 0us, total: 16us
+    seq2seq_lib.py:113:bidirectional_rnn:outputs = [tf.con..., cpu: 46.88ms, accelerator: 3.87ms, total: 50.75ms
+ ...(omitted)
+top 1 graph node: seq2seq/loss/sampled_sequence_loss/sequence_loss_by_example/SoftmaxCrossEntropyWithLogits_11, cpu: 89.92ms, accelerator: 0us, total: 89.92ms
+top 2 graph node: train_step/update_seq2seq/output_projection/w/ApplyAdam, cpu: 84.52ms, accelerator: 0us, total: 84.52ms
+top 3 graph node: seq2seq/loss/sampled_sequence_loss/sequence_loss_by_example/SoftmaxCrossEntropyWithLogits_19, cpu: 73.02ms, accelerator: 0us, total: 73.02ms
 ```
 
 ### Checker
@@ -25,16 +81,24 @@ area with the profile and report issues. A `Checker` is like a plugin.
 
 For example:
 
-####JobChecker (Not Available OSS)
-* Checking RecvTensor RPC latency and bandwidth.
-* Checking CPU/Memory utilization of the job.
+#### JobChecker (Not Available OSS)
+
+*   Checks RecvTensor RPC latency and bandwidth.
+*   Checks CPU/Memory utilization of the job.
 
 ####AcceleratorUtilization Checker
 * Checks what percentage of time the accelerator spends on computation.
 
-####Operation Checker
-* Check whether the operation runs with optimal options.
-* Checks if there is a better implementation to replace the current operation.
+#### OperationChecker
+
+*   Checks whether the operation runs with optimal options.
+*   Checks if there is a better implementation to replace the current operation.
+
+#### ExpensiveOperationChecker
+
+*   Checks the most expensive operation type.
+*   Checks the most expensive graph nodes.
+*   Checks the most expensive graph-building Python codes.
 
 ####Contribute Your Checker
 
diff --git a/tensorflow/tools/tfprof/g3doc/command_line.md b/tensorflow/tools/tfprof/g3doc/command_line.md
index 9f0de72e07..0d8d56809a 100644
--- a/tensorflow/tools/tfprof/g3doc/command_line.md
+++ b/tensorflow/tools/tfprof/g3doc/command_line.md
@@ -126,7 +126,7 @@ tfprof>
 -show_name_regexes          .*
 -hide_name_regexes          IsVariableInitialized_[0-9]+,save\/.*,^zeros[0-9_]*
 -account_displayed_op_only  false
-# supported select fields. Availability depends on --[run_meta|checkpoint|op_log]_path.
+# supported select fileds. Availability depends on --[run_meta|checkpoint|op_log]_path.
 # [bytes|micros|params|float_ops|occurrence|tensor_value|device|op_types]
 -select                     params
 # format: output_type:key=value,key=value...
diff --git a/tensorflow/tools/tfprof/internal/advisor/BUILD b/tensorflow/tools/tfprof/internal/advisor/BUILD
index e4bd0f0015..629322373a 100644
--- a/tensorflow/tools/tfprof/internal/advisor/BUILD
+++ b/tensorflow/tools/tfprof/internal/advisor/BUILD
@@ -44,12 +44,21 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "expensive_operation_checker",
+    hdrs = ["expensive_operation_checker.h"],
+    deps = [
+        ":checker",
+    ],
+)
+
 cc_library(
     name = "tfprof_advisor",
     hdrs = ["tfprof_advisor.h"],
     deps = [
         ":accelerator_utilization_checker",
         ":checker",
+        ":expensive_operation_checker",
         ":internal_checker_runner_dummy",
         ":operation_checker",
     ],
diff --git a/tensorflow/tools/tfprof/internal/advisor/accelerator_utilization_checker.h b/tensorflow/tools/tfprof/internal/advisor/accelerator_utilization_checker.h
index fb7f65d7dc..074b8e57b0 100644
--- a/tensorflow/tools/tfprof/internal/advisor/accelerator_utilization_checker.h
+++ b/tensorflow/tools/tfprof/internal/advisor/accelerator_utilization_checker.h
@@ -33,10 +33,11 @@ struct ExecStats {
 
 class AcceleratorUtilizationChecker : public Checker {
  public:
-  string name() override { return "AcceleratorUtilizationChecker"; }
+  string name() const override { return kCheckers[0]; }
 
  private:
-  std::vector<string> Check(const TFStats* stats) override {
+  AdviceProto::Checker Check(const AdvisorOptionsProto::CheckerOption& options,
+                             const TFStats* stats) override {
     if (!stats) {
       fprintf(stderr, "Missing profiles (e.g. graph, run_meta). Skip %s\n",
               name().c_str());
@@ -48,24 +49,21 @@ class AcceleratorUtilizationChecker : public Checker {
     return CheckInternal();
   }
 
-  std::vector<string> CheckInternal() {
+  AdviceProto::Checker CheckInternal() {
     for (const auto& s : accelerator_exec_stats_) {
       const ExecStats& stat = s.second;
       int64 total_micros = stat.end_micros - stat.start_micros;
       if (total_micros <= 0) continue;
       double utilization = 1.0 * stat.exec_micros / total_micros;
       if (utilization >= 0.5) {
-        reports_.push_back(strings::Printf("%s: device: %s utilization: %.2f",
-                                           kLevel[0], s.first.c_str(),
-                                           utilization));
+        reports_.add_reports(strings::Printf("device: %s utilization: %.2f",
+                                             s.first.c_str(), utilization));
       } else if (utilization < 0.5 && utilization > 0.2) {
-        reports_.push_back(
-            strings::Printf("%s: device: %s low utilization: %.2f", kLevel[1],
-                            s.first.c_str(), utilization));
+        reports_.add_reports(strings::Printf("device: %s low utilization: %.2f",
+                                             s.first.c_str(), utilization));
       } else if (utilization <= 0.2) {
-        reports_.push_back(
-            strings::Printf("%s: device: %s low utilization: %.2f", kLevel[2],
-                            s.first.c_str(), utilization));
+        reports_.add_reports(strings::Printf("device: %s low utilization: %.2f",
+                                             s.first.c_str(), utilization));
       }
     }
     return reports_;
@@ -102,7 +100,7 @@ class AcceleratorUtilizationChecker : public Checker {
 
   std::map<string, ExecStats> accelerator_exec_stats_;
   std::map<string, int64> ps_placement_;
-  std::vector<string> reports_;
+  AdviceProto::Checker reports_;
 };
 
 }  // namespace tfprof
diff --git a/tensorflow/tools/tfprof/internal/advisor/checker.h b/tensorflow/tools/tfprof/internal/advisor/checker.h
index b8b057be5b..3ce80cd8c4 100644
--- a/tensorflow/tools/tfprof/internal/advisor/checker.h
+++ b/tensorflow/tools/tfprof/internal/advisor/checker.h
@@ -18,27 +18,33 @@ limitations under the License.
 
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_stats.h"
+#include "tensorflow/tools/tfprof/tfprof_options.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
 
-static const char* const kLevel[] = {
-    "NOTE",     // Good to know.
-    "SUGGEST",  // Might get better.
-    "WARN",     // Please do it for better.
+// Append only.
+static const char* const kCheckers[] = {
+    "AcceleratorUtilizationChecker", "OperationChecker",
+    "ExpensiveOperationChecker",
+    "JobChecker",  // Internal checker.
 };
 
 class Checker {
  public:
-  virtual ~Checker(){};
+  virtual ~Checker() {}
 
-  virtual string name() = 0;
+  virtual string name() const = 0;
 
-  std::vector<string> Run(const TFStats* stats) { return Check(stats); }
+  AdviceProto::Checker Run(const AdvisorOptionsProto::CheckerOption& options,
+                           const TFStats* stats) {
+    return Check(options, stats);
+  }
 
  protected:
-  // Returns a vector of string, each one being an advice.
-  virtual std::vector<string> Check(const TFStats* stats) = 0;
+  virtual AdviceProto::Checker Check(
+      const AdvisorOptionsProto::CheckerOption& options,
+      const TFStats* stats) = 0;
 };
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/advisor/internal_checker_runner.h b/tensorflow/tools/tfprof/internal/advisor/internal_checker_runner.h
index 1238b57f20..ed8ae571b6 100644
--- a/tensorflow/tools/tfprof/internal/advisor/internal_checker_runner.h
+++ b/tensorflow/tools/tfprof/internal/advisor/internal_checker_runner.h
@@ -17,13 +17,16 @@ limitations under the License.
 #define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_ADVISOR_INTERNAL_CHECKER_RUNNER_H_
 
 #include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/tools/tfprof/tfprof_options.pb.h"
+#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
 
 class TFStats;
 
-std::map<string, std::vector<string>> RunInternalCheckers(const TFStats* stats);
+AdviceProto RunInternalCheckers(const AdvisorOptionsProto& options,
+                                const TFStats* stats);
 
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/advisor/internal_checker_runner_dummy.cc b/tensorflow/tools/tfprof/internal/advisor/internal_checker_runner_dummy.cc
index 8204d2b04e..67962c8e8b 100644
--- a/tensorflow/tools/tfprof/internal/advisor/internal_checker_runner_dummy.cc
+++ b/tensorflow/tools/tfprof/internal/advisor/internal_checker_runner_dummy.cc
@@ -17,9 +17,9 @@ limitations under the License.
 namespace tensorflow {
 namespace tfprof {
 
-std::map<string, std::vector<string>> RunInternalCheckers(
-    const TFStats* stats) {
-  return std::map<string, std::vector<string>>();
+AdviceProto RunInternalCheckers(const AdvisorOptionsProto& options,
+                                const TFStats* stats) {
+  return AdviceProto();
 }
 
 }  // namespace tfprof
diff --git a/tensorflow/tools/tfprof/internal/advisor/operation_checker.h b/tensorflow/tools/tfprof/internal/advisor/operation_checker.h
index 2a05f9bfd0..4d0d68e3bf 100644
--- a/tensorflow/tools/tfprof/internal/advisor/operation_checker.h
+++ b/tensorflow/tools/tfprof/internal/advisor/operation_checker.h
@@ -24,10 +24,11 @@ namespace tfprof {
 
 class OperationChecker : public Checker {
  public:
-  string name() override { return "OperationChecker"; }
+  string name() const override { return kCheckers[1]; }
 
  private:
-  std::vector<string> Check(const TFStats* stats) override {
+  AdviceProto::Checker Check(const AdvisorOptionsProto::CheckerOption& options,
+                             const TFStats* stats) override {
     if (!stats) {
       fprintf(stderr, "Missing profiles (e.g. graph, run_meta). Skip %s\n",
               name().c_str());
@@ -53,22 +54,20 @@ class OperationChecker : public Checker {
       }
     }
     if (use_batch_norm && !use_fused_batch_norm) {
-      reports_.push_back(strings::Printf(
-          "%s: Maybe use faster FusedBatchNorm instead of BatchNorm",
-          kLevel[1]));
+      reports_.add_reports(
+          "Maybe use faster FusedBatchNorm instead of BatchNorm");
     }
     if (recommend_nchw) {
       // TODO(xpan): Maybe print which Op supports NCHW.
-      reports_.push_back(strings::Printf(
-          "%s: Found operation using NHWC data_format on GPU. Maybe "
-          "NCHW is faster.",
-          kLevel[1]));
+      reports_.add_reports(
+          "Found operation using NHWC data_format on GPU. Maybe "
+          "NCHW is faster.");
     }
     return reports_;
   }
 
  private:
-  std::vector<string> reports_;
+  AdviceProto::Checker reports_;
 };
 
 }  // namespace tfprof
diff --git a/tensorflow/tools/tfprof/internal/advisor/tfprof_advisor.h b/tensorflow/tools/tfprof/internal/advisor/tfprof_advisor.h
index 856f515459..d2257fb9b5 100644
--- a/tensorflow/tools/tfprof/internal/advisor/tfprof_advisor.h
+++ b/tensorflow/tools/tfprof/internal/advisor/tfprof_advisor.h
@@ -18,8 +18,10 @@ limitations under the License.
 
 #include "tensorflow/tools/tfprof/internal/advisor/accelerator_utilization_checker.h"
 #include "tensorflow/tools/tfprof/internal/advisor/checker.h"
+#include "tensorflow/tools/tfprof/internal/advisor/expensive_operation_checker.h"
 #include "tensorflow/tools/tfprof/internal/advisor/internal_checker_runner.h"
 #include "tensorflow/tools/tfprof/internal/advisor/operation_checker.h"
+#include "tensorflow/tools/tfprof/tfprof_options.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -29,23 +31,44 @@ class Advisor {
  public:
   Advisor(const TFStats* stats) : stats_(stats) {}
 
-  std::map<string, std::vector<string>> Advise() {
+  static AdvisorOptionsProto DefaultOptions() {
+    AdvisorOptionsProto options;
+    std::vector<string> checkers(
+        kCheckers, kCheckers + sizeof(kCheckers) / sizeof(*kCheckers));
+    for (const string& checker : checkers) {
+      (*options.mutable_checkers())[checker];
+    }
+    return options;
+  }
+
+  AdviceProto Advise(const AdvisorOptionsProto& options) {
     // Note: Release a checker's memory ASAP.
-    std::map<string, std::vector<string>> reports = RunInternalCheckers(stats_);
-    // TODO(xpan): Think of a way to turn off/on specific checkers.
-    AcceleratorUtilizationChecker au_checker;
-    reports[au_checker.name()] = au_checker.Run(stats_);
-    OperationChecker op_checker;
-    reports[op_checker.name()] = op_checker.Run(stats_);
-
-    for (const auto& checker_r : reports) {
-      fprintf(stdout, "%s reports:\n", checker_r.first.c_str());
-      for (const auto& r : checker_r.second) {
+    AdviceProto ret = RunInternalCheckers(options, stats_);
+
+    if (options.checkers().find(kCheckers[0]) != options.checkers().end()) {
+      AcceleratorUtilizationChecker au_checker;
+      (*ret.mutable_checkers())[kCheckers[0]].MergeFrom(
+          au_checker.Run(options.checkers().at(kCheckers[0]), stats_));
+    }
+    if (options.checkers().find(kCheckers[1]) != options.checkers().end()) {
+      OperationChecker op_checker;
+      (*ret.mutable_checkers())[kCheckers[1]].MergeFrom(
+          op_checker.Run(options.checkers().at(kCheckers[1]), stats_));
+    }
+    if (options.checkers().find(kCheckers[2]) != options.checkers().end()) {
+      ExpensiveOperationChecker expensive_op_checker;
+      (*ret.mutable_checkers())[kCheckers[2]].MergeFrom(
+          expensive_op_checker.Run(options.checkers().at(kCheckers[2]),
+                                   stats_));
+    }
+    for (const auto& checker : ret.checkers()) {
+      fprintf(stdout, "\n%s:\n", checker.first.c_str());
+      for (const string& r : checker.second.reports()) {
         fprintf(stdout, "%s\n", r.c_str());
       }
     }
     fflush(stdout);
-    return reports;
+    return ret;
   }
 
  private:
diff --git a/tensorflow/tools/tfprof/internal/advisor/tfprof_advisor_test.cc b/tensorflow/tools/tfprof/internal/advisor/tfprof_advisor_test.cc
index b41d0770dc..3b40253954 100644
--- a/tensorflow/tools/tfprof/internal/advisor/tfprof_advisor_test.cc
+++ b/tensorflow/tools/tfprof/internal/advisor/tfprof_advisor_test.cc
@@ -29,15 +29,16 @@ class TFProfAdvisorTest : public ::testing::Test {
                              nullptr, nullptr));
 
     stats_->AddNodeForTest(
-        "n1", CreateNode("n1", "Conv2D", {{"data_format", "NHWC"}}, 10, 2));
-    stats_->AddNodeForTest("n2", CreateNode("n2", "Conv2D", {}, 20, 2));
+        0, CreateNode("n1", "Conv2D", {{"data_format", "NHWC"}}, 0, 10, 2));
+    stats_->AddNodeForTest(0, CreateNode("n2", "Conv2D", {}, 0, 20, 2));
+    stats_->BuildAllViews();
     advisor_.reset(new Advisor(stats_.get()));
   }
 
   std::unique_ptr<TFGraphNode> CreateNode(const string& name,
                                           const string& type,
                                           std::map<string, string> attrs,
-                                          int64 start_miros,
+                                          int64 step, int64 start_miros,
                                           int64 end_rel_micros) {
     node_defs_.push_back(std::unique_ptr<NodeDef>(new NodeDef()));
     NodeDef* def = node_defs_.back().get();
@@ -52,10 +53,10 @@ class TFProfAdvisorTest : public ::testing::Test {
     NodeExecStats node_stat;
     node_stat.set_all_start_micros(start_miros);
     node_stat.set_op_end_rel_micros(end_rel_micros);
-    node->AddStepStat(0, "/job:localhost/replica:0/task:0/gpu:0", node_stat);
-    node->AddStepStat(0, "/job:localhost/replica:0/task:0/gpu:0:stream:all",
+    node->AddStepStat(step, "/job:localhost/replica:0/task:0/gpu:0", node_stat);
+    node->AddStepStat(step, "/job:localhost/replica:0/task:0/gpu:0:stream:all",
                       node_stat);
-    node->AddStepStat(0, "/job:localhost/replica:0/task:0/gpu:0:stream:0",
+    node->AddStepStat(step, "/job:localhost/replica:0/task:0/gpu:0:stream:0",
                       node_stat);
     return node;
   }
@@ -66,23 +67,38 @@ class TFProfAdvisorTest : public ::testing::Test {
 };
 
 TEST_F(TFProfAdvisorTest, Basics) {
-  std::map<string, std::vector<string>> reports = advisor_->Advise();
-  EXPECT_TRUE(reports.find("AcceleratorUtilizationChecker") != reports.end());
-  EXPECT_TRUE(reports.find("OperationChecker") != reports.end());
+  AdvisorOptionsProto options = Advisor::DefaultOptions();
+  AdviceProto advice = advisor_->Advise(options);
+  EXPECT_TRUE(advice.checkers().find(kCheckers[0]) != advice.checkers().end());
+  EXPECT_TRUE(advice.checkers().find(kCheckers[1]) != advice.checkers().end());
+  EXPECT_TRUE(advice.checkers().find(kCheckers[2]) != advice.checkers().end());
 }
 
 TEST_F(TFProfAdvisorTest, OperationChecker) {
-  std::map<string, std::vector<string>> reports = advisor_->Advise();
-  EXPECT_EQ(reports["OperationChecker"].size(), 1);
-  EXPECT_TRUE(StringPiece(reports["OperationChecker"][0]).contains("NCHW"));
+  AdvisorOptionsProto options;
+  (*options.mutable_checkers())[kCheckers[1]];
+  AdviceProto advice = advisor_->Advise(options);
+  EXPECT_EQ(advice.checkers().at(kCheckers[1]).reports_size(), 1);
+  EXPECT_TRUE(StringPiece(advice.checkers().at(kCheckers[1]).reports(0))
+                  .contains("NCHW"));
 }
 
 TEST_F(TFProfAdvisorTest, UtilizationChecker) {
-  std::map<string, std::vector<string>> reports = advisor_->Advise();
-  EXPECT_EQ(reports["AcceleratorUtilizationChecker"].size(), 1);
-  EXPECT_TRUE(StringPiece(reports["AcceleratorUtilizationChecker"][0])
+  AdvisorOptionsProto options;
+  (*options.mutable_checkers())[kCheckers[0]];
+  AdviceProto advice = advisor_->Advise(options);
+  EXPECT_EQ(advice.checkers().at(kCheckers[0]).reports_size(), 1);
+  EXPECT_TRUE(StringPiece(advice.checkers().at(kCheckers[0]).reports(0))
                   .contains("low utilization"));
 }
 
+TEST_F(TFProfAdvisorTest, ExpensiveOperationChecker) {
+  AdvisorOptionsProto options;
+  (*options.mutable_checkers())[kCheckers[2]];
+  AdviceProto advice = advisor_->Advise(options);
+  EXPECT_TRUE(StringPiece(advice.checkers().at(kCheckers[2]).reports(0))
+                  .contains("top 1 operation type: Conv2D"));
+}
+
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/print_model_analysis.cc b/tensorflow/tools/tfprof/internal/print_model_analysis.cc
index 37d01db3a1..5a9c44d8e6 100644
--- a/tensorflow/tools/tfprof/internal/print_model_analysis.cc
+++ b/tensorflow/tools/tfprof/internal/print_model_analysis.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/tools/tfprof/internal/tfprof_options.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_stats.h"
 #include "tensorflow/tools/tfprof/tfprof_log.pb.h"
+#include "tensorflow/tools/tfprof/tfprof_options.pb.h"
 #include "tensorflow/tools/tfprof/tfprof_output.pb.h"
 
 namespace tensorflow {
@@ -36,6 +37,18 @@ TFStats* tf_stat = nullptr;
 
 string RunProfile(const string& command, const string& options,
                   TFStats* tf_stats) {
+  if (command == kCmds[4]) {
+    AdvisorOptionsProto option_pb;
+    if (!option_pb.ParseFromString(options)) {
+      fprintf(stderr, "Cannot parse AdvisorOptionsProto\n");
+      return "";
+    }
+    tf_stats->BuildAllViews();
+    return Advisor(tf_stats).Advise(option_pb).SerializeAsString();
+  } else {
+    tf_stats->BuildView(command);
+  }
+
   Options opts;
   tensorflow::Status s = Options::FromProtoStr(options, &opts);
   if (!s.ok()) {
@@ -97,14 +110,14 @@ void AddStep(int64 step, const string* run_meta, const string* op_log) {
   // TODO(xpan): Better error handling.
   std::unique_ptr<RunMetadata> run_meta_ptr(new RunMetadata());
   run_meta_ptr->ParseFromString(*run_meta);
-  tf_stat->ParseRunMeta(step, std::move(run_meta_ptr));
+  tf_stat->AddRunMeta(step, std::move(run_meta_ptr));
 
   std::unique_ptr<OpLog> op_log_ptr;
   if (op_log && !op_log->empty()) {
     op_log_ptr.reset(new OpLog());
     op_log_ptr->ParseFromString(*op_log);
   }
-  tf_stat->ParseOpLog(std::move(op_log_ptr));
+  tf_stat->AddOpLog(std::move(op_log_ptr));
 }
 
 string Profile(const string* command, const string* options) {
@@ -144,7 +157,5 @@ string PrintModelAnalysis(const string* graph, const string* run_meta,
   return RunProfile(*command, *options, &tf_stats);
 }
 
-void Advise() { Advisor(tf_stat).Advise(); }
-
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/print_model_analysis.h b/tensorflow/tools/tfprof/internal/print_model_analysis.h
index 84165e542d..46db63646d 100644
--- a/tensorflow/tools/tfprof/internal/print_model_analysis.h
+++ b/tensorflow/tools/tfprof/internal/print_model_analysis.h
@@ -39,8 +39,6 @@ void AddStep(int64 step, const string* run_meta, const string* op_log);
 
 string Profile(const string* command, const string* options);
 
-void Advise();
-
 // Single-step Profiler.
 //
 // Interface defined for Python API swig. Calls the tfprof core API.
diff --git a/tensorflow/tools/tfprof/internal/tfprof_graph.h b/tensorflow/tools/tfprof/internal/tfprof_graph.h
index fbeae8673d..194a21f0cc 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_graph.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_graph.h
@@ -54,8 +54,8 @@ class TFGraph : public TFShow {
   const ShowNode* ShowInternal(const Options& opts,
                                Timeline* timeline) override;
 
-  bool ShouldShowIfExtra(ShowNode* node, const Options& opts,
-                         int depth) override {
+  bool ShouldShowIfExtra(const ShowNode* node, const Options& opts,
+                         int depth) const override {
     return true;
   }
 
diff --git a/tensorflow/tools/tfprof/internal/tfprof_op.cc b/tensorflow/tools/tfprof/internal/tfprof_op.cc
index 6a7077c085..77a2593623 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_op.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_op.cc
@@ -126,6 +126,7 @@ const ShowMultiNode* TFOp::ShowInternal(const Options& opts,
   }
   nodes = SortNodes(nodes, opts);
 
+  // pre keeps track of previous visited node.
   OpNode* pre = nullptr;
   std::vector<OpNode*> account_nodes;
   for (auto it = nodes.rbegin(); it != nodes.rend(); ++it) {
@@ -170,16 +171,20 @@ const ShowMultiNode* TFOp::ShowInternal(const Options& opts,
     root_->ResetTotalStats();
     if (pre) {
       root_->AggregateTotalStats(pre);
-      root_->mutable_proto()->add_children()->MergeFrom(pre->proto());
-      pre->mutable_proto()->clear_children();
     }
   }
+  if (pre) {
+    root_->mutable_proto()->add_children()->MergeFrom(pre->proto());
+    pre->mutable_proto()->clear_children();
+  }
 
   if (opts.output_type == kOutput[1] || opts.output_type == kOutput[2]) {
     string display_str = FormatLegend(opts);
     for (OpNode* node : show_nodes) {
       display_str += FormatNode(node, root_.get(), opts);
     }
+    // In op view, we don't show root (total). But it will still in proto.
+    // TODO(xpan): Is it the right choice?
     root_->formatted_str = display_str;
   }
   return root_.get();
@@ -201,7 +206,7 @@ int64 TFOp::SearchRoot(const std::vector<OpNode*> nodes,
   return i;
 }
 
-string TFOp::FormatNode(OpNode* node, OpNode* root, const Options& opts) {
+string TFOp::FormatNode(OpNode* node, OpNode* root, const Options& opts) const {
   std::vector<string> attrs;
 
   if (opts.select.find(kShown[0]) != opts.select.end()) {
diff --git a/tensorflow/tools/tfprof/internal/tfprof_op.h b/tensorflow/tools/tfprof/internal/tfprof_op.h
index 5b16490363..34812f54be 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_op.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_op.h
@@ -56,15 +56,15 @@ class TFOp : public TFMultiShow {
   int64 SearchRoot(const std::vector<OpNode*> nodes,
                    const std::vector<string>& regexes);
 
-  bool ShouldShowIfExtra(ShowMultiNode* node, const Options& opts,
-                         int depth) override {
+  bool ShouldShowIfExtra(const ShowMultiNode* node, const Options& opts,
+                         int depth) const override {
     if (opts.min_occurrence > node->node->graph_nodes().size()) {
       return false;
     }
     return true;
   }
 
-  string FormatNode(OpNode* node, OpNode* root, const Options& opts);
+  string FormatNode(OpNode* node, OpNode* root, const Options& opts) const;
 
   std::unique_ptr<OpNode> root_;
   std::map<string, std::unique_ptr<OpNode>> cnodes_map_;
diff --git a/tensorflow/tools/tfprof/internal/tfprof_options.h b/tensorflow/tools/tfprof/internal/tfprof_options.h
index 6c9db24342..d39333e3fc 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_options.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_options.h
@@ -59,10 +59,10 @@ static const char* const kShown[] = {
     "cpu_micros"};
 
 static const char* const kCmds[] = {
-    "scope", "graph", "code", "op", "set", "help",
+    "scope", "graph", "code", "op", "advise", "set", "help",
 };
 
-static const char* const kOutput[] = {"timeline", "stdout", "file"};
+static const char* const kOutput[] = {"timeline", "stdout", "file", "none"};
 
 static const char* const kTimelineOpts[] = {
     "outfile",
diff --git a/tensorflow/tools/tfprof/internal/tfprof_show.cc b/tensorflow/tools/tfprof/internal/tfprof_show.cc
index 40cc56fb22..6c1183a4d5 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_show.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_show.cc
@@ -26,7 +26,9 @@ namespace tensorflow {
 namespace tfprof {
 
 const TFGraphNodeProto& TFShow::Show(const Options& opts) {
-  if (opts.output_type == kOutput[0]) {
+  if (opts.output_type == kOutput[3]) {
+    return ShowInternal(opts, nullptr)->proto();
+  } else if (opts.output_type == kOutput[0]) {
     Timeline timeline(opts.step, opts.output_options.at(kTimelineOpts[0]));
     return ShowInternal(opts, &timeline)->proto();
   } else if (opts.output_type == kOutput[2]) {
@@ -64,7 +66,8 @@ bool TFShow::LookUpCheckPoint(const string& name,
   return true;
 }
 
-bool TFShow::ShouldShow(ShowNode* node, const Options& opts, int depth) {
+bool TFShow::ShouldShow(const ShowNode* node, const Options& opts,
+                        int depth) const {
   // Always show kTFProfRoot.
   if (node->name() == kTFProfRoot) return true;
 
@@ -97,7 +100,8 @@ bool TFShow::ShouldShow(ShowNode* node, const Options& opts, int depth) {
   return true;
 }
 
-bool TFShow::ShouldTrim(ShowNode* node, const std::vector<string>& regexes) {
+bool TFShow::ShouldTrim(const ShowNode* node,
+                        const std::vector<string>& regexes) const {
   for (const string& regex : regexes) {
     if (RE2::FullMatch(node->name(), regex)) {
       return true;
@@ -122,7 +126,7 @@ bool TFShow::ReAccount(ShowNode* node, const Options& opts) {
   return false;
 }
 
-string TFShow::FormatNode(ShowNode* node, const Options& opts) {
+string TFShow::FormatNode(ShowNode* node, const Options& opts) const {
   std::vector<string> info;
   if (opts.select.find(kShown[2]) != opts.select.end()) {
     const string shape = FormatShapes(node->node->shape());
@@ -210,7 +214,7 @@ string TFShow::FormatNode(ShowNode* node, const Options& opts) {
                          str_util::Join(info, ", ").c_str());
 }
 
-string TFShow::FormatLegend(const Options& opts) {
+string TFShow::FormatLegend(const Options& opts) const {
   std::vector<string> legends;
   if (opts.select.find(kShown[2]) != opts.select.end()) {
     legends.push_back("# parameters");
diff --git a/tensorflow/tools/tfprof/internal/tfprof_show.h b/tensorflow/tools/tfprof/internal/tfprof_show.h
index 2c61b4fd73..95513e086f 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_show.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_show.h
@@ -54,20 +54,21 @@ class TFShow {
                         std::unique_ptr<TFProfTensor>* tensor);
 
   // Overridden by subclass if extra requirements need to be met.
-  virtual bool ShouldShowIfExtra(ShowNode* node, const Options& opts,
-                                 int depth) {
+  virtual bool ShouldShowIfExtra(const ShowNode* node, const Options& opts,
+                                 int depth) const {
     return true;
   }
 
-  bool ShouldShow(ShowNode* node, const Options& opts, int depth);
+  bool ShouldShow(const ShowNode* node, const Options& opts, int depth) const;
 
-  bool ShouldTrim(ShowNode* node, const std::vector<string>& regexes);
+  bool ShouldTrim(const ShowNode* node,
+                  const std::vector<string>& regexes) const;
 
   bool ReAccount(ShowNode* node, const Options& opts);
 
-  string FormatNode(ShowNode* node, const Options& opts);
+  string FormatNode(ShowNode* node, const Options& opts) const;
 
-  string FormatLegend(const Options& opts);
+  string FormatLegend(const Options& opts) const;
 
   template <typename T>
   std::vector<T*> SortNodes(const std::vector<T*>& nodes, const Options& opts) {
diff --git a/tensorflow/tools/tfprof/internal/tfprof_show_multi.cc b/tensorflow/tools/tfprof/internal/tfprof_show_multi.cc
index 7545b3d48b..84d542714a 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_show_multi.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_show_multi.cc
@@ -28,7 +28,9 @@ namespace tensorflow {
 namespace tfprof {
 
 const TFMultiGraphNodeProto& TFMultiShow::Show(const Options& opts) {
-  if (opts.output_type == kOutput[0]) {
+  if (opts.output_type == kOutput[3]) {
+    return ShowInternal(opts, nullptr)->proto();
+  } else if (opts.output_type == kOutput[0]) {
     Timeline timeline(opts.step, opts.output_options.at(kTimelineOpts[0]));
     return ShowInternal(opts, &timeline)->proto();
   } else if (opts.output_type == kOutput[2]) {
@@ -48,8 +50,8 @@ const TFMultiGraphNodeProto& TFMultiShow::Show(const Options& opts) {
   }
 }
 
-bool TFMultiShow::ShouldShow(ShowMultiNode* node, const Options& opts,
-                            int depth) {
+bool TFMultiShow::ShouldShow(const ShowMultiNode* node, const Options& opts,
+                             int depth) const {
   // Always show kTFProfRoot.
   if (node->name() == kTFProfRoot) return true;
 
@@ -88,8 +90,8 @@ bool TFMultiShow::ShouldShow(ShowMultiNode* node, const Options& opts,
   return true;
 }
 
-bool TFMultiShow::ShouldTrim(ShowMultiNode* node,
-                            const std::vector<string>& regexes) {
+bool TFMultiShow::ShouldTrim(const ShowMultiNode* node,
+                             const std::vector<string>& regexes) const {
   for (const string& regex : regexes) {
     if (RE2::FullMatch(node->name(), regex)) {
       return true;
@@ -102,7 +104,7 @@ bool TFMultiShow::ReAccount(ShowMultiNode* node, const Options& opts) {
   return node->ReInit(opts.step, opts.account_type_regexes);
 }
 
-string TFMultiShow::FormatLegend(const Options& opts) {
+string TFMultiShow::FormatLegend(const Options& opts) const {
   std::vector<string> legends;
   if (opts.select.find(kShown[0]) != opts.select.end()) {
     legends.push_back("output bytes");
@@ -142,7 +144,8 @@ string TFMultiShow::FormatLegend(const Options& opts) {
                          str_util::Join(legends, " | ").c_str());
 }
 
-string TFMultiShow::FormatInputShapes(const TFMultiGraphNodeProto& proto) {
+string TFMultiShow::FormatInputShapes(
+    const TFMultiGraphNodeProto& proto) const {
   // input_shape string -> (static defined count, run count, run_micros)
   std::map<string, std::tuple<int64, int64, int64>> input_shapes_attr;
   for (int i = 0; i < proto.graph_nodes_size(); ++i) {
@@ -199,7 +202,7 @@ string TFMultiShow::FormatInputShapes(const TFMultiGraphNodeProto& proto) {
 }
 
 std::vector<string> TFMultiShow::FormatTimes(const ShowMultiNode* node,
-                                             const Options& opts) {
+                                             const Options& opts) const {
   std::vector<string> attrs;
   if (opts.select.find(kShown[1]) != opts.select.end()) {
     attrs.push_back(FormatTotalExecTime(node, opts));
diff --git a/tensorflow/tools/tfprof/internal/tfprof_show_multi.h b/tensorflow/tools/tfprof/internal/tfprof_show_multi.h
index e6faf1231d..ce309816a9 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_show_multi.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_show_multi.h
@@ -55,21 +55,23 @@ class TFMultiShow {
                         std::unique_ptr<TFProfTensor>* tensor);
 
   // Overridden by subclass if extra requirements need to be met.
-  virtual bool ShouldShowIfExtra(ShowMultiNode* node, const Options& opts,
-                                 int depth) {
+  virtual bool ShouldShowIfExtra(const ShowMultiNode* node, const Options& opts,
+                                 int depth) const {
     return true;
   }
 
-  bool ShouldShow(ShowMultiNode* node, const Options& opts, int depth);
+  bool ShouldShow(const ShowMultiNode* node, const Options& opts,
+                  int depth) const;
 
-  bool ShouldTrim(ShowMultiNode* node, const std::vector<string>& regexes);
+  bool ShouldTrim(const ShowMultiNode* node,
+                  const std::vector<string>& regexes) const;
 
   bool ReAccount(ShowMultiNode* node, const Options& opts);
 
-  string FormatLegend(const Options& opts);
-  string FormatInputShapes(const TFMultiGraphNodeProto& proto);
+  string FormatLegend(const Options& opts) const;
+  string FormatInputShapes(const TFMultiGraphNodeProto& proto) const;
   std::vector<string> FormatTimes(const ShowMultiNode* node,
-                                  const Options& opts);
+                                  const Options& opts) const;
 
   template <typename T>
   std::vector<T*> SortNodes(const std::vector<T*>& nodes, const Options& opts) {
diff --git a/tensorflow/tools/tfprof/internal/tfprof_show_test.cc b/tensorflow/tools/tfprof/internal/tfprof_show_test.cc
index 18a7c51a20..46b6ebc09a 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_show_test.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_show_test.cc
@@ -65,6 +65,7 @@ class TFProfShowTest : public ::testing::Test {
 
     tf_stats_.reset(new TFStats(std::move(graph_pb), std::move(run_meta_pb),
                                 std::move(op_log_pb), std::move(ckpt_reader)));
+    tf_stats_->BuildAllViews();
   }
 
   std::unique_ptr<TFStats> tf_stats_;
diff --git a/tensorflow/tools/tfprof/internal/tfprof_stats.cc b/tensorflow/tools/tfprof/internal/tfprof_stats.cc
index f5b8dad4e2..64da7ae7cf 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_stats.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_stats.cc
@@ -29,16 +29,17 @@ TFStats::TFStats(std::unique_ptr<GraphDef> graph,
                  std::unique_ptr<RunMetadata> run_meta,
                  std::unique_ptr<OpLog> op_log,
                  std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader)
-    : graph_(std::move(graph)),
+    : has_code_traces_(false),
+      graph_(std::move(graph)),
       ckpt_reader_(std::move(ckpt_reader)) {
   CHECK(graph_) << "Must at least have GraphDef";
 
   printf("Parsing Inputs...\n");
   ParseGraph();
   if (run_meta && run_meta->has_step_stats()) {
-    ParseRunMeta(0, std::move(run_meta));
+    AddRunMeta(0, std::move(run_meta));
   }
-  ParseOpLog(std::move(op_log));
+  AddOpLog(std::move(op_log));
 
   if (ckpt_reader_) {
     for (const auto& v : ckpt_reader_->GetVariableToShapeMap()) {
@@ -48,27 +49,48 @@ TFStats::TFStats(std::unique_ptr<GraphDef> graph,
       }
     }
   }
+}
 
-  printf("Preparing Views...\n");
-  scope_view_ = std::unique_ptr<TFScope>(new TFScope(ckpt_reader_.get()));
-  graph_view_ = std::unique_ptr<TFGraph>(new TFGraph(ckpt_reader_.get()));
-  code_view_ = std::unique_ptr<TFCode>(new TFCode());
-  op_view_ = std::unique_ptr<TFOp>(new TFOp());
+void TFStats::BuildView(const string& cmd) {
+  if (cmd == kCmds[0] && !scope_view_) {
+    scope_view_.reset(new TFScope(ckpt_reader_.get()));
+    for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
+      scope_view_->AddNode(it->second.get());
+    }
+    scope_view_->Build();
+  }
+  if (cmd == kCmds[1] && !graph_view_) {
+    graph_view_.reset(new TFGraph(ckpt_reader_.get()));
+    for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
+      graph_view_->AddNode(it->second.get());
+    }
+    graph_view_->Build();
+  }
+  if (cmd == kCmds[2] && !code_view_) {
+    code_view_.reset(new TFCode());
+    for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
+      code_view_->AddNode(it->second.get());
+    }
+    code_view_->Build();
+  }
+  if (cmd == kCmds[3] && !op_view_) {
+    op_view_.reset(new TFOp());
+    for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
+      op_view_->AddNode(it->second.get());
+    }
+    op_view_->Build();
+  }
+}
 
-  for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
-    scope_view_->AddNode(it->second.get());
-    graph_view_->AddNode(it->second.get());
-    code_view_->AddNode(it->second.get());
-    op_view_->AddNode(it->second.get());
-  }
-  scope_view_->Build();
-  graph_view_->Build();
-  code_view_->Build();
-  op_view_->Build();
+void TFStats::BuildAllViews() {
+  std::vector<string> cmds_str(kCmds, kCmds + sizeof(kCmds) / sizeof(*kCmds));
+  for (const string& cmd : cmds_str) {
+    BuildView(cmd);
+  }
 }
 
 const TFGraphNodeProto& TFStats::ShowGraphNode(const string& cmd,
-                                               const Options& opts) {
+                                               const Options& opts) const {
   if (!Validate(opts)) {
     return empty_graph_node_;
   }
@@ -82,8 +104,8 @@ const TFGraphNodeProto& TFStats::ShowGraphNode(const string& cmd,
   }
 }
 
-const TFMultiGraphNodeProto& TFStats::ShowMultiGraphNode(const string& cmd,
-                                                         const Options& opts) {
+const TFMultiGraphNodeProto& TFStats::ShowMultiGraphNode(
+    const string& cmd, const Options& opts) const {
   if (!Validate(opts)) {
     return empty_multi_graph_node_;
   }
@@ -130,7 +152,7 @@ void TFStats::ParseGraph() {
   }
 }
 
-void TFStats::ParseOpLog(std::unique_ptr<OpLog> op_log) {
+void TFStats::AddOpLog(std::unique_ptr<OpLog> op_log) {
   if (!op_log) {
     return;
   }
@@ -144,12 +166,13 @@ void TFStats::ParseOpLog(std::unique_ptr<OpLog> op_log) {
       node->second->AddFloatOps(entry.float_ops());
     }
     if (entry.has_code_def()) {
+      has_code_traces_ = true;
       node->second->AddCode(entry.code_def());
     }
   }
 }
 
-void TFStats::ParseRunMeta(int64 step, std::unique_ptr<RunMetadata> run_meta) {
+void TFStats::AddRunMeta(int64 step, std::unique_ptr<RunMetadata> run_meta) {
   if (!run_meta || !run_meta->has_step_stats()) {
     fprintf(stderr, "Invalid RunMetadata for step %lld\n", step);
     return;
@@ -176,7 +199,7 @@ void TFStats::ParseRunMeta(int64 step, std::unique_ptr<RunMetadata> run_meta) {
   }
 }
 
-bool TFStats::Validate(const Options& opts) {
+bool TFStats::Validate(const Options& opts) const {
   if (opts.step >= 0 && steps_.find(opts.step) == steps_.end()) {
     fprintf(stderr, "Options -step=%lld not found\n", opts.step);
     return false;
@@ -184,9 +207,9 @@ bool TFStats::Validate(const Options& opts) {
   return true;
 }
 
-void TFStats::AddNodeForTest(const string& name,
-                             std::unique_ptr<TFGraphNode> node) {
-  nodes_map_[name] = std::move(node);
+void TFStats::AddNodeForTest(int64 step, std::unique_ptr<TFGraphNode> node) {
+  steps_.insert(step);
+  nodes_map_[node->name()] = std::move(node);
 }
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_stats.h b/tensorflow/tools/tfprof/internal/tfprof_stats.h
index dfb190e703..b26d274f80 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_stats.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_stats.h
@@ -59,28 +59,38 @@ class TFStats {
   const std::map<string, std::unique_ptr<TFGraphNode>>& nodes() const {
     return nodes_map_;
   }
+  const std::set<int64>& steps() const { return steps_; }
+  bool has_code_traces() const { return has_code_traces_; }
 
+  void BuildView(const string& cmd);
+  void BuildAllViews();
+
+  // Note: Must first BuildView(view_foo) before ShowXXX(view_foo) methods.
+  //
   // Organize the TensorFlow model as different types of views, and generate
   // outputs for profiling.
-  const TFGraphNodeProto& ShowGraphNode(const string& cmd, const Options& opts);
+  // TODO(xpan): Should it return reference here?
+  const TFGraphNodeProto& ShowGraphNode(const string& cmd,
+                                        const Options& opts) const;
   const TFMultiGraphNodeProto& ShowMultiGraphNode(const string& cmd,
-                                                  const Options& opts);
+                                                  const Options& opts) const;
 
   // Add a step of run time meta data.
-  void ParseRunMeta(int64 step, std::unique_ptr<RunMetadata> run_meta);
+  void AddRunMeta(int64 step, std::unique_ptr<RunMetadata> run_meta);
   // Add tfprof operation meta data, such as customized op type, float_ops,
   // and code traces.
-  void ParseOpLog(std::unique_ptr<OpLog> op_log);
+  void AddOpLog(std::unique_ptr<OpLog> op_log);
 
   // For test purpose only.
-  void AddNodeForTest(const string& name, std::unique_ptr<TFGraphNode> node);
+  void AddNodeForTest(int64 step, std::unique_ptr<TFGraphNode> node);
 
  private:
-  bool Validate(const Options& opts);
+  bool Validate(const Options& opts) const;
 
   void ParseGraph();
 
   std::set<int64> steps_;
+  bool has_code_traces_;
   std::unique_ptr<GraphDef> graph_;
   std::unique_ptr<TFScope> scope_view_;
   std::unique_ptr<TFGraph> graph_view_;
diff --git a/tensorflow/tools/tfprof/internal/tfprof_stats_test.cc b/tensorflow/tools/tfprof/internal/tfprof_stats_test.cc
index bcd1147c69..00d5d8cdb3 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_stats_test.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_stats_test.cc
@@ -66,6 +66,7 @@ class TFProfStatsTest : public ::testing::Test {
 
     tf_stats_.reset(new TFStats(std::move(graph_pb), std::move(run_meta_pb),
                                 std::move(op_log_pb), std::move(ckpt_reader)));
+    tf_stats_->BuildAllViews();
   }
 
   std::unique_ptr<TFStats> tf_stats_;
diff --git a/tensorflow/tools/tfprof/internal/tfprof_tensor_test.cc b/tensorflow/tools/tfprof/internal/tfprof_tensor_test.cc
index d7cfe7dade..0482657ca5 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_tensor_test.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_tensor_test.cc
@@ -50,6 +50,7 @@ class TFProfTensorTest : public ::testing::Test {
 
     tf_stats_.reset(new TFStats(std::move(graph_pb), std::move(run_meta_pb),
                                 std::move(op_log_pb), std::move(ckpt_reader)));
+    tf_stats_->BuildAllViews();
   }
 
   std::unique_ptr<TFStats> tf_stats_;
diff --git a/tensorflow/tools/tfprof/internal/tfprof_timeline_test.cc b/tensorflow/tools/tfprof/internal/tfprof_timeline_test.cc
index 0e9bb9658c..cad31050a9 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_timeline_test.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_timeline_test.cc
@@ -52,6 +52,7 @@ class TFProfTimelineTest : public ::testing::Test {
 
     tf_stats_.reset(new TFStats(std::move(graph_pb), std::move(run_meta_pb),
                                 nullptr, nullptr));
+    tf_stats_->BuildAllViews();
   }
 
   std::unique_ptr<TFStats> tf_stats_;
diff --git a/tensorflow/tools/tfprof/tfprof_main.cc b/tensorflow/tools/tfprof/tfprof_main.cc
index 7a4e7e85ff..5e70c093cc 100644
--- a/tensorflow/tools/tfprof/tfprof_main.cc
+++ b/tensorflow/tools/tfprof/tfprof_main.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/tools/tfprof/internal/advisor/tfprof_advisor.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_options.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_stats.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
@@ -161,12 +162,13 @@ int Run(int argc, char** argv) {
         "Profiling everything!\n");
     return 0;
   } else if (argc > 1) {
-    if (string(argv[1]) == kCmds[5]) {
+    if (string(argv[1]) == kCmds[6]) {
       PrintHelp();
       return 0;
     }
     if (string(argv[1]) == kCmds[0] || string(argv[1]) == kCmds[1] ||
-        string(argv[1]) == kCmds[2] || string(argv[1]) == kCmds[3]) {
+        string(argv[1]) == kCmds[2] || string(argv[1]) == kCmds[3] ||
+        string(argv[1]) == kCmds[4]) {
       cmd = argv[1];
     }
   }
@@ -216,7 +218,13 @@ int Run(int argc, char** argv) {
               run_meta_files[i].c_str(), s.ToString().c_str());
       return 1;
     }
-    tf_stat.ParseRunMeta(i, std::move(run_meta));
+    tf_stat.AddRunMeta(i, std::move(run_meta));
+  }
+
+  if (cmd == kCmds[4]) {
+    tf_stat.BuildAllViews();
+    Advisor(&tf_stat).Advise(Advisor::DefaultOptions());
+    return 0;
   }
 
   Options opts(FLAGS_max_depth, FLAGS_min_bytes, FLAGS_min_micros,
@@ -227,9 +235,11 @@ int Run(int argc, char** argv) {
                output_type, output_options);
 
   if (cmd == kCmds[2] || cmd == kCmds[3]) {
+    tf_stat.BuildView(cmd);
     tf_stat.ShowMultiGraphNode(cmd, opts);
     return 0;
   } else if (cmd == kCmds[0] || cmd == kCmds[1]) {
+    tf_stat.BuildView(cmd);
     tf_stat.ShowGraphNode(cmd, opts);
     return 0;
   }
@@ -254,14 +264,19 @@ int Run(int argc, char** argv) {
       fprintf(stderr, "E: %s\n", s.ToString().c_str());
       continue;
     }
-    if (cmd == kCmds[4]) {
+    if (cmd == kCmds[5]) {
       opts = new_opts;
-    } else if (cmd == kCmds[5]) {
+    } else if (cmd == kCmds[6]) {
       PrintHelp();
     } else if (cmd == kCmds[2] || cmd == kCmds[3]) {
+      tf_stat.BuildView(cmd);
       tf_stat.ShowMultiGraphNode(cmd, new_opts);
     } else if (cmd == kCmds[0] || cmd == kCmds[1]) {
+      tf_stat.BuildView(cmd);
       tf_stat.ShowGraphNode(cmd, new_opts);
+    } else if (cmd == kCmds[4]) {
+      tf_stat.BuildAllViews();
+      Advisor(&tf_stat).Advise(Advisor::DefaultOptions());
     }
   }
   return 0;
diff --git a/tensorflow/tools/tfprof/tfprof_output.proto b/tensorflow/tools/tfprof/tfprof_output.proto
index 7206c6f956..abf141f1e7 100644
--- a/tensorflow/tools/tfprof/tfprof_output.proto
+++ b/tensorflow/tools/tfprof/tfprof_output.proto
@@ -102,3 +102,11 @@ message TFMultiGraphNodeProto {
   // structure used.
   repeated TFMultiGraphNodeProto children = 11;
 }
+
+message AdviceProto {
+  // checker name -> a list of reports from the checker.
+  map<string, Checker> checkers = 1;
+  message Checker {
+    repeated string reports = 2;
+  }
+}
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 3e81e361a9..afe0979536 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -327,11 +327,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   patched_http_archive(
       name = "protobuf",
       urls = [
-          "http://mirror.bazel.build/github.com/google/protobuf/archive/v3.3.1.tar.gz",
-          "https://github.com/google/protobuf/archive/v3.3.1.tar.gz",
+          "http://mirror.bazel.build/github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
+          "https://github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
       ],
-      sha256 = "30f23a45c6f4515598702a6d19c4295ba92c4a635d7ad8d331a4db9fccff392d",
-      strip_prefix = "protobuf-3.3.1",
+      sha256 = "e5d3d4e227a0f7afb8745df049bbd4d55474b158ca5aaa2a0e31099af24be1d0",
+      strip_prefix = "protobuf-2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a",
       # TODO: remove patching when tensorflow stops linking same protos into
       #       multiple shared libraries loaded in runtime by python.
       #       This patch fixes a runtime crash when tensorflow is compiled
@@ -345,21 +345,21 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.http_archive(
       name = "com_google_protobuf",
       urls = [
-          "http://mirror.bazel.build/github.com/google/protobuf/archive/v3.3.1.tar.gz",
-          "https://github.com/google/protobuf/archive/v3.3.1.tar.gz",
+          "http://mirror.bazel.build/github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
+          "https://github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
       ],
-      sha256 = "30f23a45c6f4515598702a6d19c4295ba92c4a635d7ad8d331a4db9fccff392d",
-      strip_prefix = "protobuf-3.3.1",
+      sha256 = "e5d3d4e227a0f7afb8745df049bbd4d55474b158ca5aaa2a0e31099af24be1d0",
+      strip_prefix = "protobuf-2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a",
   )
 
   native.http_archive(
       name = "com_google_protobuf_cc",
       urls = [
-          "http://mirror.bazel.build/github.com/google/protobuf/archive/v3.3.1.tar.gz",
-          "https://github.com/google/protobuf/archive/v3.3.1.tar.gz",
+          "http://mirror.bazel.build/github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
+          "https://github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
       ],
-      sha256 = "30f23a45c6f4515598702a6d19c4295ba92c4a635d7ad8d331a4db9fccff392d",
-      strip_prefix = "protobuf-3.3.1",
+      sha256 = "e5d3d4e227a0f7afb8745df049bbd4d55474b158ca5aaa2a0e31099af24be1d0",
+      strip_prefix = "protobuf-2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a",
   )
 
   native.new_http_archive(
diff --git a/tools/tf_env_collect.sh b/tools/tf_env_collect.sh
index a1c9c88c58..abeebeadea 100755
--- a/tools/tf_env_collect.sh
+++ b/tools/tf_env_collect.sh
@@ -16,62 +16,49 @@
 
 set -u  # Check for undefined variables
 
-die() {
-  # Print a message and exit with code 1.
-  #
-  # Usage: die <error_message>
-  #   e.g., die "Something bad happened."
-
-  echo $@
-  exit 1
-}
-
 echo "Collecting system information..."
 
 OUTPUT_FILE=tf_env.txt
-python_bin_path=$(which python || which python3 || die "Cannot find Python binary")
-
-{
-  echo
-  echo "== cat /etc/issue ==============================================="
-  uname -a
-  uname=`uname -s`
-  if [ "$(uname)" == "Darwin" ]; then
-    echo Mac OS X `sw_vers -productVersion`
-  elif [ "$(uname)" == "Linux" ]; then
-    cat /etc/*release | grep VERSION
-  fi
-  
-  echo
-  echo '== are we in docker ============================================='
-  num=`cat /proc/1/cgroup | grep docker | wc -l`;
-  if [ $num -ge 1 ]; then
-    echo "Yes"
-  else
-    echo "No"
-  fi
-  
-  echo
-  echo '== compiler ====================================================='
-  c++ --version 2>&1
-  
-  echo
-  echo '== uname -a ====================================================='
-  uname -a
-  
-  echo
-  echo '== check pips ==================================================='
-  pip list 2>&1 | grep "proto\|numpy\|tensorflow"
-  
-  
-  echo
-  echo '== check for virtualenv ========================================='
-  ${python_bin_path} -c "import sys;print(hasattr(sys, \"real_prefix\"))"
-  
-  echo
-  echo '== tensorflow import ============================================'
-} >> ${OUTPUT_FILE}
 
+echo >> $OUTPUT_FILE
+echo "== cat /etc/issue ===============================================" >> $OUTPUT_FILE
+uname -a >> $OUTPUT_FILE
+uname=`uname -s`
+if [ "$(uname)" == "Darwin" ]; then
+  echo Mac OS X `sw_vers -productVersion` >> $OUTPUT_FILE
+elif [ "$(uname)" == "Linux" ]; then
+  cat /etc/*release | grep VERSION >> $OUTPUT_FILE
+fi
+
+
+echo >> $OUTPUT_FILE
+echo '== are we in docker =============================================' >> $OUTPUT_FILE
+num=`cat /proc/1/cgroup | grep docker | wc -l`;
+if [ $num -ge 1 ]; then
+  echo "Yes" >> $OUTPUT_FILE
+else
+  echo "No" >> $OUTPUT_FILE
+fi
+
+echo >> $OUTPUT_FILE
+echo '== compiler =====================================================' >> $OUTPUT_FILE
+c++ --version 2>&1 >> $OUTPUT_FILE
+
+echo >> $OUTPUT_FILE
+echo '== uname -a =====================================================' >> $OUTPUT_FILE
+uname -a >> $OUTPUT_FILE
+
+echo >> $OUTPUT_FILE
+echo '== check pips ===================================================' >> $OUTPUT_FILE
+pip list 2>&1 | grep "proto\|numpy\|tensorflow" >> $OUTPUT_FILE
+
+
+echo >> $OUTPUT_FILE
+echo '== check for virtualenv =========================================' >> $OUTPUT_FILE
+python -c "import sys;print(hasattr(sys, \"real_prefix\"))" >> $OUTPUT_FILE
+
+echo >> $OUTPUT_FILE
+echo '== tensorflow import ============================================' >> $OUTPUT_FILE
 cat <<EOF > /tmp/check_tf.py
 import tensorflow as tf;
 print("tf.VERSION = %s" % tf.VERSION)
@@ -80,34 +67,32 @@ print("tf.COMPILER_VERSION = %s" % tf.GIT_VERSION)
 with tf.Session() as sess:
   print("Sanity check: %r" % sess.run(tf.constant([1,2,3])[:1]))
 EOF
-${python_bin_path} /tmp/check_tf.py 2>&1  >> ${OUTPUT_FILE}
-
-DEBUG_LD=libs ${python_bin_path} -c "import tensorflow"  2>>${OUTPUT_FILE} > /tmp/loadedlibs
-
-{
-  grep libcudnn.so /tmp/loadedlibs
-  echo
-  echo '== env =========================================================='
-  if [ -z ${LD_LIBRARY_PATH+x} ]; then
-    echo "LD_LIBRARY_PATH is unset";
-  else
-    echo LD_LIBRARY_PATH ${LD_LIBRARY_PATH} ;
-  fi
-  if [ -z ${DYLD_LIBRARY_PATH+x} ]; then
-    echo "DYLD_LIBRARY_PATH is unset";
-  else
-    echo DYLD_LIBRARY_PATH ${DYLD_LIBRARY_PATH} ;
-  fi
-  
-  
-  echo
-  echo '== nvidia-smi ==================================================='
-  nvidia-smi 2>&1
-  
-  echo
-  echo '== cuda libs  ==================================================='
-} >> ${OUTPUT_FILE}
+python /tmp/check_tf.py 2>&1  >> ${OUTPUT_FILE}
+
+DEBUG_LD=libs python -c "import tensorflow"  2>>${OUTPUT_FILE} > /tmp/loadedlibs
+grep libcudnn.so /tmp/loadedlibs >> $OUTPUT_FILE
+
+echo >> $OUTPUT_FILE
+echo '== env ==========================================================' >> $OUTPUT_FILE
+if [ -z ${LD_LIBRARY_PATH+x} ]; then
+  echo "LD_LIBRARY_PATH is unset" >> $OUTPUT_FILE;
+else
+  echo LD_LIBRARY_PATH ${LD_LIBRARY_PATH}  >> $OUTPUT_FILE;
+fi
+if [ -z ${DYLD_LIBRARY_PATH+x} ]; then
+  echo "DYLD_LIBRARY_PATH is unset" >> $OUTPUT_FILE;
+else
+  echo DYLD_LIBRARY_PATH ${DYLD_LIBRARY_PATH}  >> $OUTPUT_FILE;
+fi
+
+
+echo >> $OUTPUT_FILE >> $OUTPUT_FILE
+echo '== nvidia-smi ===================================================' >> $OUTPUT_FILE
+nvidia-smi 2>&1 >> $OUTPUT_FILE
+
+echo >> $OUTPUT_FILE
 
+echo '== cuda libs  ===================================================' >> $OUTPUT_FILE
 find /usr/local -type f -name 'libcudart*'  2>/dev/null | grep cuda |  grep -v "\\.cache" >> ${OUTPUT_FILE}
 find /usr/local -type f -name 'libudnn*'  2>/dev/null | grep cuda |  grep -v "\\.cache" >> ${OUTPUT_FILE}
 
-- 
cgit v1.2.3