From 50b999a8336d19400ab75aea66fe46eca2f5fe0b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 27 Jun 2017 16:33:00 -0700
Subject: Merge changes from github.

PiperOrigin-RevId: 160344052
---
 CONTRIBUTING.md                                    |   7 +-
 ISSUE_TEMPLATE.md                                  |   1 +
 README.md                                          |  14 +-
 RELEASE.md                                         |   2 +
 configure                                          |  37 ++-
 tensorflow/BUILD                                   |   1 +
 tensorflow/c/generate-pc.sh                        |   2 +-
 tensorflow/cc/BUILD                                |  21 +-
 tensorflow/cc/gradients/math_grad.cc               |  26 ++
 tensorflow/cc/gradients/math_grad_test.cc          |  52 ++++
 tensorflow/cc/gradients/nn_grad.cc                 |  13 +
 tensorflow/cc/gradients/nn_grad_test.cc            |  13 +
 tensorflow/compiler/jit/BUILD                      |   5 +-
 tensorflow/compiler/jit/kernels/BUILD              |   1 +
 tensorflow/compiler/plugin/BUILD                   |   4 +-
 tensorflow/compiler/plugin/executor/BUILD          |  34 ++
 tensorflow/compiler/plugin/executor/compiler.cc    | 122 ++++++++
 tensorflow/compiler/plugin/executor/compiler.h     |  62 ++++
 tensorflow/compiler/plugin/executor/device.cc      |  60 ++++
 tensorflow/compiler/plugin/executor/executable.cc  | 147 +++++++++
 tensorflow/compiler/plugin/executor/executable.h   |  65 ++++
 tensorflow/compiler/plugin/executor/executor.cc    | 135 ++++++++
 tensorflow/compiler/plugin/executor/executor.h     | 213 +++++++++++++
 tensorflow/compiler/plugin/executor/platform.cc    | 125 ++++++++
 tensorflow/compiler/plugin/executor/platform.h     |  83 +++++
 tensorflow/compiler/plugin/executor/platform_id.h  |  31 ++
 .../compiler/plugin/executor/transfer_manager.cc   | 187 +++++++++++
 .../compiler/plugin/executor/transfer_manager.h    |  77 +++++
 tensorflow/compiler/tests/BUILD                    |  18 +-
 tensorflow/compiler/tests/ftrl_test.py             |   2 +-
 .../compiler/tf2xla/kernels/batch_matmul_op.cc     |   6 +-
 .../compiler/tf2xla/kernels/batchtospace_op.cc     |   3 +-
 .../compiler/tf2xla/kernels/depthwise_conv_ops.cc  |  19 +-
 tensorflow/compiler/tf2xla/kernels/diag_op.cc      |   8 +-
 .../compiler/tf2xla/kernels/dynamic_stitch_op.cc   |   4 +-
 tensorflow/compiler/tf2xla/kernels/slice_op.cc     |   4 +-
 tensorflow/compiler/tf2xla/kernels/split_op.cc     |  14 +-
 .../compiler/tf2xla/kernels/strided_slice_op.cc    |  44 +--
 .../compiler/tf2xla/kernels/tensor_array_ops.cc    |   8 +-
 tensorflow/compiler/tf2xla/kernels/unpack_op.cc    |   4 +-
 .../compiler/xla/client/computation_builder.cc     |   6 +-
 .../compiler/xla/client/computation_builder.h      |   4 +-
 tensorflow/compiler/xla/literal_util.cc            |  10 +-
 tensorflow/compiler/xla/literal_util_test.cc       |  57 ++++
 .../compiler/xla/service/algebraic_simplifier.cc   |   7 +-
 .../xla/service/algebraic_simplifier_test.cc       |   8 +-
 .../compiler/xla/service/buffer_assignment_test.cc |  12 +-
 .../compiler/xla/service/buffer_liveness_test.cc   |   2 +-
 .../compiler/xla/service/compile_only_service.h    |   2 +-
 .../compiler/xla/service/computation_placer.cc     |   9 +-
 .../compiler/xla/service/computation_placer.h      |   6 +-
 .../compiler/xla/service/elemental_ir_emitter.cc   |  17 +-
 .../compiler/xla/service/gpu/pad_insertion.cc      |  11 +-
 .../xla/service/hlo_constant_folding_test.cc       |   3 +-
 tensorflow/compiler/xla/service/hlo_instruction.cc |  11 +-
 tensorflow/compiler/xla/service/hlo_instruction.h  |  16 +-
 .../xla/service/hlo_rematerialization_test.cc      |  16 +-
 tensorflow/compiler/xla/service/shape_inference.cc |  28 +-
 tensorflow/compiler/xla/service/shape_inference.h  |   3 +-
 .../compiler/xla/service/shape_inference_test.cc   |  33 +-
 .../xla/service/tuple_points_to_analysis_test.cc   |   2 +-
 .../compiler/xla/service/user_computation.cc       |   6 +-
 .../xla/tests/array_elementwise_ops_test.cc        |   2 +-
 .../compiler/xla/tests/dot_operation_test.cc       |   4 +-
 tensorflow/compiler/xla/tests/fusion_test.cc       |   2 +-
 .../xla/tests/multidimensional_slice_test.cc       |   4 +-
 tensorflow/compiler/xla/tests/params_test.cc       |   2 +-
 tensorflow/compiler/xla/tests/slice_test.cc        |  51 +--
 tensorflow/compiler/xla/tests/while_test.cc        |   3 +-
 tensorflow/compiler/xla/util.h                     |  18 +-
 tensorflow/compiler/xla/xla_data.proto             |   3 +-
 .../android/TensorFlowInferenceInterface.java      |  20 +-
 .../contrib/cloud/kernels/bigquery_reader_ops.cc   |   2 +-
 .../cloud/python/ops/bigquery_reader_ops.py        |   2 +-
 tensorflow/contrib/cmake/CMakeLists.txt            |   2 +-
 tensorflow/contrib/data/README.md                  |   2 +-
 .../python/ops/bijectors/affine_impl.py            |   4 +-
 .../distributions/python/ops/relaxed_bernoulli.py  |   2 +-
 .../distributions/python/ops/sample_stats.py       |   2 +-
 .../python/ops/vector_laplace_linear_operator.py   |   2 +-
 tensorflow/contrib/graph_editor/transform.py       |   2 +-
 tensorflow/contrib/keras/python/keras/backend.py   |   6 +-
 .../contrib/keras/python/keras/layers/core.py      |   2 +-
 .../contrib/keras/python/keras/layers/recurrent.py |   2 +-
 .../contrib/keras/python/keras/models_test.py      |   2 +-
 tensorflow/contrib/layers/BUILD                    |   2 +-
 tensorflow/contrib/layers/__init__.py              |   6 +
 tensorflow/contrib/layers/python/layers/layers.py  | 221 ++++++++++++-
 .../contrib/layers/python/layers/layers_test.py    | 150 ++++++++-
 .../learn/python/learn/estimators/debug_test.py    |   4 +-
 .../contrib/learn/python/learn/estimators/head.py  |   2 +-
 .../learn/python/learn/estimators/linear_test.py   |  20 +-
 .../learn/python/learn/estimators/model_fn_test.py |  10 +-
 .../contrib/learn/python/learn/experiment.py       |   6 +-
 tensorflow/contrib/learn/python/learn/models.py    |   4 +-
 .../linalg/python/ops/linear_operator_test_util.py |   2 +-
 tensorflow/contrib/lookup/lookup_ops.py            |   4 +-
 tensorflow/contrib/makefile/build_with_docker.sh   |   2 +-
 .../contrib/makefile/compile_android_protobuf.sh   |   4 +-
 .../contrib/makefile/compile_ios_protobuf.sh       |  17 +-
 .../contrib/makefile/compile_ios_tensorflow.sh     |   4 +-
 tensorflow/contrib/makefile/compile_pi_protobuf.sh |   6 +-
 tensorflow/contrib/makefile/tf_op_files.txt        |   1 +
 tensorflow/contrib/ndlstm/python/lstm2d.py         |  54 +++-
 tensorflow/contrib/ndlstm/python/lstm2d_test.py    |   8 +
 tensorflow/contrib/remote_fused_graph/pylib/BUILD  |   1 +
 tensorflow/contrib/rnn/python/ops/rnn_cell.py      |   6 +-
 .../python/kernel_tests/beam_search_ops_test.py    |   4 +-
 tensorflow/contrib/seq2seq/python/ops/decoder.py   |   6 +-
 tensorflow/contrib/slim/README.md                  |   2 +-
 tensorflow/contrib/tfprof/README.md                |  23 ++
 .../tools/tfprof/internal/run_metadata_test.py     |   2 +-
 tensorflow/contrib/tpu/python/tpu/tpu_estimator.py |   4 +-
 .../contrib/training/python/training/evaluation.py |   2 +-
 tensorflow/contrib/verbs/rdma.cc                   |  55 +++-
 tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc    |  41 ++-
 tensorflow/contrib/verbs/verbs_util.cc             |  34 ++
 tensorflow/contrib/verbs/verbs_util.h              |  10 +
 tensorflow/core/BUILD                              |   7 +-
 tensorflow/core/common_runtime/direct_session.cc   |   4 +-
 tensorflow/core/debug/grpc_session_debug_test.cc   |   5 +-
 .../distributed_runtime/worker_cache_logger.cc     |  54 ++--
 .../core/distributed_runtime/worker_cache_logger.h |   8 +
 tensorflow/core/framework/graph_def_util.h         |   2 +-
 tensorflow/core/framework/op.h                     |   1 -
 tensorflow/core/framework/op_kernel.h              |   2 +-
 tensorflow/core/framework/tensor.h                 |   2 +-
 tensorflow/core/graph/mkl_layout_pass.cc           | 277 ++++++++++++-----
 tensorflow/core/graph/mkl_layout_pass_test.cc      | 252 ++++++++++++++-
 .../core/grappler/costs/virtual_scheduler_test.cc  |   2 +-
 tensorflow/core/grappler/grappler_item.h           |   1 +
 .../core/grappler/optimizers/auto_parallel.cc      |   5 +
 .../core/grappler/optimizers/auto_parallel.h       |   1 +
 .../core/grappler/optimizers/auto_parallel_test.cc |  42 +--
 .../core/grappler/optimizers/layout_optimizer.cc   |   2 +-
 tensorflow/core/kernels/BUILD                      |  16 +-
 tensorflow/core/kernels/adjust_contrast_op.cc      |  24 ++
 .../kernels/adjust_contrast_op_benchmark_test.cc   |   5 +
 tensorflow/core/kernels/colorspace_op.cc           |  15 +
 tensorflow/core/kernels/control_flow_ops.cc        | 136 +++++---
 tensorflow/core/kernels/cwise_op_add_2.cc          |   7 +-
 tensorflow/core/kernels/cwise_op_cosh.cc           |  37 +++
 tensorflow/core/kernels/cwise_op_gpu_add.cu.cc     |   3 +-
 tensorflow/core/kernels/cwise_op_gpu_cosh.cu.cc    |  26 ++
 tensorflow/core/kernels/cwise_op_gpu_sinh.cu.cc    |  26 ++
 tensorflow/core/kernels/cwise_op_invert.cc         |   2 +-
 tensorflow/core/kernels/cwise_op_sinh.cc           |  37 +++
 .../core/kernels/cwise_op_squared_difference.cc    |  13 +
 tensorflow/core/kernels/cwise_ops.h                |   6 +
 tensorflow/core/kernels/dynamic_stitch_op.cc       |  27 +-
 tensorflow/core/kernels/map_stage_op.cc            |   7 +-
 tensorflow/core/kernels/meta_support.cc            |   5 +-
 .../core/kernels/mkl_conv_grad_filter_ops.cc       |  69 +++--
 tensorflow/core/kernels/mkl_conv_ops.cc            |  51 ++-
 tensorflow/core/kernels/mkl_lrn_op.cc              | 112 +++----
 tensorflow/core/kernels/mkl_relu_op.cc             |  30 +-
 tensorflow/core/kernels/mkl_tfconv_op.cc           |  54 +++-
 tensorflow/core/kernels/non_max_suppression_op.cc  |  36 ++-
 tensorflow/core/kernels/priority_queue.cc          |   2 +-
 tensorflow/core/kernels/shape_ops.cc               |  56 ++--
 tensorflow/core/kernels/slice_op.cc                | 258 +++++++++++++++-
 tensorflow/core/kernels/sparse_reduce_op.cc        | 341 +++++++++++++++++++++
 tensorflow/core/kernels/sparse_reduce_sum_op.cc    | 305 ------------------
 tensorflow/core/kernels/stack_ops.cc               |  68 +++-
 tensorflow/core/kernels/tile_functor.h             | 115 +++++++
 tensorflow/core/kernels/tile_functor_cpu.cc        |  85 +++++
 tensorflow/core/kernels/tile_functor_gpu.cu.cc     | 101 ++++++
 tensorflow/core/kernels/tile_ops.cc                |  76 ++---
 tensorflow/core/kernels/tile_ops_cpu_impl.h        |  35 +--
 tensorflow/core/kernels/tile_ops_gpu_impl.h        |   1 -
 tensorflow/core/kernels/tile_ops_impl.h            |  25 +-
 tensorflow/core/kernels/topk_op.cc                 |   2 +-
 tensorflow/core/kernels/transpose_functor.h        |   7 +
 tensorflow/core/kernels/transpose_op.cc            |   5 +-
 .../kernels/typed_conditional_accumulator_base.h   |   2 +-
 tensorflow/core/lib/gtl/optional.h                 |   2 +-
 tensorflow/core/ops/math_grad.cc                   |  20 ++
 tensorflow/core/ops/math_grad_test.cc              |  20 ++
 tensorflow/core/ops/math_ops.cc                    |   8 +
 tensorflow/core/ops/nn_ops.cc                      |   2 +
 tensorflow/core/ops/ops.pbtxt                      |  75 +++++
 tensorflow/core/ops/sparse_ops.cc                  |  69 +++++
 tensorflow/core/platform/cloud/retrying_utils.cc   |   2 +-
 tensorflow/core/protobuf/worker.proto              |   2 +-
 tensorflow/core/public/version.h                   |   2 +-
 tensorflow/core/util/mkl_util.h                    |  38 ++-
 .../docs_src/api_guides/python/contrib.losses.md   |  10 +-
 tensorflow/docs_src/api_guides/python/math_ops.md  |   2 +
 tensorflow/docs_src/extend/estimators.md           |   5 +-
 tensorflow/docs_src/get_started/get_started.md     |   5 +-
 tensorflow/docs_src/get_started/mnist/beginners.md |   2 +-
 tensorflow/docs_src/get_started/mnist/mechanics.md |   2 +-
 .../docs_src/get_started/tensorboard_histograms.md |   2 +
 tensorflow/docs_src/install/install_c.md           |   2 +-
 tensorflow/docs_src/install/install_go.md          |   2 +-
 tensorflow/docs_src/install/install_java.md        |  18 +-
 tensorflow/docs_src/install/install_linux.md       |  26 +-
 tensorflow/docs_src/install/install_mac.md         |  14 +-
 tensorflow/docs_src/install/install_sources.md     |   4 +-
 tensorflow/docs_src/install/install_windows.md     |  13 +-
 .../docs_src/performance/performance_guide.md      |   8 +-
 tensorflow/docs_src/performance/quantization.md    |  13 +-
 tensorflow/docs_src/tutorials/seq2seq.md           |  13 +-
 tensorflow/docs_src/tutorials/wide.md              |  18 +-
 tensorflow/docs_src/tutorials/wide_and_deep.md     |   6 +-
 tensorflow/docs_src/tutorials/word2vec.md          |   2 +-
 tensorflow/examples/android/build.gradle           |   2 +-
 tensorflow/examples/image_retraining/retrain.py    |  43 ++-
 tensorflow/examples/label_image/README.md          |  23 +-
 tensorflow/examples/label_image/label_image.py     | 132 ++++++++
 tensorflow/examples/learn/examples_test.sh         |   2 +-
 tensorflow/examples/tutorials/mnist/mnist.py       |   2 +-
 .../examples/tutorials/word2vec/word2vec_basic.py  |  16 +-
 tensorflow/go/README.md                            |  29 --
 tensorflow/go/genop/generate.sh                    |   2 +-
 tensorflow/go/op/wrappers.go                       |  30 ++
 tensorflow/go/shape.go                             |   2 +-
 tensorflow/go/tensor.go                            |   2 +-
 tensorflow/java/BUILD                              |  26 +-
 tensorflow/java/build_defs.bzl                     |  11 +-
 tensorflow/java/maven/libtensorflow/pom.xml        |   4 +-
 tensorflow/java/maven/proto/pom.xml                |   6 +-
 tensorflow/java/maven/release.sh                   |   2 +-
 tensorflow/java/maven/run_inside_container.sh      |   2 +-
 .../java/src/main/java/org/tensorflow/Input.java   |  48 +++
 .../src/main/java/org/tensorflow/Operation.java    |  73 ++++-
 .../main/java/org/tensorflow/OperationBuilder.java |   2 +-
 .../java/src/main/java/org/tensorflow/Output.java  |  35 ++-
 .../java/src/main/java/org/tensorflow/Session.java |   4 +-
 .../src/main/java/org/tensorflow/op/NameScope.java | 146 +++++++++
 .../src/main/java/org/tensorflow/op/Scope.java     | 165 ++++++++++
 tensorflow/java/src/main/native/operation_jni.cc   |  18 ++
 tensorflow/java/src/main/native/operation_jni.h    |  11 +
 .../test/java/org/tensorflow/OperationTest.java    | 112 +++++++
 .../src/test/java/org/tensorflow/SessionTest.java  |   6 +-
 .../src/test/java/org/tensorflow/TensorTest.java   |   2 +-
 .../src/test/java/org/tensorflow/op/ScopeTest.java | 270 ++++++++++++++++
 tensorflow/python/BUILD                            |   3 +
 tensorflow/python/debug/cli/analyzer_cli.py        |   2 +-
 tensorflow/python/debug/wrappers/framework_test.py |   2 +-
 .../python/estimator/canned/dnn_linear_combined.py |   2 +-
 tensorflow/python/framework/test_util.py           |  21 +-
 tensorflow/python/kernel_tests/BUILD               |  10 -
 tensorflow/python/kernel_tests/basic_gpu_test.py   |   2 +
 tensorflow/python/kernel_tests/cwise_ops_test.py   |  15 +
 tensorflow/python/kernel_tests/fft_ops_test.py     |  63 ++--
 .../python/kernel_tests/map_stage_op_test.py       |  93 +++---
 .../python/kernel_tests/record_input_test.py       |  14 +-
 tensorflow/python/kernel_tests/shape_ops_test.py   |  18 +-
 tensorflow/python/kernel_tests/sparse_ops_test.py  |  46 ++-
 tensorflow/python/kernel_tests/stage_op_test.py    |  87 +++---
 tensorflow/python/lib/io/py_record_writer.cc       |   8 +
 tensorflow/python/lib/io/py_record_writer.h        |   1 +
 tensorflow/python/lib/io/py_record_writer.i        |   1 +
 tensorflow/python/lib/io/tf_record.py              |   5 +
 tensorflow/python/ops/array_ops.py                 |  30 +-
 tensorflow/python/ops/data_flow_ops.py             |  33 +-
 .../python/ops/distributions/special_math.py       |   2 +-
 tensorflow/python/ops/gradients_impl.py            |  52 ++--
 tensorflow/python/ops/lookup_ops.py                |   6 +-
 tensorflow/python/ops/math_grad.py                 |  18 ++
 tensorflow/python/ops/math_ops.py                  |   7 +-
 tensorflow/python/ops/rnn_cell_impl.py             |   4 +-
 tensorflow/python/ops/sparse_ops.py                |  86 ++++++
 tensorflow/python/ops/standard_ops.py              |   2 +-
 tensorflow/python/ops/tensor_array_ops.py          |   2 +-
 tensorflow/python/ops/variable_scope.py            |   5 +
 .../tools/print_selective_registration_header.py   |   2 +-
 tensorflow/python/training/input.py                |   7 +-
 tensorflow/python/training/input_test.py           |   7 +
 tensorflow/python/training/momentum.py             |   2 +-
 tensorflow/tensorflow.bzl                          |   2 +-
 tensorflow/tools/api/golden/tensorflow.pbtxt       |  16 +
 .../tensorflow.python_io.-t-f-record-writer.pbtxt  |   4 +
 tensorflow/tools/ci_build/builds/pip.sh            |  27 +-
 tensorflow/tools/ci_build/ci_build.sh              |  10 +-
 .../tools/ci_build/ci_parameterized_build.sh       |  17 +-
 tensorflow/tools/ci_build/ci_sanity.sh             |  24 +-
 .../tools/ci_build/install/install_pip_packages.sh |   4 +-
 .../tools/ci_build/install/install_proto3.sh       |   6 +-
 .../install/install_python3.5_pip_packages.sh      |   2 +-
 .../ci_build/protobuf/protobuf_optimized_pip.sh    |   2 +-
 .../tools/ci_build/windows/bazel/bazel_test_lib.sh |   2 +-
 .../tools/ci_build/windows/cpu/cmake/run_py.bat    |   5 +-
 .../tools/ci_build/windows/gpu/cmake/run_py.bat    |   5 +-
 tensorflow/tools/dist_test/local_test.sh           |   6 +-
 tensorflow/tools/docker/README.md                  |  28 +-
 .../tools/docker/parameterized_docker_build.sh     |  44 ++-
 tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh     |  11 +-
 tensorflow/tools/git/gen_git_source.sh             |   2 +-
 tensorflow/tools/graph_transforms/README.md        |   2 +-
 .../graph_transforms/quantize_weights_test.cc      |   4 +-
 .../tools/lib_package/libtensorflow_java_test.sh   |   2 +-
 tensorflow/tools/lib_package/libtensorflow_test.sh |   2 +-
 tensorflow/tools/pip_package/setup.py              |   2 +-
 tensorflow/tools/tfprof/README.md                  |   6 +-
 tensorflow/tools/tfprof/g3doc/command_line.md      |   2 +-
 tensorflow/workspace.bzl                           |  24 +-
 third_party/gpus/cuda_configure.bzl                |   4 +-
 third_party/py/BUILD.tpl                           |  12 +
 third_party/py/python_configure.bzl                |  56 +++-
 third_party/sycl/crosstool/computecpp.tpl          |   6 +-
 third_party/sycl/sycl/BUILD.tpl                    |   2 +-
 tools/tf_env_collect.sh                            | 143 +++++----
 304 files changed, 6784 insertions(+), 1689 deletions(-)
 create mode 100644 tensorflow/compiler/plugin/executor/BUILD
 create mode 100644 tensorflow/compiler/plugin/executor/compiler.cc
 create mode 100644 tensorflow/compiler/plugin/executor/compiler.h
 create mode 100644 tensorflow/compiler/plugin/executor/device.cc
 create mode 100644 tensorflow/compiler/plugin/executor/executable.cc
 create mode 100644 tensorflow/compiler/plugin/executor/executable.h
 create mode 100644 tensorflow/compiler/plugin/executor/executor.cc
 create mode 100644 tensorflow/compiler/plugin/executor/executor.h
 create mode 100644 tensorflow/compiler/plugin/executor/platform.cc
 create mode 100644 tensorflow/compiler/plugin/executor/platform.h
 create mode 100644 tensorflow/compiler/plugin/executor/platform_id.h
 create mode 100644 tensorflow/compiler/plugin/executor/transfer_manager.cc
 create mode 100644 tensorflow/compiler/plugin/executor/transfer_manager.h
 create mode 100644 tensorflow/core/kernels/cwise_op_cosh.cc
 create mode 100644 tensorflow/core/kernels/cwise_op_gpu_cosh.cu.cc
 create mode 100644 tensorflow/core/kernels/cwise_op_gpu_sinh.cu.cc
 create mode 100644 tensorflow/core/kernels/cwise_op_sinh.cc
 create mode 100644 tensorflow/core/kernels/sparse_reduce_op.cc
 delete mode 100644 tensorflow/core/kernels/sparse_reduce_sum_op.cc
 create mode 100644 tensorflow/core/kernels/tile_functor.h
 create mode 100644 tensorflow/core/kernels/tile_functor_cpu.cc
 create mode 100644 tensorflow/core/kernels/tile_functor_gpu.cu.cc
 create mode 100644 tensorflow/examples/label_image/label_image.py
 create mode 100644 tensorflow/java/src/main/java/org/tensorflow/Input.java
 create mode 100644 tensorflow/java/src/main/java/org/tensorflow/op/NameScope.java
 create mode 100644 tensorflow/java/src/main/java/org/tensorflow/op/Scope.java
 create mode 100644 tensorflow/java/src/test/java/org/tensorflow/op/ScopeTest.java

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index c78b6b1a15..43abdaafbf 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -159,7 +159,12 @@ There are two ways to run TensorFlow unit tests.
    bazel test ${flags} //tensorflow/python/...
    ```
 
-2. Using Docker and TensorFlow's CI scripts.
+2. Using [Docker](www.docker.com) and TensorFlow's CI scripts.
+
+   ```bash
+   # Install Docker first, then this will build and run cpu tests
+   tensorflow/tools/ci_build/ci_build.sh CPU bazel test //tensorflow/...
+   ```
 
    See
    [TensorFlow Builds](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/ci_build) for details.
diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md
index 6f4c048ce8..4f1666c2e7 100644
--- a/ISSUE_TEMPLATE.md
+++ b/ISSUE_TEMPLATE.md
@@ -6,6 +6,7 @@ If you open a GitHub issue, here is our policy:
 
 1. It must be a bug or a feature request.
 2. The form below must be filled out.
+3. It shouldn't be a TensorBoard issue. Those go [here](https://github.com/tensorflow/tensorboard/issues).
 
 **Here's why we have that policy**: TensorFlow developers respond to issues. We want to focus on work that benefits the whole community, e.g., fixing bugs and adding features. Support only helps individuals. GitHub also notifies thousands of people when issues are filed. We want them to see you communicating an interesting problem, rather than being redirected to Stack Overflow.
 
diff --git a/README.md b/README.md
index e7dbf57b25..abbead98a7 100644
--- a/README.md
+++ b/README.md
@@ -34,13 +34,13 @@ and discussion.**
 
 People who are a little more adventurous can also try our nightly binaries:
 
-* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0rc2-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0rc2-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0rc2-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
-* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc2-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc2-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc2-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
-* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0rc2-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0rc2-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
-* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc2-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc2-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
-* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.2.0rc2-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.2.0rc2-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/))
-* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.2.0rc2-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.2.0rc2-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/))
-* Android: [demo APK](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk), [native libs](http://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/native/)
+* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
+* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
+* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
+* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
+* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.2.0-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.2.0-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/))
+* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.2.0-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.2.0-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/))
+* Android: [demo APK](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk), [native libs](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/native/)
 ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-android/))
 
 #### *Try your first TensorFlow program*
diff --git a/RELEASE.md b/RELEASE.md
index d22c5c62fe..9875838d7e 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -113,6 +113,8 @@
   checkpoints containing such RNN cells, in which case you can use the
   [checkpoint_convert script](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/rnn/python/tools/checkpoint_convert.py)
   to convert the variable names in your old checkpoints.
+* Added `tf.contrib.kernel_methods` module with Ops and estimators for primal
+  (explicit) kernel methods in TensorFlow.
 
 ## Bug Fixes and Other Changes
 * In python, `Operation.get_attr` on type attributes returns the Python DType
diff --git a/configure b/configure
index e1aaddabda..1531b4d40a 100755
--- a/configure
+++ b/configure
@@ -162,8 +162,12 @@ bazel version > bazel.version
 curr_bazel_version=$(head -n 1 bazel.version | cut -d ' ' -f3)
 rm -f bazel.version
 
+
 echo "You have bazel $curr_bazel_version installed."
-if [ "$(version "$MIN_BAZEL_VERSION")" -gt "$(version "$curr_bazel_version")" ]; then
+if [ -z "$curr_bazel_version" ]; then
+  echo "WARNING: current bazel installation is not a release version."
+  echo "Make sure you are running at least bazel $MIN_BAZEL_VERSION."
+elif [ "$(version "$MIN_BAZEL_VERSION")" -gt "$(version "$curr_bazel_version")" ]; then
   echo "Please upgrade your bazel installation to version $MIN_BAZEL_VERSION or higher to build TensorFlow!"
   echo "Exiting..."
   exit 1
@@ -535,9 +539,9 @@ done
 # Set default CUDA version if not set
 if [ -z "$TF_CUDA_VERSION" ]; then
   TF_CUDA_VERSION="8.0"
-  export TF_CUDA_VERSION 
+  export TF_CUDA_VERSION
 fi
-write_action_env_to_bazelrc "TF_CUDA_VERSION" "$TF_CUDA_VERSION" 
+write_action_env_to_bazelrc "TF_CUDA_VERSION" "$TF_CUDA_VERSION"
 
 # Set up which gcc nvcc should use as the host compiler
 # No need to set this on Windows
@@ -586,6 +590,9 @@ while true; do
     # Result returned from "read" will be used unexpanded. That make "~" unusable.
     # Going through one more level of expansion to handle that.
     CUDNN_INSTALL_PATH=`"${PYTHON_BIN_PATH}" -c "import os; print(os.path.realpath(os.path.expanduser('${CUDNN_INSTALL_PATH}')))"`
+    if is_windows; then
+      CUDNN_INSTALL_PATH="$(cygpath -m "$CUDNN_INSTALL_PATH")"
+    fi
   fi
 
   if [[ -z "$TF_CUDNN_VERSION" ]]; then
@@ -652,16 +659,22 @@ write_action_env_to_bazelrc "TF_CUDNN_VERSION" "$TF_CUDNN_VERSION"
 
 # Configure the compute capabilities that TensorFlow builds for.
 # Since Cuda toolkit is not backward-compatible, this is not guaranteed to work.
+function get_native_cuda_compute_capabilities {
+  device_query_bin="$CUDA_TOOLKIT_PATH/extras/demo_suite/deviceQuery" # Also works on Windows without .exe
+  "$device_query_bin" | grep 'Capability' | grep -o '[0-9]*\.[0-9]*' | sed ':a;{N;s/\n/,/};ba'
+  exit 0 # ensure that this function always exit success even if device detection fails, to prevent the whole configure from aborting
+}
 while true; do
   fromuser=""
-  default_cuda_compute_capabilities="3.5,5.2"
+  native_cuda_compute_capabilities=$(get_native_cuda_compute_capabilities)
+  default_cuda_compute_capabilities=${native_cuda_compute_capabilities:-"3.5,5.2"}
   if [ -z "$TF_CUDA_COMPUTE_CAPABILITIES" ]; then
 cat << EOF
 Please specify a list of comma-separated Cuda compute capabilities you want to build with.
 You can find the compute capability of your device at: https://developer.nvidia.com/cuda-gpus.
 Please note that each additional compute capability significantly increases your build time and binary size.
 EOF
-    read -p "[Default is: \"3.5,5.2\"]: " TF_CUDA_COMPUTE_CAPABILITIES
+    read -p "[Default is: \"$default_cuda_compute_capabilities\"]: " TF_CUDA_COMPUTE_CAPABILITIES
     fromuser=1
   fi
   if [ -z "$TF_CUDA_COMPUTE_CAPABILITIES" ]; then
@@ -832,17 +845,17 @@ while true; do
     if [ -e "$MPI_HOME/include" ] && [ -e "$MPI_HOME/lib" ]; then
         break
     fi
- 
+
     echo "Invalid path to the MPI Toolkit. ${MPI_HOME}/include or ${MPI_HOME}/lib cannot be found."
     if [ -z "$fromuser" ]; then
         exit 1
     fi
 
     # Retry
-    MPI_HOME="" 
+    MPI_HOME=""
 done
-    
-    
+
+
 if [ "$TF_NEED_MPI" == "1" ]; then
   write_to_bazelrc 'build --define with_mpi_support=true'
 
@@ -850,11 +863,11 @@ if [ "$TF_NEED_MPI" == "1" ]; then
   ln -sf "${MPI_HOME}/include/mpi.h" third_party/mpi/mpi.h
 
 
-  #Determine if we use OpenMPI or MVAPICH, these require different header files 
+  #Determine if we use OpenMPI or MVAPICH, these require different header files
   #to be included here to make bazel dependency checker happy
 
   if [ -e "${MPI_HOME}/include/mpi_portable_platform.h" ]; then
-        #OpenMPI 
+        #OpenMPI
         ln -sf "${MPI_HOME}/include/mpi_portable_platform.h" third_party/mpi/
         sed -i -e "s/MPI_LIB_IS_OPENMPI=False/MPI_LIB_IS_OPENMPI=True/" third_party/mpi/mpi.bzl
  else
@@ -864,7 +877,7 @@ if [ "$TF_NEED_MPI" == "1" ]; then
         sed -i -e "s/MPI_LIB_IS_OPENMPI=True/MPI_LIB_IS_OPENMPI=False/" third_party/mpi/mpi.bzl
  fi
 
-  
+
   if [ -e "${MPI_HOME}/lib/libmpi.so" ]; then
     ln -sf "${MPI_HOME}/lib/libmpi.so" third_party/mpi/
   else
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 62ae5ae78c..6e76db02de 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -208,6 +208,7 @@ filegroup(
         "//tensorflow/compiler/jit/kernels:all_files",
         "//tensorflow/compiler/jit/legacy_flags:all_files",
         "//tensorflow/compiler/jit/ops:all_files",
+        "//tensorflow/compiler/plugin/executor:all_files",
         "//tensorflow/compiler/tests:all_files",
         "//tensorflow/compiler/tf2xla:all_files",
         "//tensorflow/compiler/tf2xla/cc:all_files",
diff --git a/tensorflow/c/generate-pc.sh b/tensorflow/c/generate-pc.sh
index 73d427d9b2..02a6a58b61 100755
--- a/tensorflow/c/generate-pc.sh
+++ b/tensorflow/c/generate-pc.sh
@@ -26,7 +26,7 @@ usage() {
 [ $# == 0 ] && usage && exit 0
 
 # read the options
-ARGS=`getopt -o p:v:h --long prefix:,version:,help -n $0 -- "$@"`
+ARGS=$(getopt -o p:v:h --long prefix:,version:,help -n $0 -- "$@")
 eval set -- "$ARGS"
 
 # extract options and their arguments into variables.
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index a884f11d48..9801add1da 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -472,10 +472,23 @@ cc_binary(
     name = "tutorials_example_trainer",
     srcs = ["tutorials/example_trainer.cc"],
     copts = tf_copts(),
-    linkopts = [
-        "-lpthread",
-        "-lm",
-    ],
+    linkopts = select({
+        "//tensorflow:windows": [],
+        "//tensorflow:windows_msvc": [],
+        "//tensorflow:darwin": [
+            "-lm",
+            "-lpthread",
+        ],
+        "//tensorflow:ios": [
+            "-lm",
+            "-lpthread",
+        ],
+        "//conditions:default": [
+            "-lm",
+            "-lpthread",
+            "-lrt",
+        ],
+    }),
     deps = [
         ":cc_ops",
         "//tensorflow/core:core_cpu",
diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc
index 8c1a01f518..71d9a8ed7b 100644
--- a/tensorflow/cc/gradients/math_grad.cc
+++ b/tensorflow/cc/gradients/math_grad.cc
@@ -162,6 +162,32 @@ Status Log1pGrad(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("Log1p", Log1pGrad);
 
+Status SinhGrad(const Scope& scope, const Operation& op,
+                const std::vector<Output>& grad_inputs,
+                std::vector<Output>* grad_outputs) {
+  // y = sinh(x)
+  // dy/dx = cosh(x)
+  auto dydx = Cosh(scope, op.input(0));
+  // grad(x) = grad(y) * conj(dy/dx)
+  grad_outputs->push_back(
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("Sinh", SinhGrad);
+
+Status CoshGrad(const Scope& scope, const Operation& op,
+                const std::vector<Output>& grad_inputs,
+                std::vector<Output>* grad_outputs) {
+  // y = cosh(x)
+  // dy/dx = sinh(x)
+  auto dydx = Sinh(scope, op.input(0));
+  // grad(x) = grad(y) * conj(dy/dx)
+  grad_outputs->push_back(
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("Cosh", CoshGrad);
+
 Status TanhGrad(const Scope& scope, const Operation& op,
                 const std::vector<Output>& grad_inputs,
                 std::vector<Output>* grad_outputs) {
diff --git a/tensorflow/cc/gradients/math_grad_test.cc b/tensorflow/cc/gradients/math_grad_test.cc
index de6baa1769..1653b04378 100644
--- a/tensorflow/cc/gradients/math_grad_test.cc
+++ b/tensorflow/cc/gradients/math_grad_test.cc
@@ -45,6 +45,8 @@ class CWiseUnaryGradTest : public ::testing::Test {
     EXPM1,
     LOG,
     LOG1P,
+    SINH,
+    COSH,
     TANH,
     SIGMOID,
     SIGN,
@@ -111,6 +113,12 @@ class CWiseUnaryGradTest : public ::testing::Test {
       case LOG1P:
         y = Log1p(scope_, x);
         break;
+      case SINH:
+        y = Sinh(scope_, x);
+        break;
+      case COSH:
+        y = Cosh(scope_, x);
+        break;
       case TANH:
         y = Tanh(scope_, x);
         break;
@@ -337,6 +345,50 @@ TEST_F(CWiseUnaryGradTest, Log1p_Complex) {
   TestCWiseGrad<complex64>(LOG1P, x_fn, dy_fn, dx_fn);
 }
 
+TEST_F(CWiseUnaryGradTest, Sinh) {
+  auto x_fn = [this](const int i) { return RV({0, -1, 1, -2, 2, -3, 3}); };
+  auto dy_fn = [this](const float x) { return x + RV({-2, 2, -3, 3, -4, 4}); };
+  auto dx_fn = [this](const float x, const float dy) {
+    return dy * std::cosh(x);
+  };
+  TestCWiseGrad<float>(SINH, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Sinh_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy * conjugate(std::cosh(x));
+  };
+  TestCWiseGrad<complex64>(SINH, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Cosh) {
+  auto x_fn = [this](const int i) { return RV({0, -1, 1, -2, 2, -3, 3}); };
+  auto dy_fn = [this](const float x) { return x + RV({-2, 2, -3, 3, -4, 4}); };
+  auto dx_fn = [this](const float x, const float dy) {
+    return dy * std::sinh(x);
+  };
+  TestCWiseGrad<float>(COSH, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Cosh_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy * conjugate(std::sinh(x));
+  };
+  TestCWiseGrad<complex64>(COSH, x_fn, dy_fn, dx_fn);
+}
+
 TEST_F(CWiseUnaryGradTest, Tanh) {
   auto x_fn = [this](const int i) { return RV({0, -1, 1, -2, 2, -3, 3}); };
   auto dy_fn = [this](const float x) { return x + RV({-2, 2, -3, 3, -4, 4}); };
diff --git a/tensorflow/cc/gradients/nn_grad.cc b/tensorflow/cc/gradients/nn_grad.cc
index 5e5203d090..952b2015ed 100644
--- a/tensorflow/cc/gradients/nn_grad.cc
+++ b/tensorflow/cc/gradients/nn_grad.cc
@@ -46,6 +46,19 @@ Status SoftmaxGrad(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("Softmax", SoftmaxGrad);
 
+Status LogSoftmaxGrad(const Scope& scope, const Operation& op,
+                   const std::vector<Output>& grad_inputs,
+                   std::vector<Output>* grad_outputs) {
+
+  auto softmax = Exp(scope, op.output(0));
+  auto sum = Sum(scope, grad_inputs[0], {1}, Sum::KeepDims(true));
+  auto mul = Mul(scope, sum, softmax);
+  auto dx = Sub(scope, grad_inputs[0], mul);
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("LogSoftmax", LogSoftmaxGrad);
+
 Status ReluGradHelper(const Scope& scope, const Operation& op,
                       const std::vector<Output>& grad_inputs,
                       std::vector<Output>* grad_outputs) {
diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc
index 70c9bd4e08..daa87546ec 100644
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@@ -57,6 +57,19 @@ TEST_F(NNGradTest, SoftmaxGrad) {
   RunTest(x, shape, y, shape);
 }
 
+TEST_F(NNGradTest, LogSoftmaxGrad) {
+  TensorShape shape({5, 3});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+  auto y = LogSoftmax(scope_, x);
+  // Avoid numerical instability when computing finite differences.
+  Tensor x_init_value = test::AsTensor<float>(
+          {-0.9f, -0.7f, -0.5f, -0.3f, -0.1f,
+           0.1f, 0.3f, 0.5f, 0.7f, 0.8f,
+           -0.1f, 0.1f, 0.1f, 0.1f, 1.2f},
+          {5, 3});
+  RunTest(x, x_init_value, y, shape);
+}
+
 TEST_F(NNGradTest, ReluGrad) {
   TensorShape shape({5, 2});
   auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 306e704415..7ebd842218 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -15,7 +15,10 @@ package_group(
 )
 
 package(
-    default_visibility = [":internal"],
+    default_visibility = [
+        ":internal",
+        "//tensorflow/compiler/plugin/executor:__pkg__",
+    ],
 )
 
 load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
diff --git a/tensorflow/compiler/jit/kernels/BUILD b/tensorflow/compiler/jit/kernels/BUILD
index ed204b8182..97f3512a6c 100644
--- a/tensorflow/compiler/jit/kernels/BUILD
+++ b/tensorflow/compiler/jit/kernels/BUILD
@@ -2,6 +2,7 @@ licenses(["notice"])  # Apache 2.0
 
 package(
     default_visibility = [
+        "//tensorflow/compiler/plugin/executor:__pkg__",
         "//tensorflow/compiler/tf2xla:internal",
     ],
 )
diff --git a/tensorflow/compiler/plugin/BUILD b/tensorflow/compiler/plugin/BUILD
index 4badd3a589..8c2e9a7c81 100644
--- a/tensorflow/compiler/plugin/BUILD
+++ b/tensorflow/compiler/plugin/BUILD
@@ -32,5 +32,7 @@ package(
 
 cc_library(
     name = "plugin",
-    deps = [],
+    deps = [
+        "//tensorflow/compiler/plugin/executor:plugin_lib",
+    ],
 )
diff --git a/tensorflow/compiler/plugin/executor/BUILD b/tensorflow/compiler/plugin/executor/BUILD
new file mode 100644
index 0000000000..2e5875705f
--- /dev/null
+++ b/tensorflow/compiler/plugin/executor/BUILD
@@ -0,0 +1,34 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "plugin_lib",
+    srcs = glob([
+        "*.cc",
+    ]),
+    hdrs = glob([
+        "*.h",
+    ]),
+    deps = [
+        "//tensorflow/compiler/jit:xla_device",
+        "//tensorflow/compiler/jit:xla_jit_headers_lib",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:xla_headers_lib",
+        "//tensorflow/compiler/xla/service",
+        "//third_party/eigen3",
+        "@local_config_cuda//cuda:cuda_headers",
+        "@protobuf//:protobuf_headers",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+)
diff --git a/tensorflow/compiler/plugin/executor/compiler.cc b/tensorflow/compiler/plugin/executor/compiler.cc
new file mode 100644
index 0000000000..3a84f08c00
--- /dev/null
+++ b/tensorflow/compiler/plugin/executor/compiler.cc
@@ -0,0 +1,122 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdlib.h>
+#include <fstream>
+
+#include "tensorflow/compiler/plugin/executor/compiler.h"
+#include "tensorflow/compiler/plugin/executor/executable.h"
+
+#include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
+#include "tensorflow/compiler/xla/service/flatten_call_graph.h"
+#include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
+#include "tensorflow/compiler/xla/service/hlo_cse.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
+#include "tensorflow/compiler/xla/service/hlo_subcomputation_unification.h"
+#include "tensorflow/compiler/xla/service/inliner.h"
+#include "tensorflow/compiler/xla/service/reshape_mover.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/strcat.h"
+
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace se = ::perftools::gputools;
+namespace sep = ::perftools::gputools::executorplugin;
+namespace port = ::perftools::gputools::port;
+
+namespace xla {
+namespace executorplugin {
+
+/*
+ * Run optimization passes on the module.  The graph is transformed by
+ * each pass in the optimization pipeline.  The service subdirectory
+ * contains useful optimization passes.
+ */
+Status ExecutorCompiler::RunHloOptimization(HloModule* hlo_module) {
+  HloPassPipeline pipeline("Executor");
+  pipeline.AddPass<Inliner>();
+  pipeline.AddPass<HloSubcomputationUnification>();
+  pipeline.AddPass<HloCSE>(false);
+
+  pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(
+      false, [](const Shape&, const Shape&) { return false; });
+  pipeline.AddPass<ReshapeMover>();
+  pipeline.AddPass<HloConstantFolding>();
+  pipeline.AddPass<HloCSE>(true);
+
+  pipeline.AddPass<HloDCE>();
+  pipeline.AddPass<FlattenCallGraph>();
+  return pipeline.Run(hlo_module).status();
+}
+
+StatusOr<std::unique_ptr<Executable>> ExecutorCompiler::Compile(
+        std::unique_ptr<HloModule> hlo_module,
+        se::StreamExecutor* stream_exec) {
+  TF_RET_CHECK(stream_exec != nullptr);
+
+  VLOG(1) << "Generate graph " << hlo_module->name();
+
+  TF_RETURN_IF_ERROR(RunHloOptimization(hlo_module.get()));
+
+  // Typically you would visit the HLO graph, building up a compiled equivalent
+  // In this case we are using an Hlo evaluator at execution time, so we don't
+  // need to compile anything
+
+  // Create executable from only the Hlo module
+  std::unique_ptr<Executable> executable;
+  executable.reset(new ExecutorExecutable(std::move(hlo_module)));
+
+  return std::move(executable);
+}
+
+StatusOr<std::vector<std::unique_ptr<Executable>>> ExecutorCompiler::Compile(
+        std::vector<std::unique_ptr<HloModule>> hlo_modules,
+        std::vector<se::StreamExecutor*> stream_execs) {
+
+  return tensorflow::errors::Unimplemented(
+      "Compilation of multiple HLO modules is not supported on Executor.");
+}
+
+StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+ExecutorCompiler::CompileAheadOfTime(
+    std::vector<std::unique_ptr<HloModule>> hlo_modules,
+    const AotCompilationOptions& aot_options) {
+
+  return tensorflow::errors::InvalidArgument(
+      "AOT compilation not supported on Executor");
+}
+
+se::Platform::Id ExecutorCompiler::PlatformId() const {
+  return sep::kExecutorPlatformId;
+}
+
+HloCostAnalysis::ShapeSizeFunction
+ExecutorCompiler::ShapeSizeBytesFunction() const {
+  return ExecutorExecutable::ShapeSizeBytes;
+}
+
+
+}  // namespace executorplugin
+}  // namespace xla
+
+REGISTER_MODULE_INITIALIZER(executor_compiler, {
+  xla::Compiler::RegisterCompilerFactory(sep::kExecutorPlatformId, []() {
+    return xla::MakeUnique<xla::executorplugin::ExecutorCompiler>();
+  });
+});
diff --git a/tensorflow/compiler/plugin/executor/compiler.h b/tensorflow/compiler/plugin/executor/compiler.h
new file mode 100644
index 0000000000..d318eefc49
--- /dev/null
+++ b/tensorflow/compiler/plugin/executor/compiler.h
@@ -0,0 +1,62 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_EXECUTOR_COMPILER_H_
+#define TENSORFLOW_COMPILER_EXECUTOR_COMPILER_H_
+
+#include <memory>
+
+#include "tensorflow/compiler/xla/service/compiler.h"
+#include "tensorflow/compiler/xla/service/executable.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+
+#include "tensorflow/compiler/plugin/executor/platform_id.h"
+
+namespace xla {
+namespace executorplugin {
+
+class ExecutorCompiler : public Compiler {
+ public:
+  ExecutorCompiler() {}
+  ~ExecutorCompiler() override {}
+
+  StatusOr<std::unique_ptr<Executable>> Compile(
+      std::unique_ptr<HloModule> hlo_module,
+      perftools::gputools::StreamExecutor* stream_exec) override;
+
+  StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
+      std::vector<std::unique_ptr<HloModule>> hlo_module,
+      std::vector<perftools::gputools::StreamExecutor*> stream_exec) override;
+
+  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAheadOfTime(
+      std::vector<std::unique_ptr<HloModule>> module,
+      const AotCompilationOptions& options) override;
+
+  HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override;
+
+  perftools::gputools::Platform::Id PlatformId() const override;
+
+ private:
+  Status RunHloOptimization(HloModule* hlo_module);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(ExecutorCompiler);
+};
+
+}  // namespace executorplugin
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_EXECUTOR_COMPILER_H_
diff --git a/tensorflow/compiler/plugin/executor/device.cc b/tensorflow/compiler/plugin/executor/device.cc
new file mode 100644
index 0000000000..bbc39dc03f
--- /dev/null
+++ b/tensorflow/compiler/plugin/executor/device.cc
@@ -0,0 +1,60 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/kernels/xla_device_launch_op.h"
+#include "tensorflow/compiler/jit/xla_device.h"
+#include "tensorflow/compiler/jit/xla_device_ops.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+
+namespace tensorflow {
+
+const char* const DEVICE_XLA_EXEC = "XLA_EXEC";
+const char* const DEVICE_EXEC_XLA_JIT = "XLA_EXEC_JIT";
+
+constexpr std::array<DataType, 5> kExecAllTypes = {
+    {DT_INT32, DT_FLOAT, DT_BOOL, DT_DOUBLE, DT_INT64}};
+
+class XlaExaDeviceFactory : public DeviceFactory {
+ public:
+  Status CreateDevices(const SessionOptions& options, const string& name_prefix,
+                       std::vector<Device*>* devices) override;
+};
+
+Status XlaExaDeviceFactory::CreateDevices(const SessionOptions& options,
+                                          const string& name_prefix,
+                                          std::vector<Device*>* devices) {
+  static XlaDeviceOpRegistrations* registrations =
+      RegisterXlaDeviceKernels(DEVICE_XLA_EXEC, DEVICE_EXEC_XLA_JIT);
+  (void)registrations;
+
+  std::unique_ptr<XlaDevice> device;
+  TF_RETURN_IF_ERROR(XlaDevice::Create("Executor", DEVICE_XLA_EXEC, 0,
+                                       DEVICE_EXEC_XLA_JIT, options,
+                                       name_prefix, &device));
+  devices->push_back(device.release());
+  return Status::OK();
+}
+
+REGISTER_LOCAL_DEVICE_FACTORY(DEVICE_XLA_EXEC, XlaExaDeviceFactory, 110);
+
+// Kernel registrations
+
+static bool OpFilter(KernelDef* kdef) { return true; }
+
+REGISTER_XLA_LAUNCH_KERNEL(DEVICE_XLA_EXEC, XlaDeviceLaunchOp, kExecAllTypes);
+REGISTER_XLA_DEVICE_KERNELS(DEVICE_XLA_EXEC, kExecAllTypes);
+REGISTER_XLA_BACKEND(DEVICE_EXEC_XLA_JIT, kExecAllTypes, OpFilter);
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/plugin/executor/executable.cc b/tensorflow/compiler/plugin/executor/executable.cc
new file mode 100644
index 0000000000..79eea9af3f
--- /dev/null
+++ b/tensorflow/compiler/plugin/executor/executable.cc
@@ -0,0 +1,147 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/plugin/executor/executable.h"
+#include "tensorflow/compiler/plugin/executor/executor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+
+namespace se = ::perftools::gputools;
+namespace sep = ::perftools::gputools::executorplugin;
+
+namespace xla {
+namespace executorplugin {
+
+ExecutorExecutable::ExecutorExecutable(std::unique_ptr<HloModule> hlo_module)
+    : Executable(std::move(hlo_module), ShapeSizeBytes) {}
+
+ExecutorExecutable::~ExecutorExecutable() {}
+
+static se::DeviceMemoryBase AllocateSingleOutput(sep::ExecutorExecutor* executor,
+                                                 const Literal& literal) {
+  int64 size(xla::ShapeUtil::ByteSizeOf(literal.shape()));
+  void* buf = executor->Allocate(size);
+  const void* src = literal.InternalData();
+  memcpy(buf, src, size);
+  return se::DeviceMemoryBase(buf, size);
+}
+
+static se::DeviceMemoryBase AllocateOutputBuffer(sep::ExecutorExecutor* executor,
+                                                 const Literal& literal) {
+  const Shape& shape = literal.shape();
+  if (shape.element_type() != xla::TUPLE) {
+    return AllocateSingleOutput(executor, literal);
+  } else {
+    int64 size(xla::ShapeUtil::ByteSizeOf(shape, sizeof(void*)));
+    void** buf = reinterpret_cast<void**>(executor->Allocate(size));
+    for (int64 n = 0; n < xla::ShapeUtil::TupleElementCount(shape); n++) {
+      se::DeviceMemoryBase out =
+          AllocateSingleOutput(executor, literal.tuple_literals(n));
+      *buf++ = out.opaque();
+    }
+
+    return se::DeviceMemoryBase(buf, size);
+  }
+}
+
+StatusOr<se::DeviceMemoryBase> ExecutorExecutable::ExecuteOnStream(
+    const ServiceExecutableRunOptions* run_options,
+    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments,
+    HloExecutionProfile* hlo_execution_profile) {
+  se::Stream* stream = run_options->stream();
+
+  VLOG(1) << "Execute " << module().name();
+  if (VLOG_IS_ON(2)) {
+    for (const auto& a : arguments) {
+      VLOG(2) << "-- argument " << a.opaque();
+    }
+  }
+
+  uint64 start_micros = tensorflow::Env::Default()->NowMicros();
+
+  HloComputation* computation = module().entry_computation();
+  if (computation->num_parameters() != arguments.size()) {
+    return tensorflow::errors::Internal(
+        "Mismatch between argument count and graph parameter count.");
+  }
+
+  // Create the arguments as an vector of XLA literals
+  std::vector<std::unique_ptr<Literal>> arg_literals;
+  std::vector<Literal*> arg_literals_ptrs;
+  for (int64 p = 0; p < computation->num_parameters(); p++) {
+    // Create the input literal for the parameter
+    HloInstruction* param = computation->parameter_instruction(p);
+    arg_literals.emplace_back(Literal::CreateFromShape(param->shape()));
+    arg_literals_ptrs.push_back(arg_literals.back().get());
+
+    // Copy in the data from the stream_executor buffers
+    void* buffer = arg_literals.back().get()->MutableInternalData();
+    memcpy(buffer, arguments[p].opaque(),
+           ShapeUtil::ByteSizeOf(param->shape()));
+  }
+
+  // Execute the graph using the evaluator
+  HloEvaluator evaluator;
+  std::unique_ptr<Literal> output;
+  TF_ASSIGN_OR_RETURN(output,
+                      evaluator.Evaluate(computation, arg_literals_ptrs));
+
+  // Copy the result into the return buffer
+  perftools::gputools::StreamExecutor* executor(stream->parent());
+  sep::ExecutorExecutor* executorExecutor(
+      static_cast<sep::ExecutorExecutor*>(executor->implementation()));
+
+  se::DeviceMemoryBase ret =
+      AllocateOutputBuffer(executorExecutor, *(output.get()));
+
+  uint64 end_micros = tensorflow::Env::Default()->NowMicros();
+
+  {
+    tensorflow::mutex_lock lock(mutex_);
+    const double nanoseconds = (end_micros - start_micros) * 1000.0;
+    execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0));
+  }
+
+  return ret;
+}
+
+StatusOr<std::unique_ptr<ShapedBuffer>> ExecutorExecutable::ExecuteOnStream(
+    const ServiceExecutableRunOptions* run_options,
+    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
+    HloExecutionProfile* hlo_execution_profile) {
+  return tensorflow::errors::Unimplemented(
+      "ExecuteOnStream is not yet supported on Executor.");
+}
+
+StatusOr<se::DeviceMemoryBase> ExecutorExecutable::ExecuteAsyncOnStream(
+    const ServiceExecutableRunOptions* run_options,
+    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments) {
+  return tensorflow::errors::Unimplemented(
+      "ExecuteAsyncOnStream is not yet supported on Executor.");
+}
+
+/*static*/ int64 ExecutorExecutable::ShapeSizeBytes(const Shape& shape) {
+  if (ShapeUtil::IsOpaque(shape)) {
+    return sizeof(void*);
+  }
+  return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
+}
+
+
+}  // namespace executorplugin
+}  // namespace xla
diff --git a/tensorflow/compiler/plugin/executor/executable.h b/tensorflow/compiler/plugin/executor/executable.h
new file mode 100644
index 0000000000..ba3d4da21d
--- /dev/null
+++ b/tensorflow/compiler/plugin/executor/executable.h
@@ -0,0 +1,65 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_EXECUTABLE_H_
+#define TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_EXECUTABLE_H_
+
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/executable.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace xla {
+namespace executorplugin {
+
+class ExecutorExecutable : public Executable {
+ public:
+  ExecutorExecutable(std::unique_ptr<HloModule> hlo_module);
+  ~ExecutorExecutable() override;
+
+  StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream(
+      const ServiceExecutableRunOptions* run_options,
+      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
+          arguments,
+      HloExecutionProfile* hlo_execution_profile) override;
+
+  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream(
+      const ServiceExecutableRunOptions* run_options,
+      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
+      HloExecutionProfile* hlo_execution_profile) override;
+
+  StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteAsyncOnStream(
+      const ServiceExecutableRunOptions* run_options,
+      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
+          arguments) override;
+
+  static int64 ShapeSizeBytes(const Shape& shape);
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(ExecutorExecutable);
+};
+
+}  // namespace executorplugin
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_EXECUTABLE_H_
diff --git a/tensorflow/compiler/plugin/executor/executor.cc b/tensorflow/compiler/plugin/executor/executor.cc
new file mode 100644
index 0000000000..e72c2711f7
--- /dev/null
+++ b/tensorflow/compiler/plugin/executor/executor.cc
@@ -0,0 +1,135 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/plugin/executor/executor.h"
+#include "tensorflow/compiler/plugin/executor/platform_id.h"
+
+#include "tensorflow/compiler/xla/status_macros.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+namespace se = ::perftools::gputools;
+
+namespace perftools {
+namespace gputools {
+namespace executorplugin {
+
+host::HostStream *AsExecutorStream(Stream *stream) {
+  DCHECK(stream != nullptr);
+  return dynamic_cast<host::HostStream *>(stream->implementation());
+}
+
+ExecutorExecutor::ExecutorExecutor(const PluginConfig &plugin_config)
+    : plugin_config_(plugin_config) {}
+
+ExecutorExecutor::~ExecutorExecutor() {}
+
+void *ExecutorExecutor::Allocate(uint64 size) {
+  void *buf = new char[size];
+  return buf;
+}
+
+void *ExecutorExecutor::AllocateSubBuffer(DeviceMemoryBase *parent,
+                                         uint64 offset_bytes,
+                                         uint64 size_bytes) {
+  return parent + offset_bytes;
+}
+
+void ExecutorExecutor::Deallocate(DeviceMemoryBase *mem) {
+  if (!mem->is_sub_buffer()) {
+    delete[] static_cast<char *>(mem->opaque());
+  }
+}
+
+bool ExecutorExecutor::Memcpy(Stream *stream, void *host_dst,
+                             const DeviceMemoryBase &dev_src, uint64 size) {
+  AsExecutorStream(stream)->EnqueueTask([this, host_dst, dev_src, size]() {
+    port::Status ok = SynchronousMemcpy(host_dst, dev_src, size);
+  });
+  return true;
+}
+
+bool ExecutorExecutor::Memcpy(Stream *stream, DeviceMemoryBase *dev_dst,
+                             const void *host_src, uint64 size) {
+  AsExecutorStream(stream)->EnqueueTask([this, dev_dst, host_src, size]() {
+    port::Status ok = SynchronousMemcpy(dev_dst, host_src, size);
+  });
+  return true;
+}
+
+port::Status ExecutorExecutor::SynchronousMemcpy(DeviceMemoryBase *dev_dst,
+                                                const void *host_src,
+                                                uint64 size) {
+  memcpy(dev_dst->opaque(), host_src, size);
+  return port::Status::OK();
+}
+
+port::Status ExecutorExecutor::SynchronousMemcpy(void *host_dst,
+                                                const DeviceMemoryBase &dev_src,
+                                                uint64 size) {
+  memcpy(host_dst, dev_src.opaque(), size);
+  return port::Status::OK();
+}
+
+bool ExecutorExecutor::HostCallback(Stream *stream,
+                                   std::function<void()> callback) {
+  AsExecutorStream(stream)->EnqueueTask(callback);
+  return true;
+}
+
+bool ExecutorExecutor::CreateStreamDependency(Stream *dependent, Stream *other) {
+  AsExecutorStream(dependent)->EnqueueTask(
+      [other]() { other->BlockHostUntilDone(); });
+  AsExecutorStream(dependent)->BlockUntilDone();
+  return true;
+}
+
+bool ExecutorExecutor::StartTimer(Stream *stream, Timer *timer) {
+  dynamic_cast<host::HostTimer *>(timer->implementation())->Start(stream);
+  return true;
+}
+
+bool ExecutorExecutor::StopTimer(Stream *stream, Timer *timer) {
+  dynamic_cast<host::HostTimer *>(timer->implementation())->Stop(stream);
+  return true;
+}
+
+bool ExecutorExecutor::BlockHostUntilDone(Stream *stream) {
+  AsExecutorStream(stream)->BlockUntilDone();
+  return true;
+}
+
+DeviceDescription *ExecutorExecutor::PopulateDeviceDescription() const {
+  internal::DeviceDescriptionBuilder builder;
+
+  builder.set_device_address_bits(64);
+
+  builder.set_name("Executor");
+  builder.set_device_vendor("VectorName");
+  builder.set_platform_version("1.0");
+  builder.set_driver_version("1.0");
+  builder.set_runtime_version("1.0");
+  builder.set_pci_bus_id("1");
+  builder.set_device_memory_size(static_cast<uint64>(4) * 1024 * 1024 * 1024);
+  builder.set_clock_rate_ghz(static_cast<float>(CLOCKS_PER_SEC) / 1e9);
+
+  auto built = builder.Build();
+  return built.release();
+}
+
+}  // namespace executorplugin
+}  // namespace gputools
+}  // namespace perftools
diff --git a/tensorflow/compiler/plugin/executor/executor.h b/tensorflow/compiler/plugin/executor/executor.h
new file mode 100644
index 0000000000..32fdb157e4
--- /dev/null
+++ b/tensorflow/compiler/plugin/executor/executor.h
@@ -0,0 +1,213 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Declares the ExecutorExecutor class, which is a CPU-only implementation of
+// the StreamExecutor interface. For now, this is used for testing and to
+// examine the performance of host-based StreamExecutor code.
+#ifndef TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_EXECUTOR_H_
+#define TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_EXECUTOR_H_
+
+#include "tensorflow/stream_executor/host/host_stream.h"
+#include "tensorflow/stream_executor/host/host_timer.h"
+
+#include "tensorflow/compiler/xla/shape_util.h"
+
+#include "tensorflow/stream_executor/blas.h"
+#include "tensorflow/stream_executor/lib/error.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/rng.h"
+#include "tensorflow/stream_executor/stream_executor.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+#include <list>
+#include <mutex>
+
+namespace perftools {
+namespace gputools {
+namespace executorplugin {
+
+using Args = tensorflow::gtl::ArraySlice<DeviceMemoryBase>;
+
+class ExecutorExecutor : public internal::StreamExecutorInterface {
+ public:
+  explicit ExecutorExecutor(const PluginConfig &plugin_config);
+  ~ExecutorExecutor() override;
+
+  port::Status Init(int device_ordinal, DeviceOptions device_options) override {
+    return port::Status::OK();
+  }
+
+  bool GetKernel(const MultiKernelLoaderSpec &spec,
+                 KernelBase *kernel) override {
+    return false;
+  }
+  bool Launch(Stream *stream, const ThreadDim &thread_dims,
+              const BlockDim &block_dims, const KernelBase &kernel,
+              const KernelArgsArrayBase &args) override {
+    return false;
+  }
+
+  void *Allocate(uint64 size) override;
+  void *AllocateSubBuffer(DeviceMemoryBase *mem, uint64 offset_bytes,
+                          uint64 size_bytes) override;
+  void Deallocate(DeviceMemoryBase *mem) override;
+
+  void *HostMemoryAllocate(uint64 size) override { return new char[size]; }
+  void HostMemoryDeallocate(void *mem) override {
+    delete[] static_cast<char *>(mem);
+  }
+  bool HostMemoryRegister(void *mem, uint64 size) override { return true; }
+  bool HostMemoryUnregister(void *mem) override { return true; }
+
+  bool Memcpy(Stream *stream, void *host_dst, const DeviceMemoryBase &pop_src,
+              uint64 size) override;
+  bool Memcpy(Stream *stream, DeviceMemoryBase *pop_dst, const void *host_src,
+              uint64 size) override;
+  bool MemcpyDeviceToDevice(Stream *stream, DeviceMemoryBase *pop_dst,
+                            const DeviceMemoryBase &host_src,
+                            uint64 size) override {
+    return false;
+  }
+
+  bool MemZero(Stream *stream, DeviceMemoryBase *location,
+               uint64 size) override {
+    return false;
+  }
+  bool Memset(Stream *stream, DeviceMemoryBase *location, uint8 pattern,
+              uint64 size) override {
+    return false;
+  }
+  bool Memset32(Stream *stream, DeviceMemoryBase *location, uint32 pattern,
+                uint64 size) override {
+    return false;
+  }
+
+  // No "synchronize all activity" implemented for this platform at the moment.
+  bool SynchronizeAllActivity() override { return false; }
+  bool SynchronousMemZero(DeviceMemoryBase *location, uint64 size) override {
+    return false;
+  }
+
+  bool SynchronousMemSet(DeviceMemoryBase *location, int value,
+                         uint64 size) override {
+    return false;
+  }
+
+  port::Status SynchronousMemcpy(DeviceMemoryBase *pop_dst,
+                                 const void *host_src, uint64 size) override;
+  port::Status SynchronousMemcpy(void *host_dst,
+                                 const DeviceMemoryBase &pop_src,
+                                 uint64 size) override;
+  port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase *pop_dst,
+                                               const DeviceMemoryBase &pop_src,
+                                               uint64 size) override {
+    return port::Status{port::error::UNIMPLEMENTED, ""};
+  }
+
+  bool HostCallback(Stream *stream, std::function<void()> callback) override;
+
+  port::Status AllocateEvent(Event *event) override {
+    return port::Status{port::error::UNIMPLEMENTED, ""};
+  }
+
+  port::Status DeallocateEvent(Event *event) override {
+    return port::Status{port::error::UNIMPLEMENTED, ""};
+  }
+
+  port::Status RecordEvent(Stream *stream, Event *event) override {
+    return port::Status{port::error::UNIMPLEMENTED, ""};
+  }
+
+  port::Status WaitForEvent(Stream *stream, Event *event) override {
+    return port::Status{port::error::UNIMPLEMENTED, ""};
+  }
+
+  Event::Status PollForEventStatus(Event *event) override {
+    return Event::Status::kError;
+  }
+
+  bool AllocateStream(Stream *stream) override { return true; }
+  void DeallocateStream(Stream *stream) override {}
+  bool CreateStreamDependency(Stream *dependent, Stream *other) override;
+
+  bool AllocateTimer(Timer *timer) override { return true; }
+  void DeallocateTimer(Timer *timer) override {}
+  bool StartTimer(Stream *stream, Timer *timer) override;
+  bool StopTimer(Stream *stream, Timer *timer) override;
+
+  bool BlockHostUntilDone(Stream *stream) override;
+
+  int PlatformDeviceCount() override { return 1; }
+
+  bool DeviceMemoryUsage(int64 *free, int64 *total) const override {
+    return false;
+  }
+
+  DeviceDescription *PopulateDeviceDescription() const override;
+
+  port::Status EnablePeerAccessTo(StreamExecutorInterface *other) override {
+    return port::Status::OK();
+  }
+
+  bool CanEnablePeerAccessTo(StreamExecutorInterface *other) override {
+    return true;
+  }
+
+  SharedMemoryConfig GetDeviceSharedMemoryConfig() override {
+    return SharedMemoryConfig::kDefault;
+  }
+
+  port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override {
+    return port::Status{port::error::UNIMPLEMENTED,
+                        "Shared memory not supported"};
+  }
+
+  std::unique_ptr<internal::EventInterface> CreateEventImplementation()
+      override {
+    return nullptr;
+  }
+
+  std::unique_ptr<internal::KernelInterface> CreateKernelImplementation()
+      override {
+    return nullptr;
+  }
+
+  std::unique_ptr<internal::StreamInterface> GetStreamImplementation()
+      override {
+    return std::unique_ptr<internal::StreamInterface>(new host::HostStream());
+  }
+
+  std::unique_ptr<internal::TimerInterface> GetTimerImplementation() override {
+    return std::unique_ptr<internal::TimerInterface>(new host::HostTimer());
+  }
+
+  port::StatusOr<DeviceMemoryBase> ExecuteGraph(const xla::Shape &shape,
+                                                Args args);
+
+ private:
+  DeviceMemoryBase AllocateSingleOutput(const xla::Shape &shape);
+
+  port::StatusOr<DeviceMemoryBase> AllocateOutputBuffer(
+      const xla::Shape &shape);
+
+  const PluginConfig plugin_config_;
+};
+
+}  // namespace executorplugin
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_EXECUTOR_H_
diff --git a/tensorflow/compiler/plugin/executor/platform.cc b/tensorflow/compiler/plugin/executor/platform.cc
new file mode 100644
index 0000000000..2f339f04a7
--- /dev/null
+++ b/tensorflow/compiler/plugin/executor/platform.cc
@@ -0,0 +1,125 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/plugin/executor/platform.h"
+#include "tensorflow/compiler/plugin/executor/executor.h"
+#include "tensorflow/compiler/plugin/executor/platform_id.h"
+
+#include "tensorflow/stream_executor/lib/error.h"
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/ptr_util.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/status_macros.h"
+#include "tensorflow/stream_executor/lib/stringprintf.h"
+
+namespace se = ::perftools::gputools;
+namespace sep = ::perftools::gputools::executorplugin;
+
+namespace perftools {
+namespace gputools {
+namespace executorplugin {
+
+PLATFORM_DEFINE_ID(kExecutorPlatformId);
+
+ExecutorPlatform::ExecutorPlatform() : name_("Executor") {}
+
+ExecutorPlatform::~ExecutorPlatform() {}
+
+Platform::Id ExecutorPlatform::id() const { return kExecutorPlatformId; }
+
+int ExecutorPlatform::VisibleDeviceCount() const { return 1; }
+
+const string& ExecutorPlatform::Name() const { return name_; }
+
+port::StatusOr<StreamExecutor*> ExecutorPlatform::ExecutorForDevice(
+    int ordinal) {
+  StreamExecutorConfig config;
+  config.ordinal = ordinal;
+  config.plugin_config = PluginConfig();
+  config.device_options = DeviceOptions::Default();
+  return GetExecutor(config);
+}
+
+port::StatusOr<StreamExecutor*>
+ExecutorPlatform::ExecutorForDeviceWithPluginConfig(
+    int device_ordinal, const PluginConfig& plugin_config) {
+  StreamExecutorConfig config;
+  config.ordinal = device_ordinal;
+  config.plugin_config = plugin_config;
+  config.device_options = DeviceOptions::Default();
+  return GetExecutor(config);
+}
+
+port::StatusOr<StreamExecutor*> ExecutorPlatform::GetExecutor(
+    const StreamExecutorConfig& config) {
+  mutex_lock lock(executors_mutex_);
+
+  port::StatusOr<StreamExecutor*> status = executor_cache_.Get(config);
+  if (status.ok()) {
+    return status.ValueOrDie();
+  }
+
+  port::StatusOr<std::unique_ptr<StreamExecutor>> executor =
+      GetUncachedExecutor(config);
+  if (!executor.ok()) {
+    return executor.status();
+  }
+
+  StreamExecutor* naked_executor = executor.ValueOrDie().get();
+  SE_RETURN_IF_ERROR(
+      executor_cache_.Insert(config, executor.ConsumeValueOrDie()));
+  return naked_executor;
+}
+
+port::StatusOr<std::unique_ptr<StreamExecutor>>
+ExecutorPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
+  auto executor = port::MakeUnique<StreamExecutor>(
+      this, port::MakeUnique<ExecutorExecutor>(config.plugin_config));
+  auto init_status = executor->Init(config.ordinal, config.device_options);
+  if (!init_status.ok()) {
+    return port::Status{
+        port::error::INTERNAL,
+        port::Printf(
+            "failed initializing StreamExecutor for device ordinal %d: %s",
+            config.ordinal, init_status.ToString().c_str())};
+  }
+
+  return std::move(executor);
+}
+
+void ExecutorPlatform::RegisterTraceListener(
+    std::unique_ptr<TraceListener> listener) {
+  LOG(FATAL) << "not yet implemented: register executor trace listener";
+}
+
+void ExecutorPlatform::UnregisterTraceListener(TraceListener* listener) {
+  LOG(FATAL) << "not yet implemented: unregister executor trace listener";
+}
+
+static void InitializeExecutorPlatform() {
+  std::unique_ptr<se::Platform> platform(new sep::ExecutorPlatform);
+  SE_CHECK_OK(se::MultiPlatformManager::RegisterPlatform(std::move(platform)));
+}
+
+}  // namespace executorplugin
+}  // namespace gputools
+}  // namespace perftools
+
+REGISTER_MODULE_INITIALIZER(executor_platform, sep::InitializeExecutorPlatform());
+
+DECLARE_MODULE_INITIALIZER(multi_platform_manager);
+// Note that module initialization sequencing is not supported in the
+// open-source project, so this will be a no-op there.
+REGISTER_MODULE_INITIALIZER_SEQUENCE(executor_platform, multi_platform_manager);
diff --git a/tensorflow/compiler/plugin/executor/platform.h b/tensorflow/compiler/plugin/executor/platform.h
new file mode 100644
index 0000000000..c252a589d4
--- /dev/null
+++ b/tensorflow/compiler/plugin/executor/platform.h
@@ -0,0 +1,83 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_PLATFORM_H_
+#define TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_PLATFORM_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/stream_executor/executor_cache.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+#include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/stream_executor_pimpl.h"
+#include "tensorflow/stream_executor/trace_listener.h"
+
+namespace perftools {
+namespace gputools {
+namespace executorplugin {
+
+class ExecutorPlatform : public Platform {
+ public:
+  ExecutorPlatform();
+  ~ExecutorPlatform() override;
+
+  Platform::Id id() const override;
+
+  // Device count is less clear-cut for CPUs than accelerators. This call
+  // currently returns the number of thread units in the host, as reported by
+  // base::NumCPUs().
+  int VisibleDeviceCount() const override;
+
+  const string& Name() const override;
+
+  port::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
+
+  port::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig(
+      int ordinal, const PluginConfig& config) override;
+
+  port::StatusOr<StreamExecutor*> GetExecutor(
+      const StreamExecutorConfig& config) override;
+
+  port::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
+      const StreamExecutorConfig& config) override;
+
+  void RegisterTraceListener(std::unique_ptr<TraceListener> listener) override;
+
+  void UnregisterTraceListener(TraceListener* listener) override;
+
+ private:
+  // This platform's name.
+  string name_;
+
+  // mutex that guards the ordinal-to-executor map.
+  mutable mutex executors_mutex_;
+
+  // Cache of created StreamExecutors.
+  ExecutorCache executor_cache_;
+
+  SE_DISALLOW_COPY_AND_ASSIGN(ExecutorPlatform);
+};
+
+}  // namespace executorplugin
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_PLATFORM_H_
diff --git a/tensorflow/compiler/plugin/executor/platform_id.h b/tensorflow/compiler/plugin/executor/platform_id.h
new file mode 100644
index 0000000000..8d2b29a3e4
--- /dev/null
+++ b/tensorflow/compiler/plugin/executor/platform_id.h
@@ -0,0 +1,31 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_EXECUTOR_PLATFORM_ID_H_
+#define TENSORFLOW_STREAM_EXECUTOR_EXECUTOR_PLATFORM_ID_H_
+
+#include "tensorflow/stream_executor/platform.h"
+
+namespace perftools {
+namespace gputools {
+namespace executorplugin {
+
+extern const Platform::Id kExecutorPlatformId;
+
+}  // namespace executorplugin
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_EXECUTOR_PLATFORM_ID_H_
diff --git a/tensorflow/compiler/plugin/executor/transfer_manager.cc b/tensorflow/compiler/plugin/executor/transfer_manager.cc
new file mode 100644
index 0000000000..51c5deeea5
--- /dev/null
+++ b/tensorflow/compiler/plugin/executor/transfer_manager.cc
@@ -0,0 +1,187 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/plugin/executor/transfer_manager.h"
+#include "tensorflow/compiler/plugin/executor/platform_id.h"
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace sep = ::perftools::gputools::executorplugin;
+
+namespace xla {
+namespace executorplugin {
+
+ExecutorTransferManager::ExecutorTransferManager() {}
+
+se::Platform::Id ExecutorTransferManager::PlatformId() const {
+  return se::executorplugin::kExecutorPlatformId;
+}
+
+Status ExecutorTransferManager::TransferLiteralFromDevice(
+    se::StreamExecutor* executor, const se::DeviceMemoryBase& source,
+    const Shape& device_shape, const Shape& literal_shape, Literal* literal) {
+  TF_RET_CHECK(ShapeUtil::Compatible(device_shape, literal_shape));
+
+  // Tuples are a special case and contain one or more shapes inside of them to
+  // an arbitrary nesting depth.
+  if (device_shape.element_type() == TUPLE) {
+    *literal->mutable_shape() = literal_shape;
+    TF_ASSIGN_OR_RETURN(
+        std::vector<se::DeviceMemoryBase> element_buffers,
+        ShallowCopyTupleFromDevice(executor, source, device_shape));
+    TF_RET_CHECK(element_buffers.size() ==
+                 ShapeUtil::TupleElementCount(device_shape));
+    for (int64 i = 0; i < element_buffers.size(); ++i) {
+      const Shape& element_device_shape = device_shape.tuple_shapes(i);
+      const Shape& element_literal_shape = literal_shape.tuple_shapes(i);
+      Literal* element_literal = literal->add_tuple_literals();
+      // Recursively call TransferFromDevice to copy over the data in the
+      // element array.
+      TF_RETURN_IF_ERROR(TransferLiteralFromDevice(
+          executor, element_buffers[i], element_device_shape,
+          element_literal_shape, element_literal));
+    }
+    return Status::OK();
+  }
+
+  *literal->mutable_shape() = device_shape;
+  literal->Reserve(ShapeUtil::ElementsIn(device_shape));
+  TF_RETURN_IF_ERROR(TransferBufferFromDevice(
+      executor, source, ShapeUtil::ByteSizeOf(device_shape),
+      literal->MutableInternalData()));
+  if (!ShapeUtil::Equal(literal_shape, device_shape)) {
+    literal->Swap(
+        literal->Relayout(literal_shape.layout()).get());
+  }
+  TF_RET_CHECK(ShapeUtil::Equal(literal_shape, literal->shape()));
+  return Status::OK();
+}
+
+StatusOr<std::vector<se::DeviceMemoryBase>>
+ExecutorTransferManager::ShallowCopyTupleFromDevice(
+    se::StreamExecutor* executor, const se::DeviceMemoryBase& source,
+    const Shape& shape) {
+  TF_RET_CHECK(ShapeUtil::IsTuple(shape));
+
+  std::vector<void*> element_pointers(ShapeUtil::TupleElementCount(shape),
+                                      nullptr);
+  int64 tuple_size = ShapeUtil::ByteSizeOf(shape, sizeof(void*));
+  auto copy_status = executor->SynchronousMemcpyD2H(source, tuple_size,
+                                                    element_pointers.data());
+  if (!copy_status.ok()) {
+    return AddStatus(
+        Status(static_cast<tensorflow::error::Code>(copy_status.code()),
+               copy_status.error_message()),
+        "failed transfer of tuple buffer " + ShapeUtil::HumanString(shape));
+  }
+
+  // Create a DeviceMemoryBase from each void* pointer.
+  std::vector<se::DeviceMemoryBase> destination;
+  for (int i = 0; i < element_pointers.size(); ++i) {
+    if (element_pointers[i] == nullptr &&
+        !ShapeUtil::HasZeroElements(shape.tuple_shapes(i))) {
+      return FailedPrecondition("tuple contains nullptr at element %d", i);
+    }
+    int64 buffer_size =
+        ShapeUtil::ByteSizeOf(shape.tuple_shapes(i), sizeof(void*));
+    destination.emplace_back(element_pointers[i], buffer_size);
+  }
+  return std::move(destination);
+}
+
+Status ExecutorTransferManager::TransferLiteralToDevice(
+    se::StreamExecutor* executor, const Literal& literal,
+    se::DeviceMemoryBase* destination) {
+  const Shape& shape = literal.shape();
+
+  if (ShapeUtil::IsTuple(literal.shape())) {
+    std::vector<void*> tuple_elements_on_device;
+    for (const Literal& tuple_element : literal.tuple_literals()) {
+      se::DeviceMemoryBase allocation = executor->AllocateArray<uint8>(
+          GetByteSizeRequirement(tuple_element.shape()));
+      TF_RETURN_IF_ERROR(
+          TransferLiteralToDevice(executor, tuple_element, &allocation));
+      tuple_elements_on_device.push_back(allocation.opaque());
+    }
+    return TransferBufferToDevice(
+        executor, tuple_elements_on_device.size() * sizeof(void*),
+        tuple_elements_on_device.data(), destination);
+  }
+
+  return TransferBufferToDevice(executor, GetByteSizeRequirement(shape),
+                                literal.InternalData(),
+                                destination);
+}
+
+Status ExecutorTransferManager::TransferLiteralToInfeed(
+    se::StreamExecutor* executor, const Literal& literal) {
+  const Shape& shape = literal.shape();
+  VLOG(1) << "transferring literal shape to infeed: "
+          << ShapeUtil::HumanString(shape);
+
+  return Status::OK();
+}
+
+Status ExecutorTransferManager::TransferBufferToInfeed(
+    se::StreamExecutor* executor, int64 size, const void* source) {
+  return Unimplemented("Transfer to Infeed");
+}
+
+Status ExecutorTransferManager::TransferLiteralFromOutfeed(
+    perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
+    Literal* literal) {
+  const Shape& shape = literal->shape();
+  VLOG(1) << "transferring literal shape from outfeed: "
+          << ShapeUtil::HumanString(shape);
+
+  return Status::OK();
+}
+
+Status ExecutorTransferManager::ResetDevices(
+    tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
+        executors) {
+  return Unimplemented("Device reset not supported");
+}
+
+int64 ExecutorTransferManager::GetByteSizeRequirement(const Shape& shape) {
+  return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
+}
+
+}  // namespace executorplugin
+}  // namespace xla
+
+static std::unique_ptr<xla::TransferManager> CreateExecutorTransferManager() {
+  return xla::MakeUnique<xla::executorplugin::ExecutorTransferManager>();
+}
+
+static bool InitModule() {
+  xla::TransferManager::RegisterTransferManager(sep::kExecutorPlatformId,
+                                                &CreateExecutorTransferManager);
+  return true;
+}
+static bool module_initialized = InitModule();
diff --git a/tensorflow/compiler/plugin/executor/transfer_manager.h b/tensorflow/compiler/plugin/executor/transfer_manager.h
new file mode 100644
index 0000000000..7a42e5a2d7
--- /dev/null
+++ b/tensorflow/compiler/plugin/executor/transfer_manager.h
@@ -0,0 +1,77 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_TRANSFER_MANAGER_H_
+#define TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_TRANSFER_MANAGER_H_
+
+#include "tensorflow/compiler/xla/service/transfer_manager.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/core/platform/types.h"
+
+#include <vector>
+
+namespace se = ::perftools::gputools;
+
+namespace xla {
+namespace executorplugin {
+
+class ExecutorTransferManager : public TransferManager {
+ public:
+  ExecutorTransferManager();
+
+  ~ExecutorTransferManager() override {}
+
+  se::Platform::Id PlatformId() const override;
+
+  StatusOr<std::vector<se::DeviceMemoryBase>> ShallowCopyTupleFromDevice(
+      se::StreamExecutor* executor, const se::DeviceMemoryBase& source,
+      const Shape& shape) override;
+
+  Status TransferLiteralFromDevice(se::StreamExecutor* executor,
+                                   const se::DeviceMemoryBase& source,
+                                   const Shape& device_shape,
+                                   const Shape& literal_shape,
+                                   Literal* literal) override;
+
+  Status TransferLiteralToDevice(se::StreamExecutor* executor,
+                                 const Literal& literal,
+                                 se::DeviceMemoryBase* destination) override;
+
+  Status TransferLiteralToInfeed(se::StreamExecutor* executor,
+                                 const Literal& literal) override;
+
+  Status TransferBufferToInfeed(se::StreamExecutor* executor,
+                                 int64 size, const void* source) override;
+
+  Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
+                                    const Shape& literal_shape,
+                                    Literal* literal) override;
+
+  Status ResetDevices(
+      tensorflow::gtl::ArraySlice<se::StreamExecutor*> executors) override;
+
+  int64 GetByteSizeRequirement(const Shape& shape) override;
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(ExecutorTransferManager);
+};
+
+}  // namespace executorplugin
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_TRANSFER_MANAGER_H_
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 044857d422..432b24756d 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -175,6 +175,11 @@ tf_xla_py_test(
     name = "slice_ops_test",
     size = "small",
     srcs = ["slice_ops_test.py"],
+    # TODO(b/62962492): Test fails with assertion error.
+    tags = [
+        "manual",
+        "notap",
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -456,6 +461,11 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
     ],
+    # TODO(b/62961789): Test fails with SIGABRT
+    tags = [
+        "manual",
+        "notap",
+    ],
 )
 
 cc_library(
@@ -524,8 +534,12 @@ cuda_py_test(
 # --dump_graph_dir, and the config file was written by hand.
 #
 # Run the following to build a minimal benchmark of the computation on Android:
-# $ bazel build -c opt --config=android_arm \
-#       third_party/tensorflow/compiler/tests:lstm_layer_inference_benchmark
+# $ bazel build -c opt --cxxopt='-std=c++11' --linkopt='-lm' \
+#   --cpu=armeabi-v7a \
+#   --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
+#   --crosstool_top=//external:android/crosstool \
+#   //tensorflow/compiler/tests:lstm_layer_inference_benchmark
+
 #
 # Currently the resulting binary size is ~190KB
 tf_library(
diff --git a/tensorflow/compiler/tests/ftrl_test.py b/tensorflow/compiler/tests/ftrl_test.py
index 6b328fb618..a75a5cd2cf 100644
--- a/tensorflow/compiler/tests/ftrl_test.py
+++ b/tensorflow/compiler/tests/ftrl_test.py
@@ -218,7 +218,7 @@ class FtrlOptimizerTest(XLATestCase):
         self.assertAllClose(np.array([-0.24059935, -0.46829352]), var0.eval())
         self.assertAllClose(np.array([-0.02406147, -0.04830509]), var1.eval())
 
-  # When variables are intialized with Zero, FTRL-Proximal has two properties:
+  # When variables are initialized with Zero, FTRL-Proximal has two properties:
   # 1. Without L1&L2 but with fixed learning rate, FTRL-Proximal is identical
   # with GradientDescent.
   # 2. Without L1&L2 but with adaptive learning rate, FTRL-Proximal is idential
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
index f752fb3ae2..16b778bca4 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
@@ -94,12 +94,14 @@ class BatchMatMulOp : public XlaOpKernel {
       // Slice off individual matrices and reshape to 2D tensors.
       auto x_slice = builder->Slice(
           x_flat, {i, 0, 0},
-          {i + 1, x_shape.dim_size(ndims - 2), x_shape.dim_size(ndims - 1)});
+          {i + 1, x_shape.dim_size(ndims - 2), x_shape.dim_size(ndims - 1)},
+          {1, 1, 1});
       x_slice = builder->Reshape(
           x_slice, {x_shape.dim_size(ndims - 2), x_shape.dim_size(ndims - 1)});
       auto y_slice = builder->Slice(
           y_flat, {i, 0, 0},
-          {i + 1, y_shape.dim_size(ndims - 2), y_shape.dim_size(ndims - 1)});
+          {i + 1, y_shape.dim_size(ndims - 2), y_shape.dim_size(ndims - 1)},
+          {1, 1, 1});
       y_slice = builder->Reshape(
           y_slice, {y_shape.dim_size(ndims - 2), y_shape.dim_size(ndims - 1)});
 
diff --git a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
index 47d2d747e6..21d3e64872 100644
--- a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
@@ -125,6 +125,7 @@ void BatchToSpace(XlaOpKernelContext* ctx,
   //       input_shape[M+1], ..., input_shape[N-1]]
   std::vector<int64> start_indices(input_rank, 0);
   std::vector<int64> end_indices = reshaped_permuted_shape;
+  std::vector<int64> strides(input_rank, 1);
   for (int i = 0; i < block_rank; ++i) {
     int64 crop_start = crops.Get<int64>({i, 0});
     int64 crop_end = crops.Get<int64>({i, 1});
@@ -139,7 +140,7 @@ void BatchToSpace(XlaOpKernelContext* ctx,
             " end: ", crop_end, " size ", reshaped_permuted_shape[1 + i]));
   }
   xla::ComputationDataHandle output =
-      b->Slice(reshaped_permuted, start_indices, end_indices);
+      b->Slice(reshaped_permuted, start_indices, end_indices, strides);
   ctx->SetOutput(0, output);
 }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/depthwise_conv_ops.cc b/tensorflow/compiler/tf2xla/kernels/depthwise_conv_ops.cc
index 92b371cc4e..852d2a966e 100644
--- a/tensorflow/compiler/tf2xla/kernels/depthwise_conv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/depthwise_conv_ops.cc
@@ -172,15 +172,14 @@ class DepthwiseConv2dNativeOp : public XlaOpKernel {
     } else {
       // These will be used to define the bounds of each slice.
       // Within the loop, the input_channel index will be modified.
-      gtl::InlinedVector<int64, 4> filter_begin;
-      gtl::InlinedVector<int64, 4> filter_limits;
-      gtl::InlinedVector<int64, 4> input_begin;
-      gtl::InlinedVector<int64, 4> input_limits;
+      gtl::InlinedVector<int64, 4> filter_begin(4, 0);
+      gtl::InlinedVector<int64, 4> filter_limits(4);
+      gtl::InlinedVector<int64, 4> input_begin(4, 0);
+      gtl::InlinedVector<int64, 4> input_limits(4);
+      gtl::InlinedVector<int64, 4> strides(4, 1);
       for (int i = 0; i < 4; ++i) {
-        filter_begin.push_back(0);
-        filter_limits.push_back(filter_shape.dim_size(i));
-        input_begin.push_back(0);
-        input_limits.push_back(input_shape.dim_size(i));
+        filter_limits[i] = filter_shape.dim_size(i);
+        input_limits[i] = input_shape.dim_size(i);
       }
 
       std::vector<int64> strides_for_tla{strides_[1], strides_[2]};
@@ -209,9 +208,9 @@ class DepthwiseConv2dNativeOp : public XlaOpKernel {
         input_limits[3] = i + 1;
 
         xla::ComputationDataHandle filter_slice =
-            b.Slice(filter, filter_begin, filter_limits);
+            b.Slice(filter, filter_begin, filter_limits, strides);
         xla::ComputationDataHandle input_slice =
-            b.Slice(input, input_begin, input_limits);
+            b.Slice(input, input_begin, input_limits, strides);
         convs.push_back(b.ConvWithGeneralDimensions(
             input_slice, filter_slice, strides_for_tla, xla_padding, dims));
       }
diff --git a/tensorflow/compiler/tf2xla/kernels/diag_op.cc b/tensorflow/compiler/tf2xla/kernels/diag_op.cc
index 74994d8961..ec5017f6ab 100644
--- a/tensorflow/compiler/tf2xla/kernels/diag_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/diag_op.cc
@@ -125,7 +125,7 @@ class DiagPartOp : public XlaOpKernel {
     diag = builder->Reshape(diag, {new_size, new_size + 1});
 
     // Slices out the first column and reshapes to the final shape.
-    diag = builder->Slice(diag, {0, 0}, {new_size, 1});
+    diag = builder->Slice(diag, {0, 0}, {new_size, 1}, {1, 1});
     diag = builder->Reshape(diag, new_dims);
 
     ctx->SetOutput(0, diag);
@@ -224,8 +224,9 @@ class MatrixDiagPartOp : public XlaOpKernel {
     } else if (actual_size > target_size) {
       std::vector<int64> start(flattened_dims.size(), 0);
       std::vector<int64> limits(flattened_dims.begin(), flattened_dims.end());
+      std::vector<int64> strides(flattened_dims.size(), 1);
       limits[flattened_dims.size() - 1] = target_size;
-      diag = builder->Slice(diag, start, limits);
+      diag = builder->Slice(diag, start, limits, strides);
     }
 
     // Reshape so the target values are in the first position of the last
@@ -238,8 +239,9 @@ class MatrixDiagPartOp : public XlaOpKernel {
     // Slices out the first column and reshapes to the final shape.
     std::vector<int64> start(dims.size(), 0);
     std::vector<int64> limits(dims.begin(), dims.end());
+    std::vector<int64> strides(dims.size(), 1);
     limits[last_dim] = 1;
-    diag = builder->Slice(diag, start, limits);
+    diag = builder->Slice(diag, start, limits, strides);
 
     // Collapses away the last dimension.
     dims.pop_back();
diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
index faa7ef0ef9..0330e34c98 100644
--- a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
@@ -156,6 +156,8 @@ class DynamicStitchOp : public XlaOpKernel {
                                    indices0_shape.dims());
     std::vector<int64> slice_limit(1 + data0_shape.dims() -
                                    indices0_shape.dims());
+    std::vector<int64> stride(1 + data0_shape.dims() -
+                              indices0_shape.dims(), 1);
     for (int d = indices0_shape.dims(); d < data0_shape.dims(); d++) {
       slice_limit[1 + d - indices0_shape.dims()] = data0_shape.dim_size(d);
     }
@@ -168,7 +170,7 @@ class DynamicStitchOp : public XlaOpKernel {
       // And place it in the concat list in the place indicated by
       // the index.
       to_concat[index_num] =
-          ctx->builder()->Slice(expression, slice_start, slice_limit);
+          ctx->builder()->Slice(expression, slice_start, slice_limit, stride);
     }
 
     ctx->SetOutput(0, ctx->builder()->ConcatInDim(to_concat, 0));
diff --git a/tensorflow/compiler/tf2xla/kernels/slice_op.cc b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
index 51c97d85d7..482c54a40c 100644
--- a/tensorflow/compiler/tf2xla/kernels/slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
@@ -54,7 +54,9 @@ class SliceOp : public XlaOpKernel {
     for (int i = 0; i < begin.size(); ++i) {
       limits.push_back(begin[i] + size[i]);
     }
-    ctx->SetOutput(0, ctx->builder()->Slice(ctx->Input(0), begin, limits));
+    std::vector<int64> strides(begin.size(), 1);
+    ctx->SetOutput(0, ctx->builder()->Slice(ctx->Input(0), begin, limits,
+                                            strides));
   }
 
  private:
diff --git a/tensorflow/compiler/tf2xla/kernels/split_op.cc b/tensorflow/compiler/tf2xla/kernels/split_op.cc
index 017f3a110e..44ee81461e 100644
--- a/tensorflow/compiler/tf2xla/kernels/split_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/split_op.cc
@@ -77,14 +77,14 @@ class SplitOp : public XlaOpKernel {
 
     // The vectors we will use to define the slice. The entry for the
     // split dimensions varies for each output.
-    std::vector<int64> begin;
-    std::vector<int64> limits;
+    std::vector<int64> begin(input_shape.dims(), 0);
+    std::vector<int64> limits(input_shape.dims());
+    std::vector<int64> strides(input_shape.dims(), 1);
     for (int i = 0; i < input_shape.dims(); ++i) {
       // Initially set up the limits to be the full size of the input:
       // the split dimension is filled in below.
       int64 dim = input_shape.dim_size(i);
-      begin.push_back(0);
-      limits.push_back(dim);
+      limits[i] = dim;
     }
 
     auto input = ctx->Input(1);
@@ -94,7 +94,7 @@ class SplitOp : public XlaOpKernel {
       // Slice out the ith split from the split dimension.
       begin[split_dim] = i * slice_size;
       limits[split_dim] = (i + 1) * slice_size;
-      ctx->SetOutput(i, ctx->builder()->Slice(input, begin, limits));
+      ctx->SetOutput(i, ctx->builder()->Slice(input, begin, limits, strides));
     }
   }
 };
@@ -188,7 +188,7 @@ class SplitVOp : public XlaOpKernel {
     std::vector<int64> begin(input_shape.dims(), 0);
     auto dim_sizes = input_shape.dim_sizes();
     std::vector<int64> limits(dim_sizes.begin(), dim_sizes.end());
-
+    std::vector<int64> strides(input_shape.dims(), 1);
     for (int i = 0; i < num_split; ++i) {
       TensorShape output_shape(input_shape);
       int slice_size = split_sizes_vec[i];
@@ -196,7 +196,7 @@ class SplitVOp : public XlaOpKernel {
 
       // Slice out the ith split from the split dimension.
       limits[split_dim] = begin[split_dim] + slice_size;
-      ctx->SetOutput(i, ctx->builder()->Slice(input, begin, limits));
+      ctx->SetOutput(i, ctx->builder()->Slice(input, begin, limits, strides));
       begin[split_dim] = limits[split_dim];
     }
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
index 8037e90791..6af4bd0496 100644
--- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
@@ -72,55 +72,29 @@ class StridedSliceOp : public XlaOpKernel {
                        &dummy, &dummy, &dummy, &begin, &end, &strides));
 
     gtl::InlinedVector<int64, 4> dimensions_to_reverse;
-    gtl::InlinedVector<int64, 4> slice_begin, slice_end;
-    bool simple_strides = true;
+    gtl::InlinedVector<int64, 4> slice_begin, slice_end, slice_strides;
+
     for (int i = 0; i < begin.size(); ++i) {
-      simple_strides &= (std::abs(strides[i]) == 1);
       if (strides[i] > 0) {
         slice_begin.push_back(begin[i]);
         slice_end.push_back(end[i]);
+        slice_strides.push_back(strides[i]);
       } else {
         // Negative stride: swap begin and end, add 1 because the interval
         // is semi-open, and mark the dimension to be reversed.
-        slice_begin.push_back(end[i] + 1);
-        slice_end.push_back(begin[i] + 1);
+        slice_begin.push_back(input_shape.dim_size(i) - begin[i] - 1);
+        slice_end.push_back(input_shape.dim_size(i) - end[i] - 1);
+        slice_strides.push_back(-strides[i]);
         dimensions_to_reverse.push_back(i);
       }
     }
-    xla::ComputationDataHandle slice =
-        ctx->builder()->Slice(ctx->Input(0), slice_begin, slice_end);
+
+    xla::ComputationDataHandle slice = ctx->Input(0);
     if (!dimensions_to_reverse.empty()) {
       slice = ctx->builder()->Rev(slice, dimensions_to_reverse);
     }
 
-    // If at least one of the strides is > 1 (or < -1) then use Slice
-    // to pull out each of the strided slices, and Concat to put them
-    // together again.
-    if (!simple_strides) {
-      // Re-adjust the begin and end now that the periphery has been
-      // sliced away.
-      for (int d = 0; d < strides.size(); ++d) {
-        slice_end[d] -= slice_begin[d];
-        slice_begin[d] = 0;
-      }
-
-      for (int d = 0; d < strides.size(); ++d) {
-        int64 stride = std::abs(strides[d]);
-        if (stride > 1) {
-          std::vector<xla::ComputationDataHandle> to_concat;
-          int64 end = slice_end[d];
-          for (int64 i = 0; i < end; i += stride) {
-            slice_begin[d] = i;
-            slice_end[d] = i + 1;
-            to_concat.push_back(
-                ctx->builder()->Slice(slice, slice_begin, slice_end));
-          }
-          slice = ctx->builder()->ConcatInDim(to_concat, d);
-          slice_begin[d] = 0;
-          slice_end[d] = to_concat.size();
-        }
-      }
-    }
+    slice = ctx->builder()->Slice(slice, slice_begin, slice_end, slice_strides);
 
     slice = ctx->builder()->Reshape(slice, final_shape.dim_sizes());
     ctx->SetOutput(0, slice);
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
index 598b341002..9367c1ef22 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
@@ -318,7 +318,7 @@ class TensorArrayGatherOp : public XlaOpKernel {
     for (int i = 0; i < num_indices; ++i) {
       // Slices the i-th index out of `indices`, and pads it with zeros in the
       // minor dimensions to form an index into the TensorArray storage.
-      auto index = b->Slice(indices, {i}, {i + 1});
+      auto index = b->Slice(indices, {i}, {i + 1}, {1});
 
       // start_indices of the DynamicSlice are [index, 0, 0, ..., 0].
       auto start_indices = PadIndexWithZeros(b, index, ta_shape.dims() - 1);
@@ -381,16 +381,18 @@ class TensorArrayScatterOp : public XlaOpKernel {
     std::vector<int64> value_starts(value_shape.dims(), 0);
     auto value_ends = value_shape.dim_sizes();
 
+    std::vector<int64> value_strides(value_shape.dims(), 1);
+
     // For every (index, value) pair, update the corresponding TensorArray
     // storage.
     for (int i = 0; i < num_indices; ++i) {
       // Slice out part of the value.
       value_starts[0] = i;
       value_ends[0] = i + 1;
-      auto slice = b->Slice(value, value_starts, value_ends);
+      auto slice = b->Slice(value, value_starts, value_ends, value_strides);
 
       // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
-      auto index = b->Slice(indices, {i}, {i + 1});
+      auto index = b->Slice(indices, {i}, {i + 1}, {1});
       auto start_indices = PadIndexWithZeros(b, index, elem_shape.dims());
       ta = DynamicAddSlice(b, ta, slice, slice_dims, start_indices);
     }
diff --git a/tensorflow/compiler/tf2xla/kernels/unpack_op.cc b/tensorflow/compiler/tf2xla/kernels/unpack_op.cc
index a5ce78e520..f87586ba57 100644
--- a/tensorflow/compiler/tf2xla/kernels/unpack_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unpack_op.cc
@@ -66,6 +66,7 @@ class UnpackOp : public XlaOpKernel {
 
     std::vector<int64> start_indices(input_shape.dims(), 0);
     std::vector<int64> limit_indices(input_shape.dims());
+    std::vector<int64> strides(input_shape.dims(), 1);
     for (int i = 0; i < input_shape.dims(); ++i) {
       limit_indices[i] = input_shape.dim_size(i);
     }
@@ -73,7 +74,8 @@ class UnpackOp : public XlaOpKernel {
     for (int i = 0; i < num; ++i) {
       start_indices[axis] = i;
       limit_indices[axis] = i + 1;
-      auto slice = ctx->builder()->Slice(input, start_indices, limit_indices);
+      auto slice = ctx->builder()->Slice(input, start_indices, limit_indices,
+                                         strides);
       // Reshape to drop the 'axis' dimension.
       auto result = ctx->builder()->Reshape(slice, output_shape.dim_sizes());
       ctx->SetOutput(i, result);
diff --git a/tensorflow/compiler/xla/client/computation_builder.cc b/tensorflow/compiler/xla/client/computation_builder.cc
index 735a69d596..dcc313707b 100644
--- a/tensorflow/compiler/xla/client/computation_builder.cc
+++ b/tensorflow/compiler/xla/client/computation_builder.cc
@@ -256,7 +256,8 @@ void ComputationBuilder::CheckSameShape(const ComputationDataHandle& lhs,
 ComputationDataHandle ComputationBuilder::Slice(
     const ComputationDataHandle& operand,
     tensorflow::gtl::ArraySlice<int64> start_indices,
-    tensorflow::gtl::ArraySlice<int64> limit_indices) {
+    tensorflow::gtl::ArraySlice<int64> limit_indices,
+    tensorflow::gtl::ArraySlice<int64> stride) {
   if (!first_error_.ok() || !PrepareComputation().ok()) {
     return ComputationDataHandle();
   }
@@ -269,6 +270,9 @@ ComputationDataHandle ComputationBuilder::Slice(
   for (int64 index : limit_indices) {
     request.add_limit_indices(index);
   }
+  for (int64 index : stride) {
+    request.add_stride(index);
+  }
   OpRequest op_request;
   *op_request.mutable_computation() = computation_.handle();
   *op_request.mutable_slice_request() = request;
diff --git a/tensorflow/compiler/xla/client/computation_builder.h b/tensorflow/compiler/xla/client/computation_builder.h
index 5dceb03281..b411346459 100644
--- a/tensorflow/compiler/xla/client/computation_builder.h
+++ b/tensorflow/compiler/xla/client/computation_builder.h
@@ -211,9 +211,11 @@ class ComputationBuilder {
   //
   // Note that "limit" means up-to-but-not-including; i.e. [start, limit) in 1D
   // range notation.
+  // The stride parameter determines the stride over the slice
   ComputationDataHandle Slice(const ComputationDataHandle& operand,
                               tensorflow::gtl::ArraySlice<int64> start_indices,
-                              tensorflow::gtl::ArraySlice<int64> limit_indices);
+                              tensorflow::gtl::ArraySlice<int64> limit_indices,
+                              tensorflow::gtl::ArraySlice<int64> stride);
 
   // Enqueues a slice operation onto the computation that slices the 'operand'
   // from dynamic start indices which are passed in 'start_indices'.
diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index 72eec35fc4..ec643e037f 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -1215,11 +1215,7 @@ void Literal::Resize<double>(int64 num_elements, double value) {
 template <>
 void Literal::Resize<half>(int64 num_elements, half value) {
   CHECK_EQ(ShapeUtil::ElementsIn(shape()), num_elements);
-  mutable_f16s()->resize(num_elements * sizeof(half));
-  auto data = GetMutableArraySlice<half>();
-  for (int i = 0; i < num_elements; i++) {
-    data[i] = value;
-  }
+  mutable_f16s()->resize(num_elements, value);
 }
 
 template <typename RepeatedFieldT, typename NativeT>
@@ -1262,7 +1258,7 @@ LiteralProto Literal::ToProto() const {
     case F16:
       *proto.mutable_f16s() =
           string(reinterpret_cast<const char*>(f16s_.data()),
-                 f16s_.size() / sizeof(half));
+                 f16s_.size() * sizeof(half));
       break;
     case F32:
       CopyToRepeatedField(proto.mutable_f32s(), f32s());
@@ -1318,7 +1314,7 @@ void Literal::CopyFromProto(const LiteralProto& literal_proto) {
       const string& s(literal_proto.f16s());
       CHECK_EQ(0, s.size() % sizeof(half));
       f16s_ = std::vector<half>(s.size() / sizeof(half));
-      memcpy(f16s_.data(), s.data(), s.size() / sizeof(half));
+      memcpy(f16s_.data(), s.data(), s.size());
       break;
     }
     case F32:
diff --git a/tensorflow/compiler/xla/literal_util_test.cc b/tensorflow/compiler/xla/literal_util_test.cc
index 11f25fb847..5976e83950 100644
--- a/tensorflow/compiler/xla/literal_util_test.cc
+++ b/tensorflow/compiler/xla/literal_util_test.cc
@@ -949,5 +949,62 @@ TEST_F(LiteralUtilTest, CopyFromProto_Bool) {
   }
 }
 
+// Note that f16 is currently stored in a byte array in little endian byte order
+TEST_F(LiteralUtilTest, ToProto_f16) {
+  half h1(1.0f);
+  half h2(2.0f);
+
+  auto m = Literal::CreateR2<half>({{h1, h2}, {h2, h1}});
+  Literal* l = m.get();
+  EXPECT_EQ(4, ShapeUtil::ElementsIn(l->shape()));
+  EXPECT_EQ(4, l->f16s().size());
+  EXPECT_EQ(4, l->f16s_size());
+
+  LiteralProto p = l->ToProto();
+  EXPECT_EQ(4, ShapeUtil::ElementsIn(p.shape()));
+  EXPECT_EQ(8, p.f16s().size());
+  const char* d = p.f16s().data();
+  EXPECT_EQ(d[0], 0);
+  EXPECT_EQ(d[1], 0x3C);
+  EXPECT_EQ(d[2], 0);
+  EXPECT_EQ(d[3], 0x40);
+  EXPECT_EQ(d[4], 0);
+  EXPECT_EQ(d[5], 0x40);
+  EXPECT_EQ(d[6], 0);
+  EXPECT_EQ(d[7], 0x3C);
+}
+
+// Note that f16 is currently stored in a byte array in little endian byte order
+TEST_F(LiteralUtilTest, CopyFromProto_f16) {
+  half h1(1.0f);
+  half h2(2.0f);
+
+  const char half_vals[8] = {
+    0x00, 0x3C, 0x00, 0x40, 0x00, 0x40, 0x00, 0x3C
+  };
+  LiteralProto p;
+  p.mutable_shape()->set_element_type(F16);
+  p.mutable_shape()->clear_dimensions();
+  p.mutable_shape()->add_dimensions(4);
+  p.clear_f16s();
+  p.set_f16s(half_vals, 8);
+
+
+  Literal literal(p);
+  ASSERT_EQ(4, literal.f16s_size());
+  ASSERT_EQ(h1, literal.f16s(0));
+  ASSERT_EQ(h2, literal.f16s(1));
+  ASSERT_EQ(h2, literal.f16s(2));
+  ASSERT_EQ(h1, literal.f16s(3));
+
+  const std::vector<half>& r = literal.f16s();
+  ASSERT_EQ(4, r.size());
+  ASSERT_EQ(h1, r[0]);
+  ASSERT_EQ(h2, r[1]);
+  ASSERT_EQ(h2, r[2]);
+  ASSERT_EQ(h1, r[3]);
+}
+
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 0187c09d7b..5709ac3067 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -855,6 +855,7 @@ Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
     // Second, construct the slice instruction to perform the negative padding.
     std::vector<int64> start_indices;
     std::vector<int64> end_indices;
+    std::vector<int64> strides;
     for (int64 i = 0; i < pad->padding_config().dimensions_size(); ++i) {
       const PaddingConfig::PaddingConfigDimension& padding_dimension =
           pad->padding_config().dimensions(i);
@@ -868,16 +869,18 @@ Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
       }
       start_indices.push_back(start);
       end_indices.push_back(end);
+      strides.push_back(1);
     }
 
     // Verify that the slice shape matches the pad shape.
     TF_ASSIGN_OR_RETURN(Shape inferred_slice_shape,
                         ShapeInference::InferSliceShape(
-                            nonzero_pad_shape, start_indices, end_indices));
+                            nonzero_pad_shape, start_indices, end_indices,
+                            strides));
     TF_RET_CHECK(ShapeUtil::Compatible(inferred_slice_shape, pad->shape()));
 
     std::unique_ptr<HloInstruction> slice = HloInstruction::CreateSlice(
-        pad->shape(), nonzero_pad, start_indices, end_indices);
+        pad->shape(), nonzero_pad, start_indices, end_indices, strides);
     return ReplaceWithNewInstruction(pad, std::move(slice));
   }
 
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 0792006ddb..7e52c8fb0c 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -520,7 +520,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveEmptyConcatenateOperands) {
       HloInstruction::CreateConstant(Literal::CreateR1<float>({})));
   HloInstruction* empty_slice =
       builder.AddInstruction(HloInstruction::CreateSlice(
-          ShapeUtil::MakeShape(F32, {0}), param1, {42}, {42}));
+          ShapeUtil::MakeShape(F32, {0}), param1, {42}, {42}, {1}));
   Shape result_shape = ShapeUtil::MakeShape(F32, {3 * kParamLength});
   builder.AddInstruction(HloInstruction::CreateConcatenate(
       result_shape, {empty_literal, param0, param0, empty_slice, param1}, 0));
@@ -551,7 +551,7 @@ TEST_F(AlgebraicSimplifierTest, OnlyEmptyConcatenateOperands) {
       HloInstruction::CreateConstant(Literal::CreateR1<float>({})));
   HloInstruction* empty_slice =
       builder.AddInstruction(HloInstruction::CreateSlice(
-          ShapeUtil::MakeShape(F32, {0}), param0, {42}, {42}));
+          ShapeUtil::MakeShape(F32, {0}), param0, {42}, {42}, {1}));
   Shape result_shape = ShapeUtil::MakeShape(F32, {0});
   builder.AddInstruction(HloInstruction::CreateConcatenate(
       result_shape, {empty_literal, empty_slice}, 0));
@@ -1132,7 +1132,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopSlice) {
           0, ShapeUtil::MakeShape(F32, {dim0, dim1}), "param"));
   builder.AddInstruction(HloInstruction::CreateSlice(
       ShapeUtil::MakeShape(F32, {dim0, dim1}), param, /*start_indices=*/{0, 0},
-      /*limit_indices=*/{dim0, dim1}));
+      /*limit_indices=*/{dim0, dim1}, /*slices=*/{1, 1}));
 
   HloModule module(TestName());
   HloComputation* computation = module.AddEntryComputation(builder.Build());
@@ -1537,7 +1537,7 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToSlice) {
 
   Shape slice_shape = ShapeUtil::MakeShape(F32, {2, 2, 3, 3});
   HloInstruction* slice = builder.AddInstruction(HloInstruction::CreateSlice(
-      slice_shape, broadcast, {0, 1, 2, 3}, {2, 3, 5, 6}));
+      slice_shape, broadcast, {0, 1, 2, 3}, {2, 3, 5, 6}, {1, 1, 1, 1}));
 
   HloModule module(TestName());
   auto computation = module.AddEntryComputation(builder.Build());
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index c498b86dd4..56568fd446 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -731,7 +731,7 @@ TEST_F(BufferAssignmentTest, ReuseNonOperandBuffer) {
   auto negate = builder.AddInstruction(
       HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, param0));
   auto slice = builder.AddInstruction(
-      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}));
+      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}, {1}));
   auto broadcast = builder.AddInstruction(
       HloInstruction::CreateBroadcast(f32a100x10_, slice, {1}));
 
@@ -763,7 +763,7 @@ TEST_F(BufferAssignmentTest, NoReuseLiveBuffer) {
   auto negate = builder.AddInstruction(
       HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, param0));
   auto slice = builder.AddInstruction(
-      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}));
+      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}, {1}));
   auto broadcast = builder.AddInstruction(
       HloInstruction::CreateBroadcast(f32a100x10_, slice, {1}));
   builder.AddInstruction(HloInstruction::CreateTuple({negate, broadcast}));
@@ -800,7 +800,7 @@ TEST_F(BufferAssignmentTest, NoReuseAliasedBuffer) {
   auto tuple_element = builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(f32vec100_, tuple, 0));
   auto slice = builder.AddInstruction(
-      HloInstruction::CreateSlice(f32vec10_, tuple_element, {0}, {10}));
+      HloInstruction::CreateSlice(f32vec10_, tuple_element, {0}, {10}, {1}));
   auto broadcast = builder.AddInstruction(
       HloInstruction::CreateBroadcast(f32a100x10_, slice, {1}));
   builder.AddInstruction(HloInstruction::CreateTuple({tuple, broadcast}));
@@ -835,7 +835,7 @@ TEST_F(BufferAssignmentTest, DoNotReuseOversizedOutputBuffer) {
       HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, param0));
   // Slice output is 10 elements.
   auto slice = builder.AddInstruction(
-      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}));
+      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}, {1}));
   // Broadcast output is 40 elements.
   auto broadcast = builder.AddInstruction(HloInstruction::CreateBroadcast(
       ShapeUtil::MakeShape(F32, {10, 4}), slice, {0}));
@@ -867,7 +867,7 @@ TEST_F(BufferAssignmentTest, ReuseOutputBufferIfExactlySized) {
   auto negate = builder.AddInstruction(
       HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, param0));
   auto slice = builder.AddInstruction(
-      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}));
+      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}, {1}));
   // Broadcast output is 40 elements.
   auto broadcast = builder.AddInstruction(HloInstruction::CreateBroadcast(
       ShapeUtil::MakeShape(F32, {10, 10}), slice, {0}));
@@ -904,7 +904,7 @@ TEST_F(BufferAssignmentTest, DoNotReuseOversizedOutputBufferInTuple) {
       HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, param0));
   // Slice output is 10 elements.
   auto slice = builder.AddInstruction(
-      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}));
+      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}, {1}));
   // Broadcast output is 40 elements.
   auto broadcast = builder.AddInstruction(HloInstruction::CreateBroadcast(
       ShapeUtil::MakeShape(F32, {10, 4}), slice, {0}));
diff --git a/tensorflow/compiler/xla/service/buffer_liveness_test.cc b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
index a31e9b1782..a5f7cc0aeb 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
@@ -588,7 +588,7 @@ class FusedDynamicUpdateSliceLivenessTest : public BufferLivenessTest {
     if (update_uses_tuple_element1) {
       // Create a slice instruction as an additional user of 'gte1'.
       slice = builder.AddInstruction(
-          HloInstruction::CreateSlice(update_shape, gte1, {0}, {3}));
+          HloInstruction::CreateSlice(update_shape, gte1, {0}, {3}, {1}));
       update = builder.AddInstruction(HloInstruction::CreateBinary(
           update_shape, HloOpcode::kAdd, update, slice));
     }
diff --git a/tensorflow/compiler/xla/service/compile_only_service.h b/tensorflow/compiler/xla/service/compile_only_service.h
index dd00c58240..0a1911cbd1 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.h
+++ b/tensorflow/compiler/xla/service/compile_only_service.h
@@ -55,7 +55,7 @@ class CompileOnlyService : public Service {
 
   // Override Service methods that require or imply the existence of an
   // execute backend.  Note that this does not include TransferToClient, as
-  // computing contants produces global data that we may wish to transfer.
+  // computing constants produces global data that we may wish to transfer.
   tensorflow::Status Execute(const ExecuteRequest* arg,
                              ExecuteResponse* result) override {
     return Unimplemented("CompileOnlyService does not support execution.");
diff --git a/tensorflow/compiler/xla/service/computation_placer.cc b/tensorflow/compiler/xla/service/computation_placer.cc
index cdf277581f..cdfa30dd9a 100644
--- a/tensorflow/compiler/xla/service/computation_placer.cc
+++ b/tensorflow/compiler/xla/service/computation_placer.cc
@@ -49,17 +49,18 @@ Status DeviceAssignment::Serialize(DeviceAssignmentProto* proto) const {
   return Status::OK();
 }
 
-/* static */ StatusOr<DeviceAssignment> DeviceAssignment::Deserialize(
-    const DeviceAssignmentProto& proto) {
+/* static */ StatusOr<std::unique_ptr<DeviceAssignment>>
+DeviceAssignment::Deserialize(const DeviceAssignmentProto& proto) {
   TF_RET_CHECK(proto.computation_devices_size() == proto.computation_count());
-  DeviceAssignment assignment(proto.replica_count(), proto.computation_count());
+  auto assignment = MakeUnique<DeviceAssignment>(proto.replica_count(),
+                                                 proto.computation_count());
   for (int computation = 0; computation < proto.computation_count();
        ++computation) {
     const auto& computation_device = proto.computation_devices(computation);
     TF_RET_CHECK(computation_device.replica_device_ids_size() ==
                  proto.replica_count());
     for (int replica = 0; replica < proto.replica_count(); ++replica) {
-      assignment(replica, computation) =
+      (*assignment)(replica, computation) =
           computation_device.replica_device_ids(replica);
     }
   }
diff --git a/tensorflow/compiler/xla/service/computation_placer.h b/tensorflow/compiler/xla/service/computation_placer.h
index 4d26d6bb85..7d9abcd100 100644
--- a/tensorflow/compiler/xla/service/computation_placer.h
+++ b/tensorflow/compiler/xla/service/computation_placer.h
@@ -49,7 +49,11 @@ class DeviceAssignment : public Array2D<int> {
 
   // Protocol buffer serialization and deserialization.
   Status Serialize(DeviceAssignmentProto* proto) const;
-  static StatusOr<DeviceAssignment> Deserialize(
+
+  // Return a std::unique_ptr<DeviceAssignment> instead of a DeviceAssignment
+  // directly because one of the supported TF platforms (mac) does not compile
+  // due to a StatusOr of an incomplete type (DeviceAssignment).
+  static StatusOr<std::unique_ptr<DeviceAssignment>> Deserialize(
       const DeviceAssignmentProto& proto);
 };
 
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index 5b21ae3d2a..db0a8b36cd 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -949,9 +949,20 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
                  const IrArray::Index& index) -> StatusOr<llvm::Value*> {
         IrArray::Index sliced_index(index.size());
         for (int i = 0; i < index.size(); ++i) {
-          sliced_index[i] = ir_builder_->CreateAdd(
-              index[i], llvm::ConstantInt::get(index[i]->getType(),
-                                               hlo->slice_starts(i)));
+          int64 stride = hlo->slice_stride(i);
+          if (stride != 1) {
+            sliced_index[i] = ir_builder_->CreateAdd(
+                ir_builder_->CreateMul(
+                    index[i], llvm::ConstantInt::get(index[i]->getType(),
+                                                     stride)),
+                llvm::ConstantInt::get(index[i]->getType(),
+                                       hlo->slice_starts(i)));
+          } else {
+            sliced_index[i] = ir_builder_->CreateAdd(
+                    index[i],
+                    llvm::ConstantInt::get(index[i]->getType(),
+                                           hlo->slice_starts(i)));
+          }
         }
         return operand_to_generator.at(hlo->operand(0))(sliced_index);
       };
diff --git a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
index 4e130de311..b8c6162084 100644
--- a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
+++ b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
@@ -80,6 +80,7 @@ HloInstruction* MaybePaddedAndSlicedInput(
     std::vector<int64> start_indices(input->shape().dimensions_size(), 0);
     std::vector<int64> limit_indices(input->shape().dimensions().begin(),
                                      input->shape().dimensions().end());
+    std::vector<int64> strides(input->shape().dimensions_size(), 1);
     for (size_t i = 0; i < conv_dnums.spatial_dimensions().size(); ++i) {
       int64 dim = conv_dnums.spatial_dimensions(i);
       // If dimension "dim" has negative padding, increase the start index or
@@ -92,9 +93,9 @@ HloInstruction* MaybePaddedAndSlicedInput(
 
     input = computation->AddInstruction(HloInstruction::CreateSlice(
         ShapeInference::InferSliceShape(input->shape(), start_indices,
-                                        limit_indices)
+                                        limit_indices, strides)
             .ConsumeValueOrDie(),
-        input, start_indices, limit_indices));
+        input, start_indices, limit_indices, strides));
   }
 
   return input;
@@ -354,6 +355,8 @@ bool PadInsertion::CanonicalizeBackwardInputConvolution(
   std::vector<int64> limit_indices(
       new_backward_conv->shape().dimensions().begin(),
       new_backward_conv->shape().dimensions().end());
+  std::vector<int64> strides(new_backward_conv->shape().dimensions_size(),
+                             1LL);
   for (size_t i = 0; i < backward_conv->window().dimensions_size(); ++i) {
     int64 padding_low = backward_conv->window().dimensions(i).padding_low();
     int64 padding_high = backward_conv->window().dimensions(i).padding_high();
@@ -373,13 +376,13 @@ bool PadInsertion::CanonicalizeBackwardInputConvolution(
   // Replace the old backward convolution with the slice.
   CHECK(ShapeUtil::Compatible(
       ShapeInference::InferSliceShape(new_backward_conv->shape(), start_indices,
-                                      limit_indices)
+                                      limit_indices, strides)
           .ConsumeValueOrDie(),
       backward_conv->shape()));
   TF_CHECK_OK(computation->ReplaceWithNewInstruction(
       backward_conv,
       HloInstruction::CreateSlice(backward_conv->shape(), new_backward_conv,
-                                  start_indices, limit_indices)));
+                                  start_indices, limit_indices, strides)));
   return true;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
index a643bc4076..1c60b06ddd 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
@@ -147,6 +147,7 @@ TEST_F(HloConstantFoldingTest, Slice) {
   const int64 dimensions[] = {11, 8, 7, 5, 9};
   const int64 slice_start[] = {4, 2, 3, 1, 5};
   const int64 slice_limits[] = {10, 8, 6, 5, 9};
+  const int64 slice_strides[] = {1, 1, 1, 1, 1};
   TF_ASSIGN_OR_ASSERT_OK(auto literal,
                          LiteralTestUtil::CreateRandomLiteral<F32>(
                              ShapeUtil::MakeShape(F32, dimensions), 0.0, 1.0));
@@ -154,7 +155,7 @@ TEST_F(HloConstantFoldingTest, Slice) {
       HloInstruction::CreateConstant(std::move(literal)));
   Shape shape = ShapeUtil::MakeShape(F32, {6, 6, 3, 4, 4});
   builder.AddInstruction(HloInstruction::CreateSlice(
-      shape, literal_instruction, slice_start, slice_limits));
+      shape, literal_instruction, slice_start, slice_limits, slice_strides));
   auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 7c60fc79ed..4b38ecd5de 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -306,11 +306,13 @@ HloInstruction::CreateCrossReplicaSum(const Shape& shape,
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateSlice(
     const Shape& shape, HloInstruction* operand,
     tensorflow::gtl::ArraySlice<int64> start_indices,
-    tensorflow::gtl::ArraySlice<int64> limit_indices) {
+    tensorflow::gtl::ArraySlice<int64> limit_indices,
+    tensorflow::gtl::ArraySlice<int64> strides) {
   auto instruction = WrapUnique(new HloInstruction(HloOpcode::kSlice, shape));
   instruction->AppendOperand(operand);
   instruction->slice_starts_.assign(start_indices.begin(), start_indices.end());
   instruction->slice_limits_.assign(limit_indices.begin(), limit_indices.end());
+  instruction->slice_strides_.assign(strides.begin(), strides.end());
   return instruction;
 }
 
@@ -852,7 +854,8 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       return CreateReshape(shape, new_operands[0]);
     case HloOpcode::kSlice:
       CHECK_EQ(new_operands.size(), 1);
-      return CreateSlice(shape, new_operands[0], slice_starts_, slice_limits_);
+      return CreateSlice(shape, new_operands[0], slice_starts_, slice_limits_,
+                         slice_strides_);
     case HloOpcode::kDynamicSlice:
       return CreateDynamicSlice(shape, new_operands[0], new_operands[1],
                                 dynamic_slice_sizes_);
@@ -1672,6 +1675,8 @@ string HloInstruction::ToCategory() const {
       case FusionKind::kConvBackwardFilter:
       case FusionKind::kConvBackwardInput:
         return "convolution fusion";
+      case FusionKind::kCustom:
+        return "custom fusion";
     }
   }
 
@@ -2339,6 +2344,8 @@ string ToString(HloInstruction::FusionKind kind) {
       return "kConvBackwardFilter";
     case HloInstruction::FusionKind::kConvBackwardInput:
       return "kConvBackwardInput";
+    case HloInstruction::FusionKind::kCustom:
+      return "kCustom";
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 5885261e8f..bf50542cc0 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -63,6 +63,9 @@ class HloInstruction {
     kTransposeDot,        // Fused into a dot with transposed operands.
     kConvBackwardFilter,  // Fused into a backward filter convolution.
     kConvBackwardInput,   // Fused into a backward input convolution.
+
+    kCustom,              // Custom category for backend-specific fusions that
+                          // do not match any of the more specific ones.
   };
 
   ~HloInstruction();
@@ -174,7 +177,8 @@ class HloInstruction {
   static std::unique_ptr<HloInstruction> CreateSlice(
       const Shape& shape, HloInstruction* operand,
       tensorflow::gtl::ArraySlice<int64> start_indices,
-      tensorflow::gtl::ArraySlice<int64> limit_indices);
+      tensorflow::gtl::ArraySlice<int64> limit_indices,
+      tensorflow::gtl::ArraySlice<int64> strides);
 
   // Creates a slice instruction, where the first operand is sliced by
   // start indices specified in the second operand, and by size specfied in
@@ -662,6 +666,15 @@ class HloInstruction {
     return slice_limits_;
   }
 
+  // Returns the stride in the given dimension for a slice node.
+  //
+  // Precondition: opcode() == HloOpcode::kSlice
+  int64 slice_stride(int64 dimension) const {
+    CHECK_EQ(HloOpcode::kSlice, opcode_);
+    return slice_strides_[dimension];
+  }
+  const std::vector<int64>& slice_strides() const { return slice_strides_; }
+
   // Returns the size of the slice in the given dimension for a dynamic
   // slice node.
   //
@@ -907,6 +920,7 @@ class HloInstruction {
   // Describes the [begin, end) index range for a slice.
   std::vector<int64> slice_starts_;
   std::vector<int64> slice_limits_;
+  std::vector<int64> slice_strides_;
 
   // The bit sizes for a reduce-precision operation.
   int32 exponent_bits_;
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
index 8a1e705711..1a861cd16b 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
@@ -67,7 +67,8 @@ class HloRematerializationTest : public HloTestBase {
         /*dimension=*/0));
     auto slice_1 = builder.AddInstruction(HloInstruction::CreateSlice(
         vec1_shape_, concat_1, /*start_indices=*/{0},
-        /*limit_indices=*/{1}));
+        /*limit_indices=*/{1},
+        /*strides=*/{1}));
     auto concat_2 = builder.AddInstruction(HloInstruction::CreateConcatenate(
         ShapeUtil::MakeShape(xla::F32, {1025}), {bcast, slice_1},
         /*dimension=*/0));
@@ -75,7 +76,8 @@ class HloRematerializationTest : public HloTestBase {
     // which is necessary to use this computation in a while.
     builder.AddInstruction(HloInstruction::CreateSlice(vec1_shape_, concat_2,
                                                        /*start_indices=*/{0},
-                                                       /*limit_indices=*/{1}));
+                                                       /*limit_indices=*/{1},
+                                                       /*strides=*/{1}));
     return builder.Build();
   }
 
@@ -103,7 +105,8 @@ class HloRematerializationTest : public HloTestBase {
         HloInstruction::CreateBroadcast(vec1024_shape_, param, {}));
     auto slice_1 = builder.AddInstruction(
         HloInstruction::CreateSlice(vec1_shape_, bcast, /*start_indices=*/{0},
-                                    /*limit_indices=*/{1}));
+                                    /*limit_indices=*/{1},
+                                    /*strides=*/{1}));
     auto while_inst = builder.AddInstruction(HloInstruction::CreateWhile(
         vec1_shape_, while_cond, while_body, slice_1));
     auto concat = builder.AddInstruction(HloInstruction::CreateConcatenate(
@@ -111,7 +114,8 @@ class HloRematerializationTest : public HloTestBase {
         /*dimension=*/0));
     builder.AddInstruction(HloInstruction::CreateSlice(vec1_shape_, concat,
                                                        /*start_indices=*/{0},
-                                                       /*limit_indices=*/{1}));
+                                                       /*limit_indices=*/{1},
+                                                       /*strides=*/{1}));
     return builder.Build();
   }
 
@@ -353,7 +357,7 @@ TEST_F(HloRematerializationTest, InstructionRematerializedMultipleTimes) {
         /*dimension=*/0));
     builder.AddInstruction(HloInstruction::CreateSlice(
         vec1024_shape_, concat, /*start_indices=*/{0},
-        /*limit_indices=*/{1024}));
+        /*limit_indices=*/{1024}, /*slices=*/{1}));
     subcomputation = module->AddEmbeddedComputation(builder.Build());
   }
 
@@ -469,7 +473,7 @@ TEST_P(IndirectUseTest, IndirectUseNotRematerialized) {
         /*dimension=*/0));
     builder.AddInstruction(HloInstruction::CreateSlice(
         vec1024_shape_, concat, /*start_indices=*/{0},
-        /*limit_indices=*/{1024}));
+        /*limit_indices=*/{1024}, /*slices=*/{1}));
     subcomputation = module->AddEmbeddedComputation(builder.Build());
   }
 
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index b332709995..5e4df9ddd6 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -1135,7 +1135,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
 
 /* static */ StatusOr<Shape> ShapeInference::InferSliceShape(
     const Shape& arg, tensorflow::gtl::ArraySlice<int64> starts,
-    tensorflow::gtl::ArraySlice<int64> limits) {
+    tensorflow::gtl::ArraySlice<int64> limits,
+    tensorflow::gtl::ArraySlice<int64> strides) {
   TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(arg, "operand of slice"));
   VLOG(2) << tensorflow::strings::Printf(
       "slicing shape %s starts={%s} limits={%s}",
@@ -1158,13 +1159,13 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   for (int64 dimension = 0; dimension < starts.size(); ++dimension) {
     int64 start_index = starts[dimension];
     int64 limit_index = limits[dimension];
+    int64 stride = strides[dimension];
     if (start_index < 0) {
       return InvalidArgument("negative start index to slice: %lld",
                              start_index);
     }
-    if (limit_index < 0) {
-      return InvalidArgument("negative limit index to slice: %lld",
-                             limit_index);
+    if (stride == 0) {
+      return InvalidArgument("Zero stride");
     }
     if (limit_index > arg.dimensions(dimension)) {
       return InvalidArgument(
@@ -1172,18 +1173,21 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
           "size (%lld)",
           limit_index, arg.dimensions(dimension));
     }
-    if (start_index > limit_index) {
-      return InvalidArgument(
-          "limit index (%lld) must be greater or equal to "
-          "start index (%lld) in slice",
-          limit_index, start_index);
-    }
     VLOG(2) << tensorflow::strings::Printf("starts[%lld] = %lld", dimension,
                                            start_index);
     VLOG(2) << tensorflow::strings::Printf("limits[%lld] = %lld", dimension,
                                            limit_index);
-
-    sizes.push_back(limits[dimension] - starts[dimension]);
+    if (stride > 0) {
+      if (start_index > limit_index) {
+        return InvalidArgument(
+            "limit index (%lld) must be greater or equal to "
+            "start index (%lld) in slice with positive stride",
+            limit_index, start_index);
+      }
+      sizes.push_back((limit_index - start_index + stride - 1) / stride);
+    } else {
+      return InvalidArgument("Negative strides not supported");
+    }
   }
 
   return ShapeUtil::MakeShape(arg.element_type(), sizes);
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index 55c60e149d..42e4c7d39d 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -116,7 +116,8 @@ class ShapeInference {
   // e.g. slice f32[32x32] 0:16 0:16 -> f32[16x16]
   static StatusOr<Shape> InferSliceShape(
       const Shape& arg, tensorflow::gtl::ArraySlice<int64> starts,
-      tensorflow::gtl::ArraySlice<int64> limits);
+      tensorflow::gtl::ArraySlice<int64> limits,
+      tensorflow::gtl::ArraySlice<int64> strides);
 
   // Infers the shape produced by a dynamic slice operation of size specified
   // in 'slice_sizes', with dynamic start indices shape 'start_indices_shape'.
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index 7cff042a48..8c731ae297 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -682,16 +682,43 @@ TEST_F(ReduceShapeInferenceTest, ErrorElementTypeVsApplyType) {
 TEST_F(ShapeInferenceTest, InferSliceShapeRank2) {
   Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
   auto inferred_status =
-      ShapeInference::InferSliceShape(matrix_shape, {32, 0}, {64, 64});
+      ShapeInference::InferSliceShape(matrix_shape, {32, 0}, {64, 64}, {1, 1});
   ASSERT_IS_OK(inferred_status.status());
   Shape inferred = inferred_status.ValueOrDie();
   ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {32, 64}), inferred));
 }
 
+TEST_F(ShapeInferenceTest, InferSliceShapeRank2WithStrides) {
+  Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
+  auto inferred_status =
+      ShapeInference::InferSliceShape(matrix_shape, {32, 0}, {64, 64}, {2, 4});
+  ASSERT_IS_OK(inferred_status.status());
+  Shape inferred = inferred_status.ValueOrDie();
+  ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {16, 16}), inferred));
+}
+
+TEST_F(ShapeInferenceTest, InferSliceShapeRank2WithStridesNotIntegral) {
+  Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
+  auto inferred_status =
+      ShapeInference::InferSliceShape(matrix_shape, {15, 0}, {20, 13}, {2, 4});
+  ASSERT_IS_OK(inferred_status.status());
+  Shape inferred = inferred_status.ValueOrDie();
+  ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {3, 4}), inferred));
+}
+
+TEST_F(ShapeInferenceTest, InferInvalidStride) {
+  Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
+  auto inferred_status =
+      ShapeInference::InferSliceShape(matrix_shape, {127, 0}, {129, 2}, {0, 1});
+  ASSERT_FALSE(inferred_status.ok());
+  ASSERT_EQ(tensorflow::error::INVALID_ARGUMENT,
+            inferred_status.status().code());
+}
+
 TEST_F(ShapeInferenceTest, InferOobSliceShapeRank2) {
   Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
   auto inferred_status =
-      ShapeInference::InferSliceShape(matrix_shape, {127, 0}, {129, 2});
+      ShapeInference::InferSliceShape(matrix_shape, {127, 0}, {129, 2}, {1, 1});
   ASSERT_FALSE(inferred_status.ok());
   ASSERT_EQ(tensorflow::error::INVALID_ARGUMENT,
             inferred_status.status().code());
@@ -700,7 +727,7 @@ TEST_F(ShapeInferenceTest, InferOobSliceShapeRank2) {
 TEST_F(ShapeInferenceTest, InferSliceShapeRank1) {
   Shape vector_shape = ShapeUtil::MakeShape(F32, {17});
   auto inferred_status =
-      ShapeInference::InferSliceShape(vector_shape, {2}, {4});
+      ShapeInference::InferSliceShape(vector_shape, {2}, {4}, {1});
   ASSERT_TRUE(inferred_status.ok());
   Shape inferred = inferred_status.ValueOrDie();
   ASSERT_TRUE(ShapeUtil::Equal(inferred, ShapeUtil::MakeShape(F32, {2})));
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
index d25e5adee3..cd79e63caf 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
@@ -584,7 +584,7 @@ class FusionPointsToAnalysisTest : public TuplePointsToAnalysisTest {
     if (add_additional_gte0_user) {
       // Create 'slice' as an additional user of 'input'.
       auto slice = builder.AddInstruction(
-          HloInstruction::CreateSlice(update_shape, input, {0}, {3}));
+          HloInstruction::CreateSlice(update_shape, input, {0}, {3}, {1}));
       // Modify 'update' to take 'slice' output.
       update = builder.AddInstruction(HloInstruction::CreateBinary(
           update_shape, HloOpcode::kAdd, update, slice));
diff --git a/tensorflow/compiler/xla/service/user_computation.cc b/tensorflow/compiler/xla/service/user_computation.cc
index 1f6e789379..92b8c7bb21 100644
--- a/tensorflow/compiler/xla/service/user_computation.cc
+++ b/tensorflow/compiler/xla/service/user_computation.cc
@@ -744,7 +744,8 @@ StatusOr<ComputationDataHandle> UserComputation::AddSliceInstruction(
       Shape new_shape,
       ShapeInference::InferSliceShape(
           operand->output_shape(), AsInt64Slice(slice_request.start_indices()),
-          AsInt64Slice(slice_request.limit_indices())));
+          AsInt64Slice(slice_request.limit_indices()),
+          AsInt64Slice(slice_request.stride())));
 
   ComputationDataHandle handle = CreateComputationDataHandle();
 
@@ -2393,7 +2394,8 @@ void ComputationLowerer::Visit(
       hlo_instruction = add_instruction(HloInstruction::CreateSlice(
           request.output_shape(), operand,
           AsInt64Slice(slice_request.start_indices()),
-          AsInt64Slice(slice_request.limit_indices())));
+          AsInt64Slice(slice_request.limit_indices()),
+          AsInt64Slice(slice_request.stride())));
       break;
     }
 
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index bb7fbad000..024988743c 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -1853,7 +1853,7 @@ TEST_F(ArrayElementwiseOpTest, ImplictBroadcastInFusedExpressions) {
 
   auto x = builder.Parameter(0, x_literal->shape(), "x");
   auto y = builder.Parameter(1, y_literal->shape(), "y");
-  auto slice = builder.Slice(x, {1}, {2});
+  auto slice = builder.Slice(x, {1}, {2}, {1});
   builder.Sub(slice, y);
 
   ComputeAndCompareR1<float>(&builder, {-2, -3}, {x_data.get(), y_data.get()},
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index 7abef6a27b..63a630f9e5 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -365,9 +365,9 @@ XLA_TEST_F(DotOperationTest, BatchMatMul) {
   std::vector<xla::ComputationDataHandle> out_slices;
   for (int i = 0; i < 4; ++i) {
     // Slice off individual matrices and reshape to 2D tensors.
-    auto x_slice = builder.Slice(x_flat, {i, 0, 0}, {i + 1, 2, 2});
+    auto x_slice = builder.Slice(x_flat, {i, 0, 0}, {i + 1, 2, 2}, {1, 1, 1});
     x_slice = builder.Reshape(x_slice, {0, 1, 2}, {2, 2});
-    auto y_slice = builder.Slice(y_flat, {i, 0, 0}, {i + 1, 2, 2});
+    auto y_slice = builder.Slice(y_flat, {i, 0, 0}, {i + 1, 2, 2}, {1, 1, 1});
     y_slice = builder.Reshape(y_slice, {0, 1, 2}, {2, 2});
 
     auto out = builder.Dot(x_slice, y_slice);
diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc
index c8b91eafc7..7803d234fd 100644
--- a/tensorflow/compiler/xla/tests/fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/fusion_test.cc
@@ -210,7 +210,7 @@ XLA_TEST_F(FusionTest, Test) {
       HloInstruction::CreateTernary(ShapeUtil::MakeShape(F32, {2, 3}),
                                     HloOpcode::kSelect, const10, add8, const9));
   auto slice12 = builder.AddInstruction(HloInstruction::CreateSlice(
-      ShapeUtil::MakeShape(F32, {2, 1}), select11, {0, 1}, {2, 2}));
+      ShapeUtil::MakeShape(F32, {2, 1}), select11, {0, 1}, {2, 2}, {1, 1}));
   // CreateFusionInstruction needs the `instructions_to_fuse` argument in
   // reverse topological order, so the first element in `instructions_to_fuse`
   // must be the root.
diff --git a/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc b/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc
index df3d4fa21d..56c15e5ff7 100644
--- a/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc
+++ b/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc
@@ -36,7 +36,7 @@ XLA_TEST_F(SliceTest, Slice2D) {
   ComputationBuilder builder(client_, "slice_2d");
   auto original = builder.ConstantR2<float>(
       {{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}, {7.0, 8.0, 9.0}, {10.0, 11.0, 12.0}});
-  builder.Slice(original, {2, 1}, {4, 3});
+  builder.Slice(original, {2, 1}, {4, 3}, {1, 1});
 
   Array2D<float> expected({{8.0f, 9.0f}, {11.0f, 12.0f}});
   ComputeAndCompareR2<float>(&builder, expected, {}, ErrorSpec(0.000001));
@@ -47,7 +47,7 @@ XLA_TEST_F(SliceTest, Slice3D) {
   Array3D<float> array_3d(
       {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}});
   auto original = builder.ConstantR3FromArray3D<float>(array_3d);
-  builder.Slice(original, {0, 0, 1}, {2, 1, 2});
+  builder.Slice(original, {0, 0, 1}, {2, 1, 2}, {1, 1, 1});
 
   Array3D<float> expected_3d({{{2.0f}}, {{6.0f}}});
   ComputeAndCompareR3<float>(&builder, expected_3d, {}, ErrorSpec(0.000001));
diff --git a/tensorflow/compiler/xla/tests/params_test.cc b/tensorflow/compiler/xla/tests/params_test.cc
index 2065e9e813..a7692fceb4 100644
--- a/tensorflow/compiler/xla/tests/params_test.cc
+++ b/tensorflow/compiler/xla/tests/params_test.cc
@@ -325,7 +325,7 @@ XLA_TEST_F(ParamsTest, R2_2x2_TryToPassReverseLayoutToParameter) {
   ComputationBuilder builder(client_, TestName());
   auto input = builder.Parameter(0, original, "input");
   // Use the slice operator to get an off-diagonal element.
-  builder.Slice(input, {0, 1}, {1, 2});
+  builder.Slice(input, {0, 1}, {1, 2}, {1, 1});
 
   std::unique_ptr<GlobalData> data =
       client_->TransferToServer(*literal).ConsumeValueOrDie();
diff --git a/tensorflow/compiler/xla/tests/slice_test.cc b/tensorflow/compiler/xla/tests/slice_test.cc
index 97120df0c5..5e7d475662 100644
--- a/tensorflow/compiler/xla/tests/slice_test.cc
+++ b/tensorflow/compiler/xla/tests/slice_test.cc
@@ -44,7 +44,7 @@ class SliceTest : public ClientLibraryTestBase {
 
     ComputationBuilder builder(client_, TestName());
     auto original = builder.ConstantR1<NativeT>(constant);
-    builder.Slice(original, {2}, {4});
+    builder.Slice(original, {2}, {4}, {1});
 
     const std::vector<NativeT> expected = {static_cast<NativeT>(2),
                                            static_cast<NativeT>(3)};
@@ -55,7 +55,7 @@ class SliceTest : public ClientLibraryTestBase {
 XLA_TEST_F(SliceTest, SliceZeroToZeroF32) {
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR1<float>({});
-  builder.Slice(original, {0}, {0});
+  builder.Slice(original, {0}, {0}, {1});
 
   ComputeAndCompareR1<float>(&builder, {}, {});
 }
@@ -64,7 +64,7 @@ XLA_TEST_F(SliceTest, SliceTenToZeroF32) {
   ComputationBuilder builder(client_, TestName());
   std::vector<float> constant(10, 0.3);
   auto original = builder.ConstantR1<float>(constant);
-  builder.Slice(original, {7}, {7});
+  builder.Slice(original, {7}, {7}, {1});
 
   ComputeAndCompareR1<float>(&builder, {}, {});
 }
@@ -87,7 +87,7 @@ TEST_F(SliceTest, SliceTenToTen) {
 
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR1<float>(values);
-  builder.Slice(original, {0}, {10});
+  builder.Slice(original, {0}, {10}, {1});
 
   ComputeAndCompareR1<float>(&builder, values, {}, ErrorSpec(0.000001));
 }
@@ -98,7 +98,7 @@ TEST_F(SliceTest, SliceLastFourOf1024) {
 
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR1<float>(values);
-  builder.Slice(original, {1024 - 4}, {1024});
+  builder.Slice(original, {1024 - 4}, {1024}, {1});
 
   const std::vector<float> expected = {1020, 1021, 1022, 1023};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.000001));
@@ -112,7 +112,7 @@ TEST_F(SliceTest, DISABLED_SliceUnaligned1024In4096Values) {
 
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR1<float>(values);
-  builder.Slice(original, {7}, {7 + 1024});
+  builder.Slice(original, {7}, {7 + 1024}, {1});
 
   std::vector<float> expected(1024);
   std::iota(values.begin(), values.end(), 7.0);
@@ -122,7 +122,7 @@ TEST_F(SliceTest, DISABLED_SliceUnaligned1024In4096Values) {
 XLA_TEST_F(SliceTest, Slice0x0to0x0F32) {
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 0));
-  builder.Slice(original, {0, 0}, {0, 0});
+  builder.Slice(original, {0, 0}, {0, 0}, {1, 1});
 
   ComputeAndCompareR2<float>(&builder, Array2D<float>(0, 0), {});
 }
@@ -130,7 +130,7 @@ XLA_TEST_F(SliceTest, Slice0x0to0x0F32) {
 XLA_TEST_F(SliceTest, Slice0x20to0x5F32) {
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 20));
-  builder.Slice(original, {0, 15}, {0, 20});
+  builder.Slice(original, {0, 15}, {0, 20}, {1, 1});
 
   ComputeAndCompareR2<float>(&builder, Array2D<float>(0, 5), {});
 }
@@ -138,7 +138,7 @@ XLA_TEST_F(SliceTest, Slice0x20to0x5F32) {
 XLA_TEST_F(SliceTest, Slice3x0to2x0F32) {
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR2FromArray2D<float>(Array2D<float>(3, 0));
-  builder.Slice(original, {1, 0}, {3, 0});
+  builder.Slice(original, {1, 0}, {3, 0}, {1, 1});
 
   ComputeAndCompareR2<float>(&builder, Array2D<float>(2, 0), {});
 }
@@ -153,7 +153,7 @@ XLA_TEST_F(SliceTest, SliceQuadrantOf256x256) {
 
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR2FromArray2D<float>(values);
-  builder.Slice(original, {128, 128}, {256, 256});
+  builder.Slice(original, {128, 128}, {256, 256}, {1, 1});
 
   Array2D<float> expected(128, 128);
   for (int row = 0; row < 128; ++row) {
@@ -171,7 +171,7 @@ TEST_F(SliceTest, Slice_1x4096_To_1x1024) {
 
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR2FromArray2D<float>(values);
-  builder.Slice(original, {0, 3072}, {1, 4096});
+  builder.Slice(original, {0, 3072}, {1, 4096}, {1, 1});
 
   Array2D<float> expected(1, 1024);
   std::iota(expected.data(), expected.data() + 1024, 3072.0);
@@ -192,7 +192,7 @@ TEST_F(SliceTest, Slice_16x4_To_16x2) {
   }
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR2FromArray2D<float>(values);
-  builder.Slice(original, {0, 0}, {16, 2});
+  builder.Slice(original, {0, 0}, {16, 2}, {1, 1});
   ComputeAndCompareR2<float>(&builder, expected, {}, ErrorSpec(0.000001));
 }
 
@@ -204,7 +204,7 @@ TEST_F(SliceTest, SliceR4ThreeDimsMiddleMinor) {
       ReferenceUtil::Slice4D(values, {{1, 0, 8, 0}}, {{2, 2, 16, 128}});
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR4FromArray4D(values);
-  builder.Slice(original, {1, 0, 8, 0}, {2, 2, 16, 128});
+  builder.Slice(original, {1, 0, 8, 0}, {2, 2, 16, 128}, {1, 1, 1, 1});
   ComputeAndCompareR4(&builder, *expected, {}, ErrorSpec(0.000001));
 }
 
@@ -213,6 +213,7 @@ struct R2Spec {
   int64 input_dim1;
   std::array<int64, 2> slice_starts;
   std::array<int64, 2> slice_limits;
+  std::array<int64, 2> slice_strides;
   Layout layout;
 };
 
@@ -228,7 +229,7 @@ TEST_P(SliceR2Test, DoIt) {
 
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR2FromArray2D<int32>(input);
-  builder.Slice(a, spec.slice_starts, spec.slice_limits);
+  builder.Slice(a, spec.slice_starts, spec.slice_limits, spec.slice_strides);
 
   std::unique_ptr<Array2D<int32>> expected =
       ReferenceUtil::Slice2D(input, spec.slice_starts, spec.slice_limits);
@@ -239,19 +240,23 @@ TEST_P(SliceR2Test, DoIt) {
 INSTANTIATE_TEST_CASE_P(
     SliceR2TestInstantiation, SliceR2Test,
     ::testing::Values(
-        R2Spec {4, 12, {{0, 3}}, {{4, 6}}, LayoutUtil::MakeLayout({0, 1})},
-        R2Spec {4, 12, {{0, 3}}, {{4, 6}}, LayoutUtil::MakeLayout({1, 0})},
-        R2Spec {16, 4, {{0, 2}}, {{16, 4}}, LayoutUtil::MakeLayout({0, 1})},
-        R2Spec {16, 4, {{0, 2}}, {{16, 4}}, LayoutUtil::MakeLayout({1, 0})},
-        R2Spec {256, 400, {{0, 300}}, {{256, 400}},
+        R2Spec {4, 12, {{0, 3}}, {{4, 6}}, {{1, 1}},
+          LayoutUtil::MakeLayout({0, 1})},
+        R2Spec {4, 12, {{0, 3}}, {{4, 6}}, {{1, 1}},
           LayoutUtil::MakeLayout({1, 0})},
-        R2Spec {500, 400, {{111, 123}}, {{300, 257}},
+        R2Spec {16, 4, {{0, 2}}, {{16, 4}}, {{1, 1}},
+          LayoutUtil::MakeLayout({0, 1})},
+        R2Spec {16, 4, {{0, 2}}, {{16, 4}}, {{1, 1}},
           LayoutUtil::MakeLayout({1, 0})},
-        R2Spec {500, 400, {{111, 123}}, {{300, 400}},
+        R2Spec {256, 400, {{0, 300}}, {{256, 400}}, {{1, 1}},
           LayoutUtil::MakeLayout({1, 0})},
-        R2Spec {384, 512, {{128, 256}}, {{256, 384}},
+        R2Spec {500, 400, {{111, 123}}, {{300, 257}}, {{1, 1}},
           LayoutUtil::MakeLayout({1, 0})},
-        R2Spec {357, 512, {{111, 256}}, {{301, 384}},
+        R2Spec {500, 400, {{111, 123}}, {{300, 400}}, {{1, 1}},
+          LayoutUtil::MakeLayout({1, 0})},
+        R2Spec {384, 512, {{128, 256}}, {{256, 384}}, {{1, 1}},
+          LayoutUtil::MakeLayout({1, 0})},
+        R2Spec {357, 512, {{111, 256}}, {{301, 384}}, {{1, 1}},
           LayoutUtil::MakeLayout({1, 0})}
     )
 );
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index ccd2a95658..afa7d871c0 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -666,7 +666,8 @@ TEST_F(WhileTest, WhileWithPrngScalarResult) {
   auto build_condition = [this, v6s32](int count) {
     ComputationBuilder builder(client_, TestName());
     auto prev = builder.Reshape(
-        builder.Slice(builder.Parameter(0, v6s32, "prev"), {0}, {1}), {0}, {});
+        builder.Slice(builder.Parameter(0, v6s32, "prev"), {0}, {1}, {1}), {0},
+          {});
     builder.Gt(builder.ConstantR0<int32>(count), prev);
     return builder.Build().ConsumeValueOrDie();
   };
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index 42d5c1d155..31f0c3147e 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -195,16 +195,24 @@ bool IsPermutation(tensorflow::gtl::ArraySlice<int64> permutation, int64 rank);
 // 2. permutation.size() == input.size().
 template <template <typename...> class C, typename T>
 std::vector<T> Permute(tensorflow::gtl::ArraySlice<int64> permutation,
-                       C<T> input_) {
-  tensorflow::gtl::ArraySlice<T> input(input_);
-  CHECK(IsPermutation(permutation, input.size()));
-  std::vector<T> output(input.size());
+                       C<T> input) {
+  tensorflow::gtl::ArraySlice<T> data(input);
+  CHECK(IsPermutation(permutation, data.size()));
+  std::vector<T> output(data.size());
   for (size_t i = 0; i < permutation.size(); ++i) {
-    output[permutation[i]] = input[i];
+    output[permutation[i]] = data[i];
   }
   return output;
 }
 
+// Override of the above that works around compile failures with gcc 7.1.1.
+// For details see https://github.com/tensorflow/tensorflow/issues/10843
+template <typename T>
+std::vector<T> Permute(tensorflow::gtl::ArraySlice<int64> permutation,
+                       const std::vector<T>& input) {
+  return Permute<std::vector, T>(permutation, input);
+}
+
 // Inverts a permutation, i.e., output_permutation[input_permutation[i]] = i.
 std::vector<int64> InversePermutation(
     tensorflow::gtl::ArraySlice<int64> input_permutation);
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index 95c1f0995b..86c72b3449 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -200,7 +200,7 @@ message OpMetadata {
   string op_name = 2;
   // Indicate a file and line that this op is associated to in a user's program.
   //
-  // e.g. it could be be the file and line of user code that generated the op.
+  // e.g. it could be the file and line of user code that generated the op.
   string source_file = 3;
   int32 source_line = 4;
 }
@@ -369,6 +369,7 @@ message SliceRequest {
   ComputationDataHandle operand = 2;
   repeated int64 start_indices = 3;
   repeated int64 limit_indices = 4;
+  repeated int64 stride = 5;
 }
 
 message DynamicSliceRequest {
diff --git a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
index 1f180429b2..b1d18d2faf 100644
--- a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
+++ b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
@@ -17,6 +17,7 @@ package org.tensorflow.contrib.android;
 
 import android.content.res.AssetManager;
 import android.os.Trace;
+import android.os.Build.VERSION;
 import android.text.TextUtils;
 import android.util.Log;
 import java.io.FileInputStream;
@@ -370,9 +371,11 @@ public class TensorFlowInferenceInterface {
   private void loadGraph(InputStream is, Graph g) throws IOException {
     final long startMs = System.currentTimeMillis();
 
-    Trace.beginSection("initializeTensorFlow");
+    if (VERSION.SDK_INT >= 18) {
+      Trace.beginSection("initializeTensorFlow");
+      Trace.beginSection("readGraphDef");
+    }
 
-    Trace.beginSection("readGraphDef");
     // TODO(ashankar): Can we somehow mmap the contents instead of copying them?
     byte[] graphDef = new byte[is.available()];
     final int numBytesRead = is.read(graphDef);
@@ -383,17 +386,22 @@ public class TensorFlowInferenceInterface {
               + " of the graph, expected to read "
               + graphDef.length);
     }
-    Trace.endSection();
 
-    Trace.beginSection("importGraphDef");
+    if (VERSION.SDK_INT >= 18) {
+      Trace.endSection(); // readGraphDef.
+      Trace.beginSection("importGraphDef");
+    }
+
     try {
       g.importGraphDef(graphDef);
     } catch (IllegalArgumentException e) {
       throw new IOException("Not a valid TensorFlow Graph serialization: " + e.getMessage());
     }
-    Trace.endSection();
 
-    Trace.endSection(); // initializeTensorFlow.
+    if (VERSION.SDK_INT >= 18) {
+      Trace.endSection(); // importGraphDef.
+      Trace.endSection(); // initializeTensorFlow.
+    }
 
     final long endMs = System.currentTimeMillis();
     Log.i(
diff --git a/tensorflow/contrib/cloud/kernels/bigquery_reader_ops.cc b/tensorflow/contrib/cloud/kernels/bigquery_reader_ops.cc
index 093000559b..b0f9237ea2 100644
--- a/tensorflow/contrib/cloud/kernels/bigquery_reader_ops.cc
+++ b/tensorflow/contrib/cloud/kernels/bigquery_reader_ops.cc
@@ -60,7 +60,7 @@ class BigQueryReader : public ReaderBase {
     BigQueryTablePartition partition;
     if (!partition.ParseFromString(current_work())) {
       return errors::InvalidArgument(
-          "Could not parse work as as valid partition.");
+          "Could not parse work as valid partition.");
     }
     TF_RETURN_IF_ERROR(bigquery_table_accessor_->SetPartition(partition));
     return Status::OK();
diff --git a/tensorflow/contrib/cloud/python/ops/bigquery_reader_ops.py b/tensorflow/contrib/cloud/python/ops/bigquery_reader_ops.py
index 136707da18..cc8644bfd5 100644
--- a/tensorflow/contrib/cloud/python/ops/bigquery_reader_ops.py
+++ b/tensorflow/contrib/cloud/python/ops/bigquery_reader_ops.py
@@ -92,7 +92,7 @@ class BigQueryReader(io_ops.ReaderBase):
 
     Raises:
       TypeError: - If features is neither None nor a dict or
-                 - If columns is is neither None nor a list or
+                 - If columns is neither None nor a list or
                  - If both features and columns are None or set.
     """
     if (features is None) == (columns is None):
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 9ffe08eded..205b44ac12 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -74,7 +74,7 @@ if(WIN32)
   set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} /ignore:4049 /ignore:4197 /ignore:4217 /ignore:4221")
   set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /ignore:4049 /ignore:4197 /ignore:4217 /ignore:4221")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
-  set(CMAKE_CXX_FLAGS_DEBUG "/D_DEBUG /MDd /Ob0")
+  set(CMAKE_CXX_FLAGS_DEBUG "/D_DEBUG /MDd /Ob2")
   set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /D_ITERATOR_DEBUG_LEVEL=0")
   set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /D_ITERATOR_DEBUG_LEVEL=0")
   set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /D_ITERATOR_DEBUG_LEVEL=0")
diff --git a/tensorflow/contrib/data/README.md b/tensorflow/contrib/data/README.md
index 9505f5c465..6d7d278309 100644
--- a/tensorflow/contrib/data/README.md
+++ b/tensorflow/contrib/data/README.md
@@ -543,7 +543,7 @@ padded.
 ```python
 dataset = tf.contrib.data.Dataset.range(100)
 dataset = dataset.map(lambda x: tf.fill([tf.cast(x, tf.int32)], x))
-dataset = dataset.padded_batch(4, padded_shapes=[None])
+batched_dataset = dataset.padded_batch(4, padded_shapes=[None])
 
 iterator = batched_dataset.make_one_shot_iterator()
 next_element = iterator.get_next()
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py
index d44e258bd2..42865ed404 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py
@@ -120,7 +120,7 @@ class _TriLPlusVDVTLightweightOperatorPD(object):
 
     Doesn't actually do the sqrt! Named as such to agree with API.
 
-    To compute (M + V D V.T), we use the the Woodbury matrix identity:
+    To compute (M + V D V.T), we use the Woodbury matrix identity:
       inv(M + V D V.T) = inv(M) - inv(M) V inv(C) V.T inv(M)
     where,
       C = inv(D) + V.T inv(M) V.
@@ -166,7 +166,7 @@ class _TriLPlusVDVTLightweightOperatorPD(object):
   def _woodbury_sandwiched_term(self):
     """Computes the sandwiched term in the Woodbury identity.
 
-    Computes the "`C`" in the the identity:
+    Computes the "`C`" in the identity:
        inv(M + V D V.T) = inv(M) - inv(M) V inv(C) V.T inv(M)
     where,
        C = inv(D) + V.T inv(M) V.
diff --git a/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py b/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
index 5b57a95c55..b525809015 100644
--- a/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
@@ -52,7 +52,7 @@ class RelaxedBernoulli(transformed_distribution.TransformedDistribution):
   the RelaxedBernoulli can suffer from underflow issues. In many case loss
   functions such as these are invariant under invertible transformations of
   the random variables. The KL divergence, found in the variational autoencoder
-  loss, is an example. Because RelaxedBernoullis are sampled by by a Logistic
+  loss, is an example. Because RelaxedBernoullis are sampled by a Logistic
   random variable followed by a `tf.sigmoid` op, one solution is to treat
   the Logistic as the random variable and `tf.sigmoid` as downstream. The
   KL divergences of two Logistics, which are always followed by a `tf.sigmoid`
diff --git a/tensorflow/contrib/distributions/python/ops/sample_stats.py b/tensorflow/contrib/distributions/python/ops/sample_stats.py
index 26cf922d0a..2a4b92c729 100644
--- a/tensorflow/contrib/distributions/python/ops/sample_stats.py
+++ b/tensorflow/contrib/distributions/python/ops/sample_stats.py
@@ -47,7 +47,7 @@ def percentile(x,
   """Compute the `q`-th percentile of `x`.
 
   Given a vector `x`, the `q`-th percentile of `x` is the value `q / 100` of the
-  way from the minimum to the maximum in in a sorted copy of `x`.
+  way from the minimum to the maximum in a sorted copy of `x`.
 
   The values and distances of the two nearest neighbors as well as the
   `interpolation` parameter will determine the percentile if the normalized
diff --git a/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py b/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
index fd2c46d94d..fdee57695e 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Vectorized Laplace distribution class, directly using LinearOpeartor."""
+"""Vectorized Laplace distribution class, directly using LinearOperator."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/graph_editor/transform.py b/tensorflow/contrib/graph_editor/transform.py
index 762bc44814..2234400fdc 100644
--- a/tensorflow/contrib/graph_editor/transform.py
+++ b/tensorflow/contrib/graph_editor/transform.py
@@ -446,7 +446,7 @@ class Transformer(object):
       # TODO(fkp): return a subgraph?
       op_, op_outputs_ = self.transform_op_handler(info, op)
       if op is op_:
-        raise ValueError("In-place tranformation not allowed.")
+        raise ValueError("In-place transformation not allowed.")
 
       # Process op.
       info.transformed_ops[op] = op_
diff --git a/tensorflow/contrib/keras/python/keras/backend.py b/tensorflow/contrib/keras/python/keras/backend.py
index 9f02fc0958..324f510301 100644
--- a/tensorflow/contrib/keras/python/keras/backend.py
+++ b/tensorflow/contrib/keras/python/keras/backend.py
@@ -3261,7 +3261,7 @@ def conv2d(x,
       padding: string, `"same"` or `"valid"`.
       data_format: `"channels_last"` or `"channels_first"`.
           Whether to use Theano or TensorFlow data format
-          for inputs/kernels/ouputs.
+          for inputs/kernels/outputs.
       dilation_rate: tuple of 2 integers.
 
   Returns:
@@ -3309,7 +3309,7 @@ def conv2d_transpose(x,
       padding: string, `"same"` or `"valid"`.
       data_format: `"channels_last"` or `"channels_first"`.
           Whether to use Theano or TensorFlow data format
-          for inputs/kernels/ouputs.
+          for inputs/kernels/outputs.
 
   Returns:
       A tensor, result of transposed 2D convolution.
@@ -3395,7 +3395,7 @@ def conv3d(x,
       padding: string, `"same"` or `"valid"`.
       data_format: `"channels_last"` or `"channels_first"`.
           Whether to use Theano or TensorFlow data format
-          for inputs/kernels/ouputs.
+          for inputs/kernels/outputs.
       dilation_rate: tuple of 3 integers.
 
   Returns:
diff --git a/tensorflow/contrib/keras/python/keras/layers/core.py b/tensorflow/contrib/keras/python/keras/layers/core.py
index d287fa56d9..34548c83c5 100644
--- a/tensorflow/contrib/keras/python/keras/layers/core.py
+++ b/tensorflow/contrib/keras/python/keras/layers/core.py
@@ -107,7 +107,7 @@ class Dropout(tf_core_layers.Dropout, Layer):
     self.supports_masking = True
     # Inheritance call order:
     # 1) tf.layers.Dropout, 2) keras.layers.Layer, 3) tf.layers.Layer
-    super(Dropout, self).__init__(**kwargs)
+    super(Dropout, self).__init__(rate=rate, noise_shape=noise_shape, seed=seed, **kwargs)
 
   def call(self, inputs, training=None):
     if training is None:
diff --git a/tensorflow/contrib/keras/python/keras/layers/recurrent.py b/tensorflow/contrib/keras/python/keras/layers/recurrent.py
index 5e8c23ed3e..cdef55f599 100644
--- a/tensorflow/contrib/keras/python/keras/layers/recurrent.py
+++ b/tensorflow/contrib/keras/python/keras/layers/recurrent.py
@@ -985,7 +985,7 @@ class LSTM(Recurrent):
 
   References:
       - [Long short-term
-        memory](http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf)
+        memory](http://www.bioinf.jku.at/publications/older/2604.pdf)
         (original 1997 paper)
       - [Supervised sequence labeling with recurrent neural
         networks](http://www.cs.toronto.edu/~graves/preprint.pdf)
diff --git a/tensorflow/contrib/keras/python/keras/models_test.py b/tensorflow/contrib/keras/python/keras/models_test.py
index 50aba43c24..99fd6e1cbe 100644
--- a/tensorflow/contrib/keras/python/keras/models_test.py
+++ b/tensorflow/contrib/keras/python/keras/models_test.py
@@ -105,7 +105,7 @@ class TestModelSaving(test.TestCase):
       out2 = model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
 
-  def test_fuctional_model_saving(self):
+  def test_functional_model_saving(self):
     if h5py is None:
       return  # Skip test if models cannot be saved.
 
diff --git a/tensorflow/contrib/layers/BUILD b/tensorflow/contrib/layers/BUILD
index aa301a0ade..c96e42de45 100644
--- a/tensorflow/contrib/layers/BUILD
+++ b/tensorflow/contrib/layers/BUILD
@@ -121,7 +121,6 @@ cuda_py_test(
         ":layers_py",
         "//third_party/py/numpy",
         "//tensorflow/contrib/framework:framework_py",
-        "//tensorflow/contrib/losses:losses_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
@@ -141,6 +140,7 @@ cuda_py_test(
         "//tensorflow/python:template",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/ops/losses:losses",
     ],
 )
 
diff --git a/tensorflow/contrib/layers/__init__.py b/tensorflow/contrib/layers/__init__.py
index ef05dbaa65..528de31346 100644
--- a/tensorflow/contrib/layers/__init__.py
+++ b/tensorflow/contrib/layers/__init__.py
@@ -17,12 +17,16 @@
 See the @{$python/contrib.layers} guide.
 
 @@avg_pool2d
+@@avg_pool3d
 @@batch_norm
 @@convolution2d
+@@convolution3d
 @@conv2d_in_plane
 @@convolution2d_in_plane
 @@conv2d_transpose
 @@convolution2d_transpose
+@@conv3d_transpose
+@@convolution3d_transpose
 @@dropout
 @@elu
 @@embedding_lookup_unique
@@ -31,6 +35,7 @@ See the @{$python/contrib.layers} guide.
 @@layer_norm
 @@linear
 @@max_pool2d
+@@max_pool3d
 @@one_hot_encoding
 @@relu
 @@relu6
@@ -101,6 +106,7 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = ['bias_add',
                     'conv2d',
+                    'conv3d',
                     'elu',
                     'feature_column',
                     'legacy_fully_connected',
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index f2a904b521..ed4b723ca7 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -49,15 +49,20 @@ from tensorflow.python.training import moving_averages
 # TODO(b/28426988): Replace legacy_* fns migrated from slim.
 # TODO(b/28426988): Remove legacy_* when all uses have migrated to new API.
 __all__ = ['avg_pool2d',
+           'avg_pool3d',
            'batch_norm',
            'bias_add',
            'conv2d',
+           'conv3d',
            'conv2d_in_plane',
            'conv2d_transpose',
+           'conv3d_transpose',
            'convolution',
            'convolution2d',
            'convolution2d_in_plane',
            'convolution2d_transpose',
+           'convolution3d',
+           'convolution3d_transpose',
            'dropout',
            'elu',
            'flatten',
@@ -66,6 +71,7 @@ __all__ = ['avg_pool2d',
            'linear',
            'pool',
            'max_pool2d',
+           'max_pool3d',
            'one_hot_encoding',
            'relu',
            'relu6',
@@ -82,6 +88,8 @@ __all__ = ['avg_pool2d',
 
 DATA_FORMAT_NCHW = 'NCHW'
 DATA_FORMAT_NHWC = 'NHWC'
+DATA_FORMAT_NCDHW = 'NCDHW'
+DATA_FORMAT_NDHWC = 'NDHWC'
 
 
 @add_arg_scope
@@ -132,6 +140,54 @@ def avg_pool2d(inputs,
     return utils.collect_named_outputs(outputs_collections, sc, outputs)
 
 
+@add_arg_scope
+def avg_pool3d(inputs,
+               kernel_size,
+               stride=2,
+               padding='VALID',
+               data_format=DATA_FORMAT_NDHWC,
+               outputs_collections=None,
+               scope=None):
+  """Adds a 3D average pooling op.
+
+  It is assumed that the pooling is done per image but not in batch or channels.
+
+  Args:
+    inputs: A 5-D tensor of shape `[batch_size, depth, height, width, channels]` if
+      `data_format` is `NDHWC`, and `[batch_size, channels, depth, height, width]` if
+      `data_format` is `NCDHW`.
+    kernel_size: A list of length 3: [kernel_depth, kernel_height, kernel_width] of the
+      pooling kernel over which the op is computed. Can be an int if both
+      values are the same.
+    stride: A list of length 3: [stride_depth, stride_height, stride_width].
+      Can be an int if both strides are the same. Note that presently
+      both strides must have the same value.
+    padding: The padding method, either 'VALID' or 'SAME'.
+    data_format: A string. `NDHWC` (default) and `NCDHW` are supported.
+    outputs_collections: The collections to which the outputs are added.
+    scope: Optional scope for name_scope.
+
+  Returns:
+    A `Tensor` representing the results of the pooling operation.
+
+  Raises:
+    ValueError: If `data_format` is neither `NDHWC` nor `NCDHW`.
+  """
+  if data_format not in (DATA_FORMAT_NCDHW, DATA_FORMAT_NDHWC):
+    raise ValueError('data_format has to be either NCDHW or NDHWC.')
+  with ops.name_scope(scope, 'AvgPool3D', [inputs]) as sc:
+    inputs = ops.convert_to_tensor(inputs)
+    df = ('channels_first' if data_format and data_format.startswith('NC')
+          else 'channels_last')
+    layer = pooling_layers.AveragePooling3D(pool_size=kernel_size,
+                                            strides=stride,
+                                            padding=padding,
+                                            data_format=df,
+                                            _scope=sc)
+    outputs = layer.apply(inputs)
+    return utils.collect_named_outputs(outputs_collections, sc, outputs)
+
+
 def _fused_batch_norm(
     inputs,
     decay=0.999,
@@ -985,6 +1041,7 @@ def convolution(inputs,
                                        sc.original_name_scope, outputs)
 
 convolution2d = convolution
+convolution3d = convolution
 
 
 @add_arg_scope
@@ -1203,6 +1260,116 @@ def convolution2d_transpose(
                                        sc.original_name_scope, outputs)
 
 
+@add_arg_scope
+def convolution3d_transpose(
+    inputs,
+    num_outputs,
+    kernel_size,
+    stride=1,
+    padding='SAME',
+    data_format=DATA_FORMAT_NDHWC,
+    activation_fn=nn.relu,
+    normalizer_fn=None,
+    normalizer_params=None,
+    weights_initializer=initializers.xavier_initializer(),
+    weights_regularizer=None,
+    biases_initializer=init_ops.zeros_initializer(),
+    biases_regularizer=None,
+    reuse=None,
+    variables_collections=None,
+    outputs_collections=None,
+    trainable=True,
+    scope=None):
+  """Adds a convolution3d_transpose with an optional batch normalization layer.
+  
+  The function creates a variable called `weights`, representing the
+  kernel, that is convolved with the input. If `batch_norm_params` is `None`, a
+  second variable called 'biases' is added to the result of the operation.
+  Args:
+    inputs: A 5-D `Tensor` of type `float` and shape
+      `[batch, depth, height, width, in_channels]` for `NDHWC` data format or
+      `[batch, in_channels, depth, height, width]` for `NCDHW` data format.
+    num_outputs: Integer, the number of output filters.
+    kernel_size: A list of length 3 holding the [kernel_depth, kernel_height, kernel_width] of
+      of the filters. Can be an int if both values are the same.
+    stride: A list of length 3: [stride_depth, stride_height, stride_width].
+      Can be an int if both strides are the same.  Note that presently
+      both strides must have the same value.
+    padding: One of 'VALID' or 'SAME'.
+    data_format: A string. `NDHWC` (default) and `NCDHW` are supported.
+    activation_fn: Activation function. The default value is a ReLU function.
+      Explicitly set it to None to skip it and maintain a linear activation.
+    normalizer_fn: Normalization function to use instead of `biases`. If
+      `normalizer_fn` is provided then `biases_initializer` and
+      `biases_regularizer` are ignored and `biases` are not created nor added.
+      default set to None for no normalizer function
+    normalizer_params: Normalization function parameters.
+    weights_initializer: An initializer for the weights.
+    weights_regularizer: Optional regularizer for the weights.
+    biases_initializer: An initializer for the biases. If None skip biases.
+    biases_regularizer: Optional regularizer for the biases.
+    reuse: Whether or not the layer and its variables should be reused. To be
+      able to reuse the layer scope must be given.
+    variables_collections: Optional list of collections for all the variables or
+      a dictionary containing a different list of collection per variable.
+    outputs_collections: Collection to add the outputs.
+    trainable: Whether or not the variables should be trainable or not.
+    scope: Optional scope for variable_scope.
+  Returns:
+    A tensor representing the output of the operation.
+  Raises:
+    ValueError: If 'kernel_size' is not a list of length 3.
+    ValueError: If `data_format` is neither `NDHWC` nor `NCDHW`.
+    ValueError: If `C` dimension of `inputs` is None.
+  """
+  layer_variable_getter = _build_variable_getter(
+      {'bias': 'biases', 'kernel': 'weights'})
+
+  with variable_scope.variable_scope(
+      scope, 'Conv3d_transpose', [inputs], reuse=reuse,
+      custom_getter=layer_variable_getter) as sc:
+    if data_format not in (DATA_FORMAT_NCDHW, DATA_FORMAT_NDHWC):
+      raise ValueError('data_format has to be either NCDHW or NDHWC.')
+
+    inputs = ops.convert_to_tensor(inputs)
+
+    df = ('channels_first' if data_format and data_format.startswith('NC')
+          else 'channels_last')
+    layer = convolutional_layers.Convolution3DTranspose(
+        filters=num_outputs,
+        kernel_size=kernel_size,
+        strides=stride,
+        padding=padding,
+        data_format=df,
+        activation=None,
+        use_bias=not normalizer_fn and biases_initializer,
+        kernel_initializer=weights_initializer,
+        bias_initializer=biases_initializer,
+        kernel_regularizer=weights_regularizer,
+        bias_regularizer=biases_regularizer,
+        activity_regularizer=None,
+        trainable=trainable,
+        name=sc.name,
+        dtype=inputs.dtype.base_dtype,
+        _scope=sc,
+        _reuse=reuse)
+    outputs = layer.apply(inputs)
+
+    # Add variables to collections.
+    _add_variable_to_collections(layer.kernel, variables_collections, 'weights')
+    if layer.bias:
+      _add_variable_to_collections(layer.bias, variables_collections, 'biases')
+
+    if normalizer_fn is not None:
+      normalizer_params = normalizer_params or {}
+      outputs = normalizer_fn(outputs, **normalizer_params)
+
+    if activation_fn is not None:
+      outputs = activation_fn(outputs)
+    return utils.collect_named_outputs(outputs_collections,
+                                       sc.original_name_scope, outputs)
+
+
 @add_arg_scope
 def dropout(inputs,
             keep_prob=0.5,
@@ -1467,7 +1634,8 @@ def fully_connected(inputs,
     ValueError: If x has rank less than 2 or if its last dimension is not set.
   """
   if not isinstance(num_outputs, six.integer_types):
-    raise ValueError('num_outputs should be int or long, got %s.', num_outputs)
+    raise ValueError(
+        'num_outputs should be int or long, got %s.' % (num_outputs,))
 
   layer_variable_getter = _build_variable_getter({'bias': 'biases',
                                                   'kernel': 'weights'})
@@ -1689,6 +1857,55 @@ def max_pool2d(inputs,
     return utils.collect_named_outputs(outputs_collections, sc, outputs)
 
 
+@add_arg_scope
+def max_pool3d(inputs,
+               kernel_size,
+               stride=2,
+               padding='VALID',
+               data_format=DATA_FORMAT_NDHWC,
+               outputs_collections=None,
+               scope=None):
+  """Adds a 3D Max Pooling op.
+
+  It is assumed that the pooling is done per image but not in batch or channels.
+
+  Args:
+    inputs: A 5-D tensor of shape `[batch_size, depth, height, width, channels]` if
+      `data_format` is `NDHWC`, and `[batch_size, channels, depth, height, width]` if
+      `data_format` is `NCDHW`.
+    kernel_size: A list of length 3: [kernel_depth, kernel_height, kernel_width] of the
+      pooling kernel over which the op is computed. Can be an int if both
+      values are the same.
+    stride: A list of length 3: [stride_depth, stride_height, stride_width].
+      Can be an int if both strides are the same. Note that presently
+      both strides must have the same value.
+    padding: The padding method, either 'VALID' or 'SAME'.
+    data_format: A string. `NDHWC` (default) and `NCDHW` are supported.
+    outputs_collections: The collections to which the outputs are added.
+    scope: Optional scope for name_scope.
+
+  Returns:
+    A `Tensor` representing the results of the pooling operation.
+
+  Raises:
+    ValueError: If `data_format` is neither `NDHWC` nor `NCDHW`.
+    ValueError: If 'kernel_size' is not a 3-D list
+  """
+  if data_format not in (DATA_FORMAT_NCDHW, DATA_FORMAT_NDHWC):
+    raise ValueError('data_format has to be either NCDHW or NDHWC.')
+  with ops.name_scope(scope, 'MaxPool3D', [inputs]) as sc:
+    inputs = ops.convert_to_tensor(inputs)
+    df = ('channels_first' if data_format and data_format.startswith('NC')
+          else 'channels_last')
+    layer = pooling_layers.MaxPooling3D(pool_size=kernel_size,
+                                        strides=stride,
+                                        padding=padding,
+                                        data_format=df,
+                                        _scope=sc)
+    outputs = layer.apply(inputs)
+    return utils.collect_named_outputs(outputs_collections, sc, outputs)
+
+
 @add_arg_scope
 def pool(inputs,
          kernel_size,
@@ -2346,6 +2563,8 @@ linear = functools.partial(fully_connected, activation_fn=None)
 
 # Simple alias.
 conv2d = convolution2d
+conv3d = convolution3d
 conv2d_transpose = convolution2d_transpose
+conv3d_transpose = convolution3d_transpose
 conv2d_in_plane = convolution2d_in_plane
 separable_conv2d = separable_convolution2d
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index d4ee85b550..15809ea180 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -27,7 +27,6 @@ from tensorflow.contrib.framework.python.ops import arg_scope
 from tensorflow.contrib.framework.python.ops import variables
 from tensorflow.contrib.layers.python.layers import layers as _layers
 from tensorflow.contrib.layers.python.layers import regularizers
-from tensorflow.contrib.losses.python.losses import loss_ops
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -49,6 +48,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import template
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import test
 
 
@@ -121,6 +121,76 @@ class AvgPool2DTest(test.TestCase):
     self.assertListEqual(output.get_shape().as_list(), [5, 1, 1, 3])
 
 
+class AvgPool3DTest(test.TestCase):
+
+  def testInvalidDataFormat(self):
+    depth, height, width = 3, 6, 9
+    images = np.random.uniform(size=(5, depth, height, width, 3))
+    with self.assertRaisesRegexp(ValueError,
+                                 'data_format has to be either NCDHW or NDHWC.'):
+      _layers.avg_pool3d(images, [3, 3, 3], data_format='CDHWN')
+
+  def testCreateAvgPool(self):
+    depth, height, width = 3, 6, 9
+    images = np.random.uniform(size=(5, depth, height, width, 3))
+    output = _layers.avg_pool3d(images, [3, 3, 3])
+    self.assertEqual(output.op.name, 'AvgPool3D/AvgPool3D')
+    self.assertListEqual(output.get_shape().as_list(), [5, 1, 2, 4, 3])
+
+  def testCreateAvgPoolNCDHW(self):
+    depth, height, width = 3, 6, 9
+    images = np.random.uniform(size=(5, 2, depth, height, width))
+    output = _layers.avg_pool3d(images, [3, 3, 3], data_format='NCDHW')
+    self.assertEquals(output.op.name, 'AvgPool3D/transpose_1')
+    self.assertListEqual(output.get_shape().as_list(), [5, 2, 1, 2, 4])
+
+  def testCollectOutputs(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.avg_pool3d(images, [3, 3, 3], outputs_collections='outputs')
+    output_collected = ops.get_collection('outputs')[0]
+    self.assertEqual(output_collected.aliases, ['AvgPool3D'])
+    self.assertEqual(output_collected, output)
+
+  def testCreateSquareAvgPool(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.avg_pool3d(images, 3)
+    self.assertEqual(output.op.name, 'AvgPool3D/AvgPool3D')
+    self.assertListEqual(output.get_shape().as_list(), [5, 1, 2, 4, 3])
+
+  def testCreateAvgPoolWithScope(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.avg_pool3d(images, [3, 3, 3], scope='pool1')
+    self.assertEqual(output.op.name, 'pool1/AvgPool3D')
+
+  def testCreateAvgPoolWithSamePadding(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.avg_pool3d(images, [3, 3, 3], padding='SAME')
+    self.assertListEqual(output.get_shape().as_list(), [5, 2, 3, 5, 3])
+
+  def testCreateAvgPoolWithSamePaddingNCDHW(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, 3, depth, height, width), seed=1)
+    output = _layers.avg_pool3d(
+        images, [3, 3, 3], padding='SAME', data_format='NCDHW')
+    self.assertListEqual(output.get_shape().as_list(), [5, 3, 2, 3, 5])
+
+  def testCreateAvgPoolStrideWithSamePadding(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.avg_pool3d(images, [3, 3, 3], stride=1, padding='SAME')
+    self.assertListEqual(output.get_shape().as_list(), [5, depth, height, width, 3])
+
+  def testGlobalAvgPool(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.avg_pool3d(images, images.get_shape()[1:4], stride=1)
+    self.assertListEqual(output.get_shape().as_list(), [5, 1, 1, 1, 3])
+
+
 class PoolTest(test.TestCase):
 
   def testCreatePool(self):
@@ -1559,23 +1629,23 @@ class FCTest(test.TestCase):
         inputs, 32, scope='fc1', weights_regularizer=regularizer)
     self.assertEqual(
         len(ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)), 1)
-    self.assertEqual(len(loss_ops.get_regularization_losses()), 1)
+    self.assertEqual(len(losses.get_regularization_losses()), 1)
     _layers.fully_connected(
         inputs, 32, scope='fc1', weights_regularizer=regularizer, reuse=True)
     self.assertEqual(
         len(ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)), 1)
-    self.assertEqual(len(loss_ops.get_regularization_losses()), 1)
+    self.assertEqual(len(losses.get_regularization_losses()), 1)
 
     with variable_scope.variable_scope('outer', reuse=False):
       _layers.fully_connected(inputs, 32, weights_regularizer=regularizer)
       self.assertEqual(
           len(ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)), 2)
-      self.assertEqual(len(loss_ops.get_regularization_losses()), 2)
+      self.assertEqual(len(losses.get_regularization_losses()), 2)
     with variable_scope.variable_scope('outer', reuse=True):
       _layers.fully_connected(inputs, 32, weights_regularizer=regularizer)
       self.assertEqual(
           len(ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)), 2)
-      self.assertEqual(len(loss_ops.get_regularization_losses()), 2)
+      self.assertEqual(len(losses.get_regularization_losses()), 2)
 
   def testCreateFCWithoutActivation(self):
     height, width = 3, 3
@@ -2771,6 +2841,76 @@ class MaxPool2DTest(test.TestCase):
     self.assertListEqual(output.get_shape().as_list(), [5, 1, 1, 3])
 
 
+class MaxPool3DTest(test.TestCase):
+
+  def testInvalidDataFormat(self):
+    depth, height, width = 3, 6, 9
+    images = np.random.uniform(size=(5, depth, height, width, 3))
+    with self.assertRaisesRegexp(ValueError,
+                                 'data_format has to be either NCDHW or NDHWC.'):
+      _layers.max_pool3d(images, [3, 3, 3], data_format='CDHWN')
+
+  def testCreateMaxPool(self):
+    depth, height, width = 3, 6, 9
+    images = np.random.uniform(size=(5, depth, height, width, 3)).astype(np.float32)
+    output = _layers.max_pool3d(images, [3, 3, 3])
+    self.assertEqual(output.op.name, 'MaxPool3D/MaxPool3D')
+    self.assertListEqual(output.get_shape().as_list(), [5, 1, 2, 4, 3])
+
+  def testCreateMaxPoolNCDHW(self):
+    depth, height, width = 3, 6, 9
+    images = np.random.uniform(size=(5, 3, depth, height, width)).astype(np.float32)
+    output = _layers.max_pool3d(images, [3, 3, 3], data_format='NCDHW')
+    self.assertEquals(output.op.name, 'MaxPool3D/transpose_1')
+    self.assertListEqual(output.get_shape().as_list(), [5, 3, 1, 2, 4])
+
+  def testCollectOutputs(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.max_pool3d(images, [3, 3, 3], outputs_collections='outputs')
+    output_collected = ops.get_collection('outputs')[0]
+    self.assertEqual(output_collected.aliases, ['MaxPool3D'])
+    self.assertEqual(output_collected, output)
+
+  def testCreateSquareMaxPool(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.max_pool3d(images, 3)
+    self.assertEqual(output.op.name, 'MaxPool3D/MaxPool3D')
+    self.assertListEqual(output.get_shape().as_list(), [5, 1, 2, 4, 3])
+
+  def testCreateMaxPoolWithScope(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.max_pool3d(images, [3, 3, 3], scope='pool1')
+    self.assertEqual(output.op.name, 'pool1/MaxPool3D')
+
+  def testCreateMaxPoolWithSamePadding(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.max_pool3d(images, [3, 3, 3], padding='SAME')
+    self.assertListEqual(output.get_shape().as_list(), [5, 2, 3, 5, 3])
+
+  def testCreateMaxPoolWithSamePaddingNCDHW(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, 3, depth, height, width), seed=1)
+    output = _layers.max_pool3d(
+        images, [3, 3, 3], padding='SAME', data_format='NCDHW')
+    self.assertListEqual(output.get_shape().as_list(), [5, 3, 2, 3, 5])
+
+  def testCreateMaxPoolStrideWithSamePadding(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.max_pool3d(images, [3, 3, 3], stride=1, padding='SAME')
+    self.assertListEqual(output.get_shape().as_list(), [5, depth, height, width, 3])
+
+  def testGlobalMaxPool(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.max_pool3d(images, images.get_shape()[1:4], stride=1)
+    self.assertListEqual(output.get_shape().as_list(), [5, 1, 1, 1, 3])
+
+
 class OneHotEncodingTest(test.TestCase):
 
   def testOneHotEncodingCreate(self):
diff --git a/tensorflow/contrib/learn/python/learn/estimators/debug_test.py b/tensorflow/contrib/learn/python/learn/estimators/debug_test.py
index b4b552c025..6b125534a4 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/debug_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/debug_test.py
@@ -52,8 +52,8 @@ LABEL_DIMENSION = 3  #  Dimensionality of regression labels.
 
 def _train_test_split(features_and_labels):
   features, labels = features_and_labels
-  train_set = (features[:len(features) / 2], labels[:len(features) / 2])
-  test_set = (features[len(features) / 2:], labels[len(features) / 2:])
+  train_set = (features[:int(len(features) / 2)], labels[:int(len(features) / 2)])
+  test_set = (features[int(len(features) / 2):], labels[int(len(features) / 2):])
   return train_set, test_set
 
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head.py b/tensorflow/contrib/learn/python/learn/estimators/head.py
index 6e15e7891e..fdc266d582 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head.py
@@ -438,7 +438,7 @@ def loss_only_head(loss_fn, head_name=None):
   Args:
     loss_fn: a function that takes no argument and returns a list of
         scalar tensors.
-    head_name: a name for for the head.
+    head_name: a name for the head.
 
   Returns:
     An instance of `Head` to hold the additional losses.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear_test.py b/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
index 145d5c40fa..d3bb0fda57 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
@@ -729,7 +729,7 @@ class LinearClassifierTest(test.TestCase):
     self.assertLess(loss, 0.07)
 
   def testSdcaOptimizerRealValuedFeatures(self):
-    """Tests LinearClasssifier with SDCAOptimizer and real valued features."""
+    """Tests LinearClassifier with SDCAOptimizer and real valued features."""
 
     def input_fn():
       return {
@@ -776,7 +776,7 @@ class LinearClassifierTest(test.TestCase):
     self.assertLess(loss, 0.05)
 
   def testSdcaOptimizerBucketizedFeatures(self):
-    """Tests LinearClasssifier with SDCAOptimizer and bucketized features."""
+    """Tests LinearClassifier with SDCAOptimizer and bucketized features."""
 
     def input_fn():
       return {
@@ -802,7 +802,7 @@ class LinearClassifierTest(test.TestCase):
     self.assertGreater(scores['accuracy'], 0.9)
 
   def testSdcaOptimizerSparseFeatures(self):
-    """Tests LinearClasssifier with SDCAOptimizer and sparse features."""
+    """Tests LinearClassifier with SDCAOptimizer and sparse features."""
 
     def input_fn():
       return {
@@ -833,7 +833,7 @@ class LinearClassifierTest(test.TestCase):
     self.assertGreater(scores['accuracy'], 0.9)
 
   def testSdcaOptimizerWeightedSparseFeatures(self):
-    """LinearClasssifier with SDCAOptimizer and weighted sparse features."""
+    """LinearClassifier with SDCAOptimizer and weighted sparse features."""
 
     def input_fn():
       return {
@@ -864,7 +864,7 @@ class LinearClassifierTest(test.TestCase):
     self.assertGreater(scores['accuracy'], 0.9)
 
   def testSdcaOptimizerCrossedFeatures(self):
-    """Tests LinearClasssifier with SDCAOptimizer and crossed features."""
+    """Tests LinearClassifier with SDCAOptimizer and crossed features."""
 
     def input_fn():
       return {
@@ -897,7 +897,7 @@ class LinearClassifierTest(test.TestCase):
     self.assertGreater(scores['accuracy'], 0.9)
 
   def testSdcaOptimizerMixedFeatures(self):
-    """Tests LinearClasssifier with SDCAOptimizer and a mix of features."""
+    """Tests LinearClassifier with SDCAOptimizer and a mix of features."""
 
     def input_fn():
       return {
@@ -1509,7 +1509,7 @@ class LinearRegressorTest(test.TestCase):
     self.assertLess(loss, 0.05)
 
   def testSdcaOptimizerSparseFeaturesWithL1Reg(self):
-    """Tests LinearClasssifier with SDCAOptimizer and sparse features."""
+    """Tests LinearClassifier with SDCAOptimizer and sparse features."""
 
     def input_fn():
       return {
@@ -1581,7 +1581,7 @@ class LinearRegressorTest(test.TestCase):
     self.assertLess(l1_reg_weights_norm, no_l1_reg_weights_norm)
 
   def testSdcaOptimizerBiasOnly(self):
-    """Tests LinearClasssifier with SDCAOptimizer and validates bias weight."""
+    """Tests LinearClassifier with SDCAOptimizer and validates bias weight."""
 
     def input_fn():
       """Testing the bias weight when it's the only feature present.
@@ -1614,7 +1614,7 @@ class LinearRegressorTest(test.TestCase):
         regressor.get_variable_value('linear/bias_weight')[0], 0.25, err=0.1)
 
   def testSdcaOptimizerBiasAndOtherColumns(self):
-    """Tests LinearClasssifier with SDCAOptimizer and validates bias weight."""
+    """Tests LinearClassifier with SDCAOptimizer and validates bias weight."""
 
     def input_fn():
       """Testing the bias weight when there are other features present.
@@ -1676,7 +1676,7 @@ class LinearRegressorTest(test.TestCase):
         regressor.get_variable_value('linear/b/weight')[0], 0.0, err=0.05)
 
   def testSdcaOptimizerBiasAndOtherColumnsFabricatedCentered(self):
-    """Tests LinearClasssifier with SDCAOptimizer and validates bias weight."""
+    """Tests LinearClassifier with SDCAOptimizer and validates bias weight."""
 
     def input_fn():
       """Testing the bias weight when there are other features present.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/model_fn_test.py b/tensorflow/contrib/learn/python/learn/estimators/model_fn_test.py
index 6ebfeb0f16..284e2cfd7a 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/model_fn_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/model_fn_test.py
@@ -123,7 +123,7 @@ class ModelFnopsTest(test.TestCase):
       self.assertAllEqual(predictions["probabilities"].eval(),
                           regression_output.value.eval())
 
-  def testEstimatorSpec_export_classsification(self):
+  def testEstimatorSpec_export_classification(self):
     predictions = self.create_predictions()
     output_alternatives = {"classification_head": (
         constants.ProblemType.CLASSIFICATION, predictions)}
@@ -143,7 +143,7 @@ class ModelFnopsTest(test.TestCase):
       self.assertAllEqual(predictions["classes"].eval(),
                           classification_output.classes.eval())
 
-  def testEstimatorSpec_export_classsification_with_missing_scores(self):
+  def testEstimatorSpec_export_classification_with_missing_scores(self):
     predictions = self.create_predictions()
     output_alternatives_predictions = predictions.copy()
     del output_alternatives_predictions["scores"]
@@ -165,7 +165,7 @@ class ModelFnopsTest(test.TestCase):
       self.assertAllEqual(predictions["classes"].eval(),
                           classification_output.classes.eval())
 
-  def testEstimatorSpec_export_classsification_with_missing_scores_proba(self):
+  def testEstimatorSpec_export_classification_with_missing_scores_proba(self):
     predictions = self.create_predictions()
     output_alternatives_predictions = predictions.copy()
     del output_alternatives_predictions["scores"]
@@ -187,7 +187,7 @@ class ModelFnopsTest(test.TestCase):
       self.assertAllEqual(predictions["classes"].eval(),
                           classification_output.classes.eval())
 
-  def testEstimatorSpec_export_classsification_with_missing_classes(self):
+  def testEstimatorSpec_export_classification_with_missing_classes(self):
     predictions = self.create_predictions()
     output_alternatives_predictions = predictions.copy()
     del output_alternatives_predictions["classes"]
@@ -208,7 +208,7 @@ class ModelFnopsTest(test.TestCase):
                           classification_output.scores.eval())
       self.assertIsNone(classification_output.classes)
 
-  def testEstimatorSpec_export_classsification_with_nonstring_classes(self):
+  def testEstimatorSpec_export_classification_with_nonstring_classes(self):
     predictions = self.create_predictions()
     output_alternatives_predictions = predictions.copy()
     output_alternatives_predictions["classes"] = constant_op.constant(
diff --git a/tensorflow/contrib/learn/python/learn/experiment.py b/tensorflow/contrib/learn/python/learn/experiment.py
index 491c4343bd..cd47b82d82 100644
--- a/tensorflow/contrib/learn/python/learn/experiment.py
+++ b/tensorflow/contrib/learn/python/learn/experiment.py
@@ -455,7 +455,7 @@ class Experiment(object):
   def train_and_evaluate(self):
     """Interleaves training and evaluation.
 
-    The frequency of evaluation is controlled by the contructor arg
+    The frequency of evaluation is controlled by the constructor arg
     `min_eval_frequency`. When this parameter is 0, evaluation happens
     only after training has completed. Note that evaluation cannot happen
     more frequently than checkpoints are taken. If no new snapshots are
@@ -515,9 +515,9 @@ class Experiment(object):
 
     This differs from `train_and_evaluate` as follows:
       1. The procedure will have train and evaluation in turns. The model
-      will be trained for a number of steps (usuallly smaller than `train_steps`
+      will be trained for a number of steps (usually smaller than `train_steps`
       if provided) and then be evaluated.  `train_and_evaluate` will train the
-      model for `train_steps` (no small training iteraions).
+      model for `train_steps` (no small training iterations).
 
       2. Due to the different approach this schedule takes, it leads to two
       differences in resource control. First, the resources (e.g., memory) used
diff --git a/tensorflow/contrib/learn/python/learn/models.py b/tensorflow/contrib/learn/python/learn/models.py
index 234605ff76..4283240d01 100644
--- a/tensorflow/contrib/learn/python/learn/models.py
+++ b/tensorflow/contrib/learn/python/learn/models.py
@@ -63,7 +63,7 @@ def linear_regression(x, y, init_mean=None, init_stddev=1.0):
     x: tensor or placeholder for input features.
     y: tensor or placeholder for labels.
     init_mean: the mean value to use for initialization.
-    init_stddev: the standard devation to use for initialization.
+    init_stddev: the standard deviation to use for initialization.
 
   Returns:
     Predictions and loss tensors.
@@ -124,7 +124,7 @@ def logistic_regression(x,
                   will check if graph contains tensor `class_weight:0`.
                   If that is not provided either all ones are used.
     init_mean: the mean value to use for initialization.
-    init_stddev: the standard devation to use for initialization.
+    init_stddev: the standard deviation to use for initialization.
 
   Returns:
     Predictions and loss tensors.
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py b/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py
index 3d316450d8..1924ea0f66 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py
@@ -462,7 +462,7 @@ def random_tril_matrix(shape,
     remove_upper:  Python `bool`.
       If `True`, zero out the strictly upper triangle.
       If `False`, the lower triangle of returned matrix will have desired
-      properties, but will not not have the strictly upper triangle zero'd out.
+      properties, but will not have the strictly upper triangle zero'd out.
 
   Returns:
     `Tensor` with desired shape and dtype.
diff --git a/tensorflow/contrib/lookup/lookup_ops.py b/tensorflow/contrib/lookup/lookup_ops.py
index d58b9744ac..f53f38f3cf 100644
--- a/tensorflow/contrib/lookup/lookup_ops.py
+++ b/tensorflow/contrib/lookup/lookup_ops.py
@@ -208,7 +208,7 @@ def index_to_string_table_from_tensor(mapping, default_value="UNK", name=None):
   Sample Usages:
 
   ```python
-  mapping_string = tf.constant(["emerson", "lake", "palmer")
+  mapping_string = tf.constant(["emerson", "lake", "palmer"])
   indices = tf.constant([1, 5], tf.int64)
   table = tf.contrib.lookup.index_to_string_table_from_tensor(
       mapping_string, default_value="UNKNOWN")
@@ -260,7 +260,7 @@ def index_to_string(tensor, mapping, default_value="UNK", name=None):
   For example:
 
   ```python
-  mapping_string = tf.constant(["emerson", "lake", "palmer")
+  mapping_string = tf.constant(["emerson", "lake", "palmer"])
   indices = tf.constant([1, 5], tf.int64)
   values = tf.contrib.lookup.index_to_string(
       indices, mapping=mapping_string, default_value="UNKNOWN")
diff --git a/tensorflow/contrib/makefile/build_with_docker.sh b/tensorflow/contrib/makefile/build_with_docker.sh
index 7fe38f4b5d..51a73fafe5 100755
--- a/tensorflow/contrib/makefile/build_with_docker.sh
+++ b/tensorflow/contrib/makefile/build_with_docker.sh
@@ -23,7 +23,7 @@
 # Make sure we're in the correct directory, at the root of the source tree.
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 WORKSPACE="${SCRIPT_DIR}/../../../"
-cd ${WORKSPACE}
+cd ${WORKSPACE} || exit 1
 
 DOCKER_IMG_NAME="tf-make-base"
 DOCKER_CONTEXT_PATH="${WORKSPACE}tensorflow/contrib/makefile/"
diff --git a/tensorflow/contrib/makefile/compile_android_protobuf.sh b/tensorflow/contrib/makefile/compile_android_protobuf.sh
index 50d72d6093..fadbe271b8 100755
--- a/tensorflow/contrib/makefile/compile_android_protobuf.sh
+++ b/tensorflow/contrib/makefile/compile_android_protobuf.sh
@@ -27,7 +27,7 @@ cc_prefix="${CC_PREFIX}"
 usage() {
   echo "Usage: $(basename "$0") [-a:c]"
   echo "-a [Architecture] Architecture of target android [default=armeabi-v7a] \
-(supported archtecture list: \
+(supported architecture list: \
 arm64-v8a armeabi armeabi-v7a armeabi-v7a-hard mips mips64 x86 x86_64)"
   echo "-c Clean before building protobuf for target"
   echo "\"NDK_ROOT\" should be defined as an environment variable."
@@ -130,7 +130,7 @@ elif [[ ${ARCHITECTURE} == "x86_64" ]]; then
     sysroot_arch="x86_64"
     bin_prefix="x86_64-linux-android"
 else
-    echo "archtecture ${arcitecture} is not supported." 1>&2
+    echo "architecture ${ARCHITECTURE} is not supported." 1>&2
     usage
     exit 1
 fi
diff --git a/tensorflow/contrib/makefile/compile_ios_protobuf.sh b/tensorflow/contrib/makefile/compile_ios_protobuf.sh
index d1012a6c93..e8b9454e7e 100755
--- a/tensorflow/contrib/makefile/compile_ios_protobuf.sh
+++ b/tensorflow/contrib/makefile/compile_ios_protobuf.sh
@@ -1,4 +1,4 @@
-#!/bin/bash -x -e
+#!/bin/bash
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,6 +15,9 @@
 # ==============================================================================
 # Builds protobuf 3 for iOS.
 
+set -x
+set -e
+
 SCRIPT_DIR=$(dirname $0)
 source "${SCRIPT_DIR}/build_helper.subr"
 
@@ -30,17 +33,17 @@ fi
 
 JOB_COUNT="${JOB_COUNT:-$(get_job_count)}"
 
-GENDIR=`pwd`/gen/protobuf_ios/
+GENDIR=$(pwd)/gen/protobuf_ios/
 LIBDIR=${GENDIR}lib
 mkdir -p ${LIBDIR}
 
 OSX_VERSION=darwin14.0.0
 
-IPHONEOS_PLATFORM=`xcrun --sdk iphoneos --show-sdk-platform-path`
-IPHONEOS_SYSROOT=`xcrun --sdk iphoneos --show-sdk-path`
-IPHONESIMULATOR_PLATFORM=`xcrun --sdk iphonesimulator --show-sdk-platform-path`
-IPHONESIMULATOR_SYSROOT=`xcrun --sdk iphonesimulator --show-sdk-path`
-IOS_SDK_VERSION=`xcrun --sdk iphoneos --show-sdk-version`
+IPHONEOS_PLATFORM=$(xcrun --sdk iphoneos --show-sdk-platform-path)
+IPHONEOS_SYSROOT=$(xcrun --sdk iphoneos --show-sdk-path)
+IPHONESIMULATOR_PLATFORM=$(xcrun --sdk iphonesimulator --show-sdk-platform-path)
+IPHONESIMULATOR_SYSROOT=$(xcrun --sdk iphonesimulator --show-sdk-path)
+IOS_SDK_VERSION=$(xcrun --sdk iphoneos --show-sdk-version)
 MIN_SDK_VERSION=8.0
 
 CFLAGS="-DNDEBUG -Os -pipe -fPIC -fno-exceptions"
diff --git a/tensorflow/contrib/makefile/compile_ios_tensorflow.sh b/tensorflow/contrib/makefile/compile_ios_tensorflow.sh
index 6f47b80780..bcf097b303 100755
--- a/tensorflow/contrib/makefile/compile_ios_tensorflow.sh
+++ b/tensorflow/contrib/makefile/compile_ios_tensorflow.sh
@@ -20,7 +20,7 @@ source "${SCRIPT_DIR}/build_helper.subr"
 JOB_COUNT="${JOB_COUNT:-$(get_job_count)}"
 
 function less_than_required_version() {
-  echo $1 | (IFS=. read major minor micro
+  echo $1 | (IFS=. read -r major minor micro
     if [ $major -ne $2 ]; then
       [ $major -lt $2 ]
     elif [ $minor -ne $3 ]; then
@@ -31,7 +31,7 @@ function less_than_required_version() {
   )
 }
 
-ACTUAL_XCODE_VERSION=`xcodebuild -version | head -n 1 | sed 's/Xcode //'`
+ACTUAL_XCODE_VERSION=$(xcodebuild -version | head -n 1 | sed 's/Xcode //')
 REQUIRED_XCODE_VERSION=7.3.0
 if less_than_required_version $ACTUAL_XCODE_VERSION 7 3 0
 then
diff --git a/tensorflow/contrib/makefile/compile_pi_protobuf.sh b/tensorflow/contrib/makefile/compile_pi_protobuf.sh
index 2aae2d5f4e..f863d80009 100755
--- a/tensorflow/contrib/makefile/compile_pi_protobuf.sh
+++ b/tensorflow/contrib/makefile/compile_pi_protobuf.sh
@@ -15,15 +15,15 @@
 # ==============================================================================
 # Builds protobuf 3 for iOS.
 
-cd tensorflow/contrib/makefile
+cd tensorflow/contrib/makefile || exit 1
 
-GENDIR=`pwd`/gen/protobuf_pi/
+GENDIR=$(pwd)/gen/protobuf_pi/
 LIBDIR=${GENDIR}
 mkdir -p ${LIBDIR}
 
 CXX=arm-linux-gnueabihf-g++
 
-cd downloads/protobuf
+cd downloads/protobuf || exit 1
 
 ./autogen.sh
 if [ $? -ne 0 ]
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index aeb3c45697..4a3b3e7762 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -14,6 +14,7 @@ tensorflow/core/kernels/transpose_functor_cpu.cc
 tensorflow/core/kernels/training_op_helpers.cc
 tensorflow/core/kernels/training_ops.cc
 tensorflow/core/kernels/topk_op.cc
+tensorflow/core/kernels/tile_functor_cpu.cc
 tensorflow/core/kernels/tile_ops.cc
 tensorflow/core/kernels/tile_ops_cpu_impl_1.cc
 tensorflow/core/kernels/tile_ops_cpu_impl_2.cc
diff --git a/tensorflow/contrib/ndlstm/python/lstm2d.py b/tensorflow/contrib/ndlstm/python/lstm2d.py
index da9698ec5b..ebbb4ccf11 100644
--- a/tensorflow/contrib/ndlstm/python/lstm2d.py
+++ b/tensorflow/contrib/ndlstm/python/lstm2d.py
@@ -91,21 +91,71 @@ def horizontal_lstm(images, num_filters_out, scope=None):
     return output
 
 
-def separable_lstm(images, num_filters_out, nhidden=None, scope=None):
+def get_blocks(images, kernel_size):
+  """Split images in blocks
+
+  Args:
+    images: (num_images, height, width, depth) tensor
+    kernel_size: A list of length 2 holding the [kernel_height, kernel_width] of
+      of the pooling. Can be an int if both values are the same.
+
+  Returns:
+    (num_images, height/kernel_height, width/kernel_width,
+    depth*kernel_height*kernel_width) tensor
+  """
+  with variable_scope.variable_scope("image_blocks"):
+    batch_size, height, width, chanels = _shape(images)
+
+    if height % kernel_size[0] != 0:
+      offset = array_ops.zeros([batch_size,
+                                kernel_size[0] - (height % kernel_size[0]),
+                                width,
+                                chanels])
+      images = array_ops.concat([images, offset], 1)
+      batch_size, height, width, chanels = _shape(images)
+    if width % kernel_size[1] != 0:
+      offset = array_ops.zeros([batch_size,
+                                height,
+                                kernel_size[1] - (width % kernel_size[1]),
+                                chanels])
+      images = array_ops.concat([images, offset], 2)
+      batch_size, height, width, chanels = _shape(images)
+
+    h, w = int(height / kernel_size[0]), int(width / kernel_size[1])
+    features = kernel_size[1] * kernel_size[0] * chanels
+
+    lines = array_ops.split(images, h, axis=1)
+    line_blocks = []
+    for line in lines:
+      line = array_ops.transpose(line, [0, 2, 3, 1])
+      line = array_ops.reshape(line, [batch_size, w, features])
+      line_blocks.append(line)
+
+    return array_ops.stack(line_blocks, axis=1)
+
+
+def separable_lstm(images, num_filters_out,
+                   kernel_size=None, nhidden=None, scope=None):
   """Run bidirectional LSTMs first horizontally then vertically.
 
   Args:
     images: (num_images, height, width, depth) tensor
     num_filters_out: output layer depth
+    kernel_size: A list of length 2 holding the [kernel_height, kernel_width] of
+      of the pooling. Can be an int if both values are the same. Set to None for
+      not using blocks
     nhidden: hidden layer depth
     scope: optional scope name
 
   Returns:
-    (num_images, height, width, num_filters_out) tensor
+    (num_images, height/kernel_height, width/kernel_width,
+    num_filters_out) tensor
   """
   with variable_scope.variable_scope(scope, "SeparableLstm", [images]):
     if nhidden is None:
       nhidden = num_filters_out
+    if kernel_size is not None:
+      images = get_blocks(images, kernel_size)
     hidden = horizontal_lstm(images, nhidden)
     with variable_scope.variable_scope("vertical"):
       transposed = array_ops.transpose(hidden, [0, 2, 1, 3])
diff --git a/tensorflow/contrib/ndlstm/python/lstm2d_test.py b/tensorflow/contrib/ndlstm/python/lstm2d_test.py
index 3dbbb81796..f1b37d701b 100644
--- a/tensorflow/contrib/ndlstm/python/lstm2d_test.py
+++ b/tensorflow/contrib/ndlstm/python/lstm2d_test.py
@@ -69,6 +69,14 @@ class Lstm2DTest(test_util.TensorFlowTestCase):
       result = outputs.eval()
       self.assertEqual(tuple(result.shape), (2, 7, 11, 8))
 
+  def testSeparableLstmDimsBlocks(self):
+    with self.test_session():
+      inputs = constant_op.constant(_rand(2, 7, 11, 5))
+      outputs = lstm2d.separable_lstm(inputs, 8, kernel_size=[2, 2])
+      variables.global_variables_initializer().run()
+      result = outputs.eval()
+      self.assertEqual(tuple(result.shape), (2, 4, 6, 8))
+
   def testReduceToSequenceDims(self):
     with self.test_session():
       inputs = constant_op.constant(_rand(2, 7, 11, 5))
diff --git a/tensorflow/contrib/remote_fused_graph/pylib/BUILD b/tensorflow/contrib/remote_fused_graph/pylib/BUILD
index c7ed663131..288f59fed4 100644
--- a/tensorflow/contrib/remote_fused_graph/pylib/BUILD
+++ b/tensorflow/contrib/remote_fused_graph/pylib/BUILD
@@ -7,6 +7,7 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 
 tf_gen_op_wrapper_py(
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index 9c5e9fec9d..ecce1d22f0 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -79,7 +79,7 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
 
   The default non-peephole implementation is based on:
 
-    http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
+    http://www.bioinf.jku.at/publications/older/2604.pdf
 
   S. Hochreiter and J. Schmidhuber.
   "Long Short-Term Memory". Neural Computation, 9(8):1735-1780, 1997.
@@ -1110,14 +1110,14 @@ class AttentionCellWrapper(rnn_cell_impl.RNNCell):
     if input_size is None:
       input_size = inputs.get_shape().as_list()[1]
     inputs = _linear([inputs, attns], input_size, True)
-    lstm_output, new_state = self._cell(inputs, state)
+    cell_output, new_state = self._cell(inputs, state)
     if self._state_is_tuple:
       new_state_cat = array_ops.concat(nest.flatten(new_state), 1)
     else:
       new_state_cat = new_state
     new_attns, new_attn_states = self._attention(new_state_cat, attn_states)
     with vs.variable_scope("attn_output_projection"):
-      output = _linear([lstm_output, new_attns], self._attn_size, True)
+      output = _linear([cell_output, new_attns], self._attn_size, True)
     new_attn_states = array_ops.concat(
         [new_attn_states, array_ops.expand_dims(output, 1)], 1)
     new_attn_states = array_ops.reshape(
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py
index 491d87f62d..3496b355b4 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py
@@ -65,7 +65,9 @@ class GatherTreeTest(test.TestCase):
         _ = beams.eval()
 
   def testBadParentValuesOnGPU(self):
-    if not test.is_gpu_available():
+    # Only want to run this test on CUDA devices, as gather_tree is not
+    # registered for SYCL devices.
+    if not test.is_gpu_available(cuda_only=True):
       return
     # (max_time = 4, batch_size = 1, beams = 3)
     # bad parent in beam 1 time 1; appears as a negative index at time 0
diff --git a/tensorflow/contrib/seq2seq/python/ops/decoder.py b/tensorflow/contrib/seq2seq/python/ops/decoder.py
index 4795dfb8c9..fbe53fc60a 100644
--- a/tensorflow/contrib/seq2seq/python/ops/decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/decoder.py
@@ -99,9 +99,9 @@ class Decoder(object):
       name: Name scope for any created operations.
 
     Returns:
-      `(outputs, next_state, next_inputs, finished)`: `outputs` is an instance
-      of BasicDecoderOutput, `next_state` is a (structure of) state tensors and
-      TensorArrays, `next_inputs` is the tensor that should be used as input for
+      `(outputs, next_state, next_inputs, finished)`: `outputs` is an object
+      containing the decoder output, `next_state` is a (structure of) state tensors
+      and TensorArrays, `next_inputs` is the tensor that should be used as input for
       the next step, `finished` is a boolean tensor telling whether the sequence
       is complete, for each sequence in the batch.
     """
diff --git a/tensorflow/contrib/slim/README.md b/tensorflow/contrib/slim/README.md
index d37c632be7..4bde661698 100644
--- a/tensorflow/contrib/slim/README.md
+++ b/tensorflow/contrib/slim/README.md
@@ -836,7 +836,7 @@ with tf.Session() as sess:
   for batch_id in range(num_batches):
     sess.run(names_to_updates.values())
 
-  metric_values = sess.run(name_to_values.values())
+  metric_values = sess.run(names_to_values.values())
   for metric, value in zip(names_to_values.keys(), metric_values):
     print('Metric %s has value: %f' % (metric, value))
 ```
diff --git a/tensorflow/contrib/tfprof/README.md b/tensorflow/contrib/tfprof/README.md
index 824ba4c09b..4fa1ccea69 100644
--- a/tensorflow/contrib/tfprof/README.md
+++ b/tensorflow/contrib/tfprof/README.md
@@ -1,3 +1,26 @@
 # tfprof: TensorFlow Profiler and Beyond
 
 # Full Document in tensorflow/tools/tfprof/README.md
+
+Author: Xin Pan (xpan@google.com, github: panyx0718), Jon Shlens, Yao Zhang
+
+Consultants: Jon Shlens, Pete Warden
+
+###Major Features
+
+1.  Measure model parameters, float operations, tensor shapes.
+2.  Profile op execution times, requested memory size and device placement.
+3.  Inspect checkpoint tensors' shapes and their values.
+4.  Selectively group, filter, account and order ops.
+
+####tfprof supports 3 views to organize TensorFlow model profiles
+
+    *  code view: Stats are associated your Python codes and organized as call stacks.
+    *  scope view: Stats are organized as name scope hierarchies.
+    *  graph view: Stats are organized as Tensorflow Op graph.
+
+####For each view, there are 3 ways to display outputs:
+
+    *  stdout: Results are written to stdout.
+    *  timeline: Visualized in chrome browser as time series.
+    *  file: Results are dumped to file.
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/internal/run_metadata_test.py b/tensorflow/contrib/tfprof/python/tools/tfprof/internal/run_metadata_test.py
index 9c59df3117..71468dde37 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/internal/run_metadata_test.py
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/internal/run_metadata_test.py
@@ -89,7 +89,7 @@ def _run_loop_model():
 class RunMetadataTest(test.TestCase):
 
   def testGPU(self):
-    if not test.is_gpu_available():
+    if not test.is_gpu_available(cuda_only=True):
       return
 
     ops.reset_default_graph()
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 168726a6b3..0f15133d74 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -171,7 +171,6 @@ class TpuEstimator(estimator_lib.Estimator):
   Note: TpuEstimator transforms a global batch size in params to a per-shard
         batch size when calling the input_fn.
   """
-
   def __init__(self,
                model_fn=None,
                model_dir=None,
@@ -196,6 +195,8 @@ class TpuEstimator(estimator_lib.Estimator):
             .format(params[_BATCH_SIZE_KEY], config.tpu_config.num_shards))
 
     if use_tpu:
+      if not isinstance(config, tpu_config.RunConfig):
+        raise ValueError('`config` must be `tpu_config.RunConfig`')
       # Verifies the model_fn signature according to Estimator framework.
       estimator_lib._verify_model_fn_args(model_fn, params)  # pylint: disable=protected-access
       # We cannot store config and params in this constructor as parent
@@ -204,7 +205,6 @@ class TpuEstimator(estimator_lib.Estimator):
       model_function = wrapped_model_fn(model_fn)
     else:
       model_function = model_fn
-
     super(TpuEstimator, self).__init__(
         model_fn=model_function,
         model_dir=model_dir,
diff --git a/tensorflow/contrib/training/python/training/evaluation.py b/tensorflow/contrib/training/python/training/evaluation.py
index 24b733dd29..a895d90b8e 100644
--- a/tensorflow/contrib/training/python/training/evaluation.py
+++ b/tensorflow/contrib/training/python/training/evaluation.py
@@ -226,7 +226,7 @@ def checkpoints_iterator(checkpoint_dir,
 
   This behavior gives control to callers on what to do if checkpoints do not
   come fast enough or stop being generated.  For example, if callers have a way
-  to detect that the training has stopped and know that no new new checkpoints
+  to detect that the training has stopped and know that no new checkpoints
   will be generated, they can provide a `timeout_fn` that returns `True` when
   the training has stopped.  If they know that the training is still going on
   they return `False` instead.
diff --git a/tensorflow/contrib/verbs/rdma.cc b/tensorflow/contrib/verbs/rdma.cc
index bc687be0ab..6f3a616fe8 100644
--- a/tensorflow/contrib/verbs/rdma.cc
+++ b/tensorflow/contrib/verbs/rdma.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
+#include "tensorflow/core/common_runtime/gpu/process_state.h"
 #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
 #include "tensorflow/core/framework/rendezvous.h"
@@ -683,7 +684,6 @@ void RdmaTensorBuffer::SendNextItem() {
                          << " error message: " << status.error_message();
       size_t buffer_size = RdmaMessage::kMessageTotalBytes;
       size_t tensor_bytes = 0;
-      TensorProto proto;
       // Figures out which device the tensor is hosted on.
       Device* src_dev = nullptr;
       Status s = channel_->adapter_->worker_env_->device_mgr->LookupDevice(
@@ -703,21 +703,47 @@ void RdmaTensorBuffer::SendNextItem() {
       CHECK(s.ok()) << "dst device not found";
       AllocatorAttributes dst_alloc_attr;
       dst_alloc_attr.set_on_host(true);
+
+      bool can_memcpy = DataTypeCanUseMemcpy(in.dtype());
       // string tensor needs to be serialized
+      Tensor copy;
+      StringPiece copy_buf;
+      TensorProto proto;
       if (src_dev->tensorflow_gpu_device_info() &&
           (!send_args.alloc_attrs.on_host())) {
         CHECK(send_args.device_context)
-            << "send dev name: " << src_dev->name()
-            << " gpu_info: " << src_dev->tensorflow_gpu_device_info();
-        // "val" is on a GPU. Uses GPUUtil to fill the proto.
-        s = VerbsUtil::SetProtoFromGPUSync(
-            in, src_dev, send_args.device_context, &proto, is_dead);
-        CHECK(s.ok()) << "set proto from gpu sync";
+          << "send dev name: " << src_dev->name()
+          << " gpu_info: " << src_dev->tensorflow_gpu_device_info();
+
+        if (can_memcpy) {
+          AllocatorAttributes host_alloc_attrs;
+          host_alloc_attrs.set_gpu_compatible(true);
+          host_alloc_attrs.set_on_host(true);
+          Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0);
+          copy = Tensor(alloc, in.dtype(), in.shape());
+          s = VerbsUtil::CopyGPUTensorToCPUSync(
+              src_dev, send_args.device_context, &in, &copy);
+          CHECK(s.ok()) << "copy tensor from gpu sync";
+          copy_buf = copy.tensor_data();
+        } else {
+          // "val" is on a GPU. Uses GPUUtil to fill the proto.
+          s = VerbsUtil::SetProtoFromGPUSync(
+              in, src_dev, send_args.device_context, &proto, is_dead);
+          CHECK(s.ok()) << "set proto from gpu sync";
+        }
       } else {
         // tensor is in CPU memory.
-        in.AsProtoTensorContent(&proto);
+        if (can_memcpy) {
+          copy_buf = in.tensor_data();
+        } else {
+          in.AsProtoTensorContent(&proto);
+        }
+      }
+      if (can_memcpy) {
+        tensor_bytes = in.TotalBytes();
+      } else {
+        tensor_bytes = proto.ByteSize();
       }
-      tensor_bytes = proto.ByteSize();
       // maybe some margin for string tensor?
       buffer_size += tensor_bytes;
       // prepare message
@@ -771,7 +797,16 @@ void RdmaTensorBuffer::SendNextItem() {
               static_cast<void*>(static_cast<char*>(buffer_) +
                                  RdmaMessage::kTensorBufferStartIndex);
           CHECK(tensor_bytes + RdmaMessage::kTensorBufferStartIndex <= size_);
-          proto.SerializeToArray(output, tensor_bytes);
+          if (can_memcpy) {
+            CHECK(copy_buf.size() == tensor_bytes)
+               << "unexpected tensor size: "
+               << copy_buf.size()
+               << " != "
+               << tensor_bytes;
+            memcpy(output, copy_buf.data(), tensor_bytes);
+          } else {
+            proto.SerializeToArray(output, tensor_bytes);
+          }
         } else {
           buffer_size = RdmaMessage::kMessageTotalBytes;
         }
diff --git a/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc b/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc
index 5871400f26..9ea696589a 100644
--- a/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc
+++ b/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/gpu/process_state.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -99,12 +100,40 @@ void RdmaRemoteRendezvous::RecvFromRemoteAsync(
     if (!rm.is_dead_) {
       void* input = static_cast<char*>(rb->buffer_) +
                     RdmaMessage::kTensorBufferStartIndex;
-      TensorProto proto;
-      CHECK(rm.tensor_bytes_ + RdmaMessage::kTensorBufferStartIndex <=
-            rb->size_);
-      CHECK(ParseProtoUnlimited(&proto, input, rm.tensor_bytes_))
-          << "fail to parse proto from array";
-      s = dst_dev->MakeTensorFromProto(proto, recv_args.alloc_attrs, &val);
+      bool can_memcpy = DataTypeCanUseMemcpy(rm.data_type_);
+      if (can_memcpy) {
+        if (dst_dev->tensorflow_gpu_device_info() &&
+            (!recv_args.alloc_attrs.on_host())) {
+          CHECK(recv_args.device_context)
+            << "send dev name: " << src_dev->name()
+            << " gpu_info: " << src_dev->tensorflow_gpu_device_info();
+          Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0);
+          Tensor copy(alloc, rm.data_type_, rm.tensor_shape_);
+          memcpy(DMAHelper::base(&copy), input, rm.tensor_bytes_);
+
+          Allocator* dst_alloc = dst_dev->GetAllocator(recv_args.alloc_attrs);
+          Tensor gpu_copy(dst_alloc, rm.data_type_, rm.tensor_shape_);
+          s = VerbsUtil::CopyCPUTensorToGPUSync(&copy, recv_args.device_context,
+                                                dst_dev, &gpu_copy);
+          CHECK(s.ok()) << "copy tensor to gpu sync";
+          val = std::move(gpu_copy);
+        } else {
+          AllocatorAttributes host_alloc_attrs;
+          host_alloc_attrs.set_gpu_compatible(true);
+          host_alloc_attrs.set_on_host(true);
+          Allocator* alloc = dst_dev->GetAllocator(host_alloc_attrs);
+          Tensor copy(alloc, rm.data_type_, rm.tensor_shape_);
+          memcpy(DMAHelper::base(&copy), input, rm.tensor_bytes_);
+          val = std::move(copy);
+        }
+      } else {
+        TensorProto proto;
+        CHECK(rm.tensor_bytes_ + RdmaMessage::kTensorBufferStartIndex <=
+              rb->size_);
+        CHECK(ParseProtoUnlimited(&proto, input, rm.tensor_bytes_))
+            << "fail to parse proto from array";
+        s = dst_dev->MakeTensorFromProto(proto, recv_args.alloc_attrs, &val);
+      }
     }
 
     rc->RemoveRecvCallback(key_with_step_id);
diff --git a/tensorflow/contrib/verbs/verbs_util.cc b/tensorflow/contrib/verbs/verbs_util.cc
index c3350f7958..76e44d34a9 100644
--- a/tensorflow/contrib/verbs/verbs_util.cc
+++ b/tensorflow/contrib/verbs/verbs_util.cc
@@ -20,6 +20,40 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 namespace tensorflow {
 
+// static sync wrapper:
+Status VerbsUtil::CopyGPUTensorToCPUSync(Device* gpu_device,
+                              const DeviceContext* device_context,
+                              const Tensor* gpu_tensor,
+                              Tensor* cpu_tensor) {
+  Notification n;
+  Status status;
+  GPUUtil::CopyGPUTensorToCPU(gpu_device, device_context,
+                              gpu_tensor, cpu_tensor,
+                              [&n, &status](const Status& s) {
+                                status = s;
+                                n.Notify();
+                              });
+  n.WaitForNotification();
+  return status;
+}
+
+// static sync wrapper:
+Status VerbsUtil::CopyCPUTensorToGPUSync(const Tensor* cpu_tensor,
+                                         const DeviceContext* device_context,
+                                         Device* gpu_device,
+                                         Tensor* gpu_tensor) {
+  Notification n;
+  Status status;
+  GPUUtil::CopyCPUTensorToGPU(cpu_tensor, device_context,
+                              gpu_device, gpu_tensor,
+                              [&n, &status](const Status& s) {
+                                status = s;
+                                n.Notify();
+                              });
+  n.WaitForNotification();
+  return status;
+}
+
 // static sync wrapper:
 Status VerbsUtil::SetProtoFromGPUSync(const Tensor& tensor, Device* dev,
                                       const DeviceContext* device_context,
diff --git a/tensorflow/contrib/verbs/verbs_util.h b/tensorflow/contrib/verbs/verbs_util.h
index cbc01adae4..d9da396228 100644
--- a/tensorflow/contrib/verbs/verbs_util.h
+++ b/tensorflow/contrib/verbs/verbs_util.h
@@ -28,6 +28,16 @@ class TensorProto;
 
 class VerbsUtil {
  public:
+  // synchronous wrapper of CopyGPUTensorToCPU
+  static Status CopyGPUTensorToCPUSync(Device* gpu_device,
+                                       const DeviceContext* device_context,
+                                       const Tensor* gpu_tensor,
+                                       Tensor* cpu_tensor);
+  // synchronous wrapper of CopyCPUTensorToGPU
+  static Status CopyCPUTensorToGPUSync(const Tensor* cpu_tensor,
+                                       const DeviceContext* device_context,
+                                       Device* gpu_device,
+                                       Tensor* gpu_tensor);
   // synchronous wrapper of SetProtoFromGPU
   static Status SetProtoFromGPUSync(const Tensor& tensor, Device* dev,
                                     const DeviceContext* device_context,
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 9f1554b5b6..24b0f702b3 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -121,7 +121,6 @@ load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
 )
-load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")
 
 # -----------------------------------------------------------------------------
 # Public targets
@@ -820,7 +819,7 @@ cc_library(
         ":test",
         "//tensorflow/core/kernels:constant_op",
         "//tensorflow/core/kernels:ops_util",
-        "//tensorflow/core/platform/default/build_config:gtest",  # + if_sycl([":sycl_runtime"])
+        "//tensorflow/core/platform/default/build_config:gtest",
     ],
 )
 
@@ -1854,7 +1853,7 @@ cc_library(
         "common_runtime/sycl/sycl_device_context.h",
     ]),
     copts = tf_copts(),
-    linkstatic = 1,
+    linkstatic = 0,
     deps = [
         ":core_cpu",
         ":core_cpu_internal",
@@ -1866,7 +1865,7 @@ cc_library(
         "//third_party/eigen3",
         "@local_config_sycl//sycl:sycl",
     ],
-    alwayslink = 1,
+    alwayslink = 0,
 )
 
 # -----------------------------------------------------------------------------
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 21a20bcc4d..4b951691fb 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -654,11 +654,11 @@ Status DirectSession::Run(const RunOptions& run_options,
 
   // If requested via RunOptions, output the partition graphs.
   if (run_options.output_partition_graphs()) {
-    protobuf::RepeatedPtrField<GraphDef>* parition_graph_defs =
+    protobuf::RepeatedPtrField<GraphDef>* partition_graph_defs =
         run_metadata->mutable_partition_graphs();
     for (const PerPartitionExecutorsAndLib& exec_and_lib :
          executors_and_keys->items) {
-      GraphDef* partition_graph_def = parition_graph_defs->Add();
+      GraphDef* partition_graph_def = partition_graph_defs->Add();
       exec_and_lib.graph->ToGraphDef(partition_graph_def);
     }
   }
diff --git a/tensorflow/core/debug/grpc_session_debug_test.cc b/tensorflow/core/debug/grpc_session_debug_test.cc
index 3827596a67..d6f35fe24c 100644
--- a/tensorflow/core/debug/grpc_session_debug_test.cc
+++ b/tensorflow/core/debug/grpc_session_debug_test.cc
@@ -279,9 +279,12 @@ TEST_F(GrpcSessionDebugTest, MultiDevices_String) {
 
         DeleteDumpDir();
       } else {
+        // CUDA and SYCL devices do not have an Identity op for strings
         LOG(ERROR) << "Error: " << s;
         ASSERT_TRUE((a_dev.device_type() == DEVICE_GPU) ||
-                    (b_dev.device_type() == DEVICE_GPU));
+                    (a_dev.device_type() == DEVICE_SYCL) ||
+                    (b_dev.device_type() == DEVICE_GPU) ||
+                    (b_dev.device_type() == DEVICE_SYCL));
         ASSERT_FALSE(s.ok());
       }
     }
diff --git a/tensorflow/core/distributed_runtime/worker_cache_logger.cc b/tensorflow/core/distributed_runtime/worker_cache_logger.cc
index 5ca1d92a81..ffcbc50a2a 100644
--- a/tensorflow/core/distributed_runtime/worker_cache_logger.cc
+++ b/tensorflow/core/distributed_runtime/worker_cache_logger.cc
@@ -88,27 +88,39 @@ void WorkerCacheLogger::RecordRecvTensor(int64 step_id, int64 start_usecs,
                                          const string& src_device,
                                          const string& dst_device,
                                          int64 bytes) {
-  NodeExecStats* ns = new NodeExecStats;
-  ns->set_node_name("RecvTensor");
-  string byte_string = strings::StrCat("[", bytes, "B] ");
-  if (bytes >= 0.1 * 1048576.0) {
-    byte_string = strings::Printf("[%.1fMB] ", bytes / 1048576.0);
-  }
-  ns->set_timeline_label(strings::StrCat(byte_string, tensor_name, " from ",
-                                         src_device, " to ", dst_device));
-  ns->set_all_start_micros(start_usecs);
-  ns->set_op_start_rel_micros(0);
-  int64 elapsed = end_usecs - start_usecs;
-  ns->set_op_end_rel_micros(elapsed);
-  ns->set_all_end_rel_micros(elapsed);
-  NodeOutput* no = ns->add_output();
-  no->set_slot(0);
-  // TODO(tucker): Maybe set the dimensions too, but then they'll
-  // need to be passed in.
-  no->mutable_tensor_description()
-      ->mutable_allocation_description()
-      ->set_requested_bytes(bytes);
-  Save(dst_device, step_id, ns);
+  RecordDataTransfer(step_id, start_usecs, end_usecs, tensor_name, src_device,
+    dst_device, bytes, "", "RecvTensor");
 }
 
+void WorkerCacheLogger::RecordDataTransfer(int64 step_id, int64 start_usecs,
+                                           int64 end_usecs,
+                                           const string& tensor_name,
+                                           const string& src_device,
+                                           const string& dst_device,
+                                           int64 bytes,
+                                           const string& details,
+                                           const string& transfer_method_name){
+  NodeExecStats* ns = new NodeExecStats;
+    ns->set_node_name(transfer_method_name);
+    string byte_string = strings::StrCat("[", bytes, "B] ");
+    if (bytes >= 0.1 * 1048576.0) {
+      byte_string = strings::Printf("[%.1fMB] ", bytes / 1048576.0);
+    }
+    ns->set_timeline_label(strings::StrCat(byte_string, tensor_name, " from ",
+                                           src_device, " to ", dst_device,
+                                           details));
+    ns->set_all_start_micros(start_usecs);
+    ns->set_op_start_rel_micros(0);
+    int64 elapsed = end_usecs - start_usecs;
+    ns->set_op_end_rel_micros(elapsed);
+    ns->set_all_end_rel_micros(elapsed);
+    NodeOutput* no = ns->add_output();
+    no->set_slot(0);
+    // TODO(tucker): Maybe set the dimensions too, but then they'll
+    // need to be passed in.
+    no->mutable_tensor_description()
+        ->mutable_allocation_description()
+        ->set_requested_bytes(bytes);
+    Save(dst_device, step_id, ns);
+  }
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/worker_cache_logger.h b/tensorflow/core/distributed_runtime/worker_cache_logger.h
index a92590a176..00846c25fe 100644
--- a/tensorflow/core/distributed_runtime/worker_cache_logger.h
+++ b/tensorflow/core/distributed_runtime/worker_cache_logger.h
@@ -60,6 +60,14 @@ class WorkerCacheLogger {
                         const string& tensor_name, const string& src_device,
                         const string& dst_device, int64 bytes);
 
+  // Generates a NodeExecStats record with the given data, and saves for
+  // later retrieval by RetrieveLogs().
+  void RecordDataTransfer(int64 step_id, int64 start_usecs, int64 end_usecs,
+                          const string& tensor_name, const string& src_device,
+                          const string& dst_device, int64 bytes,
+                          const string& details,
+                          const string& transfer_method_name);
+
  private:
   mutex count_mu_;
   int32 want_logging_count_ GUARDED_BY(count_mu_) = 0;
diff --git a/tensorflow/core/framework/graph_def_util.h b/tensorflow/core/framework/graph_def_util.h
index 56355eaf36..950737c39a 100644
--- a/tensorflow/core/framework/graph_def_util.h
+++ b/tensorflow/core/framework/graph_def_util.h
@@ -62,7 +62,7 @@ Status AddDefaultAttrsToGraphDef(GraphDef* graph_def,
 // attr with a default was added). Note that this will not affect
 // attrs with non-default values, so you must run a
 // ValidateGraphDef...() function to see if the result is in fact
-// compatible. If not nulllptr, the op/attr pairs that were removed
+// compatible. If not nullptr, the op/attr pairs that were removed
 // are added to '*op_attr_removed'.
 //
 // Expected usage, for a producer that wants to prepare a graph for
diff --git a/tensorflow/core/framework/op.h b/tensorflow/core/framework/op.h
index c5a0983a54..a4dd06de45 100644
--- a/tensorflow/core/framework/op.h
+++ b/tensorflow/core/framework/op.h
@@ -205,7 +205,6 @@ class OpDefBuilderWrapper;
 template <>
 class OpDefBuilderWrapper<true> {
  public:
-  typedef OpDefBuilderWrapper<true> WrapperType;
   OpDefBuilderWrapper(const char name[]) : builder_(name) {}
   OpDefBuilderWrapper<true>& Attr(StringPiece spec) {
     builder_.Attr(spec);
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index f8a5946116..fd5c8fdbb8 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -672,7 +672,7 @@ class OpKernelContext {
   void forward_ref_input_to_ref_output(int input_index, int output_index);
 
   // Returns true when an alias to input[input_index], reshaped to output_shape,
-  // which is is safe to use for in-place computation was written to *output.
+  // which is safe to use for in-place computation was written to *output.
   // Returns false if input[input_index] has a refcount greater than one, or if
   // its type does not match the expected output type of output[output_index],
   // or the number of elements in input[input_index] does not equal the number
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index 49eecc0b08..a164fe61b5 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -307,7 +307,7 @@ class Tensor {
   /// Returns the data as an Eigen::Tensor with NDIMS dimensions, collapsing the
   /// first 'begin' Tensor dimensions into the first dimension of the result and
   /// the Tensor dimensions of the last dims() - 'begin' - NDIMS into the last
-  /// dimension of the result. If 'begin' < 0 then the the |'begin'| leading
+  /// dimension of the result. If 'begin' < 0 then the |'begin'| leading
   /// dimensions of size 1 will be added. If 'begin' + NDIMS > dims() then
   /// 'begin' + NDIMS - dims() trailing dimensions of size 1 will be added.
   template <typename T, size_t NDIMS = 3>
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 94741a11ff..625780e7c9 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -247,16 +247,10 @@ namespace tensorflow {
 //
 //           P = Conv2DWithBiasBackpropBias(O, O_m)
 //
-// 'Distance' between input of BiasAddGrad and _MklConv2D in terms of hops is
-// the context matching depth. If _MklConv2DWithBias is not within the context
-// matching depth, then we do not rewrite BiasAddGrad.
-
-// How many hops do we search for matching node in the backward dataflow graph?
-// We use maxhop of 10 based on empirical observations. Also, these are
-// maxhops in backward data-flow graph. Since input of forward nodes (Conv2D)
-// directly goes to backward nodes, we do not expect the hop-distance
-// would be more than few nodes.
-static size_t kNodeMergeContextMaxDepth = 10;
+// Rewrite of BiasAddGrad into Conv2DWithBiasBackpropBias takes place depending
+// on the matching 'context'. The term context is loosely related to which
+// forward op is _associated_ to BiasAddGrad. If it is _MklConv2DWithBias then
+// we consider it Conv2D context; if it is MatMul, then it is MatMul context.
 
 class MklLayoutRewritePass : public GraphOptimizationPass {
  public:
@@ -280,6 +274,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.max_pool = "MaxPool";
     csinfo_.max_pool_grad = "MaxPoolGrad";
     csinfo_.mkl_conv2d = "_MklConv2D";
+    csinfo_.mkl_conv2d_grad_input = "_MklConv2DBackpropInput";
+    csinfo_.mkl_conv2d_grad_filter = "_MklConv2DBackpropFilter";
     csinfo_.mkl_conv2d_with_bias = "_MklConv2DWithBias";
     csinfo_.mkl_conv2d_with_bias_backprop_bias =
                                    "_MklConv2DWithBiasBackpropBias";
@@ -360,16 +356,12 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     minfo_.push_back({csinfo_.mkl_conv2d, csinfo_.bias_add, 0,
                       csinfo_.mkl_conv2d_with_bias});
 
-    // We use maxhop of 10 based on empirical observations. Also, these are
-    // maxhops in backward data-flow graph. Since input of forward nodes
-    // (Conv2D) directly goes to backward nodes, we do not expect the
-    // hop-distance would be more than few nodes.
     biasaddgrad_matmul_context_ = {csinfo_.bias_add_grad, csinfo_.matmul,
-                                   kNodeMergeContextMaxDepth};
+                                   IsBiasAddGradInMatMulContext};
 
     biasaddgrad_conv2dwithbias_context_ = {csinfo_.bias_add_grad,
                                    csinfo_.mkl_conv2d_with_bias,
-                                   kNodeMergeContextMaxDepth};
+                                   IsBiasAddGradInConv2DWithBiasContext};
 
     cinfo_.push_back(&biasaddgrad_matmul_context_);
     cinfo_.push_back(&biasaddgrad_conv2dwithbias_context_);
@@ -392,9 +384,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string node;     // Name of the node to be rewritten
     string fwd;      // Name of the node in the forward pass that this node
                      // corresponds to
-    size_t max_hop;  // Maximum number of hops the fwd is located
-                     // from this node. If the fwd is farther than max_hop
-                     // then we do not rewrite the node.
+    std::function<bool(const Node*, const Node**, void* c)> context_match_fn;
   } ContextInfo;
 
   /// Structure to specify the name of an original node, its new name after
@@ -438,7 +428,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
   /// Structure to store all constant strings
   /// NOTE: names are alphabetically sorted.
-  struct {
+  typedef struct {
     string avg_pool;
     string avg_pool_grad;
     string bias_add;
@@ -457,13 +447,15 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string max_pool;
     string max_pool_grad;
     string mkl_conv2d;
+    string mkl_conv2d_grad_input;
+    string mkl_conv2d_grad_filter;
     string mkl_conv2d_with_bias;
     string mkl_conv2d_with_bias_backprop_bias;
     string relu;
     string relu_grad;
     string reshape;
     string split;
-  } csinfo_;
+  } ConstStringsInfo;
 
  private:
   /// Maintain info about nodes to rewrite
@@ -478,6 +470,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   /// Maintain info about nodes to rewrite
   static std::vector<ContextInfo*> cinfo_;
 
+  /// Maintain structure of constant strings
+  static ConstStringsInfo csinfo_;
+
   /// Context variables used in referencing rules
   static ContextInfo biasaddgrad_matmul_context_;
   static ContextInfo biasaddgrad_conv2dwithbias_context_;
@@ -629,6 +624,173 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return false;
   }
 
+  // Is BiasAddGrad node in 'n' is associated with Conv2DWithBias node
+  // specified in contextinfo 'ci'. Function updates fwd_node to point
+  // to Conv2DWithBias node if 'n' is associated with Conv2DWithBias.
+  //
+  // Association checks for one of the following graphs:
+  //
+  // Graph A:
+  //
+  // _ = Conv2DWithBias(F, I, _)
+  // ..
+  // _ = Conv2DBackpropFilter(F, _, G)
+  // _ = Conv2DBackpropInput(_, I, G)
+  // _ = BiasAddGrad(G)
+  //
+  // OR
+  //
+  // Graph B:
+  //
+  // _ = Conv2DWithBias(F, _, _)
+  // ..
+  // _ = Conv2DBackpropFilter(F, _, G)
+  // _ = BiasAddGrad(G)
+  //
+  // Here F, G, and I are graph nodes; _ represents graph nodes that we
+  // don't care here.
+  //
+  // @return - true (if BiasAddGrad is associated with Conv2DWithBias);
+  //           false otherwise.
+  static bool IsBiasAddGradInConv2DWithBiasContext(const Node* n,
+                                                   const Node** fwd_node,
+                                                   void* ci) {
+    CHECK_NOTNULL(n);
+    CHECK_NOTNULL(fwd_node);
+    CHECK_NOTNULL(ci);
+    *fwd_node = nullptr;
+
+    CHECK_EQ(n->type_string(), csinfo_.bias_add_grad);
+
+    // Get the only 1 input of BiasAddGrad.
+    CHECK_EQ(n->num_inputs(), 1);
+    const Node* bias_add_grad_inp = nullptr;
+    TF_CHECK_OK(n->input_node(0, &bias_add_grad_inp));
+    CHECK_NOTNULL(bias_add_grad_inp);
+
+    // Check if this input also goes to BackpropFilter and BackpropInput
+    // as 3rd input.
+    bool found_backprop_input = false;
+    bool found_backprop_filter = false;
+    Node* backprop_filter_node = nullptr;
+    Node* backprop_input_node = nullptr;
+
+    for (const Edge* e : bias_add_grad_inp->out_edges()) {
+      Node* third_input = nullptr;
+      if (e->dst()->type_string() == csinfo_.conv2d_grad_input ||
+          e->dst()->type_string() == csinfo_.mkl_conv2d_grad_input) {
+        // Third input (index 2) of BackpropInput
+        TF_CHECK_OK(e->dst()->input_node(2, &third_input));
+        // Third input (index 2) of BackpropInput must be same as the input
+        // of BiasAddGrad.
+        if (third_input == bias_add_grad_inp) {
+          found_backprop_input = true;
+          backprop_input_node = e->dst();
+        }
+      }
+
+      if (e->dst()->type_string() == csinfo_.conv2d_grad_filter ||
+          e->dst()->type_string() == csinfo_.mkl_conv2d_grad_filter) {
+        // Third input (index 2) of BackpropFilter
+        TF_CHECK_OK(e->dst()->input_node(2, &third_input));
+        // Third input (index 2) of BackpropFilter must be same as the input
+        // of BiasAddGrad.
+        if (third_input == bias_add_grad_inp) {
+          found_backprop_filter = true;
+          backprop_filter_node = e->dst();
+        }
+      }
+
+      // If we found both the nodes, then we can stop the search.
+      if (found_backprop_input && found_backprop_filter) {
+        break;
+      }
+    }
+
+    // If BackpropFilter node is not found, then this is not
+    // Conv2DWithBias context. For 2nd graph in the example above, only
+    // BackpropFilter would be present.
+    if (!found_backprop_filter) {
+      return false;
+    }
+
+    // Otherwise, we found the nodes.
+    CHECK_NOTNULL(backprop_filter_node);
+    if (found_backprop_input) {
+      CHECK_NOTNULL(backprop_input_node);
+    }
+
+    // Now that we confirmed that this is Conv2DWithBias context, we need to
+    // get access to the forward node (Conv2DWithBias). 2nd input of
+    // Conv2DWithBias is same as the 2nd input of Conv2DBackpropInput; 1st
+    // input of Conv2DWithBias is same as the 1st input of Conv2DBackpropFilter
+    // (This comes from definition of gradient computation for Conv2D).
+    if (found_backprop_input) {
+      // Graph A in the example.
+      Node* second_inp_of_input = nullptr;
+      Node* first_inp_of_filter = nullptr;
+      TF_CHECK_OK(backprop_input_node->input_node(1, &second_inp_of_input));
+      TF_CHECK_OK(backprop_filter_node->input_node(0, &first_inp_of_filter));
+      CHECK_NOTNULL(second_inp_of_input);
+      CHECK_NOTNULL(first_inp_of_filter);
+
+      // Now we need to find out Conv2DWithBias node from these input nodes.
+      // Conv2DWithBias node is the node that accepts both the nodes
+      // second_inp_of_input and first_inp_of_filter in 2nd and 1st input slots.
+      for (const Edge* fe : first_inp_of_filter->out_edges()) {
+        if (fe->dst()->type_string() == csinfo_.mkl_conv2d_with_bias &&
+            fe->dst_input() == 0) {
+          for (const Edge* ie : second_inp_of_input->out_edges()) {
+            if (ie->dst()->type_string() == csinfo_.mkl_conv2d_with_bias &&
+                ie->dst_input() == 1 && fe->dst() == ie->dst()) {
+              VLOG(1) << "MklLayoutRewritePass: found "
+                      << fe->dst()->DebugString()
+                      << " as the forward node for matching context, backward"
+                      << " node is: " << n->DebugString();
+              *fwd_node = fe->dst();
+              return true;
+            }
+          }
+        }
+      }
+    } else {
+      // We did not find BackpropInput, so we work with BackpropFilter only.
+      // Graph B in the example.
+      Node* first_inp_of_filter = nullptr;
+      TF_CHECK_OK(backprop_filter_node->input_node(0, &first_inp_of_filter));
+      CHECK_NOTNULL(first_inp_of_filter);
+
+      // Now we need to find out Conv2DWithBias node from first input of
+      // BackpropFIlter. Conv2DWithBias node is the node that accepts
+      // first_inp_of_filter in 1st input slot.
+      for (const Edge* fe : first_inp_of_filter->out_edges()) {
+        if (fe->dst()->type_string() == csinfo_.mkl_conv2d_with_bias &&
+            fe->dst_input() == 0) {
+          VLOG(1) << "MklLayoutRewritePass: found "
+                  << fe->dst()->DebugString()
+                  << " as the forward node for matching context, backward"
+                  << " node is: " << n->DebugString();
+          *fwd_node = fe->dst();
+          return true;
+        }
+      }
+    }
+
+    return false;
+  }
+
+  // Is BiasAddGrad node in 'n' is associated with MatMul node
+  // specified in contextinfo 'ci'. Function does not update fwd_node.
+  //
+  // @return - true (if BiasAddGrad is associated with MatMul);
+  //           false otherwise.
+  static bool IsBiasAddGradInMatMulContext(const Node* n,
+                                           const Node** fwd_node,
+                                           void* ci) {
+    return (!IsBiasAddGradInConv2DWithBiasContext(n, fwd_node, ci));
+  }
+
+
   // Rewrite rule that uses context-information for matching,
   // used in scenario 2.
   //
@@ -639,8 +801,6 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   static bool ContextMatchRewrite(const Node* n, const ContextInfo* c);
 
   // Helper function that searches the matching contextinfo for the node.
-  // Implements depth-first search in the data dependence graph for the
-  // gradient op in the backward direction.
   //
   // @input n - Node (gradient op) whose contextinfo is to be searched,
   //        fwd_node - pointer to node from the forward pass that this node
@@ -788,6 +948,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                                    Node* orig_node);
 };
 
+MklLayoutRewritePass::ConstStringsInfo MklLayoutRewritePass::csinfo_;
 MklLayoutRewritePass::ContextInfo
   MklLayoutRewritePass::biasaddgrad_conv2dwithbias_context_;
 MklLayoutRewritePass::ContextInfo
@@ -1667,12 +1828,12 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
   const ContextInfo* ci = nullptr;
   bool is_context_based_rewrite = false;
   if ((ci = SearchMatchingContext(orig_node, &fwd_node)) != nullptr) {
-    CHECK_NOTNULL(fwd_node);
     is_context_based_rewrite = true;
 
     // Sanity checks for context-based rewrite (if any)
     if (orig_node->type_string() == csinfo_.bias_add_grad &&
         ri->new_name == csinfo_.mkl_conv2d_with_bias_backprop_bias) {
+      CHECK_NOTNULL(fwd_node);
       DataType orig_T, ctx_T;
       string orig_data_format, ctx_data_format;
       TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &orig_T));
@@ -1784,69 +1945,17 @@ MklLayoutRewritePass::SearchMatchingContext(const Node* n,
   CHECK_NOTNULL(fwd_node);
   *fwd_node = nullptr;
 
-  // Search for matching contextinfo based on node name.
-  // There could be more than one matching contextinfos.
-  bool is_matching_cinfo_found = false;
-  std::vector<const ContextInfo*> mci;
+  // Search for matching contextinfo based on node name and call
+  // callback function using matching contextinfo.
+  // There could be more than one matching contextinfos but whichever
+  // matches first is returned.
   for (auto ci = cinfo_.cbegin(); ci != cinfo_.cend(); ++ci) {
-    if (n->type_string() == (*ci)->node) {
-      mci.push_back(*ci);
-      is_matching_cinfo_found = true;
+    if (n->type_string() == (*ci)->node &&
+        (*ci)->context_match_fn(n, fwd_node, *ci)) {
+      VLOG(1) << "Found context as matching: " << (*ci)->fwd;
+      return *ci;
     }
   }
-  // If no matching contextinfo is found, return immediately.
-  if (!is_matching_cinfo_found) {
-    return nullptr;
-  }
-
-  VLOG(1) << "MklLayoutRewritePass: Searching graph for: " << n->type_string()
-          << " in backwards.";
-
-  // Now we will check for forward op name for context info in data
-  // flow graph. Get the max hops we should search for the fwd node.
-  // We are now going to search (breadth-first) backwards in data
-  // dependence graph (for up to max hops) from n for the node
-  // specified in fwd.
-  // queue to maintain nodes to be visited and depth info for
-  // breadth-first search
-  std::queue<std::pair<const Node*, int>> nqueue;
-  const Node* curr_node = n;
-  size_t curr_depth = 0;
-  nqueue.push(std::make_pair(curr_node, curr_depth));
-
-  while (curr_depth < kNodeMergeContextMaxDepth && !nqueue.empty()) {
-    std::pair<const Node*, int> curr_pair = nqueue.front();
-    nqueue.pop();
-
-    std::set<const Node*> visited_nodes;
-    curr_node = curr_pair.first;
-    curr_depth = curr_pair.second;
-    CHECK_NOTNULL(curr_node);
-
-    VLOG(1) << "MklLayoutRewritePass: Visiting node: "
-            << curr_node->type_string() << " at depth: " << curr_depth
-            << " for node: " << n->type_string();
-
-    // If we find a match, we return immediately.
-    for (const ContextInfo* ci : mci) {
-      if (curr_node->type_string() == ci->fwd) {
-        *fwd_node = curr_node;
-        return ci;
-      }
-    }
-
-    // Else we explore backward edges from current node.
-    // Add the source nodes of all incoming edges of the node to the queue.
-    for (const Edge* e : curr_node->in_edges()) {
-      // We do not visit already visited node.
-      if (visited_nodes.find(e->src()) == visited_nodes.end()) {
-        // Depth of these nodes is 1 more than the depth of current node.
-        nqueue.push(std::make_pair(e->src(), curr_depth + 1));
-        visited_nodes.insert(e->src());
-      }
-    }
-  } /* while */
-
   return nullptr;
 }
 
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 3c4a5263af..efbe2134e0 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -345,7 +345,8 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_AttrMismatch) {
 // Test set 2: _MklConv2D..BiasAddGrad -> _MklConv2DWithBiasBackpropBias
 // rewrite tests
 
-// D=_MklConv2D(A,M,B,N,C,O); E=Sub(D,A); F=BiasAddGrad(E)
+// BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter
+// and BackpropInput
 TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Positive) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
@@ -364,16 +365,255 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Positive) {
       "node { name: 'E' op: 'Sub'"
       " attr {key: 'T'                 value { type: DT_FLOAT } }"
       " input: ['D', 'A']}"
-      "node { name: 'F' op: 'BiasAddGrad'"
+      "node { name: 'F' op: 'Int32Input'}"
+      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'F', 'E', 'M', 'N', 'O'] }"
+      "node { name: 'H' op: 'Int32Input'}"
+      "node { name: 'I' op: '_MklConv2DBackpropInput'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['H', 'B', 'E', 'M', 'N', 'O']}"
+      "node { name: 'J' op: 'BiasAddGrad'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
       " input: ['E'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);DMT/_0(Const);"
-            "E(Sub);F(_MklConv2DWithBiasBackpropBias);M(_MklInput);"
-            "N(_MklInput);O(_MklInput)|A->D;A->E:1;B->D:1;C->D:2;D->E;"
-            "DMT/_0->F:1;E->F;E:control->DMT/_0:control;M->D:3;N->D:4;"
-            "O->D:5");
+            "E(Sub);F(Int32Input);G(_MklConv2DBackpropFilter);H(Int32Input);"
+            "I(_MklConv2DBackpropInput);J(_MklConv2DWithBiasBackpropBias);"
+            "M(_MklInput);N(_MklInput);O(_MklInput)|A->D;A->E:1;A->G;B->D:1;"
+            "B->I:1;C->D:2;D->E;DMT/_0->J:1;E->G:2;E->I:2;E->J;"
+            "E:control->DMT/_0:control;F->G:1;H->I;M->D:3;M->G:3;M->I:3;"
+            "N->D:4;N->G:4;N->I:4;O->D:5;O->G:5;O->I:5");
+}
+
+// BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter
+// and BackpropInput. But nodes do not match criteria for rewrite. So
+// rewrite should not happen.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'O' op: '_MklInput'}"
+      "node { name: 'D' op: '_MklConv2DWithBias'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
+      "node { name: 'E' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['D', 'A']}"
+      "node { name: 'F' op: 'Int32Input'}"
+      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['E', 'F', 'A', 'M', 'N', 'O'] }"
+      "node { name: 'H' op: 'Int32Input'}"
+      "node { name: 'I' op: '_MklConv2DBackpropInput'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['H', 'B', 'E', 'M', 'N', 'O']}"
+      "node { name: 'J' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
+            "E(Sub);F(Int32Input);G(_MklConv2DBackpropFilter);H(Int32Input);"
+            "I(_MklConv2DBackpropInput);J(BiasAddGrad);"
+            "M(_MklInput);N(_MklInput);O(_MklInput)|A->D;A->E:1;A->G:2;B->D:1;"
+            "B->I:1;C->D:2;D->E;E->G;E->I:2;E->J;F->G:1;H->I;M->D:3;M->G:3;"
+            "M->I:3;N->D:4;N->G:4;N->I:4;O->D:5;O->G:5;O->I:5");
+}
+
+// BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter
+// and BackpropInput. But nodes do not match criteria for rewrite. So
+// rewrite should not happen.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'O' op: '_MklInput'}"
+      "node { name: 'D' op: '_MklConv2DWithBias'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['B', 'A', 'C', 'M', 'N', 'O']}"
+      "node { name: 'E' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['D', 'A']}"
+      "node { name: 'F' op: 'Int32Input'}"
+      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'F', 'E', 'M', 'N', 'O'] }"
+      "node { name: 'H' op: 'Int32Input'}"
+      "node { name: 'I' op: '_MklConv2DBackpropInput'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['H', 'B', 'E', 'M', 'N', 'O']}"
+      "node { name: 'J' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
+            "E(Sub);F(Int32Input);G(_MklConv2DBackpropFilter);H(Int32Input);"
+            "I(_MklConv2DBackpropInput);J(BiasAddGrad);"
+            "M(_MklInput);N(_MklInput);O(_MklInput)|A->D:1;A->E:1;A->G;B->D;"
+            "B->I:1;C->D:2;D->E;E->G:2;E->I:2;E->J;F->G:1;H->I;M->D:3;M->G:3;"
+            "M->I:3;N->D:4;N->G:4;N->I:4;O->D:5;O->G:5;O->I:5");
+}
+
+
+// BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter only
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'O' op: '_MklInput'}"
+      "node { name: 'D' op: '_MklConv2DWithBias'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
+      "node { name: 'E' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['D', 'A']}"
+      "node { name: 'F' op: 'Int32Input'}"
+      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'F', 'E', 'M', 'N', 'O'] }"
+      "node { name: 'H' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);DMT/_0(Const);"
+            "E(Sub);F(Int32Input);G(_MklConv2DBackpropFilter);"
+            "H(_MklConv2DWithBiasBackpropBias);M(_MklInput);N(_MklInput);"
+            "O(_MklInput)|A->D;A->E:1;A->G;B->D:1;C->D:2;D->E;DMT/_0->H:1;"
+            "E->G:2;E->H;E:control->DMT/_0:control;F->G:1;M->D:3;M->G:3;"
+            "N->D:4;N->G:4;O->D:5;O->G:5");
+}
+
+// BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter only
+// But BackpropFilter node inputs do not satisfy criteria for rewrite.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Negative1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'O' op: '_MklInput'}"
+      "node { name: 'D' op: '_MklConv2DWithBias'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
+      "node { name: 'E' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['D', 'A']}"
+      "node { name: 'F' op: 'Int32Input'}"
+      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['E', 'F', 'A', 'M', 'N', 'O'] }"
+      "node { name: 'H' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
+            "E(Sub);F(Int32Input);G(_MklConv2DBackpropFilter);H(BiasAddGrad);"
+            "M(_MklInput);N(_MklInput);O(_MklInput)|A->D;A->E:1;A->G:2;B->D:1;"
+            "C->D:2;D->E;E->G;E->H;F->G:1;M->D:3;M->G:3;N->D:4;N->G:4;O->D:5;"
+            "O->G:5");
+}
+
+// BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter only
+// But BackpropFilter node inputs do not satisfy criteria for rewrite.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Negative2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'O' op: '_MklInput'}"
+      "node { name: 'D' op: '_MklConv2DWithBias'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['B', 'A', 'C', 'M', 'N', 'O']}"
+      "node { name: 'E' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['D', 'A']}"
+      "node { name: 'F' op: 'Int32Input'}"
+      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'F', 'E', 'M', 'N', 'O'] }"
+      "node { name: 'H' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
+            "E(Sub);F(Int32Input);G(_MklConv2DBackpropFilter);H(BiasAddGrad);"
+            "M(_MklInput);N(_MklInput);O(_MklInput)|A->D:1;A->E:1;A->G;B->D;"
+            "C->D:2;D->E;E->G:2;E->H;F->G:1;M->D:3;M->G:3;N->D:4;N->G:4;O->D:5;"
+            "O->G:5");
 }
 
 // No _MklConv2DWithBias in context, but _MklConv2D in context.
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
index e959eab54e..f6595fcbb3 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
@@ -728,7 +728,7 @@ TEST_F(VirtualSchedulerTest, ComplexDependency) {
            1 /* control dependency */);
   EXPECT_EQ(expected_size, cpu_state.memory_usage);
 
-  // Nodes currrently in memory: bn's port -1, 0, and 2, and x's port 0.
+  // Nodes currently in memory: bn's port -1, 0, and 2, and x's port 0.
   std::set<std::pair<string, int>> nodes_in_memory;
   std::transform(
       cpu_state.nodes_in_memory.begin(), cpu_state.nodes_in_memory.end(),
diff --git a/tensorflow/core/grappler/grappler_item.h b/tensorflow/core/grappler/grappler_item.h
index 84a7681782..1e7a9dfaf5 100644
--- a/tensorflow/core/grappler/grappler_item.h
+++ b/tensorflow/core/grappler/grappler_item.h
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variable.pb.h"
 #include "tensorflow/core/protobuf/queue_runner.pb.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/grappler/optimizers/auto_parallel.cc b/tensorflow/core/grappler/optimizers/auto_parallel.cc
index 42f2f1850f..d46b849ad4 100644
--- a/tensorflow/core/grappler/optimizers/auto_parallel.cc
+++ b/tensorflow/core/grappler/optimizers/auto_parallel.cc
@@ -168,6 +168,11 @@ Status AutoParallel::Initialize(const GrapplerItem& item) {
   for (const auto& variable : item.MainVariables()) {
     dont_replicate_nodes.insert(variable->name());
   }
+
+  for (const auto& init : item.init_ops) {
+    dont_replicate_nodes.insert(NodeName(init));
+  }
+
   // Don't replicate all input nodes, except the dequeue node.
   for (const auto& input_node : input_nodes) {
     if (input_node->name() != dequeue_node->name()) {
diff --git a/tensorflow/core/grappler/optimizers/auto_parallel.h b/tensorflow/core/grappler/optimizers/auto_parallel.h
index ad90bbe028..c5d2d47782 100644
--- a/tensorflow/core/grappler/optimizers/auto_parallel.h
+++ b/tensorflow/core/grappler/optimizers/auto_parallel.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
 
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/framework/variable.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/grappler/optimizers/auto_parallel_test.cc b/tensorflow/core/grappler/optimizers/auto_parallel_test.cc
index 3d1b4a34bf..9a41b5e0b5 100644
--- a/tensorflow/core/grappler/optimizers/auto_parallel_test.cc
+++ b/tensorflow/core/grappler/optimizers/auto_parallel_test.cc
@@ -33,6 +33,7 @@ TEST_F(AutoParallelTest, SimpleParallel) {
   Output constant_b = ops::Const(s.WithOpName("constant_b"), 1, {1});
   Output var = ops::Variable(s.WithOpName("var"), {1}, DT_FLOAT);
   Output assign = ops::Assign(s.WithOpName("assign"), {var}, {constant_a});
+  Output identity = ops::Identity(s.WithOpName("identity"), {var});
   Output fifo_queue = ops::FIFOQueue(s.WithOpName("fifo_queue"), {DT_FLOAT});
   auto dequeue = ops::QueueDequeueMany(s.WithOpName("dequeue"), {fifo_queue},
                                        {constant_b}, {DT_FLOAT});
@@ -44,13 +45,14 @@ TEST_F(AutoParallelTest, SimpleParallel) {
   GrapplerItem item;
   item.init_ops.push_back("assign");
   item.fetch.push_back("apply_gradient");
+  item.init_ops.push_back("assign");
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
   AutoParallel parallel(2);
   GraphDef output;
   Status status = parallel.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
-  EXPECT_EQ(20, output.node_size());
+  EXPECT_EQ(21, output.node_size());
 
   const NodeDef& node_assign = output.node(0);
   EXPECT_EQ("assign", node_assign.name());
@@ -62,60 +64,64 @@ TEST_F(AutoParallelTest, SimpleParallel) {
   const NodeDef& node_fifo_queue = output.node(2);
   EXPECT_EQ("fifo_queue", node_fifo_queue.name());
 
-  const NodeDef& node_var = output.node(3);
+  const NodeDef& node_identity = output.node(3);
+  EXPECT_EQ("identity", node_identity.name());
+  EXPECT_EQ("var", node_identity.input(0));
+
+  const NodeDef& node_var = output.node(4);
   EXPECT_EQ("var", node_var.name());
 
-  const NodeDef& node_div_const0 = output.node(4);
+  const NodeDef& node_div_const0 = output.node(5);
   EXPECT_EQ("AutoParallel-Replica-0/AutoParallel-Div-Const",
             node_div_const0.name());
 
-  const NodeDef& node_div0 = output.node(5);
+  const NodeDef& node_div0 = output.node(6);
   EXPECT_EQ("AutoParallel-Replica-0/AutoParallel-Div-apply_gradient",
             node_div0.name());
-  const NodeDef& node_add0 = output.node(6);
+  const NodeDef& node_add0 = output.node(7);
   EXPECT_EQ("AutoParallel-Replica-0/add", node_add0.name());
 
-  const NodeDef& node_gradient0 = output.node(7);
+  const NodeDef& node_gradient0 = output.node(8);
   EXPECT_EQ("AutoParallel-Replica-0/apply_gradient", node_gradient0.name());
 
-  const NodeDef& node_constant_a0 = output.node(8);
+  const NodeDef& node_constant_a0 = output.node(9);
   EXPECT_EQ("AutoParallel-Replica-0/constant_a", node_constant_a0.name());
 
-  const NodeDef& node_dequeue0 = output.node(9);
+  const NodeDef& node_dequeue0 = output.node(10);
   EXPECT_EQ("AutoParallel-Replica-0/dequeue", node_dequeue0.name());
 
-  const NodeDef& node_learning_rate0 = output.node(10);
+  const NodeDef& node_learning_rate0 = output.node(11);
   EXPECT_EQ("AutoParallel-Replica-0/learning_rate", node_learning_rate0.name());
 
-  const NodeDef& node_div_const1 = output.node(11);
+  const NodeDef& node_div_const1 = output.node(12);
   EXPECT_EQ("AutoParallel-Replica-1/AutoParallel-Div-Const",
             node_div_const1.name());
 
-  const NodeDef& node_div1 = output.node(12);
+  const NodeDef& node_div1 = output.node(13);
   EXPECT_EQ("AutoParallel-Replica-1/AutoParallel-Div-apply_gradient",
             node_div1.name());
 
-  const NodeDef& node_add1 = output.node(13);
+  const NodeDef& node_add1 = output.node(14);
   EXPECT_EQ("AutoParallel-Replica-1/add", node_add1.name());
 
-  const NodeDef& node_gradient1 = output.node(14);
+  const NodeDef& node_gradient1 = output.node(15);
   EXPECT_EQ("AutoParallel-Replica-1/apply_gradient", node_gradient1.name());
 
-  const NodeDef& node_constant_a1 = output.node(15);
+  const NodeDef& node_constant_a1 = output.node(16);
   EXPECT_EQ("AutoParallel-Replica-1/constant_a", node_constant_a1.name());
 
-  const NodeDef& node_dequeue1 = output.node(16);
+  const NodeDef& node_dequeue1 = output.node(17);
   EXPECT_EQ("AutoParallel-Replica-1/dequeue", node_dequeue1.name());
 
-  const NodeDef& node_learning_rate1 = output.node(17);
+  const NodeDef& node_learning_rate1 = output.node(18);
   EXPECT_EQ("AutoParallel-Replica-1/learning_rate", node_learning_rate1.name());
 
-  const NodeDef& node_fetch = output.node(18);
+  const NodeDef& node_fetch = output.node(19);
   EXPECT_EQ("AutoParallel-Control-Fetch", node_fetch.name());
   EXPECT_EQ("^AutoParallel-Replica-0/apply_gradient", node_fetch.input(0));
   EXPECT_EQ("^AutoParallel-Replica-1/apply_gradient", node_fetch.input(1));
 
-  const NodeDef& node_gradient = output.node(19);
+  const NodeDef& node_gradient = output.node(20);
   EXPECT_EQ("apply_gradient", node_gradient.name());
   EXPECT_EQ("^AutoParallel-Control-Fetch", node_gradient.input(0));
 }
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index ded1e474ce..28d663e2f7 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -929,7 +929,7 @@ struct TuningConfig {
   // Conv2DBackpropFilter will use a specialized GEMM implementation, which is
   // usually faster than the NCHW implementation. The downside is that this
   // might result in more non-cancellable layout conversion nodes (implemented
-  // by the Tranpose op).
+  // by the Transpose op).
   bool no_gemm;
 };
 
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 41995a2ff5..0b1c1085f2 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -725,6 +725,12 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "tile_ops",
+    srcs = ["tile_functor_cpu.cc"],
+    hdrs = ["tile_functor.h"],
+    gpu_srcs = [
+        "tile_functor.h",
+        "tile_functor_gpu.cu.cc",
+    ],
     prefix = "tile_ops",
     deps = ARRAY_DEPS,
 )
@@ -2464,7 +2470,7 @@ tf_cc_tests(
         ":ops_util",
         ":sparse_add_op",
         ":sparse_dense_binary_op_shared",
-        ":sparse_reduce_sum_op",
+        ":sparse_reduce_op",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -3207,7 +3213,7 @@ cc_library(
         ":sparse_cross_op",
         ":sparse_dense_binary_op_shared",
         ":sparse_fill_empty_rows_op",
-        ":sparse_reduce_sum_op",
+        ":sparse_reduce_op",
         ":sparse_reorder_op",
         ":sparse_reshape_op",
         ":sparse_softmax",
@@ -3263,8 +3269,8 @@ tf_kernel_library(
 )
 
 tf_kernel_library(
-    name = "sparse_reduce_sum_op",
-    prefix = "sparse_reduce_sum_op",
+    name = "sparse_reduce_op",
+    prefix = "sparse_reduce_op",
     deps = SPARSE_DEPS,
 )
 
@@ -4152,6 +4158,7 @@ filegroup(
         "spacetobatch_functor.h",
         "spacetodepth_op.h",
         "tensor_array.h",
+        "tile_functor.h",
         "tile_ops_cpu_impl.h",
         "tile_ops_impl.h",
         "training_op_helpers.h",
@@ -4290,6 +4297,7 @@ filegroup(
         "summary_op.cc",
         "tensor_array.cc",
         "tensor_array_ops.cc",
+        "tile_functor_cpu.cc",
         "tile_ops.cc",
         "tile_ops_cpu_impl_1.cc",
         "tile_ops_cpu_impl_2.cc",
diff --git a/tensorflow/core/kernels/adjust_contrast_op.cc b/tensorflow/core/kernels/adjust_contrast_op.cc
index c8f12f91a6..37976f7183 100644
--- a/tensorflow/core/kernels/adjust_contrast_op.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op.cc
@@ -31,6 +31,9 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif
 
 // AdjustContrastOp is deprecated as of GraphDef version >= 2
 
@@ -410,4 +413,25 @@ REGISTER_KERNEL_BUILDER(Name("AdjustContrastv2").Device(DEVICE_GPU),
                         AdjustContrastOpv2<GPUDevice>);
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+template <>
+class AdjustContrastOpv2<SYCLDevice> : public AdjustContrastOpV2Base {
+ public:
+  explicit AdjustContrastOpv2(OpKernelConstruction* context)
+      : AdjustContrastOpV2Base(context) {}
+
+  void DoCompute(OpKernelContext* context,
+                 const ComputeOptions& options) override {
+    const int64 shape[4] = {options.batch, options.height, options.width,
+                            options.channels};
+    functor::AdjustContrastv2<SYCLDevice>()(
+        context->eigen_device<SYCLDevice>(),
+        options.input->shaped<float, 4>(shape), options.factor->scalar<float>(),
+        options.output->shaped<float, 4>(shape));
+  }
+};
+REGISTER_KERNEL_BUILDER(Name("AdjustContrastv2").Device(DEVICE_SYCL),
+                        AdjustContrastOpv2<SYCLDevice>);
+#endif  // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc b/tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc
index ffd47406eb..c485f14844 100644
--- a/tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc
@@ -56,6 +56,11 @@ static Graph* BM_AdjustContrast(int batches, int width, int height) {
 // BM_AdjustContrast_cpu_1_299_299     179084     340186  2181  751.9M items/s
 // BM_AdjustContrast_gpu_32_299_299     85276     123665  4189  2.9G items/s
 BM_AdjustContrastDev(cpu, 1, 299, 299);
+#if GOOGLE_CUDA
 BM_AdjustContrastDev(gpu, 32, 299, 299);
+#endif  // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+BM_AdjustContrastDev(sycl, 32, 299, 299);
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/colorspace_op.cc b/tensorflow/core/kernels/colorspace_op.cc
index d65a34fd73..ba100b32e7 100644
--- a/tensorflow/core/kernels/colorspace_op.cc
+++ b/tensorflow/core/kernels/colorspace_op.cc
@@ -35,6 +35,9 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif
 
 template <typename Device, typename T>
 class RGBToHSVOp : public OpKernel {
@@ -146,4 +149,16 @@ TF_CALL_float(REGISTER_GPU);
 TF_CALL_double(REGISTER_GPU);
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL(T)                                       \
+  REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_SYCL) \
+                              .TypeConstraint<T>("T"),         \
+                          RGBToHSVOp<SYCLDevice, T>);          \
+  REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_SYCL) \
+                              .TypeConstraint<T>("T"),         \
+                          HSVToRGBOp<SYCLDevice, T>);
+TF_CALL_float(REGISTER_SYCL);
+TF_CALL_double(REGISTER_SYCL);
+#endif
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index 203a9a9f24..64c06786bc 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -112,15 +112,14 @@ REGISTER_GPU_HOST_REF_KERNEL(string);
 #undef REGISTER_GPU_HOST_KERNEL
 #undef REGISTER_GPU_HOST_REF_KERNEL
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                       \
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_SWITCH(type)                       \
   REGISTER_KERNEL_BUILDER(Name("Switch")                 \
                               .Device(DEVICE_SYCL)       \
-                              .TypeConstraint<type>("T") \
-                              .HostMemory("pred"),       \
+                              .HostMemory("pred")        \
+                              .TypeConstraint<type>("T"),\
                           SwitchOp)
-REGISTER_SYCL_KERNEL(bool);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
+TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_SWITCH);
 
 #define REGISTER_SYCL_REF_SWITCH(type)                     \
   REGISTER_KERNEL_BUILDER(Name("RefSwitch")                \
@@ -128,12 +127,41 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
                               .HostMemory("pred")          \
                               .TypeConstraint<type>("T"),  \
                           SwitchOp)
-REGISTER_SYCL_REF_SWITCH(bool);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_SWITCH);
+TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_SWITCH);
 
-#undef REGISTER_SYCL_KERNEL
+#undef REGISTER_SYCL_SWITCH
 #undef REGISTER_SYCL_REF_SWITCH
 
+#define REGISTER_SYCL_HOST_KERNEL(type)                  \
+  REGISTER_KERNEL_BUILDER(Name("Switch")                 \
+                              .Device(DEVICE_SYCL)       \
+                              .HostMemory("data")        \
+                              .HostMemory("pred")        \
+                              .HostMemory("output_false")\
+                              .HostMemory("output_true") \
+                              .TypeConstraint<type>("T"),\
+                          SwitchOp)
+
+REGISTER_SYCL_HOST_KERNEL(bool);
+REGISTER_SYCL_HOST_KERNEL(string);
+REGISTER_SYCL_HOST_KERNEL(int32);
+
+#define REGISTER_SYCL_HOST_REF_KERNEL(type)                \
+  REGISTER_KERNEL_BUILDER(Name("RefSwitch")                \
+                              .Device(DEVICE_SYCL)         \
+                              .HostMemory("data")          \
+                              .HostMemory("pred")          \
+                              .HostMemory("output_false")  \
+                              .HostMemory("output_true")   \
+                              .TypeConstraint<type>("T"),  \
+                          SwitchOp)
+
+REGISTER_SYCL_HOST_REF_KERNEL(int32);
+REGISTER_SYCL_HOST_REF_KERNEL(bool);
+REGISTER_SYCL_HOST_REF_KERNEL(string);
+
+#undef REGISTER_SYCL_HOST_KERNEL
+#undef REGISTER_SYCL_HOST_REF_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
 class RefSelectOp : public OpKernel {
@@ -233,13 +261,13 @@ REGISTER_GPU_REF_KERNEL(bool);
 #undef REGISTER_GPU_KERNEL
 #undef REGISTER_GPU_REF_KERNEL
 
-#if TENSORFLOW_USE_SYCL
+#ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(type)                        \
   REGISTER_KERNEL_BUILDER(Name("Merge")                   \
                               .Device(DEVICE_SYCL)        \
                               .TypeConstraint<type>("T")  \
                               .HostMemory("value_index"), \
-                          MergeOp)
+                          MergeOp);
 REGISTER_SYCL_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 
@@ -248,9 +276,10 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
                               .Device(DEVICE_SYCL)        \
                               .TypeConstraint<type>("T")  \
                               .HostMemory("value_index"), \
-                          MergeOp)
+                          MergeOp);
 REGISTER_SYCL_REF_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
+
 #undef REGISTER_SYCL_KERNEL
 #undef REGISTER_SYCL_REF_KERNEL
 #endif // TENSORFLOW_USE_SYCL
@@ -280,6 +309,30 @@ REGISTER_GPU_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_GPU_HOST_KERNEL
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_HOST_KERNEL(type)                   \
+  REGISTER_KERNEL_BUILDER(Name("Merge")                   \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("inputs")       \
+                              .HostMemory("output")       \
+                              .HostMemory("value_index")  \
+                              .TypeConstraint<type>("T"), \
+                          MergeOp);                       \
+  REGISTER_KERNEL_BUILDER(Name("RefMerge")                \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("inputs")       \
+                              .HostMemory("output")       \
+                              .HostMemory("value_index")  \
+                              .TypeConstraint<type>("T"), \
+                          MergeOp)
+
+REGISTER_SYCL_HOST_KERNEL(int32);
+REGISTER_SYCL_HOST_KERNEL(string);
+REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
+
+#undef REGISTER_SYCL_HOST_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 void EnterOp::Compute(OpKernelContext* context) {
   if (IsRefType(context->input_dtype(0))) {
     context->forward_ref_input_to_ref_output(0, 0);
@@ -306,7 +359,7 @@ REGISTER_GPU_REF_KERNEL(bool);
 #undef REGISTER_GPU_KERNEL
 #undef REGISTER_GPU_REF_KERNEL
 
-#if TENSORFLOW_USE_SYCL
+#ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(type)  \
   REGISTER_KERNEL_BUILDER(          \
       Name("Enter").Device(DEVICE_SYCL).TypeConstraint<type>("T"), EnterOp)
@@ -345,7 +398,7 @@ REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_SYCL_HOST_KERNEL
 #undef REGISTER_SYCL_HOST_REF_KERNEL
-#endif
+#endif // TENSORFLOW_USE_SYCL
 
 // Special GPU kernels for int32 and string.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@@ -394,30 +447,25 @@ REGISTER_KERNEL_BUILDER(Name("RefExit").Device(DEVICE_CPU), ExitOp);
       Name("RefExit").Device(DEVICE_GPU).TypeConstraint<type>("T"), ExitOp);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_KERNEL);
 REGISTER_GPU_KERNEL(bool);
+REGISTER_GPU_REF_KERNEL(bool);
 
 #undef REGISTER_GPU_KERNEL
 #undef REGISTER_GPU_REF_KERNEL
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)  \
-  REGISTER_KERNEL_BUILDER(          \
-  Name("Exit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp)
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(type)                                           \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name("Exit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp);   \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name("RefExit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp);
 REGISTER_SYCL_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 
-#define REGISTER_SYCL_REF_KERNEL(type)  \
-  REGISTER_KERNEL_BUILDER(              \
-  Name("RefExit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp)
-REGISTER_SYCL_REF_KERNEL(bool);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
-
 #undef REGISTER_SYCL_KERNEL
 #undef REGISTER_SYCL_REF_KERNEL
 
-// Special GPU kernels for int32 and string.
-// TODO(b/25387198): Also enable int32 in device memory. This kernel
-// registration requires all int32 inputs and outputs to be in host memory.
 #define REGISTER_SYCL_HOST_KERNEL(type)                   \
   REGISTER_KERNEL_BUILDER(Name("Exit")                    \
                               .Device(DEVICE_SYCL)        \
@@ -507,31 +555,19 @@ REGISTER_GPU_HOST_KERNEL(string);
 
 #undef REGISTER_GPU_HOST_KERNEL
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)  \
-  REGISTER_KERNEL_BUILDER(Name("NextIteration")           \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("data")         \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          NextIterationOp)
-  REGISTER_SYCL_KERNEL(bool);
-  TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-#define REGISTER_SYCL_REF_KERNEL(type)                    \
-  REGISTER_KERNEL_BUILDER(Name("RefNextIteration")        \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("data")         \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          NextIterationOp)
-  REGISTER_SYCL_REF_KERNEL(bool);
-  TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(type)                                           \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name("NextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"),   \
+      NextIterationOp);                                                      \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name("RefNextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"),\
+      NextIterationOp)
+REGISTER_SYCL_KERNEL(bool);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
+
 #undef REGISTER_SYCL_KERNEL
-#undef REGISTER_SYCL_REF_KERNEL
 
-// Special GPU kernels for int32 and string.
-// TODO(b/25387198): Also enable int32 in device memory. This kernel
-// registration requires all int32 inputs and outputs to be in host memory.
 #define REGISTER_SYCL_HOST_KERNEL(type)                   \
   REGISTER_KERNEL_BUILDER(Name("NextIteration")           \
                               .Device(DEVICE_SYCL)        \
diff --git a/tensorflow/core/kernels/cwise_op_add_2.cc b/tensorflow/core/kernels/cwise_op_add_2.cc
index 5d3385b0ed..5dea00e95c 100644
--- a/tensorflow/core/kernels/cwise_op_add_2.cc
+++ b/tensorflow/core/kernels/cwise_op_add_2.cc
@@ -22,10 +22,11 @@ namespace tensorflow {
 // sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__.
 #if !defined(__ANDROID_TYPES_SLIM__)
 
-REGISTER5(BinaryOp, CPU, "Add", functor::add, int8, int16, complex64,
-          complex128, string);
+REGISTER6(BinaryOp, CPU, "Add", functor::add, int8, int16, complex64,
+          uint8, complex128, string);
 #if GOOGLE_CUDA
-REGISTER3(BinaryOp, GPU, "Add", functor::add, int64, complex64, complex128);
+REGISTER4(BinaryOp, GPU, "Add", functor::add, uint8, int64, complex64,
+          complex128);
 #endif  // GOOGLE_CUDA
 
 #endif  // !defined(__ANDROID_TYPES_SLIM__)
diff --git a/tensorflow/core/kernels/cwise_op_cosh.cc b/tensorflow/core/kernels/cwise_op_cosh.cc
new file mode 100644
index 0000000000..bca99a4f89
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_cosh.cc
@@ -0,0 +1,37 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER4(UnaryOp, CPU, "Cosh", functor::cosh, float, double,
+          complex64, complex128);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Cosh")                                \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::cosh<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
+#if GOOGLE_CUDA
+REGISTER2(UnaryOp, GPU, "Cosh", functor::cosh, float, double);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_gpu_add.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_add.cu.cc
index 5aaf2b5b4b..61079ebab3 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_add.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_add.cu.cc
@@ -19,7 +19,8 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-DEFINE_BINARY6(add, Eigen::half, float, double, int64, complex64, complex128);
+DEFINE_BINARY7(add, Eigen::half, float, double, uint8, int64, complex64,
+               complex128);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_cosh.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_cosh.cu.cc
new file mode 100644
index 0000000000..267a381d1a
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_cosh.cu.cc
@@ -0,0 +1,26 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY2(cosh, float, double);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_sinh.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_sinh.cu.cc
new file mode 100644
index 0000000000..f8329e50d6
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_sinh.cu.cc
@@ -0,0 +1,26 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY2(sinh, float, double);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_invert.cc b/tensorflow/core/kernels/cwise_op_invert.cc
index c84ee6894e..df2c02e42e 100644
--- a/tensorflow/core/kernels/cwise_op_invert.cc
+++ b/tensorflow/core/kernels/cwise_op_invert.cc
@@ -20,7 +20,7 @@ REGISTER6(UnaryOp, CPU, "Invert", functor::invert, int8, int16, int32, int64,
           uint8, uint16);
 
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER(UnaryOp, SYCL, "Invert", functor::invert, int8, int16, int32, int64,
+REGISTER6(UnaryOp, SYCL, "Invert", functor::invert, int8, int16, int32, int64,
          uint8, uint16);
 #endif  // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/cwise_op_sinh.cc b/tensorflow/core/kernels/cwise_op_sinh.cc
new file mode 100644
index 0000000000..055f0b12e1
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_sinh.cc
@@ -0,0 +1,37 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER4(UnaryOp, CPU, "Sinh", functor::sinh, float, double,
+          complex64, complex128);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Sinh")                                \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::sinh<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYC
+
+#if GOOGLE_CUDA
+REGISTER2(UnaryOp, GPU, "Sinh", functor::sinh, float, double);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_squared_difference.cc b/tensorflow/core/kernels/cwise_op_squared_difference.cc
index edd6c07146..78fefc69c7 100644
--- a/tensorflow/core/kernels/cwise_op_squared_difference.cc
+++ b/tensorflow/core/kernels/cwise_op_squared_difference.cc
@@ -35,4 +35,17 @@ REGISTER_KERNEL_BUILDER(
         .TypeConstraint<int32>("T"),
     BinaryOp<CPUDevice, functor::squared_difference<int32>>);
 
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER3(BinaryOp, SYCL, "SquaredDifference", functor::squared_difference,
+          float, double, int64);
+REGISTER_KERNEL_BUILDER(
+    Name("SquaredDifference")
+        .Device(DEVICE_SYCL)
+        .HostMemory("x")
+        .HostMemory("y")
+        .HostMemory("z")
+        .TypeConstraint<int32>("T"),
+    BinaryOp<CPUDevice, functor::squared_difference<int32>>);
+#endif  // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index 97bdc5e878..c11d6cfabb 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -483,6 +483,12 @@ struct log1p : base<T, Eigen::internal::scalar_log1p_op<T> > {};
 template <typename T>
 struct sign : base<T, Eigen::internal::scalar_sign_op<T> > {};
 
+template <typename T>
+struct sinh : base<T, Eigen::internal::scalar_sinh_op<T> > {};
+
+template <typename T>
+struct cosh : base<T, Eigen::internal::scalar_cosh_op<T> > {};
+
 template <typename T>
 struct tanh : base<T, Eigen::internal::scalar_tanh_op<T> > {};
 
diff --git a/tensorflow/core/kernels/dynamic_stitch_op.cc b/tensorflow/core/kernels/dynamic_stitch_op.cc
index 08ae787c86..135d635514 100644
--- a/tensorflow/core/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/core/kernels/dynamic_stitch_op.cc
@@ -165,20 +165,6 @@ class DynamicStitchOp : public OpKernel {
 TF_CALL_POD_STRING_TYPES(REGISTER_DYNAMIC_STITCH);
 #undef REGISTER_DYNAMIC_STITCH
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_DYNAMIC_STITCH_SYCL(type)               \
-  REGISTER_KERNEL_BUILDER(Name("DynamicStitch")          \
-                              .Device(DEVICE_SYCL)       \
-                              .TypeConstraint<type>("T") \
-                              .HostMemory("indices")     \
-                              .HostMemory("data")        \
-                              .HostMemory("merged"),     \
-                          DynamicStitchOp<type>)
-
-TF_CALL_ALL_TYPES(REGISTER_DYNAMIC_STITCH_SYCL);
-#undef REGISTER_DYNAMIC_STITCH_SYCL
-#endif  // TENSORFLOW_USE_SYCL
-
 #if GOOGLE_CUDA
 #define REGISTER_DYNAMIC_STITCH_GPU(type)                \
   REGISTER_KERNEL_BUILDER(Name("DynamicStitch")          \
@@ -194,4 +180,17 @@ TF_CALL_POD_STRING_TYPES(REGISTER_DYNAMIC_STITCH_GPU);
 
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_DYNAMIC_STITCH_SYCL(type)               \
+  REGISTER_KERNEL_BUILDER(Name("DynamicStitch")          \
+                              .Device(DEVICE_SYCL)       \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("indices")     \
+                              .HostMemory("data")        \
+                              .HostMemory("merged"),     \
+                          DynamicStitchOp<type>)
+
+TF_CALL_POD_STRING_TYPES(REGISTER_DYNAMIC_STITCH_SYCL);
+#undef REGISTER_DYNAMIC_STITCH_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/map_stage_op.cc b/tensorflow/core/kernels/map_stage_op.cc
index 6431c6540e..46eaf3d9e7 100644
--- a/tensorflow/core/kernels/map_stage_op.cc
+++ b/tensorflow/core/kernels/map_stage_op.cc
@@ -547,14 +547,14 @@ REGISTER_KERNEL_BUILDER(Name("OrderedMapStage")
                             .HostMemory("indices")
                             .Device(DEVICE_GPU),
                         MapStageOp<true>);
-#endif
+#endif // GOOGLE_CUDA
+
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("MapStage").HostMemory("key").Device(DEVICE_SYCL),
                         MapStageOp<false>);
 REGISTER_KERNEL_BUILDER(
     Name("OrderedMapStage").HostMemory("key").Device(DEVICE_SYCL),
     MapStageOp<true>);
-
 #endif // TENSORFLOW_USE_SYCL
 
 template <bool Ordered>
@@ -661,6 +661,7 @@ REGISTER_KERNEL_BUILDER(Name("OrderedMapPeek")
                             .Device(DEVICE_GPU),
                         MapPeekOp<true>);
 #endif
+
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(
     Name("MapPeek").HostMemory("key").HostMemory("indices").Device(DEVICE_SYCL),
@@ -724,8 +725,8 @@ REGISTER_KERNEL_BUILDER(Name("OrderedMapUnstageNoKey")
                             .HostMemory("indices")
                             .Device(DEVICE_GPU),
                         MapUnstageNoKeyOp<true>);
-
 #endif
+
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("MapUnstageNoKey")
                             .HostMemory("key")
diff --git a/tensorflow/core/kernels/meta_support.cc b/tensorflow/core/kernels/meta_support.cc
index 0e899402a2..b29feb0032 100644
--- a/tensorflow/core/kernels/meta_support.cc
+++ b/tensorflow/core/kernels/meta_support.cc
@@ -98,8 +98,9 @@ typedef gemmlowp::meta::SimpleContext<gemmlowp::WorkersPool> LocalContext;
 template <typename Context, typename Params>
 void MultiThreadGemm(Context* context, const Params& params) {
   if (params.m <= 4) {
-    gemmlowp::meta::Gemm<gemmlowp::meta::GemmExecutorPackLHSCacheFriendly<>,
-                         Params, 1, 8, 8>(params);
+      gemmlowp::meta::MultiThreadGemm<
+          Context, gemmlowp::meta::GemmExecutorPackLHSCacheFriendly<>, Params,
+          1, 8, 8>(context, params);
   } else {
     if (params.m >= params.n) {
       gemmlowp::meta::MultiThreadGemm<
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index dc6b88e953..ddcf241277 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -206,10 +206,15 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
     // Mkl needs the entities in its native format.
     // So create temporary tensors along with buffers to
     // convert the received entities.
-    Tensor mkl_tmp_input_buf_tensor, mkl_tmp_out_backprop_buf_tensor;
+    Tensor mkl_tmp_input_buf_tensor, mkl_tmp_out_backprop_buf_tensor,
+           mkl_tmp_buf_trans_input;
     // This preparation sets (1) dnnResourceSrc (2) dnnResourceDiffDst
-    mkl_context.MklPrepareInputs(context, &mkl_tmp_input_buf_tensor,
-                                 &mkl_tmp_out_backprop_buf_tensor);
+    mkl_context.MklPrepareInputs(context, data_format_,
+                                 input_in_mkl_format,
+                                 out_backprop_in_mkl_format,
+                                 &mkl_tmp_input_buf_tensor,
+                                 &mkl_tmp_out_backprop_buf_tensor,
+                                 &mkl_tmp_buf_trans_input);
 
     // Final conv-grad-filter should be in TF layout.
     Tensor* grad_filter;
@@ -307,34 +312,58 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
 
     // Compare incoming tensor layouts with MKL preferred layouts and convert
     // data to the preferred layout if necessary
-    void MklPrepareInputs(OpKernelContext* context,
+    void MklPrepareInputs(OpKernelContext* context, TensorFormat format,
+                          bool input_in_mkl_format,
+                          bool out_backprop_in_mkl_format,
                           Tensor* mkl_tmp_input_buf_tensor,
-                          Tensor* mkl_tmp_out_backprop_buf_tensor) {
+                          Tensor* mkl_tmp_out_backprop_buf_tensor,
+                          Tensor* mkl_tmp_buf_trans_input) {
       bool mkl_convert_input, mkl_convert_out_backprop;
       dnnPrimitive_t mkl_prim_convert_input, mkl_prim_convert_out_backprop;
-      dnnLayout_t mkl_lt_internal_input, mkl_lt_internal_out_backprop;
+      dnnLayout_t mkl_lt_internal_input, mkl_lt_internal_out_backprop,
+                  mkl_lt_trans_input;
       void *mkl_buf_convert_input, *mkl_buf_convert_out_backprop;
+      void *mkl_buf_input, *mkl_buf_out_backprop;
 
       mkl_prim_convert_input = nullptr;
       mkl_prim_convert_out_backprop = nullptr;
       mkl_lt_internal_input = nullptr;
       mkl_lt_internal_out_backprop = nullptr;
+      mkl_lt_trans_input = nullptr;
       mkl_buf_convert_input = nullptr;
       mkl_buf_convert_out_backprop = nullptr;
+      mkl_buf_input = nullptr;
+      mkl_buf_out_backprop = nullptr;
 
       // Compare with internal layouts and convert if needed
       const Tensor& input = MklGetInput(context, 0);
-      void* mkl_buf_input =
-          const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
+      if (!input_in_mkl_format && format == FORMAT_NHWC){
+        TensorShape nchw_shape = ShapeFromFormat(FORMAT_NCHW,
+            in_sizes[MklDims::N], in_sizes[MklDims::H],
+            in_sizes[MklDims::W], in_sizes[MklDims::C]);
+        OP_REQUIRES_OK(context, context->allocate_temp(
+            DataTypeToEnum<float>::value, nchw_shape, mkl_tmp_buf_trans_input));
+        MklNHWCToNCHW(input, &mkl_tmp_buf_trans_input);
+        mkl_buf_input = const_cast<void*>(static_cast<const void*>(
+            mkl_tmp_buf_trans_input->flat<float>().data()));
+        size_t strides[4];
+        GetStridesFromSizes(FORMAT_NCHW, strides, in_sizes);
+        CHECK_EQ(dnnLayoutCreate_F32(&mkl_lt_trans_input, in_dims, in_sizes,
+            strides), E_SUCCESS);
+      }
+      else {
+        mkl_buf_input =
+            const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
+        mkl_lt_trans_input = lt_input;
+      }
       CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
                    &mkl_lt_internal_input, prim_conv_bwdfilter, dnnResourceSrc),
                E_SUCCESS);
       mkl_convert_input =
-          !dnnLayoutCompare_F32(mkl_lt_internal_input, lt_input);
+          !dnnLayoutCompare_F32(mkl_lt_internal_input, mkl_lt_trans_input);
       if (mkl_convert_input) {
-        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, lt_input,
-                                         mkl_lt_internal_input),
-                 E_SUCCESS);
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input,
+                     mkl_lt_trans_input, mkl_lt_internal_input), E_SUCCESS);
         AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
                        &mkl_buf_convert_input);
         CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
@@ -343,26 +372,30 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
         dnnDelete_F32(mkl_prim_convert_input);
       }
       dnnLayoutDelete_F32(mkl_lt_internal_input);
+      if (!input_in_mkl_format && format == FORMAT_NHWC)
+        dnnLayoutDelete_F32(mkl_lt_trans_input);
+
 
       conv_res[dnnResourceSrc] =
           (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input;
 
       const Tensor& out_backprop = MklGetInput(context, 2);
-      void* mkl_buf_out_backprop = const_cast<void*>(
-          static_cast<const void*>(out_backprop.flat<T>().data()));
+      mkl_buf_out_backprop = const_cast<void*>(
+                      static_cast<const void*>(out_backprop.flat<T>().data()));
+
       CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_out_backprop,
                                                 prim_conv_bwdfilter,
                                                 dnnResourceDiffDst),
                E_SUCCESS);
       mkl_convert_out_backprop =
-          !dnnLayoutCompare_F32(mkl_lt_internal_out_backprop, lt_out_backprop);
+          !dnnLayoutCompare_F32(mkl_lt_internal_out_backprop,
+                                lt_out_backprop);
       if (mkl_convert_out_backprop) {
         CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_out_backprop,
-                                         lt_out_backprop,
-                                         mkl_lt_internal_out_backprop),
+                      lt_out_backprop, mkl_lt_internal_out_backprop),
                  E_SUCCESS);
         AllocTmpBuffer(context, mkl_tmp_out_backprop_buf_tensor,
-                       lt_out_backprop, &mkl_buf_convert_out_backprop);
+            mkl_lt_internal_out_backprop, &mkl_buf_convert_out_backprop);
         CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_out_backprop,
                                           mkl_buf_out_backprop,
                                           mkl_buf_convert_out_backprop),
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index 76b9f1798d..df49e03f31 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -267,12 +267,15 @@ class MklConv2DOp : public OpKernel {
 
     mkl_context.MklCreateInputLayouts(context);
 
+    // Temp tensor used to allocate tmp buffers
     Tensor mkl_tmp_input_buf_tensor, mkl_tmp_filter_buf_tensor,
-        mkl_tmp_bias_buf_tensor;  // Temp tensor used to allocate tmp
-                                  // buffers
-    mkl_context.MklPrepareConvolutionInputs(context, &mkl_tmp_input_buf_tensor,
+        mkl_tmp_bias_buf_tensor, mkl_tmp_buf_trans_input;
+    mkl_context.MklPrepareConvolutionInputs(context, data_format_,
+                                            input_in_mkl_format,
+                                            &mkl_tmp_input_buf_tensor,
                                             &mkl_tmp_filter_buf_tensor,
-                                            &mkl_tmp_bias_buf_tensor);
+                                            &mkl_tmp_bias_buf_tensor,
+                                            &mkl_tmp_buf_trans_input);
 
     // Execute convolution
     CHECK_EQ(dnnExecute_F32(mkl_context.prim_fwd, mkl_context.conv_res),
@@ -323,39 +326,59 @@ class MklConv2DOp : public OpKernel {
     // Compare incoming tensor layouts with MKL preferred layouts and convert
     // data to the preferred layout if necessary
     void MklPrepareConvolutionInputs(OpKernelContext* context,
+                                     TensorFormat format,
+                                     bool input_in_mkl_format,
                                      Tensor* mkl_tmp_input_buf_tensor,
                                      Tensor* mkl_tmp_filter_buf_tensor,
-                                     Tensor* mkl_tmp_bias_buf_tensor) {
+                                     Tensor* mkl_tmp_bias_buf_tensor,
+                                     Tensor* mkl_tmp_buf_trans_input) {
       bool mkl_convert_input, mkl_convert_filter, mkl_convert_bias;
       dnnPrimitive_t mkl_prim_convert_filter, mkl_prim_convert_bias,
           mkl_prim_convert_input;
       dnnLayout_t mkl_lt_internal_filter, mkl_lt_internal_bias,
-          mkl_lt_internal_input;
+          mkl_lt_internal_input, mkl_lt_trans_input;
       void *mkl_buf_convert_input, *mkl_buf_convert_filter,
-          *mkl_buf_convert_bias;
+          *mkl_buf_convert_bias, *mkl_buf_input;
       mkl_prim_convert_filter = nullptr;
       mkl_prim_convert_bias = nullptr;
       mkl_prim_convert_input = nullptr;
       mkl_lt_internal_filter = nullptr;
       mkl_lt_internal_bias = nullptr;
       mkl_lt_internal_input = nullptr;
+      mkl_lt_trans_input = nullptr;
       mkl_buf_convert_input = nullptr;
       mkl_buf_convert_filter = nullptr;
       mkl_buf_convert_bias = nullptr;
+      mkl_buf_input = nullptr;
 
       // Compare with internal layouts and convert if needed
       const Tensor& input = MklGetInput(context, 0);
-      void* mkl_buf_input =
-          const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
+      if (!input_in_mkl_format && format == FORMAT_NHWC) {
+        TensorShape nchw_shape = ShapeFromFormat(FORMAT_NCHW,
+            in_sizes[MklDims::N], in_sizes[MklDims::H],
+            in_sizes[MklDims::W], in_sizes[MklDims::C]);
+        OP_REQUIRES_OK(context, context->allocate_temp(
+            DataTypeToEnum<float>::value, nchw_shape, mkl_tmp_buf_trans_input));
+        MklNHWCToNCHW(input, &mkl_tmp_buf_trans_input);
+        mkl_buf_input = const_cast<void*>(static_cast<const void*>(
+            mkl_tmp_buf_trans_input->flat<float>().data()));
+        size_t strides[4];
+        GetStridesFromSizes(FORMAT_NCHW, strides, in_sizes);
+        CHECK_EQ(dnnLayoutCreate_F32(&mkl_lt_trans_input, in_dims, in_sizes,
+            strides), E_SUCCESS);
+      } else {
+          mkl_buf_input = const_cast<void*>(
+                              static_cast<const void*>(input.flat<T>().data()));
+          mkl_lt_trans_input = lt_input;
+      }
       CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_input,
                                                 prim_fwd, dnnResourceSrc),
                E_SUCCESS);
       mkl_convert_input =
-          !dnnLayoutCompare_F32(mkl_lt_internal_input, lt_input);
+          !dnnLayoutCompare_F32(mkl_lt_internal_input, mkl_lt_trans_input);
       if (mkl_convert_input) {
-        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, lt_input,
-                                         mkl_lt_internal_input),
-                 E_SUCCESS);
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input,
+                 mkl_lt_trans_input, mkl_lt_internal_input), E_SUCCESS);
         AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
                        &mkl_buf_convert_input);
         CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
@@ -364,6 +387,8 @@ class MklConv2DOp : public OpKernel {
         dnnDelete_F32(mkl_prim_convert_input);
       }
       dnnLayoutDelete_F32(mkl_lt_internal_input);
+      if (!input_in_mkl_format && format == FORMAT_NHWC)
+        dnnLayoutDelete_F32(mkl_lt_trans_input);
 
       conv_res[dnnResourceSrc] =
           (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input;
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index 070aeff49f..07a7e6b5da 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -22,9 +22,6 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 #include <vector>
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "third_party/mkl/include/mkl_dnn.h"
-#include "third_party/mkl/include/mkl_dnn_types.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -33,6 +30,9 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/tensor_format.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
 
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/util/work_sharder.h"
@@ -66,11 +66,10 @@ class MklLRNOp : public OpKernel {
   explicit MklLRNOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
-        errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                " larger than int max"));
+    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
+                                         std::numeric_limits<int>::max()),
+                errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                        " larger than int max"));
     depth_radius_ = static_cast<size_t>(depth_radius64);
 
     OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
@@ -93,10 +92,9 @@ class MklLRNOp : public OpKernel {
                               : input.dims();
     OP_REQUIRES(context, mkl_context.in_dims == 4,
                 errors::InvalidArgument("input must be 4-dimensional"));
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(input.NumElements(), std::numeric_limits<int>::max()),
-        errors::InvalidArgument("argument to LRN too large"));
+    OP_REQUIRES(context, FastBoundsCheck(input.NumElements(),
+                                         std::numeric_limits<int>::max()),
+                errors::InvalidArgument("argument to LRN too large"));
 
     if (!input_in_mkl_format) {
       mkl_context.MklDefaultToEigen(context, depth_radius_, bias_, alpha_,
@@ -104,15 +102,6 @@ class MklLRNOp : public OpKernel {
       return;
     }
 
-    // TODO(inteltf) MKL will support depth radius not equal to 2 in the future
-    if (depth_radius_ != 2) {
-      Tensor converted_tensor =
-          ConvertMklToTF<T>(context, input, mkl_context.input_shape);
-      mkl_context.MklDefaultToEigen(context, depth_radius_, bias_, alpha_,
-                                    beta_, converted_tensor);
-      return;
-    }
-
     if (input_in_mkl_format) {
       // MKL supports normalization over channel dimension only
       if (mkl_context.input_shape.tf_dim_idx(mkl_context.in_dims - 1) ==
@@ -345,11 +334,10 @@ class MklLRNGradOp : public OpKernel {
   explicit MklLRNGradOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
-        errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                " larger than int max"));
+    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
+                                         std::numeric_limits<int>::max()),
+                errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                        " larger than int max"));
     depth_radius_ = static_cast<int>(depth_radius64);
     OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
     OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
@@ -553,6 +541,9 @@ class MklLRNGradOp : public OpKernel {
       CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_bdw_input, lrn_bwd,
                                                 dnnResourceDiffDst),
                E_SUCCESS);
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_internal_input, lrn_bwd,
+                                                dnnResourceSrc),
+               E_SUCCESS);
 
       bool ingrad_in_mkl_format = ingrad_shape.IsMklTensor();
       if (ingrad_in_mkl_format) {
@@ -581,44 +572,37 @@ class MklLRNGradOp : public OpKernel {
         }
       }
 
-// Although MKL documentation for LRN does not specify setting/getting
-// of dnnResourceSrc and dnnResourceDst, Caffe code sets dnnResourceSrc.
-// So we set dnnResourceSrc here. But we do not know why we are setting
-// dnnResourceDst.
-#if 0
-    // NOTE: The code below is kept just so that we know how we should handle
-    // dnnResourceSrc if the primitive layout for dnnResourceSrc was supported.
-
-    if (!dnnLayoutCompare_F32(lt_internal_input,
-         static_cast<dnnLayout_t>inimage_shape.GetCurLayout())) {
-      AllocTmpBuffer(context, mkl_tmp_image_buf_tensor, lt_internal_input,
-                     &res_lrn_bwd[dnnResourceSrc]);
-      inimage_shape.GetConvertedFlatData(lt_internal_input,
-                                           user_fwd_input,
-                                           res_lrn_bwd[dnnResourceSrc]);
-    } else {
-      res_lrn_bwd[dnnResourceSrc] = user_fwd_input;
-    }
-#endif
-
-      // Since we cannot get expected layout for dnnResourceSrc, we construct
-      // buffer using
-      // MKL format if input is in MKL format.
-      if (inimage_shape.IsMklTensor()) {
-        AllocTmpBuffer(context, mkl_tmp_image_buf_tensor,
-                       (dnnLayout_t)inimage_shape.GetCurLayout(),
-                       &res_lrn_bwd[dnnResourceSrc]);
+      bool inimage_in_mkl_format = inimage_shape.IsMklTensor();
+      if (inimage_in_mkl_format) {
+        if (!dnnLayoutCompare_F32(
+                lt_internal_input,
+                static_cast<dnnLayout_t>(inimage_shape.GetCurLayout()))) {
+          AllocTmpBuffer(context, mkl_tmp_image_buf_tensor, lt_internal_input,
+                         &res_lrn_bwd[dnnResourceSrc]);
+          ingrad_shape.GetConvertedFlatData(lt_internal_input, user_fwd_input,
+                                            res_lrn_bwd[dnnResourceSrc]);
+        } else {
+          res_lrn_bwd[dnnResourceSrc] = user_fwd_input;
+        }
       } else {
-        res_lrn_bwd[dnnResourceSrc] = user_fwd_input;
-      }
+        if (!dnnLayoutCompare_F32(
+                lt_internal_input,
+                static_cast<dnnLayout_t>(inimage_shape.GetCurLayout()))) {
+          CHECK_EQ(dnnConversionCreate_F32(
+                       &convert_input,
+                       static_cast<dnnLayout_t>(inimage_shape.GetCurLayout()),
+                       lt_internal_input),
+                   E_SUCCESS);
 
-      // Same comment as above.
-      if (outimage_shape.IsMklTensor()) {
-        AllocTmpBuffer(context, mkl_tmp_outimage_buf_tensor,
-                       (dnnLayout_t)outimage_shape.GetCurLayout(),
-                       &res_lrn_bwd[dnnResourceDst]);
-      } else {
-        res_lrn_bwd[dnnResourceDst] = user_fwd_output;
+          AllocTmpBuffer(context, mkl_tmp_image_buf_tensor, lt_internal_input,
+                         &res_lrn_bwd[dnnResourceSrc]);
+          CHECK_EQ(dnnConversionExecute_F32(convert_input, user_fwd_input,
+                                            res_lrn_bwd[dnnResourceSrc]),
+                   E_SUCCESS);
+          dnnDelete_F32(convert_input);
+        } else {
+          res_lrn_bwd[dnnResourceSrc] = user_fwd_input;
+        }
       }
 
       res_lrn_bwd[dnnResourceWorkspace] = workspace_buffer;
@@ -628,8 +612,6 @@ class MklLRNGradOp : public OpKernel {
     // TODO(intelft) Check if we can use EigenLRNOp directly instead of making a
     // copy.
     void MklDefaultToEigen(OpKernelContext* context) {
-      // CHECK(false);
-
       Tensor in_grads;
       Tensor in_image;
       Tensor out_image;
@@ -709,7 +691,7 @@ class MklLRNGradOp : public OpKernel {
       Shard(worker_threads.num_threads, worker_threads.workers, nodes * batch,
             depth * depth, shard);
     }
-		
+
     // release mkl resources
     void Mklcleanup() {
       bool ingrad_in_mkl_format = ingrad_shape.IsMklTensor();
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index 10d2937584..fabecc39a8 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -184,38 +184,31 @@ class MklReluGradOp : public OpKernel {
     dnnLayout_t lt_input, lt_grad;
 
     void MklPrepareReluGradInputs(OpKernelContext* context,
-                                  Tensor* mkl_tmp_grad_buf_tensor,
                                   Tensor* mkl_tmp_input_buf_tensor) {
-      dnnPrimitive_t cv_user_to_reluB_input, cv_user_to_reluB_grad;
-      dnnLayout_t mkl_lt_internal_input, mkl_lt_internal_grad;
-
       const Tensor& g = MklGetInput(context, 0);
       const Tensor& a = MklGetInput(context, 1);
-
-      void* user_i = static_cast<void*>(const_cast<T*>(a.flat<T>().data()));
-      void* user_g = static_cast<void*>(const_cast<T*>(g.flat<T>().data()));
-      dnnPrimitive_t cv_input_to_grad = NULL;
-      Tensor mkl_tmp_buf_tensor;
+      void* buf_input = static_cast<void*>(const_cast<T*>(a.flat<T>().data()));
       void* mkl_buffer_convert = nullptr;
+      dnnPrimitive_t cv_input_to_grad = nullptr;
 
       // if input and grad are not in the same layout, do a conversion between
       // them.
       if (!dnnLayoutCompare_F32(lt_input, lt_grad)) {
-        AllocTmpBuffer(context, &mkl_tmp_buf_tensor, lt_grad,
+        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_grad,
                        &mkl_buffer_convert);
-        CHECK_EQ(dnnConversionCreate_F32(&cv_input_to_grad, lt_input, lt_grad),
-                 E_SUCCESS);
-
-        CHECK_EQ(dnnConversionExecute_F32(cv_input_to_grad, user_i,
+        CHECK_EQ(dnnConversionCreate_F32(&cv_input_to_grad, lt_input,
+                   lt_grad), E_SUCCESS);
+        CHECK_EQ(dnnConversionExecute_F32(cv_input_to_grad, buf_input,
                                           mkl_buffer_convert),
                  E_SUCCESS);
         relu_res[dnnResourceSrc] = mkl_buffer_convert;
         dnnDelete_F32(cv_input_to_grad);
       } else {
-        relu_res[dnnResourceSrc] = user_i;
+        relu_res[dnnResourceSrc] = buf_input;
       }
 
-      relu_res[dnnResourceDiffDst] = user_g;
+      void* buf_grad = static_cast<void*>(const_cast<T*>(g.flat<T>().data()));
+      relu_res[dnnResourceDiffDst] = buf_grad;
     }
 
     void MklCreateInputLayouts(OpKernelContext* context) {
@@ -317,9 +310,8 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
                                      mkl_context.lt_grad, mkl_context.lt_grad,
                                      negative_slope),
            E_SUCCESS);
-  Tensor mkl_tmp_grad_buf_tensor, mkl_tmp_input_buf_tensor;
-  mkl_context.MklPrepareReluGradInputs(context, &mkl_tmp_grad_buf_tensor,
-                                       &mkl_tmp_input_buf_tensor);
+  Tensor mkl_tmp_input_buf_tensor;
+  mkl_context.MklPrepareReluGradInputs(context, &mkl_tmp_input_buf_tensor);
 
   if (input_is_mkl ||
       grad_is_mkl) { /*if  grad or input are MKL leave it in MKL*/
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.cc b/tensorflow/core/kernels/mkl_tfconv_op.cc
index 588d6874dd..b4aae67ca6 100644
--- a/tensorflow/core/kernels/mkl_tfconv_op.cc
+++ b/tensorflow/core/kernels/mkl_tfconv_op.cc
@@ -24,12 +24,13 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/tensor_format.h"
 
+#include "tensorflow/core/util/mkl_util.h"
 #include "third_party/mkl/include/mkl_dnn.h"
 #include "third_party/mkl/include/mkl_dnn_types.h"
-#include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -44,10 +45,11 @@ class MklToTfOp : public OpKernel {
   explicit MklToTfOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
     OP_REQUIRES_OK(context, context->GetAttr("T", &op_data_type));
+    has_avx512f_ = port::TestCPUFeature(port::CPUFeature::AVX512F);
   }
 
   void Compute(OpKernelContext* context) override {
-    // 1. Check that input tensor is in MKL format.
+    // Check that input tensor is in MKL format.
     const Tensor& input_tensor = MklGetInput(context, 0);
     MklShape input_shape;
     GetMklShape(context, 0, &input_shape);
@@ -68,9 +70,12 @@ class MklToTfOp : public OpKernel {
     CHECK_EQ(op_data_type, output_data_type);
 
     TensorShape output_shape;
-    for (size_t i = 0; i < input_shape.GetDimension(); i++) {
+    size_t ndims = input_shape.GetDimension();
+    size_t* in_sizes = new size_t[ndims];
+    for (size_t i = 0; i < ndims; i++) {
       // Outermost to innermost dimension
       output_shape.AddDim(input_shape.GetSizes()[input_shape.tf_dim_idx(i)]);
+      in_sizes[i] = input_shape.GetSizes()[i];
     }
 
     // Allocate output tensor.
@@ -78,17 +83,41 @@ class MklToTfOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, output_shape, &output_tensor));
 
-    // 3. Get input and output layout pointers.
-    dnnLayout_t output_layout =
-        static_cast<dnnLayout_t>(input_shape.GetTfLayout());
+    // If data format is NHWC, transform MKL tensor to NCHW format and then
+    // do NCHW -> NHWC.
+    dnnLayout_t lt_trans_input = nullptr;
+    Tensor mkl_tmp_trans_input_buf_tensor;
+    void* buf_trans_input = nullptr;
+    bool input_fmt_nhwc = input_shape.IsTensorInNHWCFormat();
+    if (input_fmt_nhwc && ndims == 4 && has_avx512f_) {
+      size_t strides_nchw[4];
+      GetStridesFromSizes(FORMAT_NCHW, strides_nchw, in_sizes);
+      CHECK_EQ(
+          dnnLayoutCreate_F32(&lt_trans_input, ndims, in_sizes, strides_nchw),
+          E_SUCCESS);
+      AllocTmpBuffer(context, &mkl_tmp_trans_input_buf_tensor, lt_trans_input,
+                     &buf_trans_input);
+    } else {
+      lt_trans_input = static_cast<dnnLayout_t>(input_shape.GetTfLayout());
+      buf_trans_input =
+          static_cast<void*>(const_cast<T*>(output_tensor->flat<T>().data()));
+    }
 
-    // 4. Execute DNNConversion.
+    // Execute DNNConversion.
     void* input_buffer =
         static_cast<void*>(const_cast<T*>(input_tensor.flat<T>().data()));
-    void* output_buffer =
-        static_cast<void*>(const_cast<T*>(output_tensor->flat<T>().data()));
-    input_shape.GetConvertedFlatData(output_layout, input_buffer,
-                                     output_buffer);
+    input_shape.GetConvertedFlatData(lt_trans_input, input_buffer,
+                                     buf_trans_input);
+    // NCHW -> NHWC, if data format is NHWC
+    if (input_fmt_nhwc && ndims == 4 && has_avx512f_) {
+      dnnLayoutDelete_F32(lt_trans_input);
+      TensorShape nhwc_shape = ShapeFromFormat(
+          FORMAT_NHWC, in_sizes[MklDims::N], in_sizes[MklDims::H],
+          in_sizes[MklDims::W], in_sizes[MklDims::C]);
+      MklNCHWToNHWC(mkl_tmp_trans_input_buf_tensor, &output_tensor);
+    }
+
+    delete[] in_sizes;
 
     VLOG(1) << "MKLToTFConversion complete successfully.";
   }
@@ -99,6 +128,9 @@ class MklToTfOp : public OpKernel {
 
   /// Data type of the operation
   DataType op_data_type;
+
+  /// CPUIDInfo
+  bool has_avx512f_ = false;
 };
 
 ///////////////////////////////////////////////////////////
diff --git a/tensorflow/core/kernels/non_max_suppression_op.cc b/tensorflow/core/kernels/non_max_suppression_op.cc
index 9ffe71e031..dc95f67ff0 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op.cc
@@ -90,20 +90,24 @@ static inline float ComputeIOU(typename TTypes<float, 2>::ConstTensor boxes,
   return intersection_area / (area_i + area_j - intersection_area);
 }
 
-void DoNonMaxSuppressionOp(OpKernelContext* context, const Tensor& boxes,
-                           const Tensor& scores, const Tensor& max_output_size,
+void DoNonMaxSuppressionOp(OpKernelContext* context,
+                           const Tensor& boxes,
+                           const Tensor& scores,
+                           const Tensor& max_output_size,
                            const float iou_threshold) {
   OP_REQUIRES(context, iou_threshold >= 0 && iou_threshold <= 1,
-              errors::InvalidArgument("iou_threshold must be in [0, 1]"));
-
+      errors::InvalidArgument("iou_threshold must be in [0, 1]"));
+  
   int num_boxes = 0;
   ParseAndCheckBoxSizes(context, boxes, scores, &num_boxes);
   if (!context->status().ok()) {
     return;
   }
 
-  const int output_size = std::min(max_output_size.scalar<int>()(), num_boxes);
-  typename TTypes<float, 2>::ConstTensor boxes_data = boxes.tensor<float, 2>();
+  const int output_size =
+      std::min(max_output_size.scalar<int>()(), num_boxes);
+  typename TTypes<float, 2>::ConstTensor boxes_data =
+      boxes.tensor<float, 2>();
 
   std::vector<float> scores_data(num_boxes);
   std::copy_n(scores.flat<float>().data(), num_boxes, scores_data.begin());
@@ -123,7 +127,7 @@ void DoNonMaxSuppressionOp(OpKernelContext* context, const Tensor& boxes,
     for (int j = i + 1; j < num_boxes; ++j) {
       if (active[j]) {
         float iou =
-            ComputeIOU(boxes_data, sorted_indices[i], sorted_indices[j]);
+          ComputeIOU(boxes_data, sorted_indices[i], sorted_indices[j]);
         if (iou > iou_threshold) {
           active[j] = false;
           num_active--;
@@ -141,7 +145,7 @@ void DoNonMaxSuppressionOp(OpKernelContext* context, const Tensor& boxes,
   std::copy_n(selected.begin(), selected.size(), selected_indices_data.data());
 }
 
-}  // namespace
+} // namespace
 
 template <typename Device>
 class NonMaxSuppressionOp : public OpKernel {
@@ -163,8 +167,7 @@ class NonMaxSuppressionOp : public OpKernel {
         errors::InvalidArgument("max_output_size must be 0-D, got shape ",
                                 max_output_size.shape().DebugString()));
 
-    DoNonMaxSuppressionOp(context, boxes, scores, max_output_size,
-                          iou_threshold_);
+    DoNonMaxSuppressionOp(context, boxes, scores, max_output_size, iou_threshold_);
   }
 
  private:
@@ -175,7 +178,8 @@ template <typename Device>
 class NonMaxSuppressionV2Op : public OpKernel {
  public:
   explicit NonMaxSuppressionV2Op(OpKernelConstruction* context)
-      : OpKernel(context) {}
+      : OpKernel(context) {
+  }
 
   void Compute(OpKernelContext* context) override {
     // boxes: [num_boxes, 4]
@@ -190,14 +194,14 @@ class NonMaxSuppressionV2Op : public OpKernel {
                                 max_output_size.shape().DebugString()));
     // iou_threshold: scalar
     const Tensor& iou_threshold = context->input(3);
-    OP_REQUIRES(context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
-                errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
-                                        iou_threshold.shape().DebugString()));
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
+        errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
+                                iou_threshold.shape().DebugString()));
 
     const float iou_threshold_val = iou_threshold.scalar<float>()();
 
-    DoNonMaxSuppressionOp(context, boxes, scores, max_output_size,
-                          iou_threshold_val);
+    DoNonMaxSuppressionOp(context, boxes, scores, max_output_size, iou_threshold_val);
   }
 };
 
diff --git a/tensorflow/core/kernels/priority_queue.cc b/tensorflow/core/kernels/priority_queue.cc
index 894ad3c9a0..4c406fc1ed 100644
--- a/tensorflow/core/kernels/priority_queue.cc
+++ b/tensorflow/core/kernels/priority_queue.cc
@@ -339,7 +339,7 @@ void PriorityQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
             for (; s > 0; --s) {
               if (attempt->tuple.empty()) {
                 // Only allocate tuple when we have something to dequeue
-                // so we don't use exceessive memory when there are many
+                // so we don't use excessive memory when there are many
                 // blocked dequeue attempts waiting.
                 attempt->tuple.reserve(num_components());
                 for (int i = 0; i < num_components(); ++i) {
diff --git a/tensorflow/core/kernels/shape_ops.cc b/tensorflow/core/kernels/shape_ops.cc
index d78c6d2639..c5e3164145 100644
--- a/tensorflow/core/kernels/shape_ops.cc
+++ b/tensorflow/core/kernels/shape_ops.cc
@@ -48,6 +48,7 @@ REGISTER_KERNEL_BUILDER(Name("Shape")
                           ShapeOp<int64>);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
+TF_CALL_bool(REGISTER_SYCL_KERNEL);
 #undef REGISTER_SYCL_KERNEL
 
 REGISTER_KERNEL_BUILDER(Name("Shape")
@@ -102,7 +103,7 @@ REGISTER_KERNEL_BUILDER(Name("Shape")
                             .TypeConstraint<int32>("T")
                             .TypeConstraint<int64>("out_type"),
                         ShapeOp<int64>);
-#endif
+#endif  // GOOGLE_CUDA
 
 // ShapeN ---------------------------------------
 REGISTER_KERNEL_BUILDER(Name("ShapeN")
@@ -152,9 +153,9 @@ REGISTER_KERNEL_BUILDER(Name("ShapeN")
                             .TypeConstraint<int32>("T")
                             .TypeConstraint<int64>("out_type"),
                         ShapeNOp<int64>);
-#endif
+#endif  // GOOGLE_CUDA
 
-#if TENSORFLOW_USE_SYCL
+#ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(type)                               \
   REGISTER_KERNEL_BUILDER(Name("ShapeN")                         \
                               .Device(DEVICE_SYCL)               \
@@ -170,11 +171,9 @@ REGISTER_KERNEL_BUILDER(Name("ShapeN")
                           ShapeNOp<int64>)
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
+TF_CALL_bool(REGISTER_SYCL_KERNEL);
 #undef REGISTER_SYCL_KERNEL
 
-// A special GPU kernel for int32.
-// TODO(b/25387198): Also enable int32 in device memory. This kernel
-// registration requires all int32 inputs and outputs to be in host memory.
 REGISTER_KERNEL_BUILDER(Name("ShapeN")
                             .Device(DEVICE_SYCL)
                             .HostMemory("input")
@@ -202,13 +201,9 @@ REGISTER_KERNEL_BUILDER(Name("Rank").Device(DEVICE_CPU).HostMemory("output"),
                               .TypeConstraint<type>("T") \
                               .HostMemory("output"),     \
                           RankOp);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 #undef REGISTER_SYCL_KERNEL
 
-// A special GPU kernel for int32 and bool.
-// TODO(b/25387198): Also enable int32 in device memory. This kernel
-// registration requires all int32 inputs and outputs to be in host memory.
 REGISTER_KERNEL_BUILDER(Name("Rank")
                             .Device(DEVICE_SYCL)
                             .TypeConstraint<int32>("T")
@@ -250,7 +245,7 @@ REGISTER_KERNEL_BUILDER(Name("Rank")
                             .HostMemory("input")
                             .HostMemory("output"),
                         RankOp);
-#endif
+#endif  // GOOGLE_CUDA
 
 // Size ------------------------------------------
 REGISTER_KERNEL_BUILDER(Name("Size")
@@ -299,7 +294,7 @@ REGISTER_KERNEL_BUILDER(Name("Size")
                             .HostMemory("input")
                             .HostMemory("output"),
                         SizeOp<int64>);
-#endif
+#endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(type)                               \
@@ -315,13 +310,10 @@ REGISTER_KERNEL_BUILDER(Name("Size")
                               .TypeConstraint<int64>("out_type") \
                               .HostMemory("output"),             \
                           SizeOp<int64>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
+TF_CALL_bool(REGISTER_SYCL_KERNEL);
 #undef REGISTER_SYCL_KERNEL
 
-// A special GPU kernel for int32.
-// TODO(b/25387198): Also enable int32 in device memory. This kernel
-// registration requires all int32 inputs and outputs to be in host memory.
 REGISTER_KERNEL_BUILDER(Name("Size")
                             .Device(DEVICE_SYCL)
                             .TypeConstraint<int32>("T")
@@ -336,7 +328,7 @@ REGISTER_KERNEL_BUILDER(Name("Size")
                             .HostMemory("input")
                             .HostMemory("output"),
                         SizeOp<int64>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // ExpandDims ------------------------------------
 REGISTER_KERNEL_BUILDER(Name("ExpandDims")
@@ -365,7 +357,7 @@ REGISTER_KERNEL_BUILDER(Name("ExpandDims")
                             .HostMemory("dim")
                             .HostMemory("output"),
                         ExpandDimsOp);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(type)                           \
@@ -375,9 +367,8 @@ REGISTER_KERNEL_BUILDER(Name("ExpandDims")
                               .TypeConstraint<int32>("Tdim") \
                               .HostMemory("dim"),            \
                           ExpandDimsOp);
-REGISTER_SYCL_KERNEL(float)
-REGISTER_SYCL_KERNEL(double)
-
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
+TF_CALL_bool(REGISTER_SYCL_KERNEL);
 #undef REGISTER_SYCL_KERNEL
 
 REGISTER_KERNEL_BUILDER(Name("ExpandDims")
@@ -388,7 +379,7 @@ REGISTER_KERNEL_BUILDER(Name("ExpandDims")
                             .HostMemory("dim")
                             .HostMemory("output"),
                         ExpandDimsOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Squeeze ---------------------------------------
 REGISTER_KERNEL_BUILDER(Name("Squeeze").Device(DEVICE_CPU), SqueezeOp);
@@ -411,26 +402,23 @@ REGISTER_KERNEL_BUILDER(Name("Squeeze")
                             .HostMemory("input")
                             .HostMemory("output"),
                         SqueezeOp);
-#endif
+#endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                                  \
-  REGISTER_KERNEL_BUILDER(                                          \
-      Name("Squeeze").Device(DEVICE_SYCL).TypeConstraint<type>("T"),\
+#define REGISTER_SYCL_KERNEL(type)                                   \
+  REGISTER_KERNEL_BUILDER(                                           \
+      Name("Squeeze").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
       SqueezeOp);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
+TF_CALL_bool(REGISTER_SYCL_KERNEL);
 #undef REGISTER_SYCL_KERNEL
 
-// A special GPU kernel for int32.
-// TODO(b/25387198): Also enable int32 in device memory. This kernel
-// registration requires all int32 inputs and outputs to be in host memory.
 REGISTER_KERNEL_BUILDER(Name("Squeeze")
                             .Device(DEVICE_SYCL)
                             .TypeConstraint<int32>("T")
                             .HostMemory("input")
                             .HostMemory("output"),
                         SqueezeOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/slice_op.cc b/tensorflow/core/kernels/slice_op.cc
index ee6f9a28cd..d46701749b 100644
--- a/tensorflow/core/kernels/slice_op.cc
+++ b/tensorflow/core/kernels/slice_op.cc
@@ -118,6 +118,43 @@ static void SharedValidation(OpKernelContext* context,
   }
 }
 
+// Extracted out code in SliceOp::Compute so that MklSliceOp can reuse this
+// generic code
+template <typename T>
+static void SharedSliceCommonCases(OpKernelContext* context,
+                                   TensorShape* output_shape,
+                                   gtl::InlinedVector<int64, 4>* begin,
+                                   gtl::InlinedVector<int64, 4>* size,
+                                   Tensor** result,
+                                   bool* done) {
+  bool is_identity = true;
+  bool slice_dim0 = true;
+  *done = false;
+
+  SharedValidation(context, output_shape, &is_identity, &slice_dim0, begin,
+                   size);
+  if (!context->status().ok()) return;
+  const Tensor& input = context->input(0);
+  if (is_identity) {
+    VLOG(1) << "Slice identity";
+    context->set_output(0, input);
+    *done = true;
+    return;
+  }
+
+  if (slice_dim0 && IsDim0SliceAligned<T>(input.shape(), (*begin)[0],
+                                          (*size)[0])) {
+    VLOG(1) << "Slice dim 0: " << input.shape().DebugString();
+    CHECK_GE(input.dims(), 1);  // Otherwise, is_identity should be true.
+    context->set_output(0, input.Slice((*begin)[0], (*begin)[0] + (*size)[0]));
+    *done = true;
+    return;
+  }
+
+  OP_REQUIRES_OK(context, context->allocate_output(0, *output_shape, result));
+}
+
+
 template <typename Device, typename T>
 class SliceOp : public OpKernel {
  public:
@@ -125,29 +162,89 @@ class SliceOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     TensorShape output_shape;
-    bool is_identity = true;
-    bool slice_dim0 = true;
     gtl::InlinedVector<int64, 4> begin;
     gtl::InlinedVector<int64, 4> size;
-    SharedValidation(context, &output_shape, &is_identity, &slice_dim0, &begin,
-                     &size);
-    if (!context->status().ok()) return;
+    Tensor* result = nullptr;
+    bool done = false;
+    SharedSliceCommonCases<T>(context, &output_shape, &begin, &size, &result,
+                              &done);
+    if (!context->status().ok() || done == true) return;
+
     const Tensor& input = context->input(0);
-    if (is_identity) {
-      VLOG(1) << "Slice identity";
-      context->set_output(0, input);
-      return;
+    const int input_dims = input.dims();
+
+    if (output_shape.num_elements() > 0) {
+      if (std::is_same<Device, CPUDevice>::value && input_dims == 2 &&
+          DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) {
+        auto input = context->input(0).tensor<T, 2>();
+        auto output = result->tensor<T, 2>();
+        // TODO(agarwal): Consider multi-threading this loop for cases where
+        // size[0] is very large.
+        for (int i = 0; i < size[0]; ++i) {
+          const int64 row = begin[0] + i;
+          if (i + 1 < size[0]) {
+            port::prefetch<port::PREFETCH_HINT_T0>(&output(i + 1, 0));
+            port::prefetch<port::PREFETCH_HINT_T0>(&input(row + 1, begin[1]));
+          }
+          memcpy(&output(i, 0), &input(row, begin[1]), size[1] * sizeof(T));
+        }
+        return;
+      }
+#define HANDLE_DIM(NDIM)                            \
+  if (input_dims == NDIM) {                         \
+    HandleCase<NDIM>(context, begin, size, result); \
+    return;                                         \
+  }
+
+      HANDLE_DIM(1);
+      HANDLE_DIM(2);
+      HANDLE_DIM(3);
+      HANDLE_DIM(4);
+      HANDLE_DIM(5);
+      HANDLE_DIM(6);
+      HANDLE_DIM(7);
+
+#undef HANDLE_DIM
+
+      OP_REQUIRES(context, false, errors::Unimplemented(
+                                      "SliceOp : Unhandled input dimensions"));
     }
+  }
 
-    if (slice_dim0 && IsDim0SliceAligned<T>(input.shape(), begin[0], size[0])) {
-      VLOG(1) << "Slice dim 0: " << input.shape().DebugString();
-      CHECK_GE(input.dims(), 1);  // Otherwise, is_identity should be true.
-      context->set_output(0, input.Slice(begin[0], begin[0] + size[0]));
-      return;
+ private:
+  template <int NDIM>
+  void HandleCase(OpKernelContext* context, const gtl::ArraySlice<int64>& begin,
+                  const gtl::ArraySlice<int64>& size, Tensor* result) {
+    Eigen::DSizes<Eigen::DenseIndex, NDIM> indices;
+    Eigen::DSizes<Eigen::DenseIndex, NDIM> sizes;
+    for (int i = 0; i < NDIM; ++i) {
+      indices[i] = begin[i];
+      sizes[i] = size[i];
     }
 
+    functor::Slice<Device, T, NDIM>()(
+        context->eigen_device<Device>(), result->tensor<T, NDIM>(),
+        context->input(0).tensor<T, NDIM>(), indices, sizes);
+  }
+};
+
+#ifdef INTEL_MKL
+template <typename Device, typename T>
+class MklSliceOp : public OpKernel {
+ public:
+  explicit MklSliceOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    TensorShape output_shape;
+    gtl::InlinedVector<int64, 4> begin;
+    gtl::InlinedVector<int64, 4> size;
     Tensor* result = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &result));
+    bool done = false;
+    SharedSliceCommonCases<T>(context, &output_shape, &begin, &size, &result,
+                              &done);
+    if (!context->status().ok() || done == true) return;
+
+    const Tensor& input = context->input(0);
     const int input_dims = input.dims();
 
     if (output_shape.num_elements() > 0) {
@@ -189,9 +286,123 @@ class SliceOp : public OpKernel {
   }
 
  private:
+  // Helper function for DoesSliceShapeDifferInOnly1D. Checks if the following
+  // criteria matches for slice_dim: if indices for slice are 0 in all dims
+  // except slice_dim and if sizes of all the dimensions of the slice are same
+  // as the sizes of all the dimensions of the input except slice_dim, then
+  // returns True. Otherwise, returns False.
+  bool DoesSliceShapeDifferInOnly1DHelper(const TensorShape& input_shape,
+                          const gtl::ArraySlice<int64>& begin,
+                          const gtl::ArraySlice<int64>& size,
+                          int slice_dim) {
+    for (int dim = 0; dim < 4; dim++) {
+      if (dim != slice_dim &&
+          (begin[dim] != 0 || size[dim] != input_shape.dim_size(dim))) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Is 'input' tensor being sliced over a single dimension out of 4?
+  //
+  // This check is applicable in the context of Slice of a 4-D tensor in
+  // NHWC or NCHW format over channel dimension.
+  //
+  // If indices for slice are 0 in all dims except one dimension and if sizes of
+  // all dimensions of slice are same as sizes of all dimensions of inputs
+  // except that dimension, then we are slicing over a single dimension.
+  //
+  // Returns True if Slicing over a single dimension, and sets slice_dim
+  // to the number of the dimension that satisfies criteria.
+  bool DoesSliceShapeDifferInOnly1D(const TensorShape& input_shape,
+                          const gtl::ArraySlice<int64>& begin,
+                          const gtl::ArraySlice<int64>& size,
+                          int* slice_dim) {
+    for (int dim = 0; dim < 4; dim++) {
+      if (DoesSliceShapeDifferInOnly1DHelper(input_shape, begin, size, dim)) {
+        *slice_dim = dim;
+        return true;
+      }
+    }
+    return false;
+  }
+
   template <int NDIM>
-  void HandleCase(OpKernelContext* context, const gtl::ArraySlice<int64>& begin,
+  void HandleCase(OpKernelContext* context,
+                  const gtl::ArraySlice<int64>& begin,
                   const gtl::ArraySlice<int64>& size, Tensor* result) {
+    int slice_dim = -1;
+    TensorShape in_shape = context->input(0).shape();
+    // Special case for handling 4-D tensor slice when shape of the slice
+    // differs from the input tensor in only 1 out of 4 dimensions.
+    // This case arises in the context of Slice of 4-D tensor in NHWC or NCHW
+    // format over channel dimension.
+    if (NDIM == 4 &&
+        DoesSliceShapeDifferInOnly1D(in_shape, begin, size, &slice_dim)) {
+        size_t in_strides[4] = { (size_t) in_shape.dim_size(1) *
+                                          in_shape.dim_size(2) *
+                                          in_shape.dim_size(3),
+                                 (size_t) in_shape.dim_size(2) *
+                                          in_shape.dim_size(3),
+                                 (size_t) in_shape.dim_size(3),
+                                 (size_t) 1
+                               };
+
+        size_t out_strides[4] = { (size_t) size[1] * size[2] * size[3],
+                                  (size_t) size[2] * size[3],
+                                  (size_t) size[3],
+                                  (size_t) 1 };
+
+        T *in_buf = const_cast<T*>(const_cast<const T*>(
+                    context->input(0).flat<T>().data()));
+        T *op_buf = result->flat<T>().data();
+
+        if (slice_dim == 1) {
+          /* data format = NCHW */
+
+          #pragma omp parallel for
+          for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
+              T *ip  = in_buf + (d0 * in_strides[0]);
+              T *op  = op_buf + ((d0 - begin[0]) * out_strides[0]);
+            #pragma omp parallel for
+            for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
+              T *ip1 = ip + (d1 * in_strides[1]);
+              T *op1 = op + ((d1 - begin[1]) * out_strides[1]);
+              // For NCHW, H and W will be contiguous. So we can copy
+              // both with one memcpy.
+              memcpy(static_cast<void*>(op1), static_cast<void*>(ip1),
+                     sizeof(T) * in_strides[1]);
+            }
+          }
+          return;
+        } else if (slice_dim == 3) {
+          /* data_format = NHWC */
+
+          #pragma omp parallel for
+          for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
+              T *ip = in_buf + (d0 * in_strides[0]);
+              T *op = op_buf + ((d0 - begin[0]) * out_strides[0]);
+            #pragma omp parallel for
+            for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
+              T *ip1 = ip + (d1 * in_strides[1]);
+              T *op1 = op + ((d1 - begin[1]) * out_strides[1]);
+              #pragma omp parallel for
+              for (size_t d2 = begin[2]; d2 < begin[2] + size[2]; d2++) {
+                T *ip2 = ip1 + (d2 * in_strides[2]);
+                T *ip3 = ip2 + begin[3];
+                T *op2 = op1 + ((d2 - begin[2]) * out_strides[2]);
+                T *op3 = op2;
+                memcpy(static_cast<void*>(op3), static_cast<void*>(ip3),
+                       sizeof(T) * size[3]);
+              }
+            }
+          }
+          return;
+        }
+        // slice_dim is not 1 or 3, then we fallback to Eigen implementation.
+    }
+
     Eigen::DSizes<Eigen::DenseIndex, NDIM> indices;
     Eigen::DSizes<Eigen::DenseIndex, NDIM> sizes;
     for (int i = 0; i < NDIM; ++i) {
@@ -204,6 +415,7 @@ class SliceOp : public OpKernel {
         context->input(0).tensor<T, NDIM>(), indices, sizes);
   }
 };
+#endif
 
 // Forward declarations of the functor specializations for declared in the
 // sharded source files.
@@ -233,6 +445,7 @@ DECLARE_FOR_N(bfloat16);
 #undef DECLARE_CPU_SPEC
 }  // namespace functor
 
+#ifndef INTEL_MKL
 #define REGISTER_SLICE(type)                             \
   REGISTER_KERNEL_BUILDER(Name("Slice")                  \
                               .Device(DEVICE_CPU)        \
@@ -244,8 +457,21 @@ DECLARE_FOR_N(bfloat16);
 TF_CALL_POD_STRING_TYPES(REGISTER_SLICE);
 TF_CALL_QUANTIZED_TYPES(REGISTER_SLICE);
 REGISTER_SLICE(bfloat16);
+#undef REGISTER_SLICE
+#else
+#define REGISTER_SLICE(type)                             \
+  REGISTER_KERNEL_BUILDER(Name("Slice")                  \
+                              .Device(DEVICE_CPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("begin")       \
+                              .HostMemory("size"),       \
+                          MklSliceOp<CPUDevice, type>)
 
+TF_CALL_POD_STRING_TYPES(REGISTER_SLICE);
+TF_CALL_QUANTIZED_TYPES(REGISTER_SLICE);
+REGISTER_SLICE(bfloat16);
 #undef REGISTER_SLICE
+#endif  // INTEL_MKL
 
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
diff --git a/tensorflow/core/kernels/sparse_reduce_op.cc b/tensorflow/core/kernels/sparse_reduce_op.cc
new file mode 100644
index 0000000000..9e60791f97
--- /dev/null
+++ b/tensorflow/core/kernels/sparse_reduce_op.cc
@@ -0,0 +1,341 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/sparse_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/util/sparse/sparse_tensor.h"
+
+// TODO(b/31496047): Fix non-standard include order.
+#include <numeric>  // clang-format off
+
+using tensorflow::sparse::SparseTensor;
+using tensorflow::gtl::ArraySlice;
+
+namespace tensorflow {
+
+struct ReduceDetails {
+  // The dimensions to call Reorder() with.
+  std::vector<int64> reorder_dims;
+
+  // The dimensions to call group() with after Reorder().
+  std::vector<int64> group_by_dims;
+
+  // The shape after reduction.
+  TensorShape reduced_shape;
+};
+
+// Compute common reduce parameters that'll be used for SparseTensor
+// reductions. Usage:
+// ReduceDetails reduction = SparseTensorReduceHelper(sp, axes, keep_dims);
+// sp.Reorder(reduction.reorder_dims);
+// for (const auto& g : sp.group(reduction.group_by_dims)) {
+//   ...
+// }
+// // Set output shape to reduction.reduced_shape.
+ReduceDetails SparseTensorReduceHelper(const SparseTensor &sp,
+                                       gtl::ArraySlice<int32> axes_slice,
+                                       bool keep_dims) {
+  ReduceDetails reduction;
+
+  std::vector<int32> reduction_axes(axes_slice.begin(), axes_slice.end());
+  int ndims = sp.dims();
+  for (int64 i = 0; i < reduction_axes.size(); ++i) {
+    reduction_axes[i] = (reduction_axes[i] + ndims) % ndims;
+  }
+  std::sort(reduction_axes.begin(), reduction_axes.end());
+
+  // (0) Calculate the grouping dimensions:
+  // group_by_dims == {0, .., NDIMS-1} \ reduction_axes.
+  std::vector<int64> perm(ndims);
+  std::iota(perm.begin(), perm.end(), 0);
+
+  // Requires perm and reduction_axes_ be sorted; group_by_dims will be
+  // sorted as well.
+  std::set_difference(
+      perm.begin(), perm.end(), reduction_axes.begin(), reduction_axes.end(),
+      std::inserter(reduction.group_by_dims, reduction.group_by_dims.begin()));
+
+  // Now append the rest of the axes (the complement of group_by_dims_);
+  // result is used by Reorder().
+  reduction.reorder_dims = reduction.group_by_dims;
+  std::set_difference(perm.begin(), perm.end(), reduction.group_by_dims.begin(),
+                      reduction.group_by_dims.end(),
+                      std::back_inserter(reduction.reorder_dims));
+
+  // (1) Calculate the shape after reduction.
+  auto sp_shape = sp.shape();
+  std::vector<int64> out_dim_sizes;
+  if (keep_dims) {
+    out_dim_sizes.reserve(ndims);
+    auto beg = reduction.group_by_dims.begin();
+    auto end = reduction.group_by_dims.end();
+    for (int d = 0; d < ndims; ++d) {
+      if (std::find(beg, end, d) == end) {
+        out_dim_sizes.push_back(1);  // A reduced axis.
+      } else {
+        out_dim_sizes.push_back(sp_shape[d]);
+      }
+    }
+  } else {
+    out_dim_sizes = sp.PickDims(reduction.group_by_dims);
+  }
+
+  reduction.reduced_shape = TensorShape(out_dim_sizes);
+  return reduction;
+}
+
+Status ValidateInputs(const Tensor *shape_t, const Tensor *reduction_axes_t) {
+  // indices and values are validated in SparseTensor ctor.
+  if (!TensorShapeUtils::IsVector(shape_t->shape())) {
+    return errors::InvalidArgument(
+        "Expected input_shape to be a vector; got shape: ",
+        shape_t->shape().DebugString());
+  }
+  if (!TensorShapeUtils::IsScalar(reduction_axes_t->shape()) &&
+      !TensorShapeUtils::IsVector(reduction_axes_t->shape())) {
+    return errors::InvalidArgument(
+        "Expected reduction_axes to be a scalar or a vector; got shape: ",
+        reduction_axes_t->shape().DebugString());
+  }
+
+  const auto reduction_axes_flat = reduction_axes_t->flat<int32>();
+  for (int64 i = 0; i < reduction_axes_flat.size(); i++) {
+    int32 axis = reduction_axes_flat(i);
+    if (axis < -shape_t->NumElements() || axis >= shape_t->NumElements()) {
+      return errors::InvalidArgument("Invalid reduction dimension ", axis,
+                                     ", for input with ",
+                                     shape_t->NumElements(), " dimensions.");
+    }
+  }
+
+  return Status::OK();
+}
+
+struct SumOp {
+  template <typename T>
+  static void Run(OpKernelContext *ctx, typename TTypes<T>::Scalar &s, const typename TTypes<T>::UnalignedVec &v) {
+      s.device(ctx->eigen_cpu_device()) = v.sum();
+  }
+  static StringPiece Name() {
+      return "sum";
+  }
+};
+
+struct MaxOp {
+  template <typename T>
+  static void Run(OpKernelContext *ctx, typename TTypes<T>::Scalar &s, const typename TTypes<T>::UnalignedVec &v) {
+      s.device(ctx->eigen_cpu_device()) = v.maximum();
+  }
+  static StringPiece Name() {
+      return "max";
+  }
+};
+
+template <typename T, typename Op>
+class SparseReduceOp : public OpKernel {
+ public:
+  explicit SparseReduceOp(OpKernelConstruction *ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("keep_dims", &keep_dims_));
+  }
+
+  void Compute(OpKernelContext *ctx) override {
+    const Tensor *indices_t, *values_t, *shape_t, *reduction_axes_t;
+    OP_REQUIRES_OK(ctx, ctx->input("input_indices", &indices_t));
+    OP_REQUIRES_OK(ctx, ctx->input("input_values", &values_t));
+    OP_REQUIRES_OK(ctx, ctx->input("input_shape", &shape_t));
+    OP_REQUIRES_OK(ctx, ctx->input("reduction_axes", &reduction_axes_t));
+
+    OP_REQUIRES_OK(ctx, ValidateInputs(shape_t, reduction_axes_t));
+
+    // TODO(zongheng): we will call Reorder() below, which will modify
+    // in-place the underlying indices and values buffers.  To avoid
+    // surprises of this kernel being stateful, we work around the above by
+    // making deep copies here.  Remove this if/when we change Reorder()'s
+    // semantics.
+    const auto shape_vec = shape_t->vec<int64>();
+    SparseTensor sp(tensor::DeepCopy(*indices_t), tensor::DeepCopy(*values_t),
+                    TensorShape(shape_vec));
+    ReduceDetails reduction = SparseTensorReduceHelper(
+        sp, reduction_axes_t->flat<int32>(), keep_dims_);
+
+    Tensor *out_values;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output(0, reduction.reduced_shape, &out_values));
+    auto out_flat = out_values->flat<T>();
+    out_flat.setZero();
+
+    Tensor tmp_reduced_val;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
+                                           TensorShape({}), &tmp_reduced_val));
+    auto reduced_val = tmp_reduced_val.scalar<T>();
+
+    // Compute strides, and use it to convert coords to flat index.  The
+    // coordinates returned by .group() have the same ndims as group_by_dims.
+    gtl::InlinedVector<int64, 8> output_strides(reduction.group_by_dims.size());
+    if (!output_strides.empty()) {  // Do this iff we don't reduce all.
+      output_strides.back() = 1;
+      for (int d = output_strides.size() - 2; d >= 0; --d) {
+        output_strides[d] =
+            output_strides[d + 1] * shape_vec(reduction.group_by_dims[d + 1]);
+      }
+    }
+
+    auto CoordinatesToFlatIndex = [](ArraySlice<int64> coords,
+                                     ArraySlice<int64> strides) {
+      if (strides.empty()) {  // Reduce all.
+        return 0LL;
+      }
+      CHECK_EQ(coords.size(), strides.size());
+      int64 idx = 0;
+      for (int i = 0; i < coords.size(); ++i) {
+        idx += coords[i] * strides[i];
+      }
+      return idx;
+    };
+
+    // Each group maps one-on-one onto a value in the reduced tensor.
+    // g.group() provides the coordinates of a particular reduced value.
+    sp.Reorder<T>(reduction.reorder_dims);
+    for (const auto &g : sp.group(reduction.group_by_dims)) {
+      Op::template Run<T>(ctx, reduced_val, g.template values<T>());
+      const int64 idx = CoordinatesToFlatIndex(g.group(), output_strides);
+      out_flat(idx) = reduced_val();
+      VLOG(2) << "coords: " << str_util::Join(g.group(), ",")
+              << "; idx: " << idx << "; group " << Op::Name() << ": "
+              << reduced_val();
+    }
+  }
+
+ private:
+  // True if the number of dimensions should be maintained.
+  bool keep_dims_;
+};
+
+#define REGISTER_KERNELS(T)                                              \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("SparseReduceSum").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      SparseReduceOp<T, SumOp>)
+TF_CALL_NUMBER_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+
+#define REGISTER_KERNELS(T)                                              \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("SparseReduceMax").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      SparseReduceOp<T, MaxOp>)
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+
+template <typename T, typename Op>
+class SparseReduceSparseOp : public OpKernel {
+ public:
+  explicit SparseReduceSparseOp(OpKernelConstruction *ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("keep_dims", &keep_dims_));
+  }
+
+  void Compute(OpKernelContext *ctx) override {
+    const Tensor *indices_t, *values_t, *shape_t, *reduction_axes_t;
+    OP_REQUIRES_OK(ctx, ctx->input("input_indices", &indices_t));
+    OP_REQUIRES_OK(ctx, ctx->input("input_values", &values_t));
+    OP_REQUIRES_OK(ctx, ctx->input("input_shape", &shape_t));
+    OP_REQUIRES_OK(ctx, ctx->input("reduction_axes", &reduction_axes_t));
+
+    OP_REQUIRES_OK(ctx, ValidateInputs(shape_t, reduction_axes_t));
+
+    SparseTensor sp(tensor::DeepCopy(*indices_t), tensor::DeepCopy(*values_t),
+                    TensorShape(shape_t->vec<int64>()));
+    ReduceDetails reduction = SparseTensorReduceHelper(
+        sp, reduction_axes_t->flat<int32>(), keep_dims_);
+
+    sp.Reorder<T>(reduction.reorder_dims);
+    // Count nnzs in the output SparseTensor.
+    int64 nnz = 0;
+    auto iter = sp.group(reduction.group_by_dims);
+    for (auto it = iter.begin(); it != iter.end(); ++it) {
+      nnz++;
+    }
+
+    Tensor *out_indices_t;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(
+                       0, TensorShape({nnz, reduction.reduced_shape.dims()}),
+                       &out_indices_t));
+    typename TTypes<int64>::Matrix out_indices_mat =
+        out_indices_t->matrix<int64>();
+    // For keep_dims. We don't explicitly set dim fields for reduced dims below.
+    out_indices_mat.setZero();
+
+    Tensor *out_values_t;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(1, TensorShape({nnz}), &out_values_t));
+    auto out_flat = out_values_t->flat<T>();
+
+    Tensor tmp_reduced_val;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
+                                           TensorShape({}), &tmp_reduced_val));
+    auto reduced_val = tmp_reduced_val.scalar<T>();
+    int64 i = 0;
+    for (const auto &g : sp.group(reduction.group_by_dims)) {
+      Op::template Run<T>(ctx, reduced_val, g.template values<T>());
+      std::vector<int64> group = g.group();
+      for (int64 j = 0; j < group.size(); j++) {
+        if (keep_dims_) {
+          out_indices_mat(i, reduction.group_by_dims[j]) = group[j];
+        } else {
+          out_indices_mat(i, j) = group[j];
+        }
+      }
+      out_flat(i) = reduced_val();
+      i++;
+      VLOG(2) << "coords: " << str_util::Join(g.group(), ",")
+              << "; group " << Op::Name() << ": "
+              << reduced_val();
+    }
+
+    Tensor *out_shape_t;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(
+                            2, TensorShape({reduction.reduced_shape.dims()}),
+                            &out_shape_t));
+    auto out_shape_flat = out_shape_t->flat<int64>();
+    auto out_dim_sizes = reduction.reduced_shape.dim_sizes();
+    std::copy(out_dim_sizes.begin(), out_dim_sizes.end(), &out_shape_flat(0));
+  }
+
+ private:
+  // True if the number of dimensions should be maintained.
+  bool keep_dims_;
+};
+
+#define REGISTER_KERNELS(T)                                                    \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("SparseReduceSumSparse").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      SparseReduceSparseOp<T, SumOp>)
+TF_CALL_NUMBER_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+
+#define REGISTER_KERNELS(T)                                                    \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("SparseReduceMaxSparse").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      SparseReduceSparseOp<T, MaxOp>)
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/sparse_reduce_sum_op.cc b/tensorflow/core/kernels/sparse_reduce_sum_op.cc
deleted file mode 100644
index 074aab9f9e..0000000000
--- a/tensorflow/core/kernels/sparse_reduce_sum_op.cc
+++ /dev/null
@@ -1,305 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// See docs in ../ops/sparse_ops.cc.
-
-#define EIGEN_USE_THREADS
-
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_util.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/util/sparse/sparse_tensor.h"
-
-// TODO(b/31496047): Fix non-standard include order.
-#include <numeric>  // clang-format off
-
-using tensorflow::sparse::SparseTensor;
-using tensorflow::gtl::ArraySlice;
-
-namespace tensorflow {
-
-struct ReduceDetails {
-  // The dimensions to call Reorder() with.
-  std::vector<int64> reorder_dims;
-
-  // The dimensions to call group() with after Reorder().
-  std::vector<int64> group_by_dims;
-
-  // The shape after reduction.
-  TensorShape reduced_shape;
-};
-
-// Compute common reduce parameters that'll be used for SparseTensor
-// reductions. Usage:
-// ReduceDetails reduction = SparseTensorReduceHelper(sp, axes, keep_dims);
-// sp.Reorder(reduction.reorder_dims);
-// for (const auto& g : sp.group(reduction.group_by_dims)) {
-//   ...
-// }
-// // Set output shape to reduction.reduced_shape.
-ReduceDetails SparseTensorReduceHelper(const SparseTensor &sp,
-                                       gtl::ArraySlice<int32> axes_slice,
-                                       bool keep_dims) {
-  ReduceDetails reduction;
-
-  std::vector<int32> reduction_axes(axes_slice.begin(), axes_slice.end());
-  int ndims = sp.dims();
-  for (int64 i = 0; i < reduction_axes.size(); ++i) {
-    reduction_axes[i] = (reduction_axes[i] + ndims) % ndims;
-  }
-  std::sort(reduction_axes.begin(), reduction_axes.end());
-
-  // (0) Calculate the grouping dimensions:
-  // group_by_dims == {0, .., NDIMS-1} \ reduction_axes.
-  std::vector<int64> perm(ndims);
-  std::iota(perm.begin(), perm.end(), 0);
-
-  // Requires perm and reduction_axes_ be sorted; group_by_dims will be
-  // sorted as well.
-  std::set_difference(
-      perm.begin(), perm.end(), reduction_axes.begin(), reduction_axes.end(),
-      std::inserter(reduction.group_by_dims, reduction.group_by_dims.begin()));
-
-  // Now append the rest of the axes (the complement of group_by_dims_);
-  // result is used by Reorder().
-  reduction.reorder_dims = reduction.group_by_dims;
-  std::set_difference(perm.begin(), perm.end(), reduction.group_by_dims.begin(),
-                      reduction.group_by_dims.end(),
-                      std::back_inserter(reduction.reorder_dims));
-
-  // (1) Calculate the shape after reduction.
-  auto sp_shape = sp.shape();
-  std::vector<int64> out_dim_sizes;
-  if (keep_dims) {
-    out_dim_sizes.reserve(ndims);
-    auto beg = reduction.group_by_dims.begin();
-    auto end = reduction.group_by_dims.end();
-    for (int d = 0; d < ndims; ++d) {
-      if (std::find(beg, end, d) == end) {
-        out_dim_sizes.push_back(1);  // A reduced axis.
-      } else {
-        out_dim_sizes.push_back(sp_shape[d]);
-      }
-    }
-  } else {
-    out_dim_sizes = sp.PickDims(reduction.group_by_dims);
-  }
-
-  reduction.reduced_shape = TensorShape(out_dim_sizes);
-  return reduction;
-}
-
-Status ValidateInputs(const Tensor *shape_t, const Tensor *reduction_axes_t) {
-  // indices and values are validated in SparseTensor ctor.
-  if (!TensorShapeUtils::IsVector(shape_t->shape())) {
-    return errors::InvalidArgument(
-        "Expected input_shape to be a vector; got shape: ",
-        shape_t->shape().DebugString());
-  }
-  if (!TensorShapeUtils::IsScalar(reduction_axes_t->shape()) &&
-      !TensorShapeUtils::IsVector(reduction_axes_t->shape())) {
-    return errors::InvalidArgument(
-        "Expected reduction_axes to be a scalar or a vector; got shape: ",
-        reduction_axes_t->shape().DebugString());
-  }
-
-  const auto reduction_axes_flat = reduction_axes_t->flat<int32>();
-  for (int64 i = 0; i < reduction_axes_flat.size(); i++) {
-    int32 axis = reduction_axes_flat(i);
-    if (axis < -shape_t->NumElements() || axis >= shape_t->NumElements()) {
-      return errors::InvalidArgument("Invalid reduction dimension ", axis,
-                                     ", for input with ",
-                                     shape_t->NumElements(), " dimensions.");
-    }
-  }
-
-  return Status::OK();
-}
-
-template <typename T>
-class SparseReduceSumOp : public OpKernel {
- public:
-  explicit SparseReduceSumOp(OpKernelConstruction *ctx) : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("keep_dims", &keep_dims_));
-  }
-
-  void Compute(OpKernelContext *ctx) override {
-    const Tensor *indices_t, *values_t, *shape_t, *reduction_axes_t;
-    OP_REQUIRES_OK(ctx, ctx->input("input_indices", &indices_t));
-    OP_REQUIRES_OK(ctx, ctx->input("input_values", &values_t));
-    OP_REQUIRES_OK(ctx, ctx->input("input_shape", &shape_t));
-    OP_REQUIRES_OK(ctx, ctx->input("reduction_axes", &reduction_axes_t));
-
-    OP_REQUIRES_OK(ctx, ValidateInputs(shape_t, reduction_axes_t));
-
-    // TODO(zongheng): we will call Reorder() below, which will modify
-    // in-place the underlying indices and values buffers.  To avoid
-    // surprises of this kernel being stateful, we work around the above by
-    // making deep copies here.  Remove this if/when we change Reorder()'s
-    // semantics.
-    const auto shape_vec = shape_t->vec<int64>();
-    SparseTensor sp(tensor::DeepCopy(*indices_t), tensor::DeepCopy(*values_t),
-                    TensorShape(shape_vec));
-    ReduceDetails reduction = SparseTensorReduceHelper(
-        sp, reduction_axes_t->flat<int32>(), keep_dims_);
-
-    Tensor *out_values;
-    OP_REQUIRES_OK(
-        ctx, ctx->allocate_output(0, reduction.reduced_shape, &out_values));
-    auto out_flat = out_values->flat<T>();
-    out_flat.setZero();
-
-    Tensor tmp_group_sum;
-    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
-                                           TensorShape({}), &tmp_group_sum));
-    auto group_sum = tmp_group_sum.scalar<T>();
-
-    // Compute strides, and use it to convert coords to flat index.  The
-    // coordinates returned by .group() have the same ndims as group_by_dims.
-    gtl::InlinedVector<int64, 8> output_strides(reduction.group_by_dims.size());
-    if (!output_strides.empty()) {  // Do this iff we don't reduce all.
-      output_strides.back() = 1;
-      for (int d = output_strides.size() - 2; d >= 0; --d) {
-        output_strides[d] =
-            output_strides[d + 1] * shape_vec(reduction.group_by_dims[d + 1]);
-      }
-    }
-
-    auto CoordinatesToFlatIndex = [](ArraySlice<int64> coords,
-                                     ArraySlice<int64> strides) {
-      if (strides.empty()) {  // Reduce all.
-        return 0LL;
-      }
-      CHECK_EQ(coords.size(), strides.size());
-      int64 idx = 0;
-      for (int i = 0; i < coords.size(); ++i) {
-        idx += coords[i] * strides[i];
-      }
-      return idx;
-    };
-
-    // Each group maps one-on-one onto a value in the reduced tensor.
-    // g.group() provides the coordinates of a particular reduced value.
-    sp.Reorder<T>(reduction.reorder_dims);
-    for (const auto &g : sp.group(reduction.group_by_dims)) {
-      group_sum.device(ctx->eigen_cpu_device()) = g.template values<T>().sum();
-      const int64 idx = CoordinatesToFlatIndex(g.group(), output_strides);
-      out_flat(idx) = group_sum();
-      VLOG(2) << "coords: " << str_util::Join(g.group(), ",")
-              << "; idx: " << idx << "; group sum: " << group_sum();
-    }
-  }
-
- private:
-  // True if the number of dimensions should be maintained.
-  bool keep_dims_;
-};
-
-#define REGISTER_KERNELS(T)                                              \
-  REGISTER_KERNEL_BUILDER(                                               \
-      Name("SparseReduceSum").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
-      SparseReduceSumOp<T>)
-TF_CALL_NUMBER_TYPES(REGISTER_KERNELS);
-#undef REGISTER_KERNELS
-
-template <typename T>
-class SparseReduceSumSparseOp : public OpKernel {
- public:
-  explicit SparseReduceSumSparseOp(OpKernelConstruction *ctx) : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("keep_dims", &keep_dims_));
-  }
-
-  void Compute(OpKernelContext *ctx) override {
-    const Tensor *indices_t, *values_t, *shape_t, *reduction_axes_t;
-    OP_REQUIRES_OK(ctx, ctx->input("input_indices", &indices_t));
-    OP_REQUIRES_OK(ctx, ctx->input("input_values", &values_t));
-    OP_REQUIRES_OK(ctx, ctx->input("input_shape", &shape_t));
-    OP_REQUIRES_OK(ctx, ctx->input("reduction_axes", &reduction_axes_t));
-
-    OP_REQUIRES_OK(ctx, ValidateInputs(shape_t, reduction_axes_t));
-
-    SparseTensor sp(tensor::DeepCopy(*indices_t), tensor::DeepCopy(*values_t),
-                    TensorShape(shape_t->vec<int64>()));
-    ReduceDetails reduction = SparseTensorReduceHelper(
-        sp, reduction_axes_t->flat<int32>(), keep_dims_);
-
-    sp.Reorder<T>(reduction.reorder_dims);
-    // Count nnzs in the output SparseTensor.
-    int64 nnz = 0;
-    auto iter = sp.group(reduction.group_by_dims);
-    for (auto it = iter.begin(); it != iter.end(); ++it) {
-      nnz++;
-    }
-
-    Tensor *out_indices_t;
-    OP_REQUIRES_OK(ctx,
-                   ctx->allocate_output(
-                       0, TensorShape({nnz, reduction.reduced_shape.dims()}),
-                       &out_indices_t));
-    typename TTypes<int64>::Matrix out_indices_mat =
-        out_indices_t->matrix<int64>();
-    // For keep_dims. We don't explicitly set dim fields for reduced dims below.
-    out_indices_mat.setZero();
-
-    Tensor *out_values_t;
-    OP_REQUIRES_OK(ctx,
-                   ctx->allocate_output(1, TensorShape({nnz}), &out_values_t));
-    auto out_flat = out_values_t->flat<T>();
-
-    Tensor tmp_group_sum;
-    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
-                                           TensorShape({}), &tmp_group_sum));
-    auto group_sum = tmp_group_sum.scalar<T>();
-    int64 i = 0;
-    for (const auto &g : sp.group(reduction.group_by_dims)) {
-      group_sum.device(ctx->eigen_cpu_device()) = g.template values<T>().sum();
-      std::vector<int64> group = g.group();
-      for (int64 j = 0; j < group.size(); j++) {
-        if (keep_dims_) {
-          out_indices_mat(i, reduction.group_by_dims[j]) = group[j];
-        } else {
-          out_indices_mat(i, j) = group[j];
-        }
-      }
-      out_flat(i) = group_sum();
-      i++;
-      VLOG(2) << "coords: " << str_util::Join(g.group(), ",")
-              << "; group sum: " << group_sum();
-    }
-
-    Tensor *out_shape_t;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(
-                            2, TensorShape({reduction.reduced_shape.dims()}),
-                            &out_shape_t));
-    auto out_shape_flat = out_shape_t->flat<int64>();
-    auto out_dim_sizes = reduction.reduced_shape.dim_sizes();
-    std::copy(out_dim_sizes.begin(), out_dim_sizes.end(), &out_shape_flat(0));
-  }
-
- private:
-  // True if the number of dimensions should be maintained.
-  bool keep_dims_;
-};
-
-#define REGISTER_KERNELS(T)                                                    \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("SparseReduceSumSparse").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
-      SparseReduceSumSparseOp<T>)
-TF_CALL_NUMBER_TYPES(REGISTER_KERNELS);
-#undef REGISTER_KERNELS
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/stack_ops.cc b/tensorflow/core/kernels/stack_ops.cc
index b4698a8053..2db3e5ef77 100644
--- a/tensorflow/core/kernels/stack_ops.cc
+++ b/tensorflow/core/kernels/stack_ops.cc
@@ -40,6 +40,9 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
 
 class Stack : public ResourceBase {
  public:
@@ -182,6 +185,10 @@ class StackOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("Stack").Device(DEVICE_CPU), StackOp);
 REGISTER_KERNEL_BUILDER(Name("Stack").Device(DEVICE_GPU).HostMemory("handle"),
                         StackOp);
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("Stack").Device(DEVICE_SYCL).HostMemory("handle"),
+                        StackOp);
+#endif // TENSORFLOW_USE_SYCL
 
 template <typename Device>
 class StackPushOp : public AsyncOpKernel {
@@ -213,7 +220,11 @@ class StackPushOp : public AsyncOpKernel {
     static constexpr int kCopyThreshold = 2048;
     static constexpr double kOccupancy = 0.7;
     if (swap_memory_ && !alloc_attrs.on_host() &&
-        std::is_same<Device, GPUDevice>::value &&
+        ( std::is_same<Device, GPUDevice>::value
+#ifdef TENSORFLOW_USE_SYCL
+          || std::is_same<Device, SYCLDevice>::value
+#endif // TENSORFLOW_USE_SYCL
+        ) &&
         tensor.TotalBytes() > kCopyThreshold && stack->IsUsefulToSwap(tensor)) {
       DeviceContext* device_ctxt = ctx->op_device_context();
       auto device = static_cast<tensorflow::Device*>(ctx->device());
@@ -289,6 +300,31 @@ REGISTER_GPU_HOST_KERNEL(bool);
 
 #undef REGISTER_GPU_HOST_KERNEL
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(type)                        \
+  REGISTER_KERNEL_BUILDER(Name("StackPush")               \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("handle")       \
+                              .TypeConstraint<type>("T"), \
+                          StackPushOp<SYCLDevice>);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
+
+#define REGISTER_SYCL_HOST_KERNEL(type)                   \
+  REGISTER_KERNEL_BUILDER(Name("StackPush")               \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("handle")       \
+                              .HostMemory("elem")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          StackPushOp<SYCLDevice>)
+
+REGISTER_SYCL_HOST_KERNEL(int32);
+REGISTER_SYCL_HOST_KERNEL(bool);
+#undef REGISTER_SYCL_KERNEL
+#undef REGISTER_SYCL_HOST_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 class StackPopOp : public AsyncOpKernel {
  public:
   explicit StackPopOp(OpKernelConstruction* context) : AsyncOpKernel(context) {}
@@ -359,6 +395,31 @@ REGISTER_GPU_HOST_KERNEL(bool);
 
 #undef REGISTER_GPU_HOST_KERNEL
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(type)                                \
+  REGISTER_KERNEL_BUILDER(Name("StackPop")                        \
+                              .Device(DEVICE_SYCL)                \
+                              .HostMemory("handle")               \
+                              .TypeConstraint<type>("elem_type"), \
+                          StackPopOp)
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
+
+#define REGISTER_SYCL_HOST_KERNEL(type)                           \
+  REGISTER_KERNEL_BUILDER(Name("StackPop")                        \
+                              .Device(DEVICE_SYCL)                \
+                              .HostMemory("handle")               \
+                              .HostMemory("elem")                 \
+                              .TypeConstraint<type>("elem_type"), \
+                          StackPopOp)
+
+REGISTER_SYCL_HOST_KERNEL(int32);
+REGISTER_SYCL_HOST_KERNEL(bool);
+
+#undef REGISTER_SYCL_KERNEL
+#undef REGISTER_SYCL_HOST_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 class StackCloseOp : public OpKernel {
  public:
   explicit StackCloseOp(OpKernelConstruction* context) : OpKernel(context) {}
@@ -376,5 +437,8 @@ class StackCloseOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("StackClose").Device(DEVICE_CPU), StackCloseOp);
 REGISTER_KERNEL_BUILDER(
     Name("StackClose").Device(DEVICE_GPU).HostMemory("handle"), StackCloseOp);
-
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(
+    Name("StackClose").Device(DEVICE_SYCL).HostMemory("handle"), StackCloseOp);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tile_functor.h b/tensorflow/core/kernels/tile_functor.h
new file mode 100644
index 0000000000..d80898b24f
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor.h
@@ -0,0 +1,115 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_TILE_FUNCTOR_H_
+#define TENSORFLOW_KERNELS_TILE_FUNCTOR_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+namespace internal {
+
+// Helper to compute 'strides' given a tensor 'shape'. I.e.,
+// strides[i] = prod(shape.dim_size[(i+1):])
+template <typename Index>
+gtl::InlinedVector<Index, 8> ComputeStride(const TensorShape& shape) {
+  const int ndims = shape.dims();
+  gtl::InlinedVector<Index, 8> strides(ndims);
+  Index stride = 1;
+  for (int i = ndims - 1; i >= 0; --i) {
+    strides[i] = stride;
+    stride *= static_cast<Index>(shape.dim_size(i));
+  }
+  return strides;
+}
+
+
+// Device-specific naive implementation for tile.
+template <typename Device, typename T>
+void TileSimple(const Device& d, Tensor* out, const Tensor& in);
+
+template <typename Device, typename T, int NDIM>
+void TileUsingEigen(const Device& d, Tensor* out, const Tensor& in,
+                    const gtl::ArraySlice<int32>& broadcast_array) {
+  auto x = in.tensor<T, NDIM>();
+  auto y = out->tensor<T, NDIM>();
+
+  Eigen::array<int32, NDIM> b;
+  for (int i = 0; i < NDIM; ++i) b[i] = broadcast_array[i];
+  if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
+    // Use 32bit indexing to speed up the computations
+    To32Bit(y).device(d) = To32Bit(x).broadcast(b);
+  } else {
+    y.device(d) = x.broadcast(b);
+  }
+}
+
+template <typename Device, typename T>
+void TileUsingEigen(const Device& d, Tensor* out, const Tensor& in,
+                    const gtl::ArraySlice<int32>&) {
+  auto x = in.tensor<T, 0>();
+  auto y = out->tensor<T, 0>();
+  // In the scalar case we simply copy the input.
+  y.device(d) = x;
+}
+
+}  // end namespace internal
+
+namespace functor {
+
+template <typename Device, typename T>
+struct Tile {
+  void operator()(const Device& d, Tensor* out, const Tensor& in,
+                  const gtl::ArraySlice<int32> broadcast_array) const {
+    switch (in.dims()) {
+      case 0:
+        internal::TileUsingEigen<Device, T>(d, out, in, broadcast_array);
+        break;
+      case 1:
+        internal::TileUsingEigen<Device, T, 1>(d, out, in, broadcast_array);
+        break;
+      case 2:
+        internal::TileUsingEigen<Device, T, 2>(d, out, in, broadcast_array);
+        break;
+      case 3:
+        internal::TileUsingEigen<Device, T, 3>(d, out, in, broadcast_array);
+        break;
+      case 4:
+        internal::TileUsingEigen<Device, T, 4>(d, out, in, broadcast_array);
+        break;
+      case 5:
+        internal::TileUsingEigen<Device, T, 5>(d, out, in, broadcast_array);
+        break;
+      case 6:
+        internal::TileUsingEigen<Device, T, 6>(d, out, in, broadcast_array);
+        break;
+      case 7:
+        internal::TileUsingEigen<Device, T, 7>(d, out, in, broadcast_array);
+        break;
+      default:
+        internal::TileSimple<Device, T>(d, out, in);
+        break;
+    }
+  }
+};
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_TILE_FUNCTOR_H_
diff --git a/tensorflow/core/kernels/tile_functor_cpu.cc b/tensorflow/core/kernels/tile_functor_cpu.cc
new file mode 100644
index 0000000000..3af00350b9
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_cpu.cc
@@ -0,0 +1,85 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/tile_functor.h"
+
+namespace tensorflow {
+
+namespace internal {
+
+template <typename Device, typename T>
+void TileSimple(const Device& d, Tensor* out, const Tensor& in) {
+  const int ndims = in.dims();
+  const int64 nelem = out->NumElements();
+  gtl::InlinedVector<int64, 8> in_strides = ComputeStride<int64>(in.shape());
+  gtl::InlinedVector<int64, 8> out_strides = ComputeStride<int64>(out->shape());
+  const T* p = in.flat<T>().data();
+  T* q = out->flat<T>().data();
+
+  for (int64 o_idx = 0; o_idx < nelem; ++o_idx) {
+    int64 i_idx = 0;
+    int64 t = o_idx;
+    for (int i = 0; i < ndims; ++i) {
+      i_idx += t / out_strides[i] % in.dim_size(i) * in_strides[i];
+      t %= out_strides[i];
+    }
+    q[o_idx] = p[i_idx];
+  }
+}
+
+}  // end namespace internal
+
+namespace functor {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+// Register functors used for Tile functor.
+#define DEFINE_TYPE(T) template struct Tile<CPUDevice, T>;
+
+TF_CALL_bool(DEFINE_TYPE);
+TF_CALL_float(DEFINE_TYPE);
+TF_CALL_double(DEFINE_TYPE);
+TF_CALL_uint8(DEFINE_TYPE);
+TF_CALL_int32(DEFINE_TYPE);
+TF_CALL_int16(DEFINE_TYPE);
+TF_CALL_int64(DEFINE_TYPE);
+TF_CALL_half(DEFINE_TYPE);
+TF_CALL_complex64(DEFINE_TYPE);
+TF_CALL_complex128(DEFINE_TYPE);
+TF_CALL_string(DEFINE_TYPE);
+
+#undef DEFINE_TYPE
+
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+
+#define DEFINE_TYPE(T) template struct Tile<SYCLDevice, T>;
+
+TF_CALL_bool(DEFINE_TYPE);
+TF_CALL_float(DEFINE_TYPE);
+TF_CALL_double(DEFINE_TYPE);
+TF_CALL_uint8(DEFINE_TYPE);
+TF_CALL_int32(DEFINE_TYPE);
+TF_CALL_int16(DEFINE_TYPE);
+TF_CALL_int64(DEFINE_TYPE);
+
+#undef DEFINE_TYPE
+#endif // TENSORFLOW_USE_SYCL
+
+}  // end namespace functor
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/tile_functor_gpu.cu.cc b/tensorflow/core/kernels/tile_functor_gpu.cu.cc
new file mode 100644
index 0000000000..08aca8c731
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_gpu.cu.cc
@@ -0,0 +1,101 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/kernels/tile_functor.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+namespace internal {
+
+template <typename T>
+__global__ void TileKernel(int nthreads, const T* src, const int32* buf,
+                           const int32 ndims, T* dst) {
+  const int32* in_strides = buf;
+  const int32* out_strides = buf + ndims;
+  const int32* in_dim_sizes = buf + ndims * 2;
+  CUDA_1D_KERNEL_LOOP(o_idx, nthreads) {
+    int32 i_idx = 0;
+    int32 t = o_idx;
+    for (int i = 0; i < ndims; ++i) {
+      i_idx += t / out_strides[i] % in_dim_sizes[i] * in_strides[i];
+      t %= out_strides[i];
+    }
+    dst[o_idx] = ldg(src + i_idx);
+  }
+}
+
+template <typename Device, typename T>
+void TileSimple(const Device& d, Tensor* out, const Tensor& in) {
+  // Ensures we can use 32-bit index.
+  const int64 in_nelem = in.NumElements();
+  CHECK_LT(in_nelem, kint32max) << "Tensor too large to transpose on GPU";
+  const int64 out_nelem = out->NumElements();
+  CHECK_LT(out_nelem, kint32max) << "Tensor too large to transpose on GPU";
+  // Pack strides and input dimension sizes into one buffer.
+  const int32 ndims = in.dims();
+  gtl::InlinedVector<int32, 24> host_buf(ndims * 3);
+  gtl::InlinedVector<int32, 8> in_strides = ComputeStride<int32>(in.shape());
+  gtl::InlinedVector<int32, 8> out_strides = ComputeStride<int32>(out->shape());
+  for (int i = 0; i < ndims; ++i) {
+    host_buf[i] = in_strides[i];
+    host_buf[ndims + i] = out_strides[i];
+    host_buf[ndims * 2 + i] = in.dim_size(i);
+  }
+  // Copies the input strides, output strides and input dimension sizes to the device.
+  auto num_bytes = sizeof(int64) * host_buf.size();
+  auto dev_buf = d.allocate(num_bytes);
+  // NOTE: host_buf is not allocated by CudaHostAllocator, and
+  // therefore we are doing a sync copy effectively.
+  d.memcpyHostToDevice(dev_buf, host_buf.data(), num_bytes);
+  // Launch kernel to q[...] = p[...].
+  const T* p = in.flat<T>().data();
+  T* q = out->flat<T>().data();
+  CudaLaunchConfig cfg = GetCudaLaunchConfig(out_nelem, d);
+  TileKernel<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(
+      cfg.virtual_thread_count, p, reinterpret_cast<const int32*>(dev_buf),
+      ndims, q);
+  // Safe to deallocate immediately after the kernel launch.
+  d.deallocate(dev_buf);
+}
+
+}  // end namespace internal
+
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Register functors used for Tile functor.
+#define DEFINE_TYPE(T) template struct Tile<GPUDevice, T>;
+
+TF_CALL_int16(DEFINE_TYPE);
+TF_CALL_int32(DEFINE_TYPE);
+TF_CALL_int64(DEFINE_TYPE);
+TF_CALL_float(DEFINE_TYPE);
+TF_CALL_double(DEFINE_TYPE);
+TF_CALL_half(DEFINE_TYPE);
+TF_CALL_complex64(DEFINE_TYPE);
+TF_CALL_complex128(DEFINE_TYPE);
+
+#undef DEFINE_TYPE
+
+}  // end namespace functor
+}  // namespace tensorflow
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc
index 7c72487d3f..f1da3c8afb 100644
--- a/tensorflow/core/kernels/tile_ops.cc
+++ b/tensorflow/core/kernels/tile_ops.cc
@@ -44,21 +44,12 @@ typedef Eigen::GpuDevice GPUDevice;
 typedef Eigen::SyclDevice SYCLDevice;
 #endif // TENSORFLOW_USE_SYCL
 
-// Forward declarations of functors that will be defined in
-// tile_ops_cpu_impl*.cc and tile_ops_gpu.cu.cc.
+// Forward declarations of functors that will be defined in tile_ops_impl.h
 namespace functor {
-template <typename Device, typename T, int NDIM>
-struct Tile {
-  void operator()(const Device& d, typename TTypes<T, NDIM>::Tensor out,
-                  typename TTypes<T, NDIM>::ConstTensor in,
-                  const Eigen::array<int32, NDIM>& broadcast_array) const;
-};
-
 template <typename Device, typename T>
-struct Tile<Device, T, 0> {
-  void operator()(const Device& d, typename TTypes<T, 0>::Tensor out,
-                  typename TTypes<T, 0>::ConstTensor in,
-                  const Eigen::array<int32, 0>&) const;
+struct Tile {
+  void operator()(const Device& d, Tensor* out, const Tensor& in,
+                  const gtl::ArraySlice<int32> broadcast_array) const;
 };
 
 template <typename Device, typename T, int NDIM>
@@ -134,21 +125,12 @@ class TileOp : public OpKernel {
     // If there's no output, there's nothing to do.
     if (output_shape.num_elements() == 0) return;
 
-#define HANDLE_DIM(DT, NDIM)                                   \
-  if (context->input(0).dtype() == DT && input_dims == NDIM) { \
-    HandleCase<DT, NDIM>(context, multiples_array, result);    \
+#define HANDLE_TYPE(DT)                                        \
+  if (context->input(0).dtype() == DT) {                       \
+    HandleCase<DT>(context, multiples_array, result);          \
     return;                                                    \
   }
 
-#define HANDLE_TYPE(T) \
-  HANDLE_DIM(T, 1)     \
-  HANDLE_DIM(T, 2)     \
-  HANDLE_DIM(T, 3)     \
-  HANDLE_DIM(T, 4)     \
-  HANDLE_DIM(T, 5)     \
-  HANDLE_DIM(T, 6)     \
-  HANDLE_DIM(T, 7)
-
 #define HANDLE_TYPE_NAME(T) HANDLE_TYPE(DataTypeToEnum<T>::value)
 
     // Invoke macro using TF_CALL_* so type-filtering for platform applies.
@@ -166,7 +148,6 @@ class TileOp : public OpKernel {
 
 #undef HANDLE_TYPE_NAME
 #undef HANDLE_TYPE
-#undef HANDLE_DIM
 
     OP_REQUIRES(context, false,
                 errors::Unimplemented(
@@ -175,21 +156,17 @@ class TileOp : public OpKernel {
   }
 
  private:
-  template <DataType DT, int NDIM>
+  template <DataType DT>
   void HandleCaseImpl(OpKernelContext* context,
                       const gtl::ArraySlice<int32>& multiples_array,
                       Tensor* result) {
     typedef typename EnumToDataType<DT>::Type T;
-    Eigen::array<int32, NDIM> broadcast_array;
-    for (int i = 0; i < NDIM; ++i) {
-      broadcast_array[i] = multiples_array[i];
-    }
-    functor::Tile<Device, T, NDIM>()(
-        context->eigen_device<Device>(), result->tensor<T, NDIM>(),
-        context->input(0).tensor<T, NDIM>(), broadcast_array);
+    functor::Tile<Device, T>() (
+        context->eigen_device<Device>(), result,
+        context->input(0), multiples_array);
   }
 
-  template <DataType DT, int NDIM>
+  template <DataType DT>
   void HandleCase(OpKernelContext* context,
                   const gtl::ArraySlice<int32>& multiples_array,
                   Tensor* result);
@@ -198,45 +175,35 @@ class TileOp : public OpKernel {
 };
 
 template <typename Device>
-template <DataType DT, int NDIM>
+template <DataType DT>
 inline void TileOp<Device>::HandleCase(
     OpKernelContext* context, const gtl::ArraySlice<int32>& multiples_array,
     Tensor* result) {
   // TODO(vrv): print out the device name if useful. Currently disabled to avoid
   // having to use RTTI.
-  LOG(FATAL) << "TileOp: Invalid combination of Device, DT and NDIM: "
+  LOG(FATAL) << "TileOp: Invalid combination of Device, DT: "
              // << typeid(Device).name() << ", "
-             << DataTypeString(DT) << ", " << NDIM;
+             << DataTypeString(DT);
 }
 
-#define HANDLE_CASE(device, T, dtype, ndim)                            \
+#define HANDLE_CASE(device, dtype)                                     \
   template <>                                                          \
   template <>                                                          \
-  void TileOp<device>::HandleCase<dtype, ndim>(                        \
+  void TileOp<device>::HandleCase<dtype>(                              \
       OpKernelContext * context,                                       \
       const gtl::ArraySlice<int32>& multiples_array, Tensor* result) { \
-    HandleCaseImpl<dtype, ndim>(context, multiples_array, result);     \
+    HandleCaseImpl<dtype>(context, multiples_array, result);           \
   }
 
-// 0-D handled above
-#define HANDLE_CASE_DIM(device, T, dtype) \
-  HANDLE_CASE(device, T, dtype, 1);       \
-  HANDLE_CASE(device, T, dtype, 2);       \
-  HANDLE_CASE(device, T, dtype, 3);       \
-  HANDLE_CASE(device, T, dtype, 4);       \
-  HANDLE_CASE(device, T, dtype, 5);       \
-  HANDLE_CASE(device, T, dtype, 6);       \
-  HANDLE_CASE(device, T, dtype, 7);
-
 #define HANDLE_TYPE_NAME_CPU(T) \
-  HANDLE_CASE_DIM(CPUDevice, T, DataTypeToEnum<T>::value);
+  HANDLE_CASE(CPUDevice, DataTypeToEnum<T>::value);
 
 #define HANDLE_TYPE_NAME_GPU(T) \
-  HANDLE_CASE_DIM(GPUDevice, T, DataTypeToEnum<T>::value);
+  HANDLE_CASE(GPUDevice, DataTypeToEnum<T>::value);
 
 #ifdef TENSORFLOW_USE_SYCL
 #define HANDLE_TYPE_NAME_SYCL(T) \
-  HANDLE_CASE_DIM(SYCLDevice, T, DataTypeToEnum<T>::value);
+  HANDLE_CASE(SYCLDevice, DataTypeToEnum<T>::value);
 #endif // TENSORFLOW_USE_SYCL
 
 TF_CALL_bool(HANDLE_TYPE_NAME_CPU);
@@ -275,7 +242,6 @@ TF_CALL_int64(HANDLE_TYPE_NAME_SYCL);
 #ifdef TENSORFLOW_USE_SYCL
 #undef HANDLE_TYPE_NAME_SYCL
 #endif // TENSORFLOW_USE_SYCL
-#undef HANDLE_CASE_DIM
 #undef HANDLE_CASE
 
 // --------------------------------------------------------------------------
diff --git a/tensorflow/core/kernels/tile_ops_cpu_impl.h b/tensorflow/core/kernels/tile_ops_cpu_impl.h
index db3f046439..a6eed4935d 100644
--- a/tensorflow/core/kernels/tile_ops_cpu_impl.h
+++ b/tensorflow/core/kernels/tile_ops_cpu_impl.h
@@ -21,29 +21,11 @@ limitations under the License.
 #include "tensorflow/core/kernels/tile_ops_impl.h"
 
 namespace tensorflow {
+
 namespace functor {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-// Register functors used for TileOp.
-#define DEFINE_DIM(T, NDIM) template struct Tile<CPUDevice, T, NDIM>;
-#define DEFINE_TYPE(T) DEFINE_DIM(T, CPU_PROVIDED_IXDIM)
-
-TF_CALL_bool(DEFINE_TYPE);
-TF_CALL_float(DEFINE_TYPE);
-TF_CALL_double(DEFINE_TYPE);
-TF_CALL_uint8(DEFINE_TYPE);
-TF_CALL_int32(DEFINE_TYPE);
-TF_CALL_int16(DEFINE_TYPE);
-TF_CALL_int64(DEFINE_TYPE);
-TF_CALL_half(DEFINE_TYPE);
-TF_CALL_complex64(DEFINE_TYPE);
-TF_CALL_complex128(DEFINE_TYPE);
-TF_CALL_string(DEFINE_TYPE);
-
-#undef DEFINE_DIM
-#undef DEFINE_TYPE
-
 // Register functors used for TileGradientOp.
 #define DEFINE_DIM(T, NDIM)                     \
   template struct TileGrad<CPUDevice, T, NDIM>; \
@@ -65,21 +47,6 @@ TF_CALL_complex128(DEFINE_TYPE);
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
 
-// Register functors used for TileOp.
-#define DEFINE_DIM(T, NDIM) template struct Tile<SYCLDevice, T, NDIM>;
-#define DEFINE_TYPE(T) DEFINE_DIM(T, CPU_PROVIDED_IXDIM)
-
-TF_CALL_bool(DEFINE_TYPE);
-TF_CALL_float(DEFINE_TYPE);
-TF_CALL_double(DEFINE_TYPE);
-TF_CALL_uint8(DEFINE_TYPE);
-TF_CALL_int32(DEFINE_TYPE);
-TF_CALL_int16(DEFINE_TYPE);
-TF_CALL_int64(DEFINE_TYPE);
-
-#undef DEFINE_DIM
-#undef DEFINE_TYPE
-
 // Register functors used for TileGradientOp.
 #define DEFINE_DIM(T, NDIM)                      \
   template struct TileGrad<SYCLDevice, T, NDIM>; \
diff --git a/tensorflow/core/kernels/tile_ops_gpu_impl.h b/tensorflow/core/kernels/tile_ops_gpu_impl.h
index ed89381103..592f99e9b7 100644
--- a/tensorflow/core/kernels/tile_ops_gpu_impl.h
+++ b/tensorflow/core/kernels/tile_ops_gpu_impl.h
@@ -39,7 +39,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/tile_ops_impl.h"
 
 #define DEFINE_DIM(T, NDIM)                            \
-  template struct Tile<Eigen::GpuDevice, T, NDIM>;     \
   template struct TileGrad<Eigen::GpuDevice, T, NDIM>; \
   template struct ReduceAndReshape<Eigen::GpuDevice, T, NDIM, 1>;
 
diff --git a/tensorflow/core/kernels/tile_ops_impl.h b/tensorflow/core/kernels/tile_ops_impl.h
index c41e4bd74b..9861717a0b 100644
--- a/tensorflow/core/kernels/tile_ops_impl.h
+++ b/tensorflow/core/kernels/tile_ops_impl.h
@@ -21,31 +21,8 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
-namespace functor {
 
-template <typename Device, typename T, int NDIM>
-struct Tile {
-  void operator()(const Device& d, typename TTypes<T, NDIM>::Tensor out,
-                  typename TTypes<T, NDIM>::ConstTensor in,
-                  const Eigen::array<int32, NDIM>& broadcast_array) const {
-    if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
-      // Use 32bit indexing to speed up the computations
-      To32Bit(out).device(d) = To32Bit(in).broadcast(broadcast_array);
-    } else {
-      out.device(d) = in.broadcast(broadcast_array);
-    }
-  }
-};
-
-template <typename Device, typename T>
-struct Tile<Device, T, 0> {
-  void operator()(const Device& d, typename TTypes<T, 0>::Tensor out,
-                  typename TTypes<T, 0>::ConstTensor in,
-                  const Eigen::array<int32, 0>&) const {
-    // In the scalar case we simply copy the input.
-    out.device(d) = in;
-  }
-};
+namespace functor {
 
 template <typename Device, typename T, int NDIM>
 struct TileGrad {
diff --git a/tensorflow/core/kernels/topk_op.cc b/tensorflow/core/kernels/topk_op.cc
index 630fcb76f3..5c89eaef5f 100644
--- a/tensorflow/core/kernels/topk_op.cc
+++ b/tensorflow/core/kernels/topk_op.cc
@@ -93,7 +93,7 @@ class TopK : public OpKernel {
       rows_by_one.set(0, num_rows);
 #else
       Eigen::array<int, 1> reduce_on_cols = {1};
-      Eigen::array<int, 1> rows_by_one = {static_cast<int>(num_rows), 1};
+      Eigen::array<int, 2> rows_by_one = {static_cast<int>(num_rows), 1};
 #endif
 
       values.device(d) =
diff --git a/tensorflow/core/kernels/transpose_functor.h b/tensorflow/core/kernels/transpose_functor.h
index 124cf14dd2..f1ab770eeb 100644
--- a/tensorflow/core/kernels/transpose_functor.h
+++ b/tensorflow/core/kernels/transpose_functor.h
@@ -132,6 +132,13 @@ template <typename Device, typename T, int NDIMS>
 void TransposeUsingEigen(const Device& d, const Tensor& in,
                          const gtl::ArraySlice<int32> perm, Tensor* out);
 
+
+#ifdef TENSORFLOW_USE_SYCL
+// For SYCL lets always go through Eigen
+template <typename Device, typename T>
+void TransposeSYCL(const Device& d, const Tensor& in,
+                   const gtl::ArraySlice<int32> perm, Tensor* out);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace internal
 
 template <typename Device, typename T>
diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc
index 75ed76a697..d3305fb83a 100644
--- a/tensorflow/core/kernels/transpose_op.cc
+++ b/tensorflow/core/kernels/transpose_op.cc
@@ -233,10 +233,7 @@ Status TransposeSyclOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
                               .TypeConstraint<int32>("Tperm") \
                               .HostMemory("perm"),            \
                           TransposeSyclOp);
-REGISTER(float);
-REGISTER(bool);
-REGISTER(int32);
+TF_CALL_POD_TYPES(REGISTER);
 #undef REGISTER
 #endif
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/typed_conditional_accumulator_base.h b/tensorflow/core/kernels/typed_conditional_accumulator_base.h
index dbd7de7ce0..1980f758fc 100644
--- a/tensorflow/core/kernels/typed_conditional_accumulator_base.h
+++ b/tensorflow/core/kernels/typed_conditional_accumulator_base.h
@@ -22,7 +22,7 @@ namespace tensorflow {
 
 /*
  * TypedConditionalAccumulatorBase is a templated companion of
- * ConditionalAccumulatorBase which allows for subclassses to use different
+ * ConditionalAccumulatorBase which allows for subclasses to use different
  * types for the input gradients. (See ConditionalAccumulator and
  * SparseConditionalAccumulator.)
  *
diff --git a/tensorflow/core/lib/gtl/optional.h b/tensorflow/core/lib/gtl/optional.h
index 8ba4b09143..2ff8b9c7d1 100644
--- a/tensorflow/core/lib/gtl/optional.h
+++ b/tensorflow/core/lib/gtl/optional.h
@@ -656,7 +656,7 @@ class optional : private internal_optional::optional_data<T>,
   constexpr const T& reference() const { return *this->pointer(); }
   T& reference() { return *(this->pointer()); }
 
-  // T constaint checks.  You can't have an optional of nullopt_t, in_place_t or
+  // T constraint checks.  You can't have an optional of nullopt_t, in_place_t or
   // a reference.
   static_assert(
       !std::is_same<nullopt_t, typename std::remove_cv<T>::type>::value,
diff --git a/tensorflow/core/ops/math_grad.cc b/tensorflow/core/ops/math_grad.cc
index a530d286f7..9a58a31757 100644
--- a/tensorflow/core/ops/math_grad.cc
+++ b/tensorflow/core/ops/math_grad.cc
@@ -155,6 +155,26 @@ Status Log1pGrad(const AttrSlice& attrs, FunctionDef* g) {
 }
 REGISTER_OP_GRADIENT("Log1p", Log1pGrad);
 
+Status SinhGrad(const AttrSlice& attrs, FunctionDef* g) {
+  // clang-format off
+  return GradForUnaryCwise(g, {
+      {{"cosh"}, "Cosh", {"x"}, {}, {"dy"}},
+      {{"dx"}, "Mul", {"dy", "cosh"}},  // dy * cosh(x)
+  });
+  // clang-format on
+}
+REGISTER_OP_GRADIENT("Sinh", SinhGrad);
+
+Status CoshGrad(const AttrSlice& attrs, FunctionDef* g) {
+  // clang-format off
+  return GradForUnaryCwise(g, {
+      {{"sinh"}, "Sinh", {"x"}, {}, {"dy"}},
+      {{"dx"}, "Mul", {"dy", "sinh"}},  // dy * sinh(x)
+  });
+  // clang-format on
+}
+REGISTER_OP_GRADIENT("Cosh", CoshGrad);
+
 Status TanhGrad(const AttrSlice& attrs, FunctionDef* g) {
   // clang-format off
   return GradForUnaryCwise(g, {
diff --git a/tensorflow/core/ops/math_grad_test.cc b/tensorflow/core/ops/math_grad_test.cc
index 38813b3f2b..aa9706a328 100644
--- a/tensorflow/core/ops/math_grad_test.cc
+++ b/tensorflow/core/ops/math_grad_test.cc
@@ -495,6 +495,26 @@ TEST_F(MathGradTest, Log1p) {
   test::ExpectClose(ans, dx);
 }
 
+TEST_F(MathGradTest, Sinh) {
+  auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
+                                 TensorShape({2, 3}));
+  auto g = [](float x) { return std::cosh(x); };
+  auto dx = test::AsTensor<float>(
+      {g(-3.f), g(-2.f), g(-1.f), g(1.f), g(2.f), g(3.f)}, TensorShape({2, 3}));
+  auto ans = SymGrad("Sinh", x);
+  test::ExpectClose(ans, dx);
+}
+
+TEST_F(MathGradTest, Cosh) {
+  auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
+                                 TensorShape({2, 3}));
+  auto g = [](float x) { return std::sinh(x); };
+  auto dx = test::AsTensor<float>(
+      {g(-3.f), g(-2.f), g(-1.f), g(1.f), g(2.f), g(3.f)}, TensorShape({2, 3}));
+  auto ans = SymGrad("Cosh", x);
+  test::ExpectClose(ans, dx);
+}
+
 TEST_F(MathGradTest, Tanh) {
   auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
                                  TensorShape({2, 3}));
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 3fe543a239..30d6987707 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -293,6 +293,14 @@ Computes natural logarithm of (1 + x) element-wise.
 I.e., \\(y = \log_e (1 + x)\\).
 )doc");
 
+REGISTER_OP("Sinh").UNARY_COMPLEX().Doc(R"doc(
+Computes hyperbolic sine of x element-wise.
+)doc");
+
+REGISTER_OP("Cosh").UNARY_COMPLEX().Doc(R"doc(
+Computes hyperbolic cosine of x element-wise.
+)doc");
+
 REGISTER_OP("Tanh").UNARY_COMPLEX().Doc(R"doc(
 Computes hyperbolic tangent of `x` element-wise.
 )doc");
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 70302c3886..3a25fd15da 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -831,11 +831,13 @@ a different filter to each input channel (expanding from 1 channel to
 `channel_multiplier` channels for each), then concatenates the results
 together. Thus, the output has `in_channels * channel_multiplier` channels.
 
+```
 for k in 0..in_channels-1
   for q in 0..channel_multiplier-1
     output[b, i, j, k * channel_multiplier + q] =
       sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
                         filter[di, dj, k, q]
+```
 
 Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
 horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 58bed01150..485b72faa4 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -5184,6 +5184,30 @@ op {
   }
   summary: "Computes cos of x element-wise."
 }
+op {
+  name: "Cosh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  summary: "Computes hyperbolic cosine of x element-wise."
+}
 op {
   name: "CountUpTo"
   input_arg {
@@ -22849,6 +22873,30 @@ op {
   }
   summary: "Computes sin of x element-wise."
 }
+op {
+  name: "Sinh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  summary: "Computes hyperbolic sine of x element-wise."
+}
 op {
   name: "Size"
   input_arg {
@@ -27315,6 +27363,33 @@ op {
   summary: "A Reader that outputs the records from a TensorFlow Records file."
   is_stateful: true
 }
+op {
+  name: "LMDBReader"
+  output_arg {
+    name: "reader_handle"
+    description: "The handle to reference the Reader."
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this reader is placed in the given container.\nOtherwise, a default container is used."
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this reader is named in the given bucket\nwith this shared_name. Otherwise, the node name is used instead."
+  }
+  summary: "A Reader that outputs the records from a LMDB database."
+  is_stateful: true
+}
 op {
   name: "TakeDataset"
   input_arg {
diff --git a/tensorflow/core/ops/sparse_ops.cc b/tensorflow/core/ops/sparse_ops.cc
index 9722f0ee9a..6aca2c3b01 100644
--- a/tensorflow/core/ops/sparse_ops.cc
+++ b/tensorflow/core/ops/sparse_ops.cc
@@ -710,6 +710,75 @@ a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
 b: `ndims`-D Tensor.  With shape `a_shape`.
 )doc");
 
+REGISTER_OP("SparseReduceMax")
+    .Input("input_indices: int64")
+    .Input("input_values: T")
+    .Input("input_shape: int64")
+    .Input("reduction_axes: int32")
+    .Attr("keep_dims: bool = False")
+    .Output("output: T")
+    .Attr("T: realnumbertype")
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(R"doc(
+Computes the max of elements across dimensions of a SparseTensor.
+
+This Op takes a SparseTensor and is the sparse counterpart to
+`tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
+instead of a sparse one.
+
+Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+`reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+with length 1.
+
+If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+with a single element is returned.  Additionally, the axes can be negative,
+which are interpreted according to the indexing rules in Python.
+
+input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+  SparseTensor, possibly not in canonical ordering.
+input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+input_shape: 1-D.  Shape of the input SparseTensor.
+reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+keep_dims: If true, retain reduced dimensions with length 1.
+output: `R-K`-D.  The reduced Tensor.
+)doc");
+
+REGISTER_OP("SparseReduceMaxSparse")
+    .Input("input_indices: int64")
+    .Input("input_values: T")
+    .Input("input_shape: int64")
+    .Input("reduction_axes: int32")
+    .Attr("keep_dims: bool = False")
+    .Output("output_indices: int64")
+    .Output("output_values: T")
+    .Output("output_shape: int64")
+    .Attr("T: realnumbertype")
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(R"doc(
+Computes the max of elements across dimensions of a SparseTensor.
+
+This Op takes a SparseTensor and is the sparse counterpart to
+`tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a
+SparseTensor.
+
+Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+`reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+with length 1.
+
+If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+with a single element is returned.  Additionally, the axes can be negative,
+which are interpreted according to the indexing rules in Python.
+
+input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+  SparseTensor, possibly not in canonical ordering.
+input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+input_shape: 1-D.  Shape of the input SparseTensor.
+reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+keep_dims: If true, retain reduced dimensions with length 1.
+)doc");
+
 REGISTER_OP("SparseReduceSum")
     .Input("input_indices: int64")
     .Input("input_values: T")
diff --git a/tensorflow/core/platform/cloud/retrying_utils.cc b/tensorflow/core/platform/cloud/retrying_utils.cc
index 096c77c6e3..99691ecfb9 100644
--- a/tensorflow/core/platform/cloud/retrying_utils.cc
+++ b/tensorflow/core/platform/cloud/retrying_utils.cc
@@ -89,7 +89,7 @@ Status RetryingUtils::DeleteWithRetries(
   bool is_retried = false;
   return RetryingUtils::CallWithRetries(
       [delete_func, &is_retried]() {
-        const auto& status = delete_func();
+        const Status status = delete_func();
         if (is_retried && status.code() == error::NOT_FOUND) {
           return Status::OK();
         }
diff --git a/tensorflow/core/protobuf/worker.proto b/tensorflow/core/protobuf/worker.proto
index cf05aece39..e476a84a13 100644
--- a/tensorflow/core/protobuf/worker.proto
+++ b/tensorflow/core/protobuf/worker.proto
@@ -171,7 +171,7 @@ message ExecutorOpts {
 };
 
 message RunGraphRequest {
-  // session_handle is the the master-generated unique id for this session.
+  // session_handle is the master-generated unique id for this session.
   // If session_handle is non-empty, it must be the same as used when
   // registering the graph. If it is empty, a single global namespace is used to
   // search for the graph_handle.
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index d30d7819fc..0e5611e359 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -24,7 +24,7 @@ limitations under the License.
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX "-rc2"
+#define TF_VERSION_SUFFIX ""
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 6a37256ea9..67468bdc3f 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include "third_party/mkl/include/mkl_dnn.h"
 #include "third_party/mkl/include/mkl_dnn_types.h"
 #include "third_party/mkl/include/mkl_service.h"
-
+#include "third_party/mkl/include/mkl_trans.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/util/tensor_format.h"
@@ -616,6 +616,42 @@ inline void ForwarMklTensorInToOut(OpKernelContext* context,
   }
 }
 
+  // TODO(intel_tf): Remove this routine when faster MKL layout conversion is
+  // out. 
+inline void MklNHWCToNCHW(const Tensor& input, Tensor** output) {
+  const float* buf_in = input.flat<float>().data();
+  float* buf_out = (*output)->flat<float>().data();
+
+  int64 N = input.dim_size(0);
+  int64 H = input.dim_size(1);
+  int64 W = input.dim_size(2);
+  int64 C = input.dim_size(3);
+  int64 stride_n = H*W*C;
+# pragma omp parallel for num_threads(16)
+  for (int64 n = 0; n < N; ++n) {
+    mkl_somatcopy('R', 'T', H*W, C, 1, buf_in + n*stride_n, C,
+        buf_out + n*stride_n, H*W);
+  }
+}
+
+  // TODO(intel_tf): Remove this routine when faster MKL layout conversion is
+  // out. 
+inline void MklNCHWToNHWC(const Tensor& input, Tensor** output) {
+  const float* buf_in = input.flat<float>().data();
+  float* buf_out = (*output)->flat<float>().data();
+
+  int64 N = (*output)->dim_size(0);
+  int64 H = (*output)->dim_size(1);
+  int64 W = (*output)->dim_size(2);
+  int64 C = (*output)->dim_size(3);
+  int64 stride_n = H*W*C;
+# pragma omp parallel for num_threads(16)
+  for (int64 n = 0; n < N; ++n) {
+    mkl_somatcopy('R', 'T', C, H*W, 1, buf_in + n*stride_n, H*W,
+        buf_out + n*stride_n, C);
+  }
+}
+
 namespace mkl_op_registry {
 static const char* kMklOpLabel = "MklOp";
 static const char* kMklOpLabelPattern = "label='MklOp'";
diff --git a/tensorflow/docs_src/api_guides/python/contrib.losses.md b/tensorflow/docs_src/api_guides/python/contrib.losses.md
index 8c289dd556..30123e367f 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.losses.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.losses.md
@@ -13,8 +13,8 @@ of samples in the batch and `d1` ... `dN` are the remaining dimensions.
 It is common, when training with multiple loss functions, to adjust the relative
 strengths of individual losses. This is performed by rescaling the losses via
 a `weight` parameter passed to the loss functions. For example, if we were
-training with both log_loss and mean_square_error, and we wished that the
-log_loss penalty be twice as severe as the mean_square_error, we would
+training with both log_loss and mean_squared_error, and we wished that the
+log_loss penalty be twice as severe as the mean_squared_error, we would
 implement this as:
 
 ```python
@@ -22,7 +22,7 @@ implement this as:
   tf.contrib.losses.log(predictions, labels, weight=2.0)
 
   # Uses default weight of 1.0
-  tf.contrib.losses.mean_square_error(predictions, labels)
+  tf.contrib.losses.mean_squared_error(predictions, labels)
 
   # All the losses are collected into the `GraphKeys.LOSSES` collection.
   losses = tf.get_collection(tf.GraphKeys.LOSSES)
@@ -74,7 +74,7 @@ these predictions.
   predictions = MyModelPredictions(images)
 
   weight = tf.cast(tf.greater(depths, 0), tf.float32)
-  loss  = tf.contrib.losses.mean_square_error(predictions, depths, weight)
+  loss  = tf.contrib.losses.mean_squared_error(predictions, depths, weight)
 ```
 
 Note that when using weights for the losses, the final average is computed
@@ -100,7 +100,7 @@ weighted average over the individual prediction errors:
 
   weight = MyComplicatedWeightingFunction(labels)
   weight = tf.div(weight, tf.size(weight))
-  loss = tf.contrib.losses.mean_square_error(predictions, depths, weight)
+  loss = tf.contrib.losses.mean_squared_error(predictions, depths, weight)
 ```
 
 @{tf.contrib.losses.absolute_difference}
diff --git a/tensorflow/docs_src/api_guides/python/math_ops.md b/tensorflow/docs_src/api_guides/python/math_ops.md
index b3c12a61dd..3d9f203297 100644
--- a/tensorflow/docs_src/api_guides/python/math_ops.md
+++ b/tensorflow/docs_src/api_guides/python/math_ops.md
@@ -59,6 +59,8 @@ mathematical functions to your graph.
 *   @{tf.acos}
 *   @{tf.asin}
 *   @{tf.atan}
+*   @{tf.cosh}
+*   @{tf.sinh}
 *   @{tf.lgamma}
 *   @{tf.digamma}
 *   @{tf.erf}
diff --git a/tensorflow/docs_src/extend/estimators.md b/tensorflow/docs_src/extend/estimators.md
index fbde463415..500beb49a2 100644
--- a/tensorflow/docs_src/extend/estimators.md
+++ b/tensorflow/docs_src/extend/estimators.md
@@ -68,8 +68,7 @@ for abalone:
 | Feature        | Description                                               |
 | -------------- | --------------------------------------------------------- |
 | Length         | Length of abalone (in longest direction; in mm)           |
-| Diameter       | Diameter of abalone (measurement perpendicular to length; |
-:                : in mm)                                                    :
+| Diameter       | Diameter of abalone (measurement perpendicular to length; in mm)|
 | Height         | Height of abalone (with its meat inside shell; in mm)     |
 | Whole Weight   | Weight of entire abalone (in grams)                       |
 | Shucked Weight | Weight of abalone meat only (in grams)                    |
@@ -559,7 +558,7 @@ For a full list of optimizers, and other details, see the
 
 Here's the final, complete `model_fn` for the abalone age predictor. The
 following code configures the neural network; defines loss and the training op;
-and returns a `ModelFnOps` object containing `mode`, `predictions_dict`, `loss`,
+and returns a `EstimatorSpec` object containing `mode`, `predictions_dict`, `loss`,
 and `train_op`:
 
 ```python
diff --git a/tensorflow/docs_src/get_started/get_started.md b/tensorflow/docs_src/get_started/get_started.md
index 77b8e2dd2e..d1c9cd696c 100644
--- a/tensorflow/docs_src/get_started/get_started.md
+++ b/tensorflow/docs_src/get_started/get_started.md
@@ -135,8 +135,9 @@ adder_node = a + b  # + provides a shortcut for tf.add(a, b)
 
 The preceding three lines are a bit like a function or a lambda in which we
 define two input parameters (a and b) and then an operation on them. We can
-evaluate this graph with multiple inputs by using the feed_dict parameter to
-specify Tensors that provide concrete values to these placeholders:
+evaluate this graph with multiple inputs by using the feed_dict argument to
+the [run method](https://www.tensorflow.org/api_docs/python/tf/Session#run)
+to feed concrete values to the placeholders:
 
 ```python
 print(sess.run(adder_node, {a: 3, b:4.5}))
diff --git a/tensorflow/docs_src/get_started/mnist/beginners.md b/tensorflow/docs_src/get_started/mnist/beginners.md
index 624d916474..4858b3cf92 100644
--- a/tensorflow/docs_src/get_started/mnist/beginners.md
+++ b/tensorflow/docs_src/get_started/mnist/beginners.md
@@ -362,7 +362,7 @@ minimize. Then it can apply your choice of optimization algorithm to modify the
 variables and reduce the loss.
 
 ```python
-train_step = tf.train.GradientDescentOptimizer(0.05).minimize(cross_entropy)
+train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
 ```
 
 In this case, we ask TensorFlow to minimize `cross_entropy` using the
diff --git a/tensorflow/docs_src/get_started/mnist/mechanics.md b/tensorflow/docs_src/get_started/mnist/mechanics.md
index 48d9a395f2..27fae45b5b 100644
--- a/tensorflow/docs_src/get_started/mnist/mechanics.md
+++ b/tensorflow/docs_src/get_started/mnist/mechanics.md
@@ -82,7 +82,7 @@ After creating placeholders for the data, the graph is built from the
 `mnist.py` file according to a 3-stage pattern: `inference()`, `loss()`, and
 `training()`.
 
-1.  `inference()` - Builds the graph as far as is required for running
+1.  `inference()` - Builds the graph as far as required for running
 the network forward to make predictions.
 1.  `loss()` - Adds to the inference graph the ops required to generate
 loss.
diff --git a/tensorflow/docs_src/get_started/tensorboard_histograms.md b/tensorflow/docs_src/get_started/tensorboard_histograms.md
index b3dd13497e..918deda190 100644
--- a/tensorflow/docs_src/get_started/tensorboard_histograms.md
+++ b/tensorflow/docs_src/get_started/tensorboard_histograms.md
@@ -34,6 +34,8 @@ tf.summary.histogram("normal/moving_mean", mean_moving_normal)
 sess = tf.Session()
 writer = tf.summary.FileWriter("/tmp/histogram_example")
 
+summaries = tf.summary.merge_all()
+
 # Setup a loop and write the summaries to disk
 N = 400
 for step in range(N):
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index 91189f199d..81aa6e3f76 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -35,7 +35,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for Mac OS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.2.0-rc2.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.2.0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index c9b8dffadb..3f9096b822 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -35,7 +35,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.2.0-rc2.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.2.0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index 612c4c94f2..40ed9e1826 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -34,7 +34,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.2.0-rc2</version>
+  <version>1.2.0</version>
 </dependency>
 ```
 
@@ -63,7 +63,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.2.0-rc2</version>
+                 <version>1.2.0</version>
                </dependency>
              </dependencies>
          </project>
@@ -122,7 +122,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or Mac OS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.2.0-rc2.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.2.0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -141,7 +141,7 @@ Take the following steps to install TensorFlow for Java on Linux or Mac OS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.2.0-rc2.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.2.0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -149,10 +149,10 @@ Take the following steps to install TensorFlow for Java on Linux or Mac OS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.2.0-rc2.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.2.0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.2.0-rc2.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.2.0.zip).
   3. Extract this .zip file.
 
 
@@ -200,7 +200,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.2.0-rc2.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.2.0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -214,11 +214,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and Mac OS X:
 
-<pre><b>java -cp libtensorflow-1.2.0-rc2.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.2.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.2.0-rc2.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.2.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 8ce4acda13..99f27d7b85 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -171,8 +171,8 @@ Take the following steps to install TensorFlow with Virtualenv:
      issue the following command to install TensorFlow in the active
      virtualenv environment:
 
-     <pre>(tensorflow)$ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0rc2-cp34-cp34m-linux_x86_64.whl</b></pre>
+     <pre>(tensorflow)$ <b>pip install --upgrade \
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0-cp27-none-linux_x86_64.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common_installation_problems).
@@ -276,8 +276,8 @@ take the following steps:
      the following command:
 
      <pre>
-     $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0rc2-cp34-cp34m-linux_x86_64.whl</b>
+     $ <b>sudo pip install --upgrade \
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0-cp27-none-linux_x86_64.whl</b>
      </pre>
 
      If this step fails, see
@@ -464,7 +464,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0rc2-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0-cp27-none-linux_x86_64.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -632,14 +632,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0rc2-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.2.0rc2-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.2.0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -651,14 +651,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0rc2-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.2.0rc2-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.2.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -670,14 +670,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0rc2-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.2.0rc2-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.2.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -689,14 +689,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0rc2-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.2.0rc2-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.2.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index f85ecefb83..8ff0fb872f 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -108,8 +108,8 @@ Take the following steps to install TensorFlow with Virtualenv:
      Python 2.7, the command to install
      TensorFlow in the active Virtualenv is as follows:
 
-     <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.0rc2-py2-none-any.whl</b></pre>
+     <pre> $ <b>pip install --upgrade \
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.0-py2-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -229,8 +229,8 @@ take the following steps:
      you are installing TensorFlow for Mac OS and Python 2.7
      issue the following command:
 
-     <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.0rc2-py2-none-any.whl</b> </pre>
+     <pre> $ <b>sudo pip install --upgrade \
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.0-py2-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -339,7 +339,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.0rc2-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.0-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -512,7 +512,7 @@ This section documents the relevant values for Mac OS installations.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.0rc2-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.0-py2-none-any.whl
 </pre>
 
 
@@ -520,7 +520,7 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.0rc2-py2-none-a
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.0rc2-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.0-py3-none-any.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index c455492387..a082c3ce78 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -342,10 +342,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.2.0rc2 on Linux:
+for TensorFlow 1.2.0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.2.0rc2-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.2.0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
diff --git a/tensorflow/docs_src/install/install_windows.md b/tensorflow/docs_src/install/install_windows.md
index 42820660ee..8282afaab4 100644
--- a/tensorflow/docs_src/install/install_windows.md
+++ b/tensorflow/docs_src/install/install_windows.md
@@ -38,9 +38,10 @@ installed on your system:
     [NVIDIA documentation](https://developer.nvidia.com/cuda-gpus) for a
     list of supported GPU cards.
 
-If you have an earlier version of the preceding packages, please
-upgrade to the specified versions.
-
+If you have a different version of one of the preceding packages, please
+change to the specified versions.  In particular, the cuDNN version
+must match exactly: TensorFlow will not load if it cannot find `cuDNN64_5.dll`.
+To use a different version of cuDNN, you must build from source.
 
 ## Determine how to install TensorFlow
 
@@ -73,7 +74,7 @@ Use that package at your own risk.
 If the following version of Python is not installed on your machine,
 install it now:
 
-  * [Python 3.5.x from python.org](https://www.python.org/downloads/release/python-352/)
+  * [Python 3.5.x 64-bit from python.org](https://www.python.org/downloads/release/python-352/)
 
 TensorFlow only supports version 3.5.x of Python on Windows.
 Note that Python 3.5.x comes with the pip3 package manager, which is the
@@ -114,12 +115,12 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      environment. To install the CPU-only version of TensorFlow, enter the
      following command:
 
-     <pre>(tensorflow)C:\> <b>pip install --ignore-installed --upgrade https://storage.googleapis.com/tensorflow/windows/cpu/tensorflow-1.2.0rc2-cp35-cp35m-win_amd64.whl</b> </pre>
+     <pre>(tensorflow)C:\> <b>pip install --ignore-installed --upgrade https://storage.googleapis.com/tensorflow/windows/cpu/tensorflow-1.2.0-cp35-cp35m-win_amd64.whl</b> </pre>
 
      To install the GPU version of TensorFlow, enter the following command
      (on a single line):
 
-     <pre>(tensorflow)C:\> <b>pip install --ignore-installed --upgrade https://storage.googleapis.com/tensorflow/windows/gpu/tensorflow_gpu-1.2.0rc2-cp35-cp35m-win_amd64.whl</b> </pre>
+     <pre>(tensorflow)C:\> <b>pip install --ignore-installed --upgrade https://storage.googleapis.com/tensorflow/windows/gpu/tensorflow_gpu-1.2.0-cp35-cp35m-win_amd64.whl</b> </pre>
 
 ## Validate your installation
 
diff --git a/tensorflow/docs_src/performance/performance_guide.md b/tensorflow/docs_src/performance/performance_guide.md
index 07c5d3087f..a5508ac23e 100644
--- a/tensorflow/docs_src/performance/performance_guide.md
+++ b/tensorflow/docs_src/performance/performance_guide.md
@@ -52,7 +52,8 @@ bazel build -c opt --copt=-march="broadwell" --config=cuda //tensorflow/tools/pi
     (pascal): 6.2, Titan X (maxwell): 5.2, and K80: 3.7.
 *   Install the latest CUDA platform and cuDNN libraries.
 *   Make sure to use a version of gcc that supports all of the optimizations of
-    the target CPU. The recommended minimum gcc version is 4.8.3.
+    the target CPU. The recommended minimum gcc version is 4.8.3.  On OS X upgrade
+    to the latest Xcode version and use the version of clang that comes with Xcode.
 *   TensorFlow checks on startup whether it has been compiled with the
     optimizations available on the CPU. If the optimizations are not included,
     TensorFlow will emit warnings, e.g. AVX, AVX2, and FMA instructions not
@@ -122,6 +123,11 @@ format.
 The best practice is to build models that work with both `NCHW` and `NHWC` as it
 is common to train using `NCHW` on GPU, and then do inference with NHWC on CPU.
 
+There are edge cases where `NCHW` can be slower on GPU than `NHWC`. One
+[case](https://github.com/tensorflow/tensorflow/issues/7551#issuecomment-280421351)
+is using non-fused batch norm on WRN-16-4 without dropout. In that case using
+fused batch norm, which is also recommended, is the optimal solution.
+
 The very brief history of these two formats is that TensorFlow started by using
 `NHWC` because it was a little faster on CPUs. Then the TensorFlow team
 discovered that `NCHW` performs better when using the NVIDIA cuDNN library.  The
diff --git a/tensorflow/docs_src/performance/quantization.md b/tensorflow/docs_src/performance/quantization.md
index 4667b4cad7..a37748d0c9 100644
--- a/tensorflow/docs_src/performance/quantization.md
+++ b/tensorflow/docs_src/performance/quantization.md
@@ -91,11 +91,14 @@ eight-bit computations:
 ```sh
 curl http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz -o /tmp/inceptionv3.tgz
 tar xzf /tmp/inceptionv3.tgz -C /tmp/
-bazel build tensorflow/tools/quantization:quantize_graph
-bazel-bin/tensorflow/tools/quantization/quantize_graph \
-  --input=/tmp/classify_image_graph_def.pb \
-  --output_node_names="softmax" --output=/tmp/quantized_graph.pb \
-  --mode=eightbit
+bazel build tensorflow/tools/graph_transforms:transform_graph
+bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
+  --in_graph=/tmp/classify_image_graph_def.pb \
+  --outputs="softmax" --out_graph=/tmp/quantized_graph.pb \
+  --transforms='add_default_attributes strip_unused_nodes(type=float, shape="1,299,299,3")
+    remove_nodes(op=Identity, op=CheckNumerics) fold_constants(ignore_errors=true)
+    fold_batch_norms fold_old_batch_norms quantize_weights quantize_nodes
+    strip_unused_nodes sort_by_execution_order'
 ```
 
 This will produce a new model that runs the same operations as the original, but
diff --git a/tensorflow/docs_src/tutorials/seq2seq.md b/tensorflow/docs_src/tutorials/seq2seq.md
index 6ffe3e8b03..dd2ca8d524 100644
--- a/tensorflow/docs_src/tutorials/seq2seq.md
+++ b/tensorflow/docs_src/tutorials/seq2seq.md
@@ -8,7 +8,10 @@ some input and generate a meaningful response? For example, could we train
 a neural network to translate from English to French? It turns out that
 the answer is *yes*.
 
-This tutorial will show you how to build and train such a system end-to-end. Clone the [TensorFlow main repo](https://github.com/tensorflow/tensorflow) and the [TensorFlow models repo](https://github.com/tensorflow/models) from GitHub. You can then start by running the translate program:
+This tutorial will show you how to build and train such a system end-to-end.
+Clone the [TensorFlow main repo](https://github.com/tensorflow/tensorflow) and
+the [TensorFlow models repo](https://github.com/tensorflow/models) from GitHub.
+You can then start by running the translate program:
 
 ```
 cd models/tutorials/rnn/translate
@@ -25,7 +28,7 @@ This tutorial references the following files.
 
 File | What's in it?
 --- | ---
-`tensorflow/tensorflow/python/ops/seq2seq.py` | Library for building sequence-to-sequence models.
+`tensorflow/tensorflow/contrib/legacy_seq2seq/python/ops/seq2seq.py` | Library for building sequence-to-sequence models.
 `models/tutorials/rnn/translate/seq2seq_model.py` | Neural translation sequence-to-sequence model.
 `models/tutorials/rnn/translate/data_utils.py` | Helper functions for preparing translation data.
 `models/tutorials/rnn/translate/translate.py` | Binary that trains and runs the translation model.
@@ -148,9 +151,9 @@ have similar interfaces, so we will not describe them in detail. We will use
 ## Neural translation model
 
 While the core of the sequence-to-sequence model is constructed by
-the functions in `tensorflow/tensorflow/python/ops/seq2seq.py`, there are still a few tricks
-that are worth mentioning that are used in our translation model in
-`models/tutorials/rnn/translate/seq2seq_model.py`.
+the functions in `tensorflow/tensorflow/contrib/legacy_seq2seq/python/ops/seq2seq.py`,
+there are still a few tricks that are worth mentioning that are used in our
+translation model in `models/tutorials/rnn/translate/seq2seq_model.py`.
 
 ### Sampled softmax and output projection
 
diff --git a/tensorflow/docs_src/tutorials/wide.md b/tensorflow/docs_src/tutorials/wide.md
index c2621026c7..24c866eee5 100644
--- a/tensorflow/docs_src/tutorials/wide.md
+++ b/tensorflow/docs_src/tutorials/wide.md
@@ -1,6 +1,6 @@
 # TensorFlow Linear Model Tutorial
 
-In this tutorial, we will use the TF.Learn API in TensorFlow to solve a binary
+In this tutorial, we will use the tf.contrib.learn API in TensorFlow to solve a binary
 classification problem: Given census data about a person such as age, gender,
 education and occupation (the features), we will try to predict whether or not
 the person earns more than 50,000 dollars a year (the target label). We will
@@ -16,7 +16,7 @@ To try the code for this tutorial:
 
 2.  Download [the tutorial code](https://www.tensorflow.org/code/tensorflow/examples/learn/wide_n_deep_tutorial.py).
 
-3.  Install the pandas data analysis library. tf.learn doesn't require pandas, but it does support it, and this tutorial uses pandas. To install pandas:
+3.  Install the pandas data analysis library. tf.contrib.learn doesn't require pandas, but it does support it, and this tutorial uses pandas. To install pandas:
 
     a. Get `pip`:
 
@@ -69,8 +69,8 @@ COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
            "marital_status", "occupation", "relationship", "race", "gender",
            "capital_gain", "capital_loss", "hours_per_week", "native_country",
            "income_bracket"]
-df_train = pd.read_csv(train_file, names=COLUMNS, skipinitialspace=True)
-df_test = pd.read_csv(test_file, names=COLUMNS, skipinitialspace=True, skiprows=1)
+df_train = pd.read_csv(train_file.name, names=COLUMNS, skipinitialspace=True)
+df_test = pd.read_csv(test_file.name, names=COLUMNS, skipinitialspace=True, skiprows=1)
 ```
 
 Since the task is a binary classification problem, we'll construct a label
@@ -136,9 +136,9 @@ Here's a list of columns available in the Census Income dataset:
 
 ## Converting Data into Tensors
 
-When building a TF.Learn model, the input data is specified by means of an Input
+When building a tf.contrib.learn model, the input data is specified by means of an Input
 Builder function. This builder function will not be called until it is later
-passed to TF.Learn methods such as `fit` and `evaluate`. The purpose of this
+passed to tf.contrib.learn methods such as `fit` and `evaluate`. The purpose of this
 function is to construct the input data, which is represented in the form of
 @{tf.Tensor}s
 or
@@ -211,7 +211,7 @@ to predict the target label.
 ### Base Categorical Feature Columns
 
 To define a feature column for a categorical feature, we can create a
-`SparseColumn` using the TF.Learn API. If you know the set of all possible
+`SparseColumn` using the tf.contrib.learn API. If you know the set of all possible
 feature values of a column and there are only a few of them, you can use
 `sparse_column_with_keys`. Each key in the list will get assigned an
 auto-incremental ID starting from 0. For example, for the `gender` column we can
@@ -361,7 +361,7 @@ in `model_dir`.
 ## Training and Evaluating Our Model
 
 After adding all the features to the model, now let's look at how to actually
-train the model. Training a model is just a one-liner using the TF.Learn API:
+train the model. Training a model is just a one-liner using the tf.contrib.learn API:
 
 ```python
 m.fit(input_fn=train_input_fn, steps=200)
@@ -467,4 +467,4 @@ value would be high.
 
 If you're interested in learning more, check out our @{$wide_and_deep$Wide & Deep Learning Tutorial} where we'll show you how to combine
 the strengths of linear models and deep neural networks by jointly training them
-using the TF.Learn API.
+using the tf.contrib.learn API.
diff --git a/tensorflow/docs_src/tutorials/wide_and_deep.md b/tensorflow/docs_src/tutorials/wide_and_deep.md
index 77c905fd51..0978005d6c 100644
--- a/tensorflow/docs_src/tutorials/wide_and_deep.md
+++ b/tensorflow/docs_src/tutorials/wide_and_deep.md
@@ -9,7 +9,7 @@ great for training deep neural networks too, and you might be thinking which one
 you should choose—Well, why not both? Would it be possible to combine the
 strengths of both in one model?
 
-In this tutorial, we'll introduce how to use the TF.Learn API to jointly train a
+In this tutorial, we'll introduce how to use the tf.contrib.learn API to jointly train a
 wide linear model and a deep feed-forward neural network. This approach combines
 the strengths of memorization and generalization. It's useful for generic
 large-scale regression and classification problems with sparse input features
@@ -23,7 +23,7 @@ The figure above shows a comparison of a wide model (logistic regression with
 sparse features and transformations), a deep model (feed-forward neural network
 with an embedding layer and several hidden layers), and a Wide & Deep model
 (joint training of both). At a high level, there are only 3 steps to configure a
-wide, deep, or Wide & Deep model using the TF.Learn API:
+wide, deep, or Wide & Deep model using the tf.contrib.learn API:
 
 1.  Select features for the wide part: Choose the sparse base columns and
     crossed columns you want to use.
@@ -42,7 +42,7 @@ To try the code for this tutorial:
 
 2.  Download [the tutorial code](https://www.tensorflow.org/code/tensorflow/examples/learn/wide_n_deep_tutorial.py).
 
-3.  Install the pandas data analysis library. tf.learn doesn't require pandas, but it does support it, and this tutorial uses pandas. To install pandas:
+3.  Install the pandas data analysis library. tf.contrib.learn doesn't require pandas, but it does support it, and this tutorial uses pandas. To install pandas:
 
     a. Get `pip`:
 
diff --git a/tensorflow/docs_src/tutorials/word2vec.md b/tensorflow/docs_src/tutorials/word2vec.md
index dfb21334f8..8e7c19035e 100644
--- a/tensorflow/docs_src/tutorials/word2vec.md
+++ b/tensorflow/docs_src/tutorials/word2vec.md
@@ -351,7 +351,7 @@ to evaluate embeddings is to directly use them to predict syntactic and semantic
 relationships like `king is to queen as father is to ?`. This is called
 *analogical reasoning* and the task was introduced by
 [Mikolov and colleagues
-](http://msr-waypoint.com/en-us/um/people/gzweig/Pubs/NAACL2013Regularities.pdf).
+](http://www.anthology.aclweb.org/N/N13/N13-1090.pdf).
 Download the dataset for this task from
 [download.tensorflow.org](http://download.tensorflow.org/data/questions-words.txt).
 
diff --git a/tensorflow/examples/android/build.gradle b/tensorflow/examples/android/build.gradle
index 001d57f88a..e97faad516 100644
--- a/tensorflow/examples/android/build.gradle
+++ b/tensorflow/examples/android/build.gradle
@@ -10,7 +10,7 @@
 // setting nativeBuildSystem below to 'makefile'. This will allow building the demo
 // on Windows machines, but note that full equivalence with the Bazel
 // build is not yet guaranteed. See comments below for caveats and tips
-// for speeding up the build, such as as enabling ccache.
+// for speeding up the build, such as enabling ccache.
 // NOTE: Running a make build will cause subsequent Bazel builds to *fail*
 // unless the contrib/makefile/downloads/ and gen/ dirs are deleted afterwards.
 
diff --git a/tensorflow/examples/image_retraining/retrain.py b/tensorflow/examples/image_retraining/retrain.py
index 8e3b1a3a36..44a3097d80 100644
--- a/tensorflow/examples/image_retraining/retrain.py
+++ b/tensorflow/examples/image_retraining/retrain.py
@@ -800,11 +800,27 @@ def add_evaluation_step(result_tensor, ground_truth_tensor):
   return evaluation_step, prediction
 
 
-def main(_):
+def save_graph_to_file(sess, graph, graph_file_name):
+  output_graph_def = graph_util.convert_variables_to_constants(
+    sess, graph.as_graph_def(), [FLAGS.final_tensor_name])
+  with gfile.FastGFile(graph_file_name, 'wb') as f:
+    f.write(output_graph_def.SerializeToString())
+  return
+
+
+def prepare_file_system():
   # Setup the directory we'll write summaries to for TensorBoard
   if tf.gfile.Exists(FLAGS.summaries_dir):
     tf.gfile.DeleteRecursively(FLAGS.summaries_dir)
   tf.gfile.MakeDirs(FLAGS.summaries_dir)
+  if FLAGS.intermediate_store_frequency > 0:
+    ensure_dir_exists(FLAGS.intermediate_output_graphs_dir)
+  return
+
+
+def main(_):
+  # Prepare necessary directories  that can be used during training
+  prepare_file_system()
 
   # Set up the pre-trained graph.
   maybe_download_and_extract()
@@ -917,6 +933,14 @@ def main(_):
               (datetime.now(), i, validation_accuracy * 100,
                len(validation_bottlenecks)))
 
+      # Store intermediate results
+      intermediate_frequency = FLAGS.intermediate_store_frequency
+
+      if intermediate_frequency > 0 and (i % intermediate_frequency == 0) and i > 0:
+        intermediate_file_name = FLAGS.intermediate_output_graphs_dir + 'intermediate_' + str(i) + '.pb'
+        print('Save intermediate result to : ' + intermediate_file_name)
+        save_graph_to_file(sess, graph, intermediate_file_name)
+                
     # We've completed all our training, so run a final test evaluation on
     # some new images we haven't used before.
     test_bottlenecks, test_ground_truth, test_filenames = (
@@ -940,10 +964,7 @@ def main(_):
 
     # Write out the trained graph and labels with the weights stored as
     # constants.
-    output_graph_def = graph_util.convert_variables_to_constants(
-        sess, graph.as_graph_def(), [FLAGS.final_tensor_name])
-    with gfile.FastGFile(FLAGS.output_graph, 'wb') as f:
-      f.write(output_graph_def.SerializeToString())
+    save_graph_to_file(sess, graph, FLAGS.output_graph)
     with gfile.FastGFile(FLAGS.output_labels, 'w') as f:
       f.write('\n'.join(image_lists.keys()) + '\n')
 
@@ -962,6 +983,18 @@ if __name__ == '__main__':
       default='/tmp/output_graph.pb',
       help='Where to save the trained graph.'
   )
+  parser.add_argument(
+      '--intermediate_output_graphs_dir',
+      type=str,
+      default='/tmp/intermediate_graph/',
+      help='Where to save the intermediate graphs.'
+  )
+  parser.add_argument(
+      '--intermediate_store_frequency',
+      type=int,
+      default=0,
+      help='How many steps to store intermediate graph. If "0" then will not store.'
+  )
   parser.add_argument(
       '--output_labels',
       type=str,
diff --git a/tensorflow/examples/label_image/README.md b/tensorflow/examples/label_image/README.md
index 1103caf586..c5857f394a 100644
--- a/tensorflow/examples/label_image/README.md
+++ b/tensorflow/examples/label_image/README.md
@@ -1,4 +1,4 @@
-# TensorFlow C++ Image Recognition Demo
+# TensorFlow C++ and Python Image Recognition Demo
 
 This example shows how you can load a pre-trained TensorFlow network and use it
 to recognize objects in images in C++. For Java see the [Java
@@ -64,3 +64,24 @@ $ bazel-bin/tensorflow/examples/label_image/label_image --image=my_image.png
 
 For a more detailed look at this code, you can check out the C++ section of the
 [Inception tutorial](https://tensorflow.org/tutorials/image_recognition/).
+
+## Python implementation
+
+label_image.py is a python implementation that provides code corresponding
+to the C++ code here. This gives more intuitive mapping between C++ and
+Python than the Python code mentioned in the
+[Inception tutorial](https://tensorflow.org/tutorials/image_recognition/).
+and could be easier to add visualization or debug code.
+
+With tensorflow python package installed, you can run it like:
+```bash
+$ python3 tensorflow/examples/label_image/label_image.py
+```
+And get result similar to this:
+```
+military uniform 0.834305
+mortarboard 0.0218694
+academic gown 0.0103581
+pickelhaube 0.00800818
+bulletproof vest 0.0053509
+```
diff --git a/tensorflow/examples/label_image/label_image.py b/tensorflow/examples/label_image/label_image.py
new file mode 100644
index 0000000000..39d0981337
--- /dev/null
+++ b/tensorflow/examples/label_image/label_image.py
@@ -0,0 +1,132 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import sys
+
+import numpy as np
+import tensorflow as tf
+
+def load_graph(model_file):
+  graph = tf.Graph()
+  graph_def = tf.GraphDef()
+
+  with open(model_file, "rb") as f:
+    graph_def.ParseFromString(f.read())
+  with graph.as_default():
+    tf.import_graph_def(graph_def)
+
+  return graph
+
+def read_tensor_from_image_file(file_name, input_height=299, input_width=299,
+				input_mean=0, input_std=255):
+  input_name = "file_reader"
+  output_name = "normalized"
+  file_reader = tf.read_file(file_name, input_name)
+  if file_name.endswith(".png"):
+    image_reader = tf.image.decode_png(file_reader, channels = 3,
+                                       name='png_reader')
+  elif file_name.endswith(".gif"):
+    image_reader = tf.squeeze(tf.image.decode_gif(file_reader,
+                                                  name='gif_reader'))
+  elif file_name.endswith(".bmp"):
+    image_reader = tf.image.decode_bmp(file_reader, name='bmp_reader')
+  else:
+    image_reader = tf.image.decode_jpeg(file_reader, channels = 3,
+                                        name='jpeg_reader')
+  float_caster = tf.cast(image_reader, tf.float32)
+  dims_expander = tf.expand_dims(float_caster, 0);
+  resized = tf.image.resize_bilinear(dims_expander, [input_height, input_width])
+  normalized = tf.divide(tf.subtract(resized, [input_mean]), [input_std])
+  sess = tf.Session()
+  result = sess.run(normalized)
+
+  return result
+
+def load_labels(label_file):
+  label = []
+  proto_as_ascii_lines = tf.gfile.GFile(label_file).readlines()
+  for l in proto_as_ascii_lines:
+    label.append(l.rstrip())
+  return label
+
+if __name__ == "__main__":
+  file_name = "tensorflow/examples/label_image/data/grace_hopper.jpg"
+  model_file = \
+    "tensorflow/examples/label_image/data/inception_v3_2016_08_28_frozen.pb"
+  label_file = "tensorflow/examples/label_image/data/imagenet_slim_labels.txt"
+  input_height = 299
+  input_width = 299
+  input_mean = 0
+  input_std = 255
+  input_layer = "input"
+  output_layer = "InceptionV3/Predictions/Reshape_1"
+
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--image", help="image to be processed")
+  parser.add_argument("--graph", help="graph/model to be executed")
+  parser.add_argument("--labels", help="name of file containing labels")
+  parser.add_argument("--input_height", type=int, help="input height")
+  parser.add_argument("--input_width", type=int, help="input width")
+  parser.add_argument("--input_mean", type=int, help="input mean")
+  parser.add_argument("--input_std", type=int, help="input std")
+  parser.add_argument("--input_layer", help="name of input layer")
+  parser.add_argument("--output_layer", help="name of output layer")
+  args = parser.parse_args()
+
+  if args.graph:
+    model_file = args.graph
+  if args.image:
+    file_name = args.image
+  if args.labels:
+    label_file = args.labels
+  if args.input_height:
+    input_height = args.input_height
+  if args.input_width:
+    input_width = args.input_width
+  if args.input_mean:
+    input_mean = args.input_mean
+  if args.input_std:
+    input_std = args.input_std
+  if args.input_layer:
+    input_layer = args.input_layer
+  if args.output_layer:
+    output_layer = args.output_layer
+
+  graph = load_graph(model_file)
+  t = read_tensor_from_image_file(file_name,
+                                  input_height=input_height,
+                                  input_width=input_width,
+                                  input_mean=input_mean,
+                                  input_std=input_std)
+
+  input_name = "import/" + input_layer
+  output_name = "import/" + output_layer
+  input_operation = graph.get_operation_by_name(input_name);
+  output_operation = graph.get_operation_by_name(output_name);
+
+  with tf.Session(graph=graph) as sess:
+    results = sess.run(output_operation.outputs[0],
+                      {input_operation.outputs[0]: t})
+  results = np.squeeze(results)
+
+  top_k = results.argsort()[-5:][::-1]
+  labels = load_labels(label_file)
+  for i in top_k:
+    print(labels[i], results[i])
diff --git a/tensorflow/examples/learn/examples_test.sh b/tensorflow/examples/learn/examples_test.sh
index 77b245ab15..4c5893384a 100755
--- a/tensorflow/examples/learn/examples_test.sh
+++ b/tensorflow/examples/learn/examples_test.sh
@@ -32,7 +32,7 @@ TFLEARN_EXAMPLE_BASE_DIR=$DIR/tensorflow/examples/learn
 
 
 function test() {
-  echo "Test "$1":"
+  echo "Test $1:"
   $TFLEARN_EXAMPLE_BASE_DIR/$1 $2
   if [ $? -eq 0 ]
   then
diff --git a/tensorflow/examples/tutorials/mnist/mnist.py b/tensorflow/examples/tutorials/mnist/mnist.py
index d533697976..3585043a2a 100644
--- a/tensorflow/examples/tutorials/mnist/mnist.py
+++ b/tensorflow/examples/tutorials/mnist/mnist.py
@@ -17,7 +17,7 @@
 
 Implements the inference/loss/training pattern for model building.
 
-1. inference() - Builds the model as far as is required for running the network
+1. inference() - Builds the model as far as required for running the network
 forward to make predictions.
 2. loss() - Adds to the inference model the layers required to generate loss.
 3. training() - Adds to the loss model the Ops required to generate and
diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
index 13e5717b0d..aee482fda5 100644
--- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
+++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
@@ -91,7 +91,6 @@ print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])
 
 data_index = 0
 
-
 # Step 3: Function to generate a training batch for the skip-gram model.
 def generate_batch(batch_size, num_skips, skip_window):
   global data_index
@@ -101,9 +100,10 @@ def generate_batch(batch_size, num_skips, skip_window):
   labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
   span = 2 * skip_window + 1  # [ skip_window target skip_window ]
   buffer = collections.deque(maxlen=span)
-  for _ in range(span):
-    buffer.append(data[data_index])
-    data_index = (data_index + 1) % len(data)
+  if data_index + span > len(data):
+    data_index = 0
+  buffer.extend(data[data_index:data_index + span])
+  data_index += span
   for i in range(batch_size // num_skips):
     target = skip_window  # target label at the center of the buffer
     targets_to_avoid = [skip_window]
@@ -113,8 +113,12 @@ def generate_batch(batch_size, num_skips, skip_window):
       targets_to_avoid.append(target)
       batch[i * num_skips + j] = buffer[skip_window]
       labels[i * num_skips + j, 0] = buffer[target]
-    buffer.append(data[data_index])
-    data_index = (data_index + 1) % len(data)
+    if data_index == len(data):
+      buffer[:] = data[:span]
+      data_index = span
+    else:
+      buffer.append(data[data_index])
+      data_index += 1
   # Backtrack a little bit to avoid skipping words in the end of a batch
   data_index = (data_index + len(data) - span) % len(data)
   return batch, labels
diff --git a/tensorflow/go/README.md b/tensorflow/go/README.md
index a1b4255292..9c2fa60017 100644
--- a/tensorflow/go/README.md
+++ b/tensorflow/go/README.md
@@ -9,35 +9,6 @@ Construct and execute TensorFlow graphs in Go.
 > (`github.com/tensorflow/tensorflow/tensorflow/go`).
 
 ## Quickstart
-1.  Download and extract the TensorFlow C library, preferably into `/usr/local`.
-    GPU-enabled versions require CUDA 8.0 and cuDNN 5.1. For other versions, the
-    TensorFlow C library will have to be built from source (see below).
-
-    -   Linux:
-        [CPU-only](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-cpu-linux-x86_64-1.1.0.tar.gz),
-        [GPU-enabled](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-gpu-linux-x86_64-1.1.0.tar.gz)
-    -   OS X
-        [CPU-only](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-cpu-darwin-x86_64-1.1.0.tar.gz),
-
-    The following shell snippet downloads and extracts into `/usr/local`:
-
-    ```sh
-    TF_TYPE="cpu" # Set to "gpu" for GPU support
-    curl -L \
-      "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.1.0.tar.gz" |
-    sudo tar -C /usr/local -xz
-    ```
-
-2.  `go get` this package (and run tests):
-
-    ```sh
-    go get github.com/tensorflow/tensorflow/tensorflow/go
-    go test github.com/tensorflow/tensorflow/tensorflow/go
-    ```
-
-3.  Done!
-
-### Installing into locations other than `/usr/local`
 
 Refer to [Installing TensorFlow for Go](https://www.tensorflow.org/install/install_go)
 
diff --git a/tensorflow/go/genop/generate.sh b/tensorflow/go/genop/generate.sh
index d791e39c40..15ef3b9525 100644
--- a/tensorflow/go/genop/generate.sh
+++ b/tensorflow/go/genop/generate.sh
@@ -20,7 +20,7 @@ go get github.com/golang/protobuf/proto
 go get github.com/golang/protobuf/protoc-gen-go
 
 cd $(dirname $0)
-for g in $(echo $GOPATH | sed "s/:/ /g"); do
+for g in $(echo "${GOPATH//:/ }"); do
     TF_DIR="${g}/src/github.com/tensorflow/tensorflow"
     PROTOC="${TF_DIR}/bazel-out/host/bin/external/protobuf/protoc"
     if [ -x "${PROTOC}" ]; then
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index af0eaede19..beede5a6d9 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -14288,6 +14288,36 @@ func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
+// Computes hyperbolic sine of x element-wise.
+func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sinh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes hyperbolic cosine of x element-wise.
+func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Cosh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes hyperbolic tangent of `x` element-wise.
 func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
diff --git a/tensorflow/go/shape.go b/tensorflow/go/shape.go
index 114ab5decb..8d000cb9de 100644
--- a/tensorflow/go/shape.go
+++ b/tensorflow/go/shape.go
@@ -59,7 +59,7 @@ func (s Shape) NumDimensions() int {
 //
 // REQUIRES: 0 <= dim < s.NumDimensions()
 func (s Shape) Size(dim int) int64 {
-	if dim < 0 || dim > s.NumDimensions() {
+	if dim < 0 || dim >= s.NumDimensions() {
 		return -1
 	}
 	return s.dims[dim]
diff --git a/tensorflow/go/tensor.go b/tensorflow/go/tensor.go
index 34e797a2b3..4a60c736b5 100644
--- a/tensorflow/go/tensor.go
+++ b/tensorflow/go/tensor.go
@@ -270,7 +270,7 @@ func typeOf(dt DataType, shape []int64) reflect.Type {
 	if ret == nil {
 		panic(bug("DataType %v is not supported", dt))
 	}
-	for _ = range shape {
+	for range shape {
 		ret = reflect.SliceOf(ret)
 	}
 	return ret
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index 9abb63c966..90372660cd 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -9,7 +9,10 @@ load("build_defs", "JAVACOPTS")
 
 java_library(
     name = "tensorflow",
-    srcs = [":java_sources"],
+    srcs = [
+        ":java_op_sources",
+        ":java_sources",
+    ],
     data = [":libtensorflow_jni"],
     javacopts = JAVACOPTS,
     visibility = ["//visibility:public"],
@@ -26,6 +29,14 @@ filegroup(
     ],
 )
 
+filegroup(
+    name = "java_op_sources",
+    srcs = glob(["src/main/java/org/tensorflow/op/*.java"]),
+    visibility = [
+        "//tensorflow/java:__pkg__",
+    ],
+)
+
 java_library(
     name = "testutil",
     testonly = 1,
@@ -138,6 +149,19 @@ java_test(
     ],
 )
 
+java_test(
+    name = "ScopeTest",
+    size = "small",
+    srcs = ["src/test/java/org/tensorflow/op/ScopeTest.java"],
+    javacopts = JAVACOPTS,
+    test_class = "org.tensorflow.op.ScopeTest",
+    deps = [
+        ":tensorflow",
+        ":testutil",
+        "@junit",
+    ],
+)
+
 filegroup(
     name = "libtensorflow_jni",
     srcs = select({
diff --git a/tensorflow/java/build_defs.bzl b/tensorflow/java/build_defs.bzl
index 750d76301e..5bd5b9a388 100644
--- a/tensorflow/java/build_defs.bzl
+++ b/tensorflow/java/build_defs.bzl
@@ -1,5 +1,14 @@
 # -*- Python -*-
 
+# Pin to Java 1.7 to ensure broader compatibility for the Java bindings on
+# Android. Note also that the android_library bazel rule currently enforces
+# java 7
+# https://github.com/bazelbuild/bazel/blob/6c1106b1a721516d3b3db54d2e1c31b44a76fbb1/src/main/java/com/google/devtools/build/lib/bazel/rules/android/BazelAndroidSemantics.java#L73
+
+JAVA_VERSION_OPTS = [
+    "-source 7 -target 7",
+]
+
 # A more robust set of lint and errorprone checks when building
 # Java source to improve code consistency.
 
@@ -151,4 +160,4 @@ EP_DISABLED_CHECKS = [
 
 EP_OPTS = EP_ENABLED_WARNINGS + EP_DISABLED_CHECKS
 
-JAVACOPTS = XLINT_OPTS + EP_OPTS
+JAVACOPTS = JAVA_VERSION_OPTS + XLINT_OPTS + EP_OPTS
diff --git a/tensorflow/java/maven/libtensorflow/pom.xml b/tensorflow/java/maven/libtensorflow/pom.xml
index 1fc9d49a86..462e38d332 100644
--- a/tensorflow/java/maven/libtensorflow/pom.xml
+++ b/tensorflow/java/maven/libtensorflow/pom.xml
@@ -19,8 +19,8 @@
         <artifactId>maven-compiler-plugin</artifactId>
         <version>3.6.1</version>
         <configuration>
-          <source>1.8</source>
-          <target>1.8</target>
+          <source>1.7</source>
+          <target>1.7</target>
         </configuration>
       </plugin>
       <plugin>
diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml
index d1d0359b1d..f2974220d9 100644
--- a/tensorflow/java/maven/proto/pom.xml
+++ b/tensorflow/java/maven/proto/pom.xml
@@ -16,7 +16,7 @@
     <dependency>
       <groupId>com.google.protobuf</groupId>
       <artifactId>protobuf-java</artifactId>
-      <version>3.2.0</version>
+      <version>3.3.1</version>
     </dependency>
   </dependencies>
 
@@ -27,8 +27,8 @@
         <artifactId>maven-compiler-plugin</artifactId>
         <version>3.6.1</version>
         <configuration>
-          <source>1.8</source>
-          <target>1.8</target>
+          <source>1.7</source>
+          <target>1.7</target>
         </configuration>
       </plugin>
       <plugin>
diff --git a/tensorflow/java/maven/release.sh b/tensorflow/java/maven/release.sh
index 0e14bc4964..0bb4efbcc0 100755
--- a/tensorflow/java/maven/release.sh
+++ b/tensorflow/java/maven/release.sh
@@ -21,7 +21,7 @@ TF_VERSION="$1"
 SETTINGS_XML="$2"
 shift
 shift
-CMD="$@"
+CMD="$*"
 
 if [[ -z "${TF_VERSION}" ]]
 then
diff --git a/tensorflow/java/maven/run_inside_container.sh b/tensorflow/java/maven/run_inside_container.sh
index b5e2bfc3a6..302ed96c12 100644
--- a/tensorflow/java/maven/run_inside_container.sh
+++ b/tensorflow/java/maven/run_inside_container.sh
@@ -23,7 +23,7 @@ IS_SNAPSHOT="false"
 if [[ "${TF_VERSION}" == *"-SNAPSHOT" ]]; then
   IS_SNAPSHOT="true"
 fi
-PROTOC_RELEASE_URL="https://github.com/google/protobuf/releases/download/v3.2.0/protoc-3.2.0-linux-x86_64.zip"
+PROTOC_RELEASE_URL="https://github.com/google/protobuf/releases/download/v3.3.0/protoc-3.3.0-linux-x86_64.zip"
 
 set -ex
 
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Input.java b/tensorflow/java/src/main/java/org/tensorflow/Input.java
new file mode 100644
index 0000000000..dff3a45463
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/Input.java
@@ -0,0 +1,48 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow;
+
+/**
+ * Interface implemented by operands of a TensorFlow operation.
+ *
+ * <p>Example usage:
+ *
+ * <pre>{@code
+ * // The "decodeJpeg" operation can be used as input to the "cast" operation
+ * Input decodeJpeg = ops.image().decodeJpeg(...);
+ * ops.math().cast(decodeJpeg, DataType.FLOAT);
+ *
+ * // The output "y" of the "unique" operation can be used as input to the "cast" operation
+ * Output y = ops.array().unique(...).y();
+ * ops.math().cast(y, DataType.FLOAT);
+ *
+ * // The "split" operation can be used as input list to the "concat" operation
+ * Iterable<? extends Input> split = ops.array().split(...);
+ * ops.array().concat(0, split);
+ * }</pre>
+ */
+public interface Input {
+
+  /**
+   * Returns the symbolic handle of a tensor.
+   *
+   * <p>Inputs to TensorFlow operations are outputs of another TensorFlow operation. This method is
+   * used to obtain a symbolic handle that represents the computation of the input.
+   *
+   * @see {@link OperationBuilder#addInput(Output)}.
+   */
+  Output asOutput();
+}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Operation.java b/tensorflow/java/src/main/java/org/tensorflow/Operation.java
index 43dbaf125c..200350f7ae 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Operation.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Operation.java
@@ -73,15 +73,14 @@ public final class Operation {
   /**
    * Returns the size of the list of Tensors produced by this operation.
    *
-   * <p>An Operation has multiple named outputs, each of which produces either
-   * a single tensor or a list of tensors. This method returns the size of
-   * the list of tensors for a specific named output of the operation.
+   * <p>An Operation has multiple named outputs, each of which produces either a single tensor or a
+   * list of tensors. This method returns the size of the list of tensors for a specific named
+   * output of the operation.
    *
-   * @param name identifier of the list of tensors (of which there may
-   *        be many) produced by this operation.
-   * @returns the size of the list of Tensors produced by this named output.
-   * @throws IllegalArgumentException if this operation has no output
-   *         with the provided name.
+   * @param name identifier of the list of tensors (of which there may be many) produced by this
+   *     operation.
+   * @return the size of the list of Tensors produced by this named output.
+   * @throws IllegalArgumentException if this operation has no output with the provided name.
    */
   public int outputListLength(final String name) {
     Graph.Reference r = graph.ref();
@@ -97,6 +96,61 @@ public final class Operation {
     return new Output(this, idx);
   }
 
+  @Override
+  public int hashCode() {
+    return Long.valueOf(unsafeNativeHandle).hashCode();
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (o == this) {
+      return true;
+    }
+    if (!(o instanceof Operation)) {
+      return false;
+    }
+    Operation that = (Operation) o;
+    if (graph != that.graph) {
+      return false;
+    }
+
+    // The graph object is known to be identical here, so this one
+    // reference is sufficient to validate the use of native pointers
+    // in both objects.
+    Graph.Reference r = graph.ref();
+    try {
+      return unsafeNativeHandle == that.unsafeNativeHandle;
+    } finally {
+      r.close();
+    }
+  }
+
+  @Override
+  public String toString() {
+    return String.format("<%s '%s'>", type(), name());
+  }
+
+  /**
+   * Returns the size of the given inputs list of Tensors for this operation.
+   *
+   * <p>An Operation has multiple named inputs, each of which contains either a single tensor or a
+   * list of tensors. This method returns the size of the list of tensors for a specific named input
+   * of the operation.
+   *
+   * @param name identifier of the list of tensors (of which there may be many) inputs to this
+   *     operation.
+   * @returns the size of the list of Tensors produced by this named input.
+   * @throws IllegalArgumentException if this operation has no input with the provided name.
+   */
+  public int inputListLength(final String name) {
+    Graph.Reference r = graph.ref();
+    try {
+      return inputListLength(unsafeNativeHandle, name);
+    } finally {
+      r.close();
+    }
+  }
+
   long getUnsafeNativeHandle() {
     return unsafeNativeHandle;
   }
@@ -122,6 +176,7 @@ public final class Operation {
   }
 
   private final long unsafeNativeHandle;
+
   private final Graph graph;
 
   private static native String name(long handle);
@@ -132,6 +187,8 @@ public final class Operation {
 
   private static native int outputListLength(long handle, String name);
 
+  private static native int inputListLength(long handle, String name);
+
   private static native long[] shape(long graphHandle, long opHandle, int output);
 
   private static native int dtype(long graphHandle, long opHandle, int output);
diff --git a/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java b/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java
index 38ffa2a8e1..8f7559d39e 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java
@@ -28,7 +28,7 @@ import java.nio.charset.Charset;
  * <pre>{@code
  * // g is a Graph instance.
  * try (Tensor c1 = Tensor.create(3.0f)) {
- *   g.opBuilder("Constant", "MyConst")
+ *   g.opBuilder("Const", "MyConst")
  *       .setAttr("dtype", c1.dataType())
  *       .setAttr("value", c1)
  *       .build();
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Output.java b/tensorflow/java/src/main/java/org/tensorflow/Output.java
index ab128c2b30..ad28eb2743 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Output.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Output.java
@@ -15,13 +15,17 @@ limitations under the License.
 
 package org.tensorflow;
 
+import java.util.Objects;
 /**
  * A symbolic handle to a tensor produced by an {@link Operation}.
  *
  * <p>An Output is a symbolic handle to a tensor. The value of the Tensor is computed by executing
  * the {@link Operation} in a {@link Session}.
+ *
+ * <p>By implementing the {@link Input} interface, instances of this class could also be passed
+ * directly in input to an operation.
  */
-public final class Output {
+public final class Output implements Input {
 
   /** Handle to the idx-th output of the Operation {@code op}. */
   public Output(Operation op, int idx) {
@@ -49,6 +53,35 @@ public final class Output {
     return operation.dtype(index);
   }
 
+  @Override
+  public Output asOutput() {
+    return this;
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(operation, index);
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (o == this) {
+      return true;
+    }
+    if (o instanceof Output) {
+      Output that = (Output) o;
+      return index == that.index && operation.equals(that.operation);
+    }
+    return false;
+  }
+
+  @Override
+  public String toString() {
+    return String.format(
+        "<%s '%s:%d' shape=%s dtype=%s>",
+        operation.type(), operation.name(), index, shape().toString(), dataType());
+  }
+
   private final Operation operation;
   private final int index;
 }
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Session.java b/tensorflow/java/src/main/java/org/tensorflow/Session.java
index 0d071e1674..f73cded4e3 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Session.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Session.java
@@ -125,7 +125,7 @@ public final class Session implements AutoCloseable {
      *     <tt>operation_name:output_index</tt> , in which case this method acts like {@code
      *     feed(operation_name, output_index)}. These colon-separated names are commonly used in the
      *     {@code SignatureDef} protocol buffer messages that are included in {@link
-     *     SavedModelBundle.metaGraphDef()}.
+     *     SavedModelBundle#metaGraphDef()}.
      */
     public Runner feed(String operation, Tensor t) {
       return feed(parseOutput(operation), t);
@@ -165,7 +165,7 @@ public final class Session implements AutoCloseable {
      *     <tt>operation_name:output_index</tt> , in which case this method acts like {@code
      *     fetch(operation_name, output_index)}. These colon-separated names are commonly used in
      *     the {@code SignatureDef} protocol buffer messages that are included in {@link
-     *     SavedModelBundle.metaGraphDef()}.
+     *     SavedModelBundle#metaGraphDef()}.
      */
     public Runner fetch(String operation) {
       return fetch(parseOutput(operation));
diff --git a/tensorflow/java/src/main/java/org/tensorflow/op/NameScope.java b/tensorflow/java/src/main/java/org/tensorflow/op/NameScope.java
new file mode 100644
index 0000000000..2e84cac1ac
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/op/NameScope.java
@@ -0,0 +1,146 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.op;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+/**
+ * A class to manage scoped (hierarchical) names for operators.
+ *
+ * <p>{@code NameScope} manages hierarchical names where each component in the hierarchy is
+ * separated by a forward slash {@code '/'}. For instance, {@code nn/Const_72} or {@code
+ * nn/gradient/assign/init}. Each scope is a subtree in this hierarchy.
+ *
+ * <p>Use {@code NameScope} to group related operations within a hierarchy, which for example lets
+ * tensorboard coalesce nodes for better graph visualizations.
+ *
+ * <p>This class is package private, user code creates {@link Scope} which internally delegates
+ * calls to an underlying {@code NameScope}.
+ *
+ * <p>This class is <b>not</b> thread-safe.
+ */
+final class NameScope {
+
+  NameScope withSubScope(String scopeName) {
+    checkPattern(NAME_REGEX, scopeName);
+    // Override with opName if it exists.
+    String actualName = (opName != null) ? opName : scopeName;
+    String newPrefix = fullyQualify(makeUnique(actualName));
+    return new NameScope(newPrefix, null, null);
+  }
+
+  NameScope withName(String name) {
+    checkPattern(NAME_REGEX, name);
+    // All context except for the opName is shared with the new scope.
+    return new NameScope(opPrefix, name, ids);
+  }
+
+  String makeOpName(String name) {
+    checkPattern(NAME_REGEX, name);
+    // Override with opName if it exists.
+    String actualName = (opName != null) ? opName : name;
+    return fullyQualify(makeUnique(actualName));
+  }
+
+  /**
+   * Create a new, root-level namescope.
+   *
+   * <p>A root-level namescope generates operator names with no components, like {@code Const_72}
+   * and {@code result}.
+   */
+  NameScope() {
+    this(null, null, null);
+  }
+
+  private NameScope(String opPrefix, String opName, Map<String, Integer> ids) {
+    this.opPrefix = opPrefix;
+    this.opName = opName;
+    if (ids != null) {
+      this.ids = ids;
+    } else {
+      this.ids = new HashMap<String, Integer>();
+    }
+  }
+
+  // Generate a unique name, different from existing ids.
+  //
+  // ids is a map from id to integer, representing a counter of the
+  // number of previous requests to generate a unique name for the
+  // given id.
+  //
+  // For instance, the first use of makeUnique("a") adds "a" -> 1
+  // to ids and returns "a".
+  //
+  // The second use of makeUnique("a") updates ids to "a" -> 2
+  // and returns "a_1", and so on.
+  private String makeUnique(String id) {
+    if (!ids.containsKey(id)) {
+      ids.put(id, 1);
+      return id;
+    } else {
+      int cur = ids.get(id);
+      ids.put(id, cur + 1);
+      return String.format("%s_%d", id, cur);
+    }
+  }
+
+  private String fullyQualify(String name) {
+    if (opPrefix != null) {
+      return String.format("%s/%s", opPrefix, name);
+    } else {
+      return name;
+    }
+  }
+
+  // If opPrefix is non-null, it is a prefix applied to all names
+  // created by this instance.
+  private final String opPrefix;
+
+  // If opName is non-null, it is used to derive the unique name
+  // for operators rather than the provided default name.
+  private final String opName;
+
+  // NameScope generates unique names by appending a numeric suffix if
+  // needed. This is a map containing names already created by this
+  // instance mapped to the next available numeric suffix for it.
+  private final Map<String, Integer> ids;
+
+  private static void checkPattern(Pattern pattern, String name) {
+    if (name == null) {
+      throw new IllegalArgumentException("Names cannot be null");
+    }
+    if (!pattern.matcher(name).matches()) {
+      throw new IllegalArgumentException(
+          String.format(
+              "invalid name: '%s' does not match the regular expression %s",
+              name, NAME_REGEX.pattern()));
+    }
+  }
+
+  // The constraints for operator and scope names originate from restrictions on node names
+  // noted in the proto definition core/framework/node_def.proto for NodeDef and actually
+  // implemented in core/framework/node_def_util.cc [Note that the proto comment does not include
+  // dash (-) in names, while the actual implementation permits it. This regex follows the actual
+  // implementation.]
+  //
+  // This pattern is used to ensure fully qualified names always start with a LETTER_DIGIT_DOT,
+  // followed by zero or more LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE. SLASH is not permitted in
+  // actual user-supplied names to NameScope - it is used as a reserved character to separate
+  // subcomponents within fully qualified names.
+  private static final Pattern NAME_REGEX = Pattern.compile("[A-Za-z0-9.][A-Za-z0-9_.\\-]*");
+}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/op/Scope.java b/tensorflow/java/src/main/java/org/tensorflow/op/Scope.java
new file mode 100644
index 0000000000..8de2eaeb79
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/op/Scope.java
@@ -0,0 +1,165 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.op;
+
+import org.tensorflow.Graph;
+
+/**
+ * Manages groups of related properties when creating Tensorflow Operations, such as a common name
+ * prefix.
+ *
+ * <p>A {@code Scope} is a container for common properties applied to TensorFlow Ops. Normal user
+ * code initializes a {@code Scope} and provides it to Operation building classes. For example:
+ *
+ * <pre>{@code
+ * Scope scope = new Scope(graph);
+ * Constant c = Constant.create(scope, 42);
+ * }</pre>
+ *
+ * <p>An Operation building class acquires a Scope, and uses it to set properties on the underlying
+ * Tensorflow ops. For example:
+ *
+ * <pre>{@code
+ * // An operator class that adds a constant.
+ * public class Constant {
+ *   public static Constant create(Scope scope, ...) {
+ *      scope.graph().opBuilder(
+ *        "Const", scope.makeOpName("Const"))
+ *        .setAttr(...)
+ *        .build()
+ *      ...
+ *   }
+ * }
+ * }</pre>
+ *
+ * <p><b>Scope hierarchy:</b>
+ *
+ * <p>A {@code Scope} provides various {@code with()} methods that create a new scope. The new scope
+ * typically has one property changed while other properties are inherited from the parent scope.
+ *
+ * <p>An example using {@code Constant} implemented as before:
+ *
+ * <pre>{@code
+ * Scope root = new Scope(graph);
+ *
+ * // The linear subscope will generate names like linear/...
+ * Scope linear = Scope.withSubScope("linear");
+ *
+ * // This op name will be "linear/W"
+ * Constant.create(linear.withName("W"), ...);
+ *
+ * // This op will be "linear/Const", using the default
+ * // name provided by Constant
+ * Constant.create(linear, ...);
+ *
+ * // This op will be "linear/Const_1", using the default
+ * // name provided by Constant and making it unique within
+ * // this scope
+ * Constant.create(linear, ...);
+ * }</pre>
+ *
+ * <p>Scope objects are <b>not</b> thread-safe.
+ */
+public final class Scope {
+
+  /**
+   * Create a new top-level scope.
+   *
+   * @param graph The graph instance to be managed by the scope.
+   */
+  public Scope(Graph graph) {
+    this(graph, new NameScope());
+  }
+
+  /** Returns the graph managed by this scope. */
+  public Graph graph() {
+    return graph;
+  }
+
+  /**
+   * Returns a new scope where added operations will have the provided name prefix.
+   *
+   * <p>Ops created with this scope will have {@code name/childScopeName/} as the prefix. The actual
+   * name will be unique in the returned scope. All other properties are inherited from the current
+   * scope.
+   *
+   * <p>The child scope name must match the regular expression {@code [A-Za-z0-9.][A-Za-z0-9_.\-]*}
+   *
+   * @param childScopeName name for the new child scope
+   * @return a new subscope
+   * @throws IllegalArgumentException if the name is invalid
+   */
+  public Scope withSubScope(String childScopeName) {
+    return new Scope(graph, nameScope.withSubScope(childScopeName));
+  }
+
+  /**
+   * Return a new scope that uses the provided name for an op.
+   *
+   * <p>Operations created within this scope will have a name of the form {@code
+   * name/opName[_suffix]}. This lets you name a specific operator more meaningfully.
+   *
+   * <p>Names must match the regular expression {@code [A-Za-z0-9.][A-Za-z0-9_.\-]*}
+   *
+   * @param opName name for an operator in the returned scope
+   * @return a new Scope that uses opName for operations.
+   * @throws IllegalArgumentException if the name is invalid
+   */
+  public Scope withName(String opName) {
+    return new Scope(graph, nameScope.withName(opName));
+  }
+
+  /**
+   * Create a unique name for an operator, using a provided default if necessary.
+   *
+   * <p>This is normally called only by operator building classes.
+   *
+   * <p>This method generates a unique name, appropriate for the name scope controlled by this
+   * instance. Typical operator building code might look like
+   *
+   * <pre>{@code
+   * scope.graph().opBuilder("Const", scope.makeOpName("Const"))...
+   * }</pre>
+   *
+   * <p><b>Note:</b> if you provide a composite operator building class (i.e, a class that adds a
+   * set of related operations to the graph by calling other operator building code) you should also
+   * create a {@link #withSubScope(String)} scope for the underlying operators to group them under a
+   * meaningful name.
+   *
+   * <pre>{@code
+   * public static Stddev create(Scope scope, ...) {
+   *   // group sub-operations under a common name
+   *   Scope group = scope.withSubScope("stddev");
+   *   ... Sqrt.create(group, Mean.create(group, ...))
+   * }
+   * }</pre>
+   *
+   * @param defaultName name for the underlying operator.
+   * @return unique name for the operator.
+   * @throws IllegalArgumentException if the default name is invalid.
+   */
+  public String makeOpName(String defaultName) {
+    return nameScope.makeOpName(defaultName);
+  }
+
+  private Scope(Graph graph, NameScope nameScope) {
+    this.graph = graph;
+    this.nameScope = nameScope;
+  }
+
+  private final Graph graph;
+  private final NameScope nameScope;
+}
diff --git a/tensorflow/java/src/main/native/operation_jni.cc b/tensorflow/java/src/main/native/operation_jni.cc
index b3d5fc4ec3..ccc44d91c0 100644
--- a/tensorflow/java/src/main/native/operation_jni.cc
+++ b/tensorflow/java/src/main/native/operation_jni.cc
@@ -156,3 +156,21 @@ JNIEXPORT jint JNICALL Java_org_tensorflow_Operation_dtype(JNIEnv* env,
 
   return static_cast<jint>(TF_OperationOutputType(TF_Output{op, output_index}));
 }
+
+JNIEXPORT jint JNICALL Java_org_tensorflow_Operation_inputListLength(JNIEnv* env,
+                                                                      jclass clazz,
+                                                                      jlong handle,
+                                                                      jstring name) {
+  TF_Operation* op = requireHandle(env, handle);
+  if (op == nullptr) return 0;
+
+  TF_Status* status = TF_NewStatus();
+
+  const char* cname = env->GetStringUTFChars(name, nullptr);
+  int result = TF_OperationInputListLength(op, cname, status);
+  env->ReleaseStringUTFChars(name, cname);
+
+  throwExceptionIfNotOK(env, status);
+  TF_DeleteStatus(status);
+  return result;
+}
diff --git a/tensorflow/java/src/main/native/operation_jni.h b/tensorflow/java/src/main/native/operation_jni.h
index b5d156f7c2..6f379256d2 100644
--- a/tensorflow/java/src/main/native/operation_jni.h
+++ b/tensorflow/java/src/main/native/operation_jni.h
@@ -73,6 +73,17 @@ JNIEXPORT jlongArray JNICALL Java_org_tensorflow_Operation_shape(JNIEnv *,
 JNIEXPORT jint JNICALL Java_org_tensorflow_Operation_dtype(JNIEnv *, jclass,
                                                            jlong, jlong, jint);
 
+
+/*
+ * Class:     org_tensorflow_Operation
+ * Method:    inputListLength
+ * Signature: (JLjava/lang/String;)I
+ */
+JNIEXPORT jint JNICALL Java_org_tensorflow_Operation_inputListLength(JNIEnv *,
+                                                                      jclass,
+                                                                      jlong,
+                                                                      jstring);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/java/src/test/java/org/tensorflow/OperationTest.java b/tensorflow/java/src/test/java/org/tensorflow/OperationTest.java
index 74fdcf484e..27afc046ac 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/OperationTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/OperationTest.java
@@ -16,8 +16,14 @@ limitations under the License.
 package org.tensorflow;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.JUnit4;
@@ -45,6 +51,91 @@ public class OperationTest {
     }
   }
 
+  @Test
+  public void operationEquality() {
+    Operation op1;
+    try (Graph g = new Graph()) {
+      op1 = TestUtil.constant(g, "op1", 1).op();
+      Operation op2 = TestUtil.constant(g, "op2", 2).op();
+      Operation op3 = new Operation(g, op1.getUnsafeNativeHandle());
+      Operation op4 = g.operation("op1");
+      assertEquals(op1, op1);
+      assertNotEquals(op1, op2);
+      assertEquals(op1, op3);
+      assertEquals(op1.hashCode(), op3.hashCode());
+      assertEquals(op1, op4);
+      assertEquals(op1.hashCode(), op4.hashCode());
+      assertEquals(op3, op4);
+      assertNotEquals(op2, op3);
+      assertNotEquals(op2, op4);
+    }
+    try (Graph g = new Graph()) {
+      Operation newOp1 = TestUtil.constant(g, "op1", 1).op();
+      assertNotEquals(op1, newOp1);
+    }
+  }
+
+  @Test
+  public void operationCollection() {
+    try (Graph g = new Graph()) {
+      Operation op1 = TestUtil.constant(g, "op1", 1).op();
+      Operation op2 = TestUtil.constant(g, "op2", 2).op();
+      Operation op3 = new Operation(g, op1.getUnsafeNativeHandle());
+      Operation op4 = g.operation("op1");
+      Set<Operation> ops = new HashSet<>();
+      ops.addAll(Arrays.asList(op1, op2, op3, op4));
+      assertEquals(2, ops.size());
+      assertTrue(ops.contains(op1));
+      assertTrue(ops.contains(op2));
+      assertTrue(ops.contains(op3));
+      assertTrue(ops.contains(op4));
+    }
+  }
+
+  @Test
+  public void operationToString() {
+    try (Graph g = new Graph()) {
+      Operation op = TestUtil.constant(g, "c", new int[] {1}).op();
+      assertNotNull(op.toString());
+    }
+  }
+
+  @Test
+  public void outputEquality() {
+    try (Graph g = new Graph()) {
+      Output output = TestUtil.constant(g, "c", 1);
+      Output output1 = output.op().output(0);
+      Output output2 = g.operation("c").output(0);
+      assertEquals(output, output1);
+      assertEquals(output.hashCode(), output1.hashCode());
+      assertEquals(output, output2);
+      assertEquals(output.hashCode(), output2.hashCode());
+    }
+  }
+
+  @Test
+  public void outputCollection() {
+    try (Graph g = new Graph()) {
+      Output output = TestUtil.constant(g, "c", 1);
+      Output output1 = output.op().output(0);
+      Output output2 = g.operation("c").output(0);
+      Set<Output> ops = new HashSet<>();
+      ops.addAll(Arrays.asList(output, output1, output2));
+      assertEquals(1, ops.size());
+      assertTrue(ops.contains(output));
+      assertTrue(ops.contains(output1));
+      assertTrue(ops.contains(output2));
+    }
+  }
+
+  @Test
+  public void outputToString() {
+    try (Graph g = new Graph()) {
+      Output output = TestUtil.constant(g, "c", new int[] {1});
+      assertNotNull(output.toString());
+    }
+  }
+
   @Test
   public void outputListLength() {
     assertEquals(1, split(new int[] {0, 1}, 1));
@@ -52,6 +143,16 @@ public class OperationTest {
     assertEquals(3, split(new int[] {0, 1, 2}, 3));
   }
 
+  @Test
+  public void inputListLength() {
+    assertEquals(1, splitWithInputList(new int[] {0, 1}, 1, "split_dim"));
+    try {
+      splitWithInputList(new int[] {0, 1}, 2, "inputs");
+    } catch (IllegalArgumentException iae) {
+      // expected
+    }
+  }
+
   private static int split(int[] values, int num_split) {
     try (Graph g = new Graph()) {
       return g.opBuilder("Split", "Split")
@@ -62,4 +163,15 @@ public class OperationTest {
           .outputListLength("output");
     }
   }
+
+  private static int splitWithInputList(int[] values, int num_split, String name) {
+    try (Graph g = new Graph()) {
+      return g.opBuilder("Split", "Split")
+          .addInput(TestUtil.constant(g, "split_dim", 0))
+          .addInput(TestUtil.constant(g, "values", values))
+          .setAttr("num_split", num_split)
+          .build()
+          .inputListLength(name);
+    }
+  }
 }
diff --git a/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java b/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java
index 0d2dbc5b88..50bdf351e3 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java
@@ -109,7 +109,7 @@ public class SessionTest {
         assertEquals(1, outputs.size());
         final int[][] expected = {{31}};
         assertArrayEquals(expected, outputs.get(0).copyTo(new int[1][1]));
-        // Sanity check on metadatar
+        // Sanity check on metadata
         // See comments in fullTraceRunOptions() for an explanation about
         // why this check is really silly. Ideally, this would be:
         /*
@@ -187,7 +187,7 @@ public class SessionTest {
     // https://github.com/bazelbuild/rules_go/pull/121#issuecomment-251515362
     // https://github.com/bazelbuild/rules_go/pull/121#issuecomment-251692558
     //
-    // For this test, for now, the use of specific bytes sufficies.
+    // For this test, for now, the use of specific bytes suffices.
     return new byte[] {0x08, 0x03};
     /*
     return org.tensorflow.framework.RunOptions.newBuilder()
@@ -207,7 +207,7 @@ public class SessionTest {
     // https://github.com/bazelbuild/rules_go/pull/121#issuecomment-251515362
     // https://github.com/bazelbuild/rules_go/pull/121#issuecomment-251692558
     //
-    // For this test, for now, the use of specific bytes sufficies.
+    // For this test, for now, the use of specific bytes suffices.
     return new byte[] {0x10, 0x01, 0x28, 0x01};
     /*
     return org.tensorflow.framework.ConfigProto.newBuilder()
diff --git a/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java b/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java
index 44eecc1d1e..3ff59e71b2 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java
@@ -472,7 +472,7 @@ public class TensorTest {
   @Test
   public void fromHandle() {
     // fromHandle is a package-visible method intended for use when the C TF_Tensor object has been
-    // created indepdently of the Java code. In practice, two Tensor instances MUST NOT have the
+    // created independently of the Java code. In practice, two Tensor instances MUST NOT have the
     // same native handle.
     //
     // An exception is made for this test, where the pitfalls of this is avoided by not calling
diff --git a/tensorflow/java/src/test/java/org/tensorflow/op/ScopeTest.java b/tensorflow/java/src/test/java/org/tensorflow/op/ScopeTest.java
new file mode 100644
index 0000000000..9256cb281d
--- /dev/null
+++ b/tensorflow/java/src/test/java/org/tensorflow/op/ScopeTest.java
@@ -0,0 +1,270 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.op;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.fail;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+import org.tensorflow.Graph;
+import org.tensorflow.Output;
+import org.tensorflow.Session;
+import org.tensorflow.Tensor;
+
+/** Unit tests for {@link org.tensorflow.Scope}. */
+@RunWith(JUnit4.class)
+public class ScopeTest {
+
+  @Test
+  public void basicNames() {
+    try (Graph g = new Graph()) {
+      Scope root = new Scope(g);
+      assertEquals("add", root.makeOpName("add"));
+      assertEquals("add_1", root.makeOpName("add"));
+      assertEquals("add_2", root.makeOpName("add"));
+      assertEquals("mul", root.makeOpName("mul"));
+    }
+  }
+
+  @Test
+  public void hierarchicalNames() {
+    try (Graph g = new Graph()) {
+      Scope root = new Scope(g);
+      Scope child = root.withSubScope("child");
+      assertEquals("child/add", child.makeOpName("add"));
+      assertEquals("child/add_1", child.makeOpName("add"));
+      assertEquals("child/mul", child.makeOpName("mul"));
+
+      Scope child_1 = root.withSubScope("child");
+      assertEquals("child_1/add", child_1.makeOpName("add"));
+      assertEquals("child_1/add_1", child_1.makeOpName("add"));
+      assertEquals("child_1/mul", child_1.makeOpName("mul"));
+
+      Scope c_c = root.withSubScope("c").withSubScope("c");
+      assertEquals("c/c/add", c_c.makeOpName("add"));
+
+      Scope c_1 = root.withSubScope("c");
+      Scope c_1_c = c_1.withSubScope("c");
+      assertEquals("c_1/c/add", c_1_c.makeOpName("add"));
+
+      Scope c_1_c_1 = c_1.withSubScope("c");
+      assertEquals("c_1/c_1/add", c_1_c_1.makeOpName("add"));
+    }
+  }
+
+  @Test
+  public void scopeAndOpNames() {
+    try (Graph g = new Graph()) {
+      Scope root = new Scope(g);
+
+      Scope child = root.withSubScope("child");
+
+      assertEquals("child/add", child.makeOpName("add"));
+      assertEquals("child_1", root.makeOpName("child"));
+      assertEquals("child_2/p", root.withSubScope("child").makeOpName("p"));
+    }
+  }
+
+  @Test
+  public void validateNames() {
+    try (Graph g = new Graph()) {
+      Scope root = new Scope(g);
+
+      final String[] invalid_names = {
+        "_", "-", "-x", // Names are constrained to start with [A-Za-z0-9.]
+        null, "", "a$", // Invalid characters
+        "a/b", // slashes not allowed
+      };
+
+      for (String name : invalid_names) {
+        try {
+          root.withName(name);
+          fail("failed to catch invalid op name.");
+        } catch (IllegalArgumentException ex) {
+          // expected
+        }
+        // Subscopes follow the same rules
+        try {
+          root.withSubScope(name);
+          fail("failed to catch invalid scope name: " + name);
+        } catch (IllegalArgumentException ex) {
+          // expected
+        }
+      }
+
+      // Unusual but valid names.
+      final String[] valid_names = {".", "..", "._-.", "a--."};
+
+      for (String name : valid_names) {
+        root.withName(name);
+        root.withSubScope(name);
+      }
+    }
+  }
+
+  @Test
+  public void basic() {
+    try (Graph g = new Graph()) {
+      Scope s = new Scope(g);
+      Const c1 = Const.create(s, 42);
+      assertEquals("Const", c1.output().op().name());
+      Const c2 = Const.create(s, 7);
+      assertEquals("Const_1", c2.output().op().name());
+      Const c3 = Const.create(s.withName("four"), 4);
+      assertEquals("four", c3.output().op().name());
+      Const c4 = Const.create(s.withName("four"), 4);
+      assertEquals("four_1", c4.output().op().name());
+    }
+  }
+
+  @Test
+  public void hierarchy() {
+    try (Graph g = new Graph()) {
+      Scope root = new Scope(g);
+      Scope child = root.withSubScope("child");
+      assertEquals("child/Const", Const.create(child, 42).output().op().name());
+      assertEquals("child/four", Const.create(child.withName("four"), 4).output().op().name());
+    }
+  }
+
+  @Test
+  public void composite() {
+    try (Graph g = new Graph();
+        Session sess = new Session(g)) {
+      Scope s = new Scope(g);
+      Output data = Const.create(s.withName("data"), new int[] {600, 470, 170, 430, 300}).output();
+
+      // Create a composite op with a customized name
+      Variance var1 = Variance.create(s.withName("example"), data);
+      assertEquals("example/variance", var1.output().op().name());
+
+      // Confirm internally added ops have the right names.
+      assertNotNull(g.operation("example/squared_deviation"));
+      assertNotNull(g.operation("example/Mean"));
+      assertNotNull(g.operation("example/zero"));
+
+      // Same composite op with a default name
+      Variance var2 = Variance.create(s, data);
+      assertEquals("variance/variance", var2.output().op().name());
+
+      // Confirm internally added ops have the right names.
+      assertNotNull(g.operation("variance/squared_deviation"));
+      assertNotNull(g.operation("variance/Mean"));
+      assertNotNull(g.operation("variance/zero"));
+
+      // Verify correct results as well.
+      Tensor result = sess.runner().fetch(var1.output()).run().get(0);
+      assertEquals(21704, result.intValue());
+      result = sess.runner().fetch(var2.output()).run().get(0);
+      assertEquals(21704, result.intValue());
+    }
+  }
+
+  // "handwritten" sample operator classes
+  private static final class Const {
+    private final Output output;
+
+    static Const create(Scope s, Object v) {
+      try (Tensor value = Tensor.create(v)) {
+        return new Const(
+            s.graph()
+                .opBuilder("Const", s.makeOpName("Const"))
+                .setAttr("dtype", value.dataType())
+                .setAttr("value", value)
+                .build()
+                .output(0));
+      }
+    }
+
+    Const(Output o) {
+      output = o;
+    }
+
+    Output output() {
+      return output;
+    }
+  }
+
+  private static final class Mean {
+    private final Output output;
+
+    static Mean create(Scope s, Output input, Output reductionIndices) {
+      return new Mean(
+          s.graph()
+              .opBuilder("Mean", s.makeOpName("Mean"))
+              .addInput(input)
+              .addInput(reductionIndices)
+              .build()
+              .output(0));
+    }
+
+    Mean(Output o) {
+      output = o;
+    }
+
+    Output output() {
+      return output;
+    }
+  }
+
+  private static final class SquaredDifference {
+    private final Output output;
+
+    static SquaredDifference create(Scope s, Output x, Output y) {
+      return new SquaredDifference(
+          s.graph()
+              .opBuilder("SquaredDifference", s.makeOpName("SquaredDifference"))
+              .addInput(x)
+              .addInput(y)
+              .build()
+              .output(0));
+    }
+
+    SquaredDifference(Output o) {
+      output = o;
+    }
+
+    Output output() {
+      return output;
+    }
+  }
+
+  private static final class Variance {
+    private final Output output;
+
+    static Variance create(Scope base, Output x) {
+      Scope s = base.withSubScope("variance");
+      Output zero = Const.create(s.withName("zero"), new int[] {0}).output();
+      Output sqdiff =
+          SquaredDifference.create(
+                  s.withName("squared_deviation"), x, Mean.create(s, x, zero).output())
+              .output();
+
+      return new Variance(Mean.create(s.withName("variance"), sqdiff, zero).output());
+    }
+
+    Variance(Output o) {
+      output = o;
+    }
+
+    Output output() {
+      return output;
+    }
+  }
+}
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 7530c09cfe..7f291b0b13 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -832,6 +832,7 @@ py_test(
     data = ["//tensorflow/python:meta_graph_testdata"],
     main = "framework/meta_graph_test.py",
     srcs_version = "PY2AND3",
+    tags = ["no_pip"],
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -3055,6 +3056,7 @@ py_test(
     srcs_version = "PY2AND3",
     tags = [
         "no_gpu",
+        "no_pip_gpu",
     ],
     deps = [
         ":array_ops",
@@ -3079,6 +3081,7 @@ py_test(
     srcs_version = "PY2AND3",
     tags = [
         "no_gpu",
+        "no_pip_gpu",
     ],
     deps = [
         ":client",
diff --git a/tensorflow/python/debug/cli/analyzer_cli.py b/tensorflow/python/debug/cli/analyzer_cli.py
index da27f4cebe..e863c2ddb9 100644
--- a/tensorflow/python/debug/cli/analyzer_cli.py
+++ b/tensorflow/python/debug/cli/analyzer_cli.py
@@ -1312,7 +1312,7 @@ class DebugAnalyzer(object):
     all_inputs = copy.copy(tracker(node_name, is_control=False))
     is_ctrl = [False] * len(all_inputs)
     if include_control:
-      # Sort control inputs or recipients in in alphabetical order of the node
+      # Sort control inputs or recipients in alphabetical order of the node
       # names.
       ctrl_inputs = sorted(tracker(node_name, is_control=True))
       all_inputs.extend(ctrl_inputs)
diff --git a/tensorflow/python/debug/wrappers/framework_test.py b/tensorflow/python/debug/wrappers/framework_test.py
index 536365b692..2b2289d6a8 100644
--- a/tensorflow/python/debug/wrappers/framework_test.py
+++ b/tensorflow/python/debug/wrappers/framework_test.py
@@ -215,7 +215,7 @@ class DebugWrapperSessionTest(test_util.TensorFlowTestCase):
       wrapper_sess.partial_run_setup(self._p)
 
   def testInteractiveSessionInit(self):
-    """The wrapper should work also on other subclassses of session.Session."""
+    """The wrapper should work also on other subclasses of session.Session."""
 
     TestDebugWrapperSession(
         session.InteractiveSession(), self._dump_root, self._observer)
diff --git a/tensorflow/python/estimator/canned/dnn_linear_combined.py b/tensorflow/python/estimator/canned/dnn_linear_combined.py
index 8785ddc5a6..03bcf7ae57 100644
--- a/tensorflow/python/estimator/canned/dnn_linear_combined.py
+++ b/tensorflow/python/estimator/canned/dnn_linear_combined.py
@@ -395,7 +395,7 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
 
 
 class DNNLinearCombinedRegressor(estimator.Estimator):
-  """An estimator for TensorFlow Linear and DNN joined models for regresssion.
+  """An estimator for TensorFlow Linear and DNN joined models for regression.
 
   Note: This estimator is also known as wide-n-deep.
 
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index dd2fd7b2ab..f49a682dbe 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -382,16 +382,17 @@ class TensorFlowTestCase(googletest.TestCase):
     `force_gpu and `use_gpu` are False, all ops are pinned to the CPU.
 
     Example:
-
-      class MyOperatorTest(test_util.TensorFlowTestCase):
-        def testMyOperator(self):
-          with self.test_session(use_gpu=True):
-            valid_input = [1.0, 2.0, 3.0, 4.0, 5.0]
-            result = MyOperator(valid_input).eval()
-            self.assertEqual(result, [1.0, 2.0, 3.0, 5.0, 8.0]
-            invalid_input = [-1.0, 2.0, 7.0]
-            with self.assertRaisesOpError("negative input not supported"):
-              MyOperator(invalid_input).eval()
+    ```python
+    class MyOperatorTest(test_util.TensorFlowTestCase):
+      def testMyOperator(self):
+        with self.test_session(use_gpu=True):
+          valid_input = [1.0, 2.0, 3.0, 4.0, 5.0]
+          result = MyOperator(valid_input).eval()
+          self.assertEqual(result, [1.0, 2.0, 3.0, 5.0, 8.0]
+          invalid_input = [-1.0, 2.0, 7.0]
+          with self.assertRaisesOpError("negative input not supported"):
+            MyOperator(invalid_input).eval()
+    ```
 
     Args:
       graph: Optional graph to use during the returned session.
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 257677ce49..d8e069105f 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -2382,11 +2382,6 @@ cuda_py_test(
         "//tensorflow/python:util",
         "//tensorflow/python:data_flow_ops",
     ],
-    tags = [
-        "manual",  # http://b/62429636
-        "noguitar",
-        "notap",
-    ],
 )
 
 cuda_py_test(
@@ -2401,11 +2396,6 @@ cuda_py_test(
         "//tensorflow/python:util",
         "//tensorflow/python:data_flow_ops",
     ],
-    tags = [
-        "manual",  # http://b/62429636
-        "noguitar",
-        "notap",
-    ],
 )
 
 cuda_py_test(
diff --git a/tensorflow/python/kernel_tests/basic_gpu_test.py b/tensorflow/python/kernel_tests/basic_gpu_test.py
index 013aa1ba8a..b5b17ff80a 100644
--- a/tensorflow/python/kernel_tests/basic_gpu_test.py
+++ b/tensorflow/python/kernel_tests/basic_gpu_test.py
@@ -113,6 +113,7 @@ class MathBuiltinUnaryTest(test.TestCase):
     self._compare(data, np.arctan, math_ops.atan, use_gpu)
     self._compare(data, np.ceil, math_ops.ceil, use_gpu)
     self._compare(data, np.cos, math_ops.cos, use_gpu)
+    self._compare(data, np.cosh, math_ops.cosh, use_gpu)
     self._compare(data, np.exp, math_ops.exp, use_gpu)
     self._compare(data, np.floor, math_ops.floor, use_gpu)
     self._compare(data, np.log, math_ops.log, use_gpu)
@@ -120,6 +121,7 @@ class MathBuiltinUnaryTest(test.TestCase):
     self._compare(data, np.negative, math_ops.negative, use_gpu)
     self._compare(data, self._rsqrt, math_ops.rsqrt, use_gpu)
     self._compare(data, np.sin, math_ops.sin, use_gpu)
+    self._compare(data, np.sinh, math_ops.sinh, use_gpu)
     self._compare(data, np.sqrt, math_ops.sqrt, use_gpu)
     self._compare(data, np.square, math_ops.square, use_gpu)
     self._compare(data, np.tan, math_ops.tan, use_gpu)
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index 0846470abc..b47139e6b8 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -200,6 +200,8 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(x, np.expm1, math_ops.expm1)
     self._compareBoth(z, np.log, math_ops.log)
     self._compareBoth(z, np.log1p, math_ops.log1p)
+    self._compareBoth(x, np.sinh, math_ops.sinh)
+    self._compareBoth(x, np.cosh, math_ops.cosh)
     self._compareBoth(x, np.tanh, math_ops.tanh)
     self._compareBoth(x, self._sigmoid, math_ops.sigmoid)
     self._compareBoth(x, self._log_sigmoid, math_ops.log_sigmoid)
@@ -245,6 +247,8 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(x, np.expm1, math_ops.expm1)
     self._compareBoth(x, np.log, math_ops.log)
     self._compareBoth(x, np.log1p, math_ops.log1p)
+    self._compareBoth(x, np.sinh, math_ops.sinh)
+    self._compareBoth(x, np.cosh, math_ops.cosh)
     self._compareBoth(x, np.tanh, math_ops.tanh)
     self._compareBoth(x, self._sigmoid, math_ops.sigmoid)
     self._compareBoth(x, np.sign, math_ops.sign)
@@ -285,6 +289,8 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(x, np.expm1, math_ops.expm1)
     self._compareBoth(z, np.log, math_ops.log)
     self._compareBoth(z, np.log1p, math_ops.log1p)
+    self._compareBoth(x, np.sinh, math_ops.sinh)
+    self._compareBoth(x, np.cosh, math_ops.cosh)
     self._compareBoth(x, np.tanh, math_ops.tanh)
     self._compareBoth(x, self._sigmoid, math_ops.sigmoid)
     self._compareBoth(y, np.sign, math_ops.sign)
@@ -389,6 +395,8 @@ class UnaryOpTest(test.TestCase):
     self._compareCpu(x, np.expm1, math_ops.expm1)
     self._compareCpu(y, np.log, math_ops.log)
     self._compareCpu(y, np.log1p, math_ops.log1p)
+    self._compareCpu(x, np.sinh, math_ops.sinh)
+    self._compareCpu(x, np.cosh, math_ops.cosh)
     self._compareCpu(x, np.tanh, math_ops.tanh)
     self._compareCpu(x, self._sigmoid, math_ops.sigmoid)
     self._compareCpu(x, np.sin, math_ops.sin)
@@ -423,6 +431,8 @@ class UnaryOpTest(test.TestCase):
     self._compareCpu(x, np.expm1, math_ops.expm1)
     self._compareCpu(y, np.log, math_ops.log)
     self._compareCpu(y, np.log1p, math_ops.log1p)
+    self._compareCpu(x, np.sinh, math_ops.sinh)
+    self._compareCpu(x, np.cosh, math_ops.cosh)
     self._compareCpu(x, np.tanh, math_ops.tanh)
     self._compareCpu(x, self._sigmoid, math_ops.sigmoid)
     self._compareCpu(x, np.sin, math_ops.sin)
@@ -697,6 +707,11 @@ class BinaryOpTest(test.TestCase):
     except ImportError as e:
       tf_logging.warn("Cannot test special functions: %s" % str(e))
 
+  def testUint8Basic(self):
+    x = np.arange(1, 13, 2).reshape(1, 3, 2).astype(np.uint8)
+    y = np.arange(1, 7, 1).reshape(1, 3, 2).astype(np.uint8)
+    self._compareBoth(x, y, np.add, math_ops.add)
+
   def testInt8Basic(self):
     x = np.arange(1, 13, 2).reshape(1, 3, 2).astype(np.int8)
     y = np.arange(1, 7, 1).reshape(1, 3, 2).astype(np.int8)
diff --git a/tensorflow/python/kernel_tests/fft_ops_test.py b/tensorflow/python/kernel_tests/fft_ops_test.py
index 6544fe9735..6c575aea12 100644
--- a/tensorflow/python/kernel_tests/fft_ops_test.py
+++ b/tensorflow/python/kernel_tests/fft_ops_test.py
@@ -338,37 +338,38 @@ class RFFTOpsTest(BaseFFTOpsTest):
                 use_placeholder=True)
 
   def testFftLength(self):
-    with self._fft_kernel_label_map():
-      for rank in VALID_FFT_RANKS:
-        for dims in xrange(rank, rank + 3):
-          for size in (5, 6):
-            inner_dim = size // 2 + 1
-            r2c = np.mod(np.arange(np.power(size, dims)), 10).reshape(
-                (size,) * dims)
-            c2r = np.mod(np.arange(np.power(size, dims - 1) * inner_dim),
-                         10).reshape((size,) * (dims - 1) + (inner_dim,))
-
-            # Test truncation (FFT size < dimensions).
-            fft_length = (size - 2,) * rank
-            self._CompareForward(r2c.astype(np.float32), rank, fft_length)
-            self._CompareBackward(c2r.astype(np.complex64), rank, fft_length)
-
-            # Confirm it works with unknown shapes as well.
-            self._CompareForward(r2c.astype(np.float32), rank, fft_length,
-                                 use_placeholder=True)
-            self._CompareBackward(c2r.astype(np.complex64), rank, fft_length,
-                                  use_placeholder=True)
-
-            # Test padding (FFT size > dimensions).
-            fft_length = (size + 2,) * rank
-            self._CompareForward(r2c.astype(np.float32), rank, fft_length)
-            self._CompareBackward(c2r.astype(np.complex64), rank, fft_length)
-
-            # Confirm it works with unknown shapes as well.
-            self._CompareForward(r2c.astype(np.float32), rank, fft_length,
-                                 use_placeholder=True)
-            self._CompareBackward(c2r.astype(np.complex64), rank, fft_length,
-                                  use_placeholder=True)
+    if test.is_gpu_available(cuda_only=True):
+      with self._fft_kernel_label_map():
+        for rank in VALID_FFT_RANKS:
+          for dims in xrange(rank, rank + 3):
+            for size in (5, 6):
+              inner_dim = size // 2 + 1
+              r2c = np.mod(np.arange(np.power(size, dims)), 10).reshape(
+                  (size,) * dims)
+              c2r = np.mod(np.arange(np.power(size, dims - 1) * inner_dim),
+                           10).reshape((size,) * (dims - 1) + (inner_dim,))
+
+              # Test truncation (FFT size < dimensions).
+              fft_length = (size - 2,) * rank
+              self._CompareForward(r2c.astype(np.float32), rank, fft_length)
+              self._CompareBackward(c2r.astype(np.complex64), rank, fft_length)
+
+              # Confirm it works with unknown shapes as well.
+              self._CompareForward(r2c.astype(np.float32), rank, fft_length,
+                                   use_placeholder=True)
+              self._CompareBackward(c2r.astype(np.complex64), rank, fft_length,
+                                    use_placeholder=True)
+
+              # Test padding (FFT size > dimensions).
+              fft_length = (size + 2,) * rank
+              self._CompareForward(r2c.astype(np.float32), rank, fft_length)
+              self._CompareBackward(c2r.astype(np.complex64), rank, fft_length)
+
+              # Confirm it works with unknown shapes as well.
+              self._CompareForward(r2c.astype(np.float32), rank, fft_length,
+                                   use_placeholder=True)
+              self._CompareBackward(c2r.astype(np.complex64), rank, fft_length,
+                                    use_placeholder=True)
 
   def testRandom(self):
     with self._fft_kernel_label_map():
diff --git a/tensorflow/python/kernel_tests/map_stage_op_test.py b/tensorflow/python/kernel_tests/map_stage_op_test.py
index 2d2169c310..4ceb24862f 100644
--- a/tensorflow/python/kernel_tests/map_stage_op_test.py
+++ b/tensorflow/python/kernel_tests/map_stage_op_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
+TIMEOUT = 1
 
 class MapStageTest(test.TestCase):
 
@@ -194,8 +195,7 @@ class MapStageTest(test.TestCase):
     import threading
 
     queue = Queue.Queue()
-    n = 5
-    missed = 0
+    n = 8
 
     with self.test_session(use_gpu=True, graph=G) as sess:
       # Stage data in a separate thread which will block
@@ -207,31 +207,34 @@ class MapStageTest(test.TestCase):
           queue.put(0)
 
       t = threading.Thread(target=thread_run)
+      t.daemon = True
       t.start()
 
-      # Get tokens from the queue, making notes of when we timeout
-      for i in range(n):
-        try:
-          queue.get(timeout=0.05)
-        except Queue.Empty:
-          missed += 1
-
-      # We timed out n - capacity times waiting for queue puts
-      self.assertTrue(missed == n - capacity)
-
-      # Clear the staging area out a bit
-      for i in range(n - capacity):
-        sess.run(get)
-
-      # This should now succeed
-      t.join()
-
+      # Get tokens from the queue until a timeout occurs
+      try:
+        for i in range(n):
+          queue.get(timeout=TIMEOUT)
+      except Queue.Empty:
+        pass
+
+      # Should've timed out on the iteration 'capacity'
+      if not i == capacity:
+        self.fail("Expected to timeout on iteration '{}' "
+                  "but instead timed out on iteration '{}' "
+                  "Staging Area size is '{}' and configured "
+                  "capacity is '{}'.".format(capacity, i,
+                                            sess.run(size),
+                                            capacity))
+
+      # Should have capacity elements in the staging area
       self.assertTrue(sess.run(size) == capacity)
 
-      # Clear out the staging area completely
-      for i in range(capacity):
+      # Clear the staging area completely
+      for i in range(n):
         sess.run(get)
 
+      self.assertTrue(sess.run(size) == 0)
+
   def testMemoryLimit(self):
     memory_limit = 512*1024  # 512K
     chunk = 200*1024 # 256K
@@ -256,8 +259,7 @@ class MapStageTest(test.TestCase):
     import numpy as np
 
     queue = Queue.Queue()
-    n = 5
-    missed = 0
+    n = 8
 
     with self.test_session(use_gpu=True, graph=G) as sess:
       # Stage data in a separate thread which will block
@@ -265,36 +267,39 @@ class MapStageTest(test.TestCase):
       # not fill the queue with n tokens
       def thread_run():
         for i in range(n):
-          sess.run(stage, feed_dict={x: np.full(chunk, i, dtype=np.uint8),
-                                    pi: i})
+          data = np.full(chunk, i, dtype=np.uint8)
+          sess.run(stage, feed_dict={x: data, pi: i})
           queue.put(0)
 
       t = threading.Thread(target=thread_run)
+      t.daemon = True
       t.start()
 
-      # Get tokens from the queue, making notes of when we timeout
-      for i in range(n):
-        try:
-          queue.get(timeout=0.05)
-        except Queue.Empty:
-          missed += 1
-
-      # We timed out n - capacity times waiting for queue puts
-      self.assertTrue(missed == n - capacity)
-
-      # Clear the staging area out a bit
-      for i in range(n - capacity):
-        sess.run(get)
-
-      # This should now succeed
-      t.join()
-
+      # Get tokens from the queue until a timeout occurs
+      try:
+        for i in range(n):
+          queue.get(timeout=TIMEOUT)
+      except Queue.Empty:
+        pass
+
+      # Should've timed out on the iteration 'capacity'
+      if not i == capacity:
+        self.fail("Expected to timeout on iteration '{}' "
+                  "but instead timed out on iteration '{}' "
+                  "Staging Area size is '{}' and configured "
+                  "capacity is '{}'.".format(capacity, i,
+                                            sess.run(size),
+                                            capacity))
+
+      # Should have capacity elements in the staging area
       self.assertTrue(sess.run(size) == capacity)
 
-      # Clear out the staging area completely
-      for i in range(capacity):
+      # Clear the staging area completely
+      for i in range(n):
         sess.run(get)
 
+      self.assertTrue(sess.run(size) == 0)
+
   def testOrdering(self):
     import six
     import random
diff --git a/tensorflow/python/kernel_tests/record_input_test.py b/tensorflow/python/kernel_tests/record_input_test.py
index a3fc98c20f..8fec2affa5 100644
--- a/tensorflow/python/kernel_tests/record_input_test.py
+++ b/tensorflow/python/kernel_tests/record_input_test.py
@@ -53,6 +53,7 @@ class RecordInputOpTest(test.TestCase):
   def testRecordInputEpochs(self):
     files = 100
     records_per_file = 100
+    batches = 2
     with self.test_session() as sess:
       self.generateTestData("basic", files, records_per_file)
 
@@ -63,17 +64,20 @@ class RecordInputOpTest(test.TestCase):
           batch_size=1,
           shift_ratio=0.33,
           seed=10,
-          name="record_input")
+          name="record_input",
+          batches=batches)
 
       yield_op = records.get_yield_op()
 
       # cycle over 3 epochs and make sure we never duplicate
       for _ in range(3):
         epoch_set = set()
-        for _ in range(files * records_per_file):
-          r = sess.run(yield_op)
-          self.assertTrue(r[0] not in epoch_set)
-          epoch_set.add(r[0])
+        for _ in range(int(files * records_per_file / batches)):
+          op_list = sess.run(yield_op)
+          self.assertTrue(len(op_list) is batches)
+          for r in op_list:
+            self.assertTrue(r[0] not in epoch_set)
+            epoch_set.add(r[0])
 
   def testDoesNotDeadlock(self):
     # Iterate multiple times to cause deadlock if there is a chance it can occur
diff --git a/tensorflow/python/kernel_tests/shape_ops_test.py b/tensorflow/python/kernel_tests/shape_ops_test.py
index 97d61d52af..52cf904528 100644
--- a/tensorflow/python/kernel_tests/shape_ops_test.py
+++ b/tensorflow/python/kernel_tests/shape_ops_test.py
@@ -504,16 +504,16 @@ class TileTest(test.TestCase):
       with self.assertRaises(ValueError):
         array_ops.tile(a, [[2, 3], [3, 4]]).eval()
 
-  def _RunAndVerifyResult(self, use_gpu):
+  def _RunAndVerifyResult(self, rank, use_gpu):
     with self.test_session(use_gpu=use_gpu):
-      # Random dims of rank 5
-      input_shape = np.random.randint(1, 4, size=5)
+      # Random dims of given rank
+      input_shape = np.random.randint(1, 4, size=rank)
       inp = np.random.rand(*input_shape).astype("f")
       a = constant_op.constant(
           [float(x) for x in inp.ravel(order="C")],
           shape=input_shape,
           dtype=dtypes.float32)
-      multiples = np.random.randint(1, 4, size=5).astype(np.int32)
+      multiples = np.random.randint(1, 4, size=rank).astype(np.int32)
       tiled = array_ops.tile(a, multiples)
       result = tiled.eval()
     self.assertTrue((np.array(multiples) * np.array(inp.shape) == np.array(
@@ -522,10 +522,16 @@ class TileTest(test.TestCase):
     self.assertShapeEqual(result, tiled)
 
   def testRandom(self):
+    # test low rank, like 5
     for _ in range(5):
-      self._RunAndVerifyResult(use_gpu=False)
+      self._RunAndVerifyResult(5, use_gpu=False)
     for _ in range(5):
-      self._RunAndVerifyResult(use_gpu=True)
+      self._RunAndVerifyResult(5, use_gpu=True)
+    # test high rank, like 10
+    for _ in range(5):
+      self._RunAndVerifyResult(10, use_gpu=False)
+    for _ in range(5):
+      self._RunAndVerifyResult(10, use_gpu=True)
 
   def testGradientSimpleReduction(self):
     with self.test_session():
diff --git a/tensorflow/python/kernel_tests/sparse_ops_test.py b/tensorflow/python/kernel_tests/sparse_ops_test.py
index e67a2c25e9..6664362226 100644
--- a/tensorflow/python/kernel_tests/sparse_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_ops_test.py
@@ -544,21 +544,24 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(empty_row_indicator_out, np.zeros(2).astype(np.bool))
 
 
-class SparseReduceSumTest(test_util.TensorFlowTestCase):
+class SparseReduceTest(test_util.TensorFlowTestCase):
 
-  # [[1, ?, 1]
-  #  [?, 1, ?]]
+  # [[1, ?, 2]
+  #  [?, 3, ?]]
   # where ? is implictly-zero.
   ind = np.array([[0, 0], [0, 2], [1, 1]]).astype(np.int64)
   vals = np.array([1, 1, 1]).astype(np.int32)
   dense_shape = np.array([2, 3]).astype(np.int64)
 
-  def _compare(self, sp_t, reduction_axes, ndims, keep_dims):
+  def _compare(self, sp_t, reduction_axes, ndims, keep_dims, do_sum):
     densified = sparse_ops.sparse_tensor_to_dense(sp_t).eval()
 
     np_ans = densified
     if reduction_axes is None:
-      np_ans = np.sum(np_ans, keepdims=keep_dims)
+      if do_sum:
+        np_ans = np.sum(np_ans, keepdims=keep_dims)
+      else:
+        np_ans = np.max(np_ans, keepdims=keep_dims)
     else:
       if not isinstance(reduction_axes, list):  # Single scalar.
         reduction_axes = [reduction_axes]
@@ -568,15 +571,28 @@ class SparseReduceSumTest(test_util.TensorFlowTestCase):
       # Loop below depends on sorted.
       reduction_axes.sort()
       for ra in reduction_axes.ravel()[::-1]:
-        np_ans = np.sum(np_ans, axis=ra, keepdims=keep_dims)
+        if do_sum:
+          np_ans = np.sum(np_ans, axis=ra, keepdims=keep_dims)
+        else:
+          np_ans = np.max(np_ans, axis=ra, keepdims=keep_dims)
 
     with self.test_session():
-      tf_dense_ans = sparse_ops.sparse_reduce_sum(sp_t, reduction_axes,
-                                                  keep_dims)
+      if do_sum:
+        tf_dense_ans = sparse_ops.sparse_reduce_sum(sp_t, reduction_axes,
+                                                    keep_dims)
+      else:
+        tf_dense_ans = sparse_ops.sparse_reduce_max(sp_t, reduction_axes,
+                                                    keep_dims)
       out_dense = tf_dense_ans.eval()
 
-      tf_sparse_ans = sparse_ops.sparse_reduce_sum_sparse(sp_t, reduction_axes,
-                                                          keep_dims)
+      if do_sum:
+        tf_sparse_ans = sparse_ops.sparse_reduce_sum_sparse(sp_t,
+                                                            reduction_axes,
+                                                            keep_dims)
+      else:
+        tf_sparse_ans = sparse_ops.sparse_reduce_max_sparse(sp_t,
+                                                            reduction_axes,
+                                                            keep_dims)
       # Convert to dense for comparison purposes.
       out_sparse = sparse_ops.sparse_tensor_to_dense(tf_sparse_ans).eval()
 
@@ -584,8 +600,10 @@ class SparseReduceSumTest(test_util.TensorFlowTestCase):
     self.assertAllClose(np_ans, out_sparse)
 
   def _compare_all(self, sp_t, reduction_axes, ndims):
-    self._compare(sp_t, reduction_axes, ndims, False)
-    self._compare(sp_t, reduction_axes, ndims, True)
+    self._compare(sp_t, reduction_axes, ndims, False, False)
+    self._compare(sp_t, reduction_axes, ndims, False, True)
+    self._compare(sp_t, reduction_axes, ndims, True, False)
+    self._compare(sp_t, reduction_axes, ndims, True, True)
 
   def testSimpleAndRandomInputs(self):
     if np.__version__ == "1.13.0":
@@ -621,6 +639,10 @@ class SparseReduceSumTest(test_util.TensorFlowTestCase):
         sparse_ops.sparse_reduce_sum(sp_t, -3).eval()
       with self.assertRaisesOpError("Invalid reduction dimension 2"):
         sparse_ops.sparse_reduce_sum(sp_t, 2).eval()
+      with self.assertRaisesOpError("Invalid reduction dimension -3"):
+        sparse_ops.sparse_reduce_max(sp_t, -3).eval()
+      with self.assertRaisesOpError("Invalid reduction dimension 2"):
+        sparse_ops.sparse_reduce_max(sp_t, 2).eval()
 
   def testGradient(self):
     if np.__version__ == "1.13.0":
diff --git a/tensorflow/python/kernel_tests/stage_op_test.py b/tensorflow/python/kernel_tests/stage_op_test.py
index 4a89fb64e3..1a6a869e3d 100644
--- a/tensorflow/python/kernel_tests/stage_op_test.py
+++ b/tensorflow/python/kernel_tests/stage_op_test.py
@@ -23,9 +23,11 @@ from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
+TIMEOUT = 1
 
 class StageTest(test.TestCase):
 
+
   def testSimple(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -172,8 +174,7 @@ class StageTest(test.TestCase):
     import threading
 
     queue = Queue.Queue()
-    n = 5
-    missed = 0
+    n = 8
 
     with self.test_session(use_gpu=True, graph=G) as sess:
       # Stage data in a separate thread which will block
@@ -185,31 +186,33 @@ class StageTest(test.TestCase):
           queue.put(0)
 
       t = threading.Thread(target=thread_run)
+      t.daemon = True
       t.start()
 
-      # Get tokens from the queue, making notes of when we timeout
-      for i in range(n):
-        try:
-          queue.get(timeout=0.05)
-        except Queue.Empty:
-          missed += 1
-
-      # We timed out n - capacity times waiting for queue puts
-      self.assertTrue(missed == n - capacity)
-
-      # Clear the staging area out a bit
-      for i in range(n - capacity):
-        self.assertTrue(sess.run(ret) == i)
-
-      # Thread should be able to join now
-      t.join()
-
+      # Get tokens from the queue until a timeout occurs
+      try:
+        for i in range(n):
+          queue.get(timeout=TIMEOUT)
+      except Queue.Empty:
+        pass
+
+      # Should've timed out on the iteration 'capacity'
+      if not i == capacity:
+        self.fail("Expected to timeout on iteration '{}' "
+                  "but instead timed out on iteration '{}' "
+                  "Staging Area size is '{}' and configured "
+                  "capacity is '{}'.".format(capacity, i,
+                                            sess.run(size),
+                                            capacity))
+
+      # Should have capacity elements in the staging area
       self.assertTrue(sess.run(size) == capacity)
 
       # Clear the staging area completely
-      for i in range(capacity):
-        self.assertTrue(sess.run(ret) == i+(n-capacity))
+      for i in range(n):
+        self.assertTrue(sess.run(ret) == i)
 
+      # It should now be empty
       self.assertTrue(sess.run(size) == 0)
 
   def testMemoryLimit(self):
@@ -234,8 +237,7 @@ class StageTest(test.TestCase):
     import numpy as np
 
     queue = Queue.Queue()
-    n = 5
-    missed = 0
+    n = 8
 
     with self.test_session(use_gpu=True, graph=G) as sess:
       # Stage data in a separate thread which will block
@@ -247,30 +249,31 @@ class StageTest(test.TestCase):
           queue.put(0)
 
       t = threading.Thread(target=thread_run)
+      t.daemon = True
       t.start()
 
-      # Get tokens from the queue, making notes of when we timeout
-      for i in range(n):
-        try:
-          queue.get(timeout=0.05)
-        except Queue.Empty:
-          missed += 1
-
-      # We timed out n - capacity times waiting for queue puts
-      self.assertTrue(missed == n - capacity)
-
-      # Clear the staging area out a bit
-      for i in range(n - capacity):
-        self.assertTrue(sess.run(ret)[0] == i)
-
-      # Thread should be able to join now
-      t.join()
-
+      # Get tokens from the queue until a timeout occurs
+      try:
+        for i in range(n):
+          queue.get(timeout=TIMEOUT)
+      except Queue.Empty:
+        pass
+
+      # Should've timed out on the iteration 'capacity'
+      if not i == capacity:
+        self.fail("Expected to timeout on iteration '{}' "
+                  "but instead timed out on iteration '{}' "
+                  "Staging Area size is '{}' and configured "
+                  "capacity is '{}'.".format(capacity, i,
+                                            sess.run(size),
+                                            capacity))
+
+      # Should have capacity elements in the staging area
       self.assertTrue(sess.run(size) == capacity)
 
       # Clear the staging area completely
-      for i in range(capacity):
-        self.assertTrue(sess.run(ret)[0] == i+(n-capacity))
+      for i in range(n):
+        self.assertTrue(np.all(sess.run(ret) == i))
 
       self.assertTrue(sess.run(size) == 0)
 
diff --git a/tensorflow/python/lib/io/py_record_writer.cc b/tensorflow/python/lib/io/py_record_writer.cc
index df35c43c3d..ba749da47a 100644
--- a/tensorflow/python/lib/io/py_record_writer.cc
+++ b/tensorflow/python/lib/io/py_record_writer.cc
@@ -55,6 +55,14 @@ bool PyRecordWriter::WriteRecord(tensorflow::StringPiece record) {
   return s.ok();
 }
 
+void PyRecordWriter::Flush(TF_Status* out_status) {
+  Status s = writer_->Flush();
+  if (!s.ok()) {
+    Set_TF_Status_from_Status(out_status, s);
+    return;
+  }
+}
+
 void PyRecordWriter::Close(TF_Status* out_status) {
   Status s = writer_->Close();
   if (!s.ok()) {
diff --git a/tensorflow/python/lib/io/py_record_writer.h b/tensorflow/python/lib/io/py_record_writer.h
index 8c53420ce6..9d66c031d4 100644
--- a/tensorflow/python/lib/io/py_record_writer.h
+++ b/tensorflow/python/lib/io/py_record_writer.h
@@ -44,6 +44,7 @@ class PyRecordWriter {
   ~PyRecordWriter();
 
   bool WriteRecord(tensorflow::StringPiece record);
+  void Flush(TF_Status* out_status);
   void Close(TF_Status* out_status);
 
  private:
diff --git a/tensorflow/python/lib/io/py_record_writer.i b/tensorflow/python/lib/io/py_record_writer.i
index 9e61c9893a..3181c9afce 100644
--- a/tensorflow/python/lib/io/py_record_writer.i
+++ b/tensorflow/python/lib/io/py_record_writer.i
@@ -45,6 +45,7 @@ limitations under the License.
 %unignore tensorflow::io::PyRecordWriter;
 %unignore tensorflow::io::PyRecordWriter::~PyRecordWriter;
 %unignore tensorflow::io::PyRecordWriter::WriteRecord;
+%unignore tensorflow::io::PyRecordWriter::Flush;
 %unignore tensorflow::io::PyRecordWriter::Close;
 %unignore tensorflow::io::PyRecordWriter::New;
 
diff --git a/tensorflow/python/lib/io/tf_record.py b/tensorflow/python/lib/io/tf_record.py
index 3d0cdc2153..df19010068 100644
--- a/tensorflow/python/lib/io/tf_record.py
+++ b/tensorflow/python/lib/io/tf_record.py
@@ -121,6 +121,11 @@ class TFRecordWriter(object):
     """
     self._writer.WriteRecord(record)
 
+  def flush(self):
+    """Flush the file."""
+    with errors.raise_exception_on_not_ok_status() as status:
+      self._writer.Flush(status)
+
   def close(self):
     """Close the file."""
     with errors.raise_exception_on_not_ok_status() as status:
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index b2aff617df..8db5efb447 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -402,14 +402,15 @@ def _SliceHelper(tensor, slice_spec, var=None):
 
   # Insert another dimension
   foo = tf.constant([[1,2,3], [4,5,6], [7,8,9]])
-  print(foo[tf.newaxis, :, :].eval()) # => [[[3,2,1], [9,8,7]]]
-  print(foo[:, tf.newaxis, :].eval()) # => [[[3,2,1]], [[9,8,7]]]
-  print(foo[:, :, tf.newaxis].eval()) # => [[[3],[2],[1]], [[9],[8],[7]]]
+  print(foo[tf.newaxis, :, :].eval()) # => [[[1,2,3], [4,5,6], [7,8,9]]]
+  print(foo[:, tf.newaxis, :].eval()) # => [[[1,2,3]], [[4,5,6]], [[7,8,9]]]
+  print(foo[:, :, tf.newaxis].eval()) # => [[[1],[2],[3]], [[4],[5],[6]], [[7],[8],[9]]]
 
   # Ellipses (3 equivalent operations)
-  print(foo[tf.newaxis, :, :].eval()) # => [[[3,2,1], [9,8,7]]]
-  print(foo[tf.newaxis, ...].eval()) # => [[[3,2,1], [9,8,7]]]
-  print(foo[tf.newaxis].eval()) # => [[[3,2,1], [9,8,7]]]
+  foo = tf.constant([[1,2,3], [4,5,6], [7,8,9]])
+  print(foo[tf.newaxis, :, :].eval()) # => [[[1,2,3], [4,5,6], [7,8,9]]]
+  print(foo[tf.newaxis, ...].eval()) # => [[[1,2,3], [4,5,6], [7,8,9]]]
+  print(foo[tf.newaxis].eval()) # => [[[1,2,3], [4,5,6], [7,8,9]]]
   ```
 
   Notes:
@@ -760,11 +761,14 @@ def parallel_stack(values, name="parallel_stack"):
   parallel_stack([x, y, z])  # => [[1, 4], [2, 5], [3, 6]]
   ```
 
-  The difference between stack and parallel_stack is that stack requires all
-  of the inputs be computed before the operation will begin but doesn't require
-  that the input shapes be known during graph construction.  Parallel stack
-  will copy pieces of the input into the output as they become available, in
-  some situations this can provide a performance benefit.
+  The difference between `stack` and `parallel_stack` is that `stack` requires
+  all the inputs be computed before the operation will begin but doesn't require
+  that the input shapes be known during graph construction.
+
+  `parallel_stack` will copy pieces of the input into the output as they become
+  available, in some situations this can provide a performance benefit.
+
+  Unlike `stack`, `parallel_stack` does NOT support backpropagation.
 
   This is the opposite of unstack.  The numpy equivalent is
 
@@ -1369,7 +1373,7 @@ def zeros(shape, dtype=dtypes.float32, name=None):
   ```
 
   Args:
-    shape: Either a list of integers, or a 1-D `Tensor` of type `int32`.
+    shape: A list of integers, a tuple of integers, or a 1-D `Tensor` of type `int32`.
     dtype: The type of an element in the resulting `Tensor`.
     name: A name for the operation (optional).
 
@@ -1483,7 +1487,7 @@ def ones(shape, dtype=dtypes.float32, name=None):
   ```
 
   Args:
-    shape: Either a list of integers, or a 1-D `Tensor` of type `int32`.
+    shape: A list of integers, a tuple of integers, or a 1-D `Tensor` of type `int32`.
     dtype: The type of an element in the resulting `Tensor`.
     name: A name for the operation (optional).
 
diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index 4eead79531..1dc5eb2f90 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -248,7 +248,7 @@ class QueueBase(object):
     if isinstance(vals, dict):
       if not self._names:
         raise ValueError("Queue must have names to enqueue a dictionary")
-      if sorted(self._names) != sorted(vals.keys()):
+      if sorted(self._names, key=str) != sorted(vals.keys(), key=str):
         raise ValueError("Keys in dictionary to enqueue do not match "
                          "names of Queue.  Dictionary: (%s), Queue: (%s)" %
                          (sorted(vals.keys()), sorted(self._names)))
@@ -1582,7 +1582,7 @@ class StagingArea(BaseStagingArea):
   This is mostly useful for limiting the number of tensors on
   devices such as GPUs.
 
-  All get() and peek() commands block if the the requested data
+  All get() and peek() commands block if the requested data
   is not present in the Staging Area.
 
   """
@@ -2155,7 +2155,8 @@ class RecordInput(object):
                parallelism=1,
                shift_ratio=0,
                seed=0,
-               name=None):
+               name=None,
+               batches=None):
     """Constructs a RecordInput Op.
 
     Args:
@@ -2169,12 +2170,18 @@ class RecordInput(object):
       seed: Specify the random number seed used by generator that randomizes
         records.
       name: Optional name for the operation.
+      batches: None by default, creating a single batch op. Otherwise specifies
+        how many batches to create, which are returned as a list when
+        `get_yield_op()` is called. An example use case is to split processing
+        between devices on one computer.
 
     Raises:
       ValueError: If one of the arguments is invalid.
     """
-
     self._batch_size = batch_size
+    if batches is not None:
+      self._batch_size *= batches
+    self._batches = batches
     self._file_pattern = file_pattern
     self._buffer_size = buffer_size
     self._parallelism = parallelism
@@ -2183,8 +2190,11 @@ class RecordInput(object):
     self._name = name
 
   def get_yield_op(self):
-    """Add a node that yields a minibatch every time it is executed."""
-    return gen_data_flow_ops.record_input(
+    """Adds a node that yields a group of records every time it is executed.
+    If RecordInput `batches` parameter is not None, it yields a list of
+    record batches with the specified `batch_size`.
+    """
+    records = gen_data_flow_ops.record_input(
         file_pattern=self._file_pattern,
         file_buffer_size=self._buffer_size,
         file_parallelism=self._parallelism,
@@ -2192,3 +2202,14 @@ class RecordInput(object):
         batch_size=self._batch_size,
         file_random_seed=self._seed,
         name=self._name)
+    if self._batches is None:
+      return records
+    else:
+      with ops.name_scope(self._name):
+        batch_list = [[] for i in six.moves.range(self._batches)]
+        records = array_ops.split(records, self._batch_size, 0)
+        records = [array_ops.reshape(record, []) for record in records]
+        for index, protobuf in zip(six.moves.range(len(records)), records):
+          batch_index = index % self._batches
+          batch_list[batch_index].append(protobuf)
+        return batch_list
diff --git a/tensorflow/python/ops/distributions/special_math.py b/tensorflow/python/ops/distributions/special_math.py
index f96eafed71..3a804c941a 100644
--- a/tensorflow/python/ops/distributions/special_math.py
+++ b/tensorflow/python/ops/distributions/special_math.py
@@ -324,7 +324,7 @@ def log_ndtr(x, series_order=3, name="log_ndtr"):
 
 
 def _log_ndtr_lower(x, series_order):
-  """Asymptotic expansion version of `Log[cdf(x)]`, apppropriate for `x<<-1`."""
+  """Asymptotic expansion version of `Log[cdf(x)]`, appropriate for `x<<-1`."""
   x_2 = math_ops.square(x)
   # Log of the term multiplying (1 + sum)
   log_scale = -0.5 * x_2 - math_ops.log(-x) - 0.5 * math.log(2. * math.pi)
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index bd8a5c86ac..52ce451238 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -34,6 +34,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
@@ -45,6 +46,7 @@ from tensorflow.python.ops import math_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import spectral_grad  # pylint: disable=unused-import
+from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.platform import tf_logging as logging
 
 
@@ -920,8 +922,6 @@ def hessians(ys, xs, name="hessians", colocate_gradients_with_ops=False,
   Raises:
     LookupError: if one of the operations between `xs` and `ys` does not
       have a registered gradient function.
-    ValueError: if the arguments are invalid or not supported. Currently,
-      this function only supports one-dimensional `x` in `xs`.
   """
   xs = _AsList(xs)
   kwargs = {
@@ -929,28 +929,30 @@ def hessians(ys, xs, name="hessians", colocate_gradients_with_ops=False,
       'gate_gradients': gate_gradients,
       'aggregation_method': aggregation_method
     }
-  # Compute a hessian matrix for each x in xs
+  # Compute first-order derivatives and iterate for each x in xs.
   hessians = []
-  for i, x in enumerate(xs):
-    # Check dimensions
-    ndims = x.get_shape().ndims
-    if ndims is None:
-      raise ValueError('Cannot compute Hessian because the dimensionality of '
-                       'element number %d of `xs` cannot be determined' % i)
-    elif ndims != 1:
-      raise ValueError('Computing hessians is currently only supported for '
-                       'one-dimensional tensors. Element number %d of `xs` has '
-                       '%d dimensions.' % (i, ndims))
-    with ops.name_scope(name + '_first_derivative'):
-      # Compute the partial derivatives of the input with respect to all
-      # elements of `x`
-      _gradients = gradients(ys, x, **kwargs)[0]
-      # Unpack the gradients into a list so we can take derivatives with
-      # respect to each element
-      _gradients = array_ops.unstack(_gradients)
-    with ops.name_scope(name + '_second_derivative'):
-      # Compute the partial derivatives with respect to each element of the list
-      _hess = [gradients(_gradient, x, **kwargs)[0] for _gradient in _gradients]
-      # Pack the list into a matrix and add to the list of hessians
-      hessians.append(array_ops.stack(_hess, name=name))
+  _gradients = gradients(ys, xs, **kwargs)
+  for i, _gradient, x in zip(range(len(xs)), _gradients, xs):
+    # Ensure that x is a vector.
+    check_rank = check_ops.assert_rank(
+      x, 1, message='Cannot compute Hessian because element %d of `xs` does '
+      'not have rank one.' % i
+    )
+    with ops.control_dependencies([check_rank]):
+      # Declare an iterator and tensor array loop variables for the gradients.
+      n = array_ops.size(x)
+      loop_vars = [
+        array_ops.constant(0, dtypes.int32),
+        tensor_array_ops.TensorArray(x.dtype, n)
+      ]
+      # Iterate over all elements of the gradient and compute second order
+      # derivatives.
+      _, hessian = control_flow_ops.while_loop(
+          lambda j, _: j < n,
+          lambda j, result: (j + 1,
+                             result.write(j, gradients(_gradient[j], x)[0])),
+          loop_vars
+      )
+
+      hessians.append(hessian.stack())
   return hessians
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index aac7aef8ce..dade053589 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -989,9 +989,9 @@ def index_table_from_tensor(vocabulary_list,
   Sample Usages:
 
   ```python
-  vocabulary_list = t.constant(["emerson", "lake", "palmer")
+  vocabulary_list = tf.constant(["emerson", "lake", "palmer"])
   table = tf.contrib.lookup.index_table_from_tensor(
-      vocabulary_list=vocabulary_list, num_oov_buckets=1, default_value=-1)
+      mapping=vocabulary_list, num_oov_buckets=1, default_value=-1)
   features = tf.constant(["emerson", "lake", "and", "palmer"])
   ids = table.lookup(features)
   ...
@@ -1161,7 +1161,7 @@ def index_to_string_table_from_tensor(vocabulary_list,
   Sample Usages:
 
   ```python
-  vocabulary_list = t.constant(["emerson", "lake", "palmer")
+  vocabulary_list = tf.constant(["emerson", "lake", "palmer"])
   indices = tf.constant([1, 5], tf.int64)
   table = tf.contrib.lookup.index_to_string_table_from_tensor(
       vocabulary_list, default_value="UNKNOWN")
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index 024158e709..a0f505e47b 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -369,6 +369,24 @@ def _Log1pGrad(op, grad):
     return grad * math_ops.reciprocal(1 + x)
 
 
+@ops.RegisterGradient("Sinh")
+def _SinhGrad(op, grad):
+  """Returns grad * cosh(x)."""
+  x = op.inputs[0]
+  with ops.control_dependencies([grad.op]):
+    x = math_ops.conj(x)
+    return grad * math_ops.cosh(x)
+
+
+@ops.RegisterGradient("Cosh")
+def _CoshGrad(op, grad):
+  """Returns grad * sinh(x)."""
+  x = op.inputs[0]
+  with ops.control_dependencies([grad.op]):
+    x = math_ops.conj(x)
+    return grad * math_ops.sinh(x)
+
+
 @ops.RegisterGradient("Tanh")
 def _TanhGrad(op, grad):
   """Returns grad * (1 - tanh(x) * tanh(x))."""
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 23af8c0a57..bf46bbdcc2 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -45,6 +45,8 @@ See the @{$python/math_ops} guide.
 @@expm1
 @@log
 @@log1p
+@@sinh
+@@cosh
 @@ceil
 @@floor
 @@maximum
@@ -2063,12 +2065,11 @@ def tanh(x, name=None):
 
   Args:
     x: A Tensor or SparseTensor with type `float`, `double`, `int32`,
-      `complex64`, `int64`, or `qint32`.
+      `complex64`, or `int64`.
     name: A name for the operation (optional).
 
   Returns:
-    A Tensor or SparseTensor respectively with the same type as `x` if
-    `x.dtype != qint32` otherwise the return type is `quint8`.
+    A Tensor or SparseTensor respectively with the same type as `x`.
   """
   with ops.name_scope(name, "Tanh", [x]) as name:
     if isinstance(x, sparse_tensor.SparseTensor):
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index 53db96f334..f7854e86c0 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -419,7 +419,7 @@ class LSTMCell(RNNCell):
 
   The default non-peephole implementation is based on:
 
-    http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
+    http://www.bioinf.jku.at/publications/older/2604.pdf
 
   S. Hochreiter and J. Schmidhuber.
   "Long Short-Term Memory". Neural Computation, 9(8):1735-1780, 1997.
@@ -628,7 +628,7 @@ class DropoutWrapper(RNNCell):
     """Create a cell with added input, state, and/or output dropout.
 
     If `variational_recurrent` is set to `True` (**NOT** the default behavior),
-    then the the same dropout mask is applied at every step, as described in:
+    then the same dropout mask is applied at every step, as described in:
 
     Y. Gal, Z Ghahramani.  "A Theoretically Grounded Application of Dropout in
     Recurrent Neural Networks".  https://arxiv.org/abs/1512.05287
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 377adcdab8..93a9656950 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -30,6 +30,8 @@
 @@sparse_reset_shape
 @@sparse_fill_empty_rows
 @@sparse_transpose
+@@sparse_reduce_max
+@@sparse_reduce_max_sparse
 @@sparse_reduce_sum
 @@sparse_reduce_sum_sparse
 @@sparse_add
@@ -710,6 +712,90 @@ def sparse_to_dense(sparse_indices,
       name=name)
 
 
+def sparse_reduce_max(sp_input, axis=None, keep_dims=False,
+                      reduction_axes=None):
+  """Computes the max of elements across dimensions of a SparseTensor.
+
+  This Op takes a SparseTensor and is the sparse counterpart to
+  `tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
+  instead of a sparse one.
+
+  Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+  `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+  `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+  with length 1.
+
+  If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+  with a single element is returned.  Additionally, the axes can be negative,
+  similar to the indexing rules in Python.
+
+  For example:
+
+  ```python
+  # 'x' represents [[1, ?, 2]
+  #                 [?, 3, ?]]
+  # where ? is implicitly-zero.
+  tf.sparse_reduce_max(x) ==> 3
+  tf.sparse_reduce_max(x, 0) ==> [1, 3, 2]
+  tf.sparse_reduce_max(x, 1) ==> [2, 3]  # Can also use -1 as the axis.
+  tf.sparse_reduce_max(x, 1, keep_dims=True) ==> [[2], [3]]
+  tf.sparse_reduce_max(x, [0, 1]) ==> 3
+  ```
+
+  Args:
+    sp_input: The SparseTensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce; list or scalar. If `None` (the
+      default), reduces all dimensions.
+    keep_dims: If true, retain reduced dimensions with length 1.
+    reduction_axes: Deprecated name of axis.
+
+  Returns:
+    The reduced Tensor.
+  """
+  return gen_sparse_ops.sparse_reduce_max(
+      sp_input.indices, sp_input.values,
+      sp_input.dense_shape,
+      math_ops._ReductionDims(sp_input, axis, reduction_axes),
+      keep_dims)
+
+
+def sparse_reduce_max_sparse(sp_input, axis=None, keep_dims=False,
+                             reduction_axes=None):
+  """Computes the max of elements across dimensions of a SparseTensor.
+
+  This Op takes a SparseTensor and is the sparse counterpart to
+  `tf.reduce_max()`.  In contrast to SparseReduceSum, this Op returns a
+  SparseTensor.
+
+  Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+  `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+  `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+  with length 1.
+
+  If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+  with a single element is returned.  Additionally, the axes can be negative,
+  which are interpreted according to the indexing rules in Python.
+
+  Args:
+    sp_input: The SparseTensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce; list or scalar. If `None` (the
+      default), reduces all dimensions.
+    keep_dims: If true, retain reduced dimensions with length 1.
+    reduction_axes: Deprecated name of axis
+
+  Returns:
+    The reduced SparseTensor.
+  """
+  output_ind, output_val, output_shape = (
+      gen_sparse_ops.sparse_reduce_max_sparse(
+          sp_input.indices, sp_input.values,
+          sp_input.dense_shape, math_ops._ReductionDims(sp_input, axis,
+                                                        reduction_axes),
+          keep_dims))
+
+  return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
+
+
 def sparse_reduce_sum(sp_input, axis=None, keep_dims=False,
                       reduction_axes=None):
   """Computes the sum of elements across dimensions of a SparseTensor.
diff --git a/tensorflow/python/ops/standard_ops.py b/tensorflow/python/ops/standard_ops.py
index a6b14f6f6f..30bf4e4ef1 100644
--- a/tensorflow/python/ops/standard_ops.py
+++ b/tensorflow/python/ops/standard_ops.py
@@ -145,7 +145,7 @@ _allowed_symbols_math_ops = [
     "sub",  # use tf.subtract instead.
 
     # These are documented in nn.
-    # We are are not importing nn because it would create a circular dependency.
+    # We are not importing nn because it would create a circular dependency.
     "sigmoid",
     "log_sigmoid",
     "tanh",
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index 7a6abc8e61..20ae082ee1 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -87,7 +87,7 @@ class TensorArray(object):
         the shape constraints of each of the elements of the TensorArray.
         Need not be fully defined.
       colocate_with_first_write_call: If `True`, the TensorArray will be
-        colocated on the same device as the the Tensor used on its first write
+        colocated on the same device as the Tensor used on its first write
         (write operations include `write`, `unstack`, and `split`).  If `False`,
         the TensorArray will be placed on the device determined by the
         device context available during its initialization.
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index f784b715c1..c2193a24d0 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -1501,6 +1501,11 @@ def variable_scope(name_or_scope,
   A note about name scoping: Setting `reuse` does not impact the naming of other
   ops such as mult. See related discussion on [github#6189](https://github.com/tensorflow/tensorflow/issues/6189)
 
+  Note that up to and including version 1.0, it was allowed (though
+  explicitly discouraged) to pass False to the reuse argument, yielding
+  undocumented behaviour slightly different from None. Starting at 1.1.0
+  passing None and False as reuse has exactly the same effect.
+
   Args:
     name_or_scope: `string` or `VariableScope`: the scope to open.
     default_name: The default name to use if the `name_or_scope` argument is
diff --git a/tensorflow/python/tools/print_selective_registration_header.py b/tensorflow/python/tools/print_selective_registration_header.py
index 3e2ab4695e..62f00f4467 100644
--- a/tensorflow/python/tools/print_selective_registration_header.py
+++ b/tensorflow/python/tools/print_selective_registration_header.py
@@ -16,7 +16,7 @@ r"""Prints a header file to be used with SELECTIVE_REGISTRATION.
 
 An example of command-line usage is:
   bazel build tensorflow/python/tools:print_selective_registration_header && \
-  bazel-bin/tensorflow/python/tools:print_selective_registration_header \
+  bazel-bin/tensorflow/python/tools/print_selective_registration_header \
     --graphs=path/to/graph.pb > ops_to_register.h
 
 Then when compiling tensorflow, include ops_to_register.h in the include search
diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py
index 1755167938..93ba15ec81 100644
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@@ -382,7 +382,7 @@ class _SparseMetaData(object):
 
 def _as_tensor_list(tensors):
   if isinstance(tensors, dict):
-    return [tensors[k] for k in sorted(tensors)]
+    return [tensors[k] for k in sorted(tensors, key=str)]
   else:
     return tensors
 
@@ -408,7 +408,7 @@ def _as_original_type(original_tensors, tensor_list):
       # was enqueued.  Make it a list again.  See b/28117485.
       tensor_list = [tensor_list]
     return {k: tensor_list[i]
-            for i, k in enumerate(sorted(original_tensors))}
+            for i, k in enumerate(sorted(original_tensors, key=str))}
   else:
     return tensor_list
 
@@ -762,6 +762,9 @@ def _shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
   tensor_list = _as_tensor_list(tensors)
   with ops.name_scope(name, "shuffle_batch",
                       list(tensor_list) + [keep_input]) as name:
+    if capacity <= min_after_dequeue:
+      raise ValueError("capacity %d must be bigger than min_after_dequeue %d."
+                       % (capacity, min_after_dequeue))
     tensor_list = _validate(tensor_list)
     keep_input = _validate_keep_input(keep_input, enqueue_many)
     tensor_list, sparse_info = _store_sparse_tensors(
diff --git a/tensorflow/python/training/input_test.py b/tensorflow/python/training/input_test.py
index 4f705a0a85..d32768f5da 100644
--- a/tensorflow/python/training/input_test.py
+++ b/tensorflow/python/training/input_test.py
@@ -429,6 +429,13 @@ class DictHelperTest(test_lib.TestCase):
     d2 = inp._as_original_type(d, l)
     self.assertEquals(d, d2)
 
+  def testHeterogeneousKeysDictInputs(self):
+    d = {"z": 1, 1: 42, ("a", "b"): 100}
+    l = inp._as_tensor_list(d)
+    self.assertEquals([100, 42, 1], l)
+    d2 = inp._as_original_type(d, l)
+    self.assertEquals(d, d2)
+
 
 class BatchTest(test_lib.TestCase):
 
diff --git a/tensorflow/python/training/momentum.py b/tensorflow/python/training/momentum.py
index ffd7c12c42..f34ff22f07 100644
--- a/tensorflow/python/training/momentum.py
+++ b/tensorflow/python/training/momentum.py
@@ -52,7 +52,7 @@ class MomentumOptimizer(optimizer.Optimizer):
       name: Optional name prefix for the operations created when applying
         gradients.  Defaults to "Momentum".
       use_nesterov: If `True` use Nesterov Momentum.
-        See [Sutskever et. al., 2013](
+        See [Sutskever et al., 2013](
         http://jmlr.org/proceedings/papers/v28/sutskever13.pdf)
 
     """
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 690af5485a..5b1d9a517e 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -147,7 +147,7 @@ def tf_copts():
       "-Iexternal/gemmlowp",
       "-Wno-sign-compare",
       "-fno-exceptions",
-  ] + if_cuda(["-DGOOGLE_CUDA=1"]) + if_mkl(["-DINTEL_MKL=1"]) + if_android_arm(
+  ] + if_cuda(["-DGOOGLE_CUDA=1"]) + if_mkl(["-DINTEL_MKL=1", "-fopenmp",]) + if_android_arm(
       ["-mfpu=neon"]) + if_x86(["-msse3"]) + select({
           clean_dep("//tensorflow:android"): [
               "-std=c++11",
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index b761f41361..eca017edf5 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -768,6 +768,10 @@ tf_module {
     name: "cos"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "cosh"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "count_nonzero"
     argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'dtype\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \"<dtype: \'int64\'>\", \'None\', \'None\'], "
@@ -1640,6 +1644,10 @@ tf_module {
     name: "sin"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "sinh"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "size"
     argspec: "args=[\'input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int32\'>\"], "
@@ -1696,6 +1704,14 @@ tf_module {
     name: "sparse_placeholder"
     argspec: "args=[\'dtype\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "sparse_reduce_max"
+    argspec: "args=[\'sp_input\', \'axis\', \'keep_dims\', \'reduction_axes\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "sparse_reduce_max_sparse"
+    argspec: "args=[\'sp_input\', \'axis\', \'keep_dims\', \'reduction_axes\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "sparse_reduce_sum"
     argspec: "args=[\'sp_input\', \'axis\', \'keep_dims\', \'reduction_axes\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-writer.pbtxt b/tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-writer.pbtxt
index af0c11ca14..31775de2d1 100644
--- a/tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-writer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-writer.pbtxt
@@ -10,6 +10,10 @@ tf_class {
     name: "close"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "flush"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "write"
     argspec: "args=[\'self\', \'record\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index e0a1391d6e..85c712d3c6 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@@ -205,14 +205,17 @@ if [[ -n "${PY_TAGS}" ]]; then
 $(echo ${WHL_BASE_NAME} | cut -d \- -f 2)-${PY_TAGS}-${PLATFORM_TAG}.whl
 
   if [[ ! -f "${WHL_DIR}/${NEW_WHL_BASE_NAME}" ]]; then
-    cp "${WHL_DIR}/${WHL_BASE_NAME}" "${WHL_DIR}/${NEW_WHL_BASE_NAME}" && \
-      echo "Copied wheel file: ${WHL_BASE_NAME} --> ${NEW_WHL_BASE_NAME}" || \
+    if cp "${WHL_DIR}/${WHL_BASE_NAME}" "${WHL_DIR}/${NEW_WHL_BASE_NAME}"
+    then
+      echo "Copied wheel file: ${WHL_BASE_NAME} --> ${NEW_WHL_BASE_NAME}"
+    else
       die "ERROR: Failed to copy wheel file to ${NEW_WHL_BASE_NAME}"
+    fi
   fi
 fi
 
 if [[ $(uname) == "Linux" ]]; then
-  AUDITED_WHL_NAME="${WHL_DIR}/$(echo ${WHL_BASE_NAME} | sed "s/linux/manylinux1/")"
+  AUDITED_WHL_NAME="${WHL_DIR}/$(echo ${WHL_BASE_NAME//linux/manylinux1})"
 
   # Repair the wheels for cpu manylinux1
   if [[ ${CONTAINER_TYPE} == "cpu" ]]; then
@@ -240,14 +243,20 @@ echo "Installing pip whl file: ${WHL_PATH}"
 VENV_DIR="${PIP_TEST_ROOT}/venv"
 
 if [[ -d "${VENV_DIR}" ]]; then
-  rm -rf "${VENV_DIR}" && \
-      echo "Removed existing virtualenv directory: ${VENV_DIR}" || \
-      die "Failed to remove existing virtualenv directory: ${VENV_DIR}"
+  if rm -rf "${VENV_DIR}"
+  then
+    echo "Removed existing virtualenv directory: ${VENV_DIR}"
+  else
+    die "Failed to remove existing virtualenv directory: ${VENV_DIR}"
+  fi
 fi
 
-mkdir -p ${VENV_DIR} && \
-    echo "Created virtualenv directory: ${VENV_DIR}" || \
-    die "FAILED to create virtualenv directory: ${VENV_DIR}"
+if mkdir -p ${VENV_DIR}
+then
+  echo "Created virtualenv directory: ${VENV_DIR}"
+else
+  die "FAILED to create virtualenv directory: ${VENV_DIR}"
+fi
 
 # Verify that virtualenv exists
 if [[ -z $(which virtualenv) ]]; then
diff --git a/tensorflow/tools/ci_build/ci_build.sh b/tensorflow/tools/ci_build/ci_build.sh
index 3b640dd5e8..9c1b75d004 100755
--- a/tensorflow/tools/ci_build/ci_build.sh
+++ b/tensorflow/tools/ci_build/ci_build.sh
@@ -80,7 +80,7 @@ fi
 
 # cmake (CPU) builds do not require configuration.
 if [[ "${CONTAINER_TYPE}" == "cmake" ]]; then
-  CI_COMMAND_PREFIX=""
+  CI_COMMAND_PREFIX=("")
 fi
 
 # Use nvidia-docker if the container is GPU.
@@ -120,9 +120,9 @@ DOCKER_IMG_NAME=$(echo "${DOCKER_IMG_NAME}" | tr '[:upper:]' '[:lower:]')
 
 # Print arguments.
 echo "WORKSPACE: ${WORKSPACE}"
-echo "CI_DOCKER_EXTRA_PARAMS: ${CI_DOCKER_EXTRA_PARAMS[@]}"
-echo "COMMAND: ${COMMAND[@]}"
-echo "CI_COMMAND_PREFIX: ${CI_COMMAND_PREFIX[@]}"
+echo "CI_DOCKER_EXTRA_PARAMS: ${CI_DOCKER_EXTRA_PARAMS[*]}"
+echo "COMMAND: ${COMMAND[*]}"
+echo "CI_COMMAND_PREFIX: ${CI_COMMAND_PREFIX[*]}"
 echo "CONTAINER_TYPE: ${CONTAINER_TYPE}"
 echo "BUILD_TAG: ${BUILD_TAG}"
 echo "  (docker container name will be ${DOCKER_IMG_NAME})"
@@ -140,7 +140,7 @@ if [[ $? != "0" ]]; then
 fi
 
 # Run the command inside the container.
-echo "Running '${COMMAND[@]}' inside ${DOCKER_IMG_NAME}..."
+echo "Running '${COMMAND[*]}' inside ${DOCKER_IMG_NAME}..."
 mkdir -p ${WORKSPACE}/bazel-ci_build-cache
 # By default we cleanup - remove the container once it finish running (--rm)
 # and share the PID namespace (--pid=host) so the process inside does not have
diff --git a/tensorflow/tools/ci_build/ci_parameterized_build.sh b/tensorflow/tools/ci_build/ci_parameterized_build.sh
index dfaf50eb4f..1cf87d7c7c 100755
--- a/tensorflow/tools/ci_build/ci_parameterized_build.sh
+++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh
@@ -200,8 +200,8 @@ echo "  TF_BUILD_ENABLE_XLA=${TF_BUILD_ENABLE_XLA}"
 function get_cuda_capability_version() {
   if [[ ! -z $(which deviceQuery) ]]; then
     # The first listed device is used
-    echo $(deviceQuery | grep "CUDA Capability .* version" | \
-        head -1 | awk '{print $NF}')
+    deviceQuery | grep "CUDA Capability .* version" | \
+        head -1 | awk '{print $NF}'
   fi
 }
 
@@ -532,11 +532,14 @@ if [[ "${TF_BUILD_PYTHON_VERSION}" == "python3.5" ]]; then
   DOCKERFILE="${TMP_DIR}/Dockerfile.${TF_BUILD_CONTAINER_TYPE}"
 
   # Replace a line in the Dockerfile
-  sed -i \
+  if sed -i \
       's/RUN \/install\/install_pip_packages.sh/RUN \/install\/install_python3.5_pip_packages.sh/g' \
-      "${DOCKERFILE}" && \
-      echo "Copied and modified Dockerfile for Python 3.5 build: ${DOCKERFILE}" || \
-      die "ERROR: Faild to copy and modify Dockerfile: ${DOCKERFILE}"
+      "${DOCKERFILE}"
+  then
+    echo "Copied and modified Dockerfile for Python 3.5 build: ${DOCKERFILE}"
+  else
+    die "ERROR: Faild to copy and modify Dockerfile: ${DOCKERFILE}"
+  fi
 
   DOCKERFILE_FLAG="--dockerfile ${DOCKERFILE}"
 fi
@@ -574,7 +577,7 @@ rm -f ${TMP_SCRIPT}
 END_TIME=$(date +'%s')
 echo ""
 echo "Parameterized build ends with ${RESULT} at: $(date) "\
-"(Elapsed time: $((${END_TIME} - ${START_TIME})) s)"
+"(Elapsed time: $((END_TIME - START_TIME)) s)"
 
 
 # Clean up temporary directory if it exists
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index fd2874df91..e428766a40 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -47,7 +47,7 @@ num_cpus() {
 # Get the hash of the last non-merge git commit on the current branch.
 # Usage: get_last_non_merge_git_commit
 get_last_non_merge_git_commit() {
-  echo $(git rev-list --no-merges -n 1 HEAD)
+  git rev-list --no-merges -n 1 HEAD
 }
 
 # List files changed (i.e., added, removed or revised) in the last non-merge
@@ -75,7 +75,7 @@ get_py_files_to_check() {
 
     echo "${PY_FILES}"
   else
-    echo $(find tensorflow -name '*.py')
+    find tensorflow -name '*.py'
   fi
 }
 
@@ -157,25 +157,25 @@ do_pylint() {
   NONWL_ERRORS_FILE="$(mktemp)_pylint_nonwl_errors.log"
 
   rm -rf ${OUTPUT_FILE}
-  rm -rf ${ERRORS_FLIE}
+  rm -rf ${ERRORS_FILE}
   rm -rf ${NONWL_ERRORS_FILE}
   touch ${NONWL_ERRORS_FILE}
 
   ${PYLINT_BIN} --rcfile="${PYLINTRC_FILE}" --output-format=parseable \
-      --jobs=${NUM_CPUS} ${PYTHON_SRC_FILES} 2>&1 > ${OUTPUT_FILE}
+      --jobs=${NUM_CPUS} ${PYTHON_SRC_FILES} > ${OUTPUT_FILE} 2>&1
   PYLINT_END_TIME=$(date +'%s')
 
   echo ""
-  echo "pylint took $((${PYLINT_END_TIME} - ${PYLINT_START_TIME})) s"
+  echo "pylint took $((PYLINT_END_TIME - PYLINT_START_TIME)) s"
   echo ""
 
   grep -E '(\[E|\[W0311|\[W0312)' ${OUTPUT_FILE} > ${ERRORS_FILE}
 
   N_ERRORS=0
-  while read LINE; do
+  while read -r LINE; do
     IS_WHITELISTED=0
     for WL_REGEX in ${ERROR_WHITELIST}; do
-      if [[ ! -z $(echo ${LINE} | grep "${WL_REGEX}") ]]; then
+      if echo ${LINE} | grep -q "${WL_REGEX}"; then
         echo "Found a whitelisted error:"
         echo "  ${LINE}"
         IS_WHITELISTED=1
@@ -248,7 +248,7 @@ do_pep8() {
   PEP8_END_TIME=$(date +'%s')
 
   echo ""
-  echo "pep8 took $((${PEP8_END_TIME} - ${PEP8_START_TIME})) s"
+  echo "pep8 took $((PEP8_END_TIME - PEP8_START_TIME)) s"
   echo ""
 
   if [[ -s ${PEP8_OUTPUT_FILE} ]]; then
@@ -278,7 +278,7 @@ do_buildifier(){
   BUILDIFIER_END_TIME=$(date +'%s')
 
   echo ""
-  echo "buildifier took $((${BUILDIFIER_END_TIME} - ${BUILDIFIER_START_TIME})) s"
+  echo "buildifier took $((BUILDIFIER_END_TIME - BUILDIFIER_START_TIME)) s"
   echo ""
 
   if [[ -s ${BUILDIFIER_OUTPUT_FILE} ]]; then
@@ -306,7 +306,7 @@ do_external_licenses_check(){
 
   echo "Getting external dependencies for ${BUILD_TARGET}"
  bazel query "attr('licenses', 'notice', deps(${BUILD_TARGET}))" --no_implicit_deps --no_host_deps --keep_going \
-  | egrep -v "^//tensorflow" \
+  | grep -E -v "^//tensorflow" \
   | sed -e 's|:.*||' \
   | sort \
   | uniq 2>&1 \
@@ -315,7 +315,7 @@ do_external_licenses_check(){
   echo
   echo "Getting list of external licenses mentioned in ${LICENSES_TARGET}."
   bazel query "deps(${LICENSES_TARGET})" --no_implicit_deps --no_host_deps --keep_going \
-  | egrep -v "^//tensorflow" \
+  | grep -E -v "^//tensorflow" \
   | sed -e 's|:.*||' \
   | sort \
   | uniq 2>&1 \
@@ -329,7 +329,7 @@ do_external_licenses_check(){
   EXTERNAL_LICENSES_CHECK_END_TIME=$(date +'%s')
 
   echo
-  echo "do_external_licenses_check took $((${EXTERNAL_LICENSES_CHECK_END_TIME} - ${EXTERNAL_LICENSES_CHECK_START_TIME})) s"
+  echo "do_external_licenses_check took $((EXTERNAL_LICENSES_CHECK_END_TIME - EXTERNAL_LICENSES_CHECK_START_TIME)) s"
   echo
 
   if [[ -s ${MISSING_LICENSES_FILE} ]] || [[ -s ${EXTRA_LICENSES_FILE} ]] ; then
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index b8f9fc8453..7fcd235e62 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -44,8 +44,8 @@ pip2 install --upgrade markdown==2.6.8
 pip3 install --upgrade markdown==2.6.8
 
 # Install protobuf.
-pip2 install --upgrade protobuf==3.2.0
-pip3 install --upgrade protobuf==3.2.0
+pip2 install --upgrade protobuf==3.3.0
+pip3 install --upgrade protobuf==3.3.0
 
 # Remove obsolete version of six, which can sometimes confuse virtualenv.
 rm -rf /usr/lib/python3/dist-packages/six*
diff --git a/tensorflow/tools/ci_build/install/install_proto3.sh b/tensorflow/tools/ci_build/install/install_proto3.sh
index 773c89b70b..7934002b2c 100755
--- a/tensorflow/tools/ci_build/install/install_proto3.sh
+++ b/tensorflow/tools/ci_build/install/install_proto3.sh
@@ -17,9 +17,9 @@
 # Install protobuf3.
 
 # Select protobuf version.
-PROTOBUF_VERSION="3.2.0"
+PROTOBUF_VERSION="3.3.0"
 protobuf_ver_flat=$(echo $PROTOBUF_VERSION | sed 's/\.//g' | sed 's/^0*//g')
-local_protobuf_ver=$(protoc --version | awk '{print $2}')
+local_protobuf_ver=$(protoc --version)
 local_protobuf_ver_flat=$(echo $local_protobuf_ver | sed 's/\.//g' | sed 's/^0*//g')
 if [[ -z $local_protobuf_ver_flat ]]; then
   local_protobuf_ver_flat=0
@@ -30,7 +30,7 @@ if (( $local_protobuf_ver_flat < $protobuf_ver_flat )); then
   PROTOBUF_ZIP=$(basename "${PROTOBUF_URL}")
   UNZIP_DEST="google-protobuf"
 
-  wget -q "${PROTOBUF_URL}"
+  wget "${PROTOBUF_URL}"
   unzip "${PROTOBUF_ZIP}" -d "${UNZIP_DEST}"
   cp "${UNZIP_DEST}/bin/protoc" /usr/local/bin/
 
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index e7e2d256cd..084ac49496 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -64,7 +64,7 @@ set -e
 pip3.5 install --upgrade six==1.10.0
 
 # Install protobuf.
-pip3.5 install --upgrade protobuf==3.2.0
+pip3.5 install --upgrade protobuf==3.3.0
 
 # Remove obsolete version of six, which can sometimes confuse virtualenv.
 rm -rf /usr/lib/python3/dist-packages/six*
diff --git a/tensorflow/tools/ci_build/protobuf/protobuf_optimized_pip.sh b/tensorflow/tools/ci_build/protobuf/protobuf_optimized_pip.sh
index 59ba71f5df..3e31aa1ce1 100755
--- a/tensorflow/tools/ci_build/protobuf/protobuf_optimized_pip.sh
+++ b/tensorflow/tools/ci_build/protobuf/protobuf_optimized_pip.sh
@@ -14,7 +14,7 @@
 # limitations under the License.
 # ==============================================================================
 
-PROTOBUF_VERSION="3.2.0"
+PROTOBUF_VERSION="3.3.1"
 PYTHON_BIN=${PYTHON_BIN:-python}
 DIR=${PWD}/protobuf
 
diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
index 5581023ad7..dff4707cbe 100644
--- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
@@ -161,5 +161,5 @@ function create_python_test_dir() {
 
 function reinstall_tensorflow_pip() {
   echo "y" | pip uninstall tensorflow -q || true
-  pip install ${1}
+  pip install ${1} --no-deps
 }
diff --git a/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat b/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat
index 8b6836f4ed..96fbadd176 100644
--- a/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat
+++ b/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat
@@ -29,9 +29,6 @@ IF DEFINED PIP_EXE (ECHO PIP_EXE is set to %PIP_EXE%) ELSE (SET PIP_EXE="C:\Prog
 CALL %REPO_ROOT%\tensorflow\tools\ci_build\windows\cpu\cmake\run_build.bat
 if %errorlevel% neq 0 exit /b %errorlevel%
 
-:: Attempt to upgrade PIP to work around Anaconda issue #542.
-%PIP_EXE% install --ignore-installed --upgrade pip setuptools -v -v
-
 :: Since there are no wildcards in windows command prompt, use dark magic to get the wheel file name.
 DIR %REPO_ROOT%\%BUILD_DIR%\tf_python\dist\ /S /B > wheel_filename_file
 set /p WHEEL_FILENAME=<wheel_filename_file
@@ -39,7 +36,7 @@ del wheel_filename_file
 
 :: Install the pip package.
 echo Installing PIP package...
-%PIP_EXE% install --upgrade %WHEEL_FILENAME% -v -v
+%PIP_EXE% install --upgrade --no-deps %WHEEL_FILENAME% -v -v
 if %errorlevel% neq 0 exit /b %errorlevel%
 
 :: Run all python tests if the installation succeeded.
diff --git a/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat b/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
index ba2d939b5f..e774a6e916 100644
--- a/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
+++ b/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
@@ -28,9 +28,6 @@ IF DEFINED PIP_EXE (ECHO PIP_EXE is set to %PIP_EXE%) ELSE (SET PIP_EXE="C:\Prog
 CALL %REPO_ROOT%\tensorflow\tools\ci_build\windows\gpu\cmake\run_build.bat
 if %errorlevel% neq 0 exit /b %errorlevel%
 
-:: Attempt to upgrade PIP to work around Anaconda issue #542.
-%PIP_EXE% install --ignore-installed --upgrade pip setuptools -v -v
-
 :: Since there are no wildcards in windows command prompt, use dark magic to get the wheel file name.
 DIR %REPO_ROOT%\%BUILD_DIR%\tf_python\dist\ /S /B > wheel_filename_file
 set /p WHEEL_FILENAME=<wheel_filename_file
@@ -38,7 +35,7 @@ del wheel_filename_file
 
 :: Install the pip package.
 echo Installing PIP package...
-%PIP_EXE% install --upgrade %WHEEL_FILENAME% -v -v
+%PIP_EXE% install --upgrade --no-deps %WHEEL_FILENAME% -v -v
 if %errorlevel% neq 0 exit /b %errorlevel%
 
 :: Run all python tests if the installation succeeded.
diff --git a/tensorflow/tools/dist_test/local_test.sh b/tensorflow/tools/dist_test/local_test.sh
index f536beef54..7d7f92d246 100755
--- a/tensorflow/tools/dist_test/local_test.sh
+++ b/tensorflow/tools/dist_test/local_test.sh
@@ -70,7 +70,7 @@ get_container_id_by_image_name() {
     # Get the id of a container by image name
     # Usage: get_docker_container_id_by_image_name <img_name>
 
-    echo $(docker ps | grep $1 | awk '{print $1}')
+    docker ps | grep $1 | awk '{print $1}'
 }
 
 # Parse input arguments
@@ -152,7 +152,7 @@ rm -rf "${BUILD_DIR}"
 docker run ${DOCKER_IMG_NAME} \
     /var/tf_dist_test/scripts/dist_mnist_test.sh \
     --ps_hosts $(seq -f "localhost:%g" -s "," \
-                 2000 $((2000 + ${NUM_PARAMETER_SERVERS} - 1))) \
+                 2000 $((2000 + NUM_PARAMETER_SERVERS - 1))) \
     --worker_hosts $(seq -f "localhost:%g" -s "," \
-                     3000 $((3000 + ${NUM_WORKERS} - 1))) \
+                     3000 $((3000 + NUM_WORKERS - 1))) \
     --num_gpus 0 ${SYNC_REPLICAS_FLAG}
diff --git a/tensorflow/tools/docker/README.md b/tensorflow/tools/docker/README.md
index 3e45ae362c..6d5a9bdc4c 100644
--- a/tensorflow/tools/docker/README.md
+++ b/tensorflow/tools/docker/README.md
@@ -54,6 +54,30 @@ for additional containers, such as release candidates or nightly builds.
 
 ## Rebuilding the containers
 
-Just pick the dockerfile corresponding to the container you want to build, and run
+Building TensorFlow Docker containers should be done through the
+[parameterized_docker_build.sh](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/docker/README.md)
+script. The raw Dockerfiles should not be used directly as they contain strings
+to be replaced by the script during the build.
 
-    $ docker build --pull -t $USER/tensorflow-suffix -f Dockerfile.suffix .
+To use the script, specify the container type (`CPU` vs. `GPU`), the desired
+Python version (`PYTHON2` vs. `PYTHON3`) and whether the developer Docker image
+is to be built (`NO` vs. `YES`). In addition, you need to specify the central
+location from where the pip package of TensorFlow will be downloaded.
+
+For example, to build a CPU-only non-developer Docker image for Python 2, using
+TensorFlow's nightly pip package:
+
+``` bash
+export TF_DOCKER_BUILD_IS_DEVEL=NO
+export TF_DOCKER_BUILD_TYPE=CPU
+export TF_DOCKER_BUILD_PYTHON_VERSION=PYTHON2
+
+export NIGHTLY_VERSION="1.head"
+export TF_DOCKER_BUILD_CENTRAL_PIP=$(echo ${TF_DOCKER_BUILD_PYTHON_VERSION} | sed s^PYTHON2^http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=${TF_DOCKER_BUILD_PYTHON_VERSION},label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-${NIGHTLY_VERSION}-cp27-cp27mu-manylinux1_x86_64.whl^ | sed s^PYTHON3^http://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-${NIGHTLY_VERSION}-cp35-cp35m-manylinux1_x86_64.whl^)
+
+tensorflow/tools/docker/parameterized_docker_build.sh
+```
+
+If successful, the image will be tagged as `${USER}/tensorflow:latest` by default.
+
+Rebuilding GPU images requires [nvidia-docker](https://github.com/NVIDIA/nvidia-docker).
diff --git a/tensorflow/tools/docker/parameterized_docker_build.sh b/tensorflow/tools/docker/parameterized_docker_build.sh
index f88af68cde..ea88d8165f 100755
--- a/tensorflow/tools/docker/parameterized_docker_build.sh
+++ b/tensorflow/tools/docker/parameterized_docker_build.sh
@@ -233,13 +233,16 @@ if [[ "${TF_DOCKER_BUILD_IS_DEVEL}" == "no" ]]; then
 
   # Modify python/pip version if necessary.
   if [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python3" ]]; then
-    sed -i -e 's/python /python3 /g' "${DOCKERFILE}" && \
+    if sed -i -e 's/python /python3 /g' "${DOCKERFILE}" && \
         sed -i -e 's/python-dev/python3-dev/g' "${DOCKERFILE}" && \
         sed -i -e 's/pip /pip3 /g' "${DOCKERFILE}" && \
-        sed -i -e 's^# RUN ln -s /usr/bin/python3 /usr/bin/python#^RUN ln -s /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}" && \
-        echo "Modified Dockerfile for python version "\
-"${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}" || \
-        die "FAILED to modify ${DOCKERFILE} for python3"
+        sed -i -e 's^# RUN ln -s /usr/bin/python3 /usr/bin/python#^RUN ln -s /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
+    then
+      echo "Modified Dockerfile for python version "\
+"${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}"
+    else
+      die "FAILED to modify ${DOCKERFILE} for python3"
+    fi
   fi
 else
   DOCKERFILE="${TMP_DIR}/Dockerfile"
@@ -250,14 +253,17 @@ else
 
   # Modify python/pip version if necessary.
   if [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python3" ]]; then
-    sed -i -e 's/python-dev/python-dev python3-dev/g' "${DOCKERFILE}" && \
+    if sed -i -e 's/python-dev/python-dev python3-dev/g' "${DOCKERFILE}" && \
         sed -i -e 's/python /python3 /g' "${DOCKERFILE}" && \
         sed -i -e 's^/tmp/pip^/tmp/pip3^g' "${DOCKERFILE}" && \
         sed -i -e 's/pip /pip3 /g' "${DOCKERFILE}" && \
         sed -i -e 's/ENV CI_BUILD_PYTHON python/ENV CI_BUILD_PYTHON python3/g' "${DOCKERFILE}" && \
-        sed -i -e 's^# RUN ln -s /usr/bin/python3 /usr/bin/python#^RUN ln -s /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}" && \
-        echo "Modified Dockerfile further for python version ${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}" || \
-        die "FAILED to modify ${DOCKERFILE} for python3"
+        sed -i -e 's^# RUN ln -s /usr/bin/python3 /usr/bin/python#^RUN ln -s /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
+    then
+      echo "Modified Dockerfile further for python version ${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}"
+    else
+      die "FAILED to modify ${DOCKERFILE} for python3"
+    fi
   fi
 fi
 
@@ -277,7 +283,7 @@ fi
 
 # Make sure that there is no other containers of the same image running
 # TODO(cais): Move to an earlier place.
-if [[ ! -z $("${DOCKER_BINARY}" ps | grep "${IMG}") ]]; then
+if "${DOCKER_BINARY}" ps | grep -q "${IMG}"; then
   die "ERROR: It appears that there are docker containers of the image "\
 "${IMG} running. Please stop them before proceeding"
 fi
@@ -310,16 +316,22 @@ if [[ "${TF_DOCKER_BUILD_IS_DEVEL}" == "no" ]]; then
     # on the running docker container
     echo ""
     echo "Performing basic sanity checks on the running container..."
-    wget -qO- "http://127.0.0.1:${CONTAINER_PORT}/tree" &> /dev/null && \
-        echo "  PASS: wget tree" || \
-        mark_check_failed "  FAIL: wget tree"
+    if wget -qO- "http://127.0.0.1:${CONTAINER_PORT}/tree" &> /dev/null
+    then
+      echo "  PASS: wget tree"
+    else
+      mark_check_failed "  FAIL: wget tree"
+    fi
 
     for NB in ${TMP_DIR}/notebooks/*.ipynb; do
       NB_BASENAME=$(basename "${NB}")
       NB_URL="http://127.0.0.1:${CONTAINER_PORT}/notebooks/${NB_BASENAME}"
-      wget -qO- "${NB_URL}" -o "${TMP_DIR}/${NB_BASENAME}" &> /dev/null && \
-          echo "  PASS: wget ${NB_URL}" || \
-          mark_check_failed  "  FAIL: wget ${NB_URL}"
+      if wget -qO- "${NB_URL}" -o "${TMP_DIR}/${NB_BASENAME}" &> /dev/null
+      then
+        echo "  PASS: wget ${NB_URL}"
+      else
+        mark_check_failed  "  FAIL: wget ${NB_URL}"
+      fi
     done
   fi
 
diff --git a/tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh b/tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh
index 2ce0fb394f..7146213b33 100755
--- a/tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh
+++ b/tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh
@@ -71,7 +71,7 @@ rm -rf ${LOG_FILE} || \
 
 # Invoke main Python file
 python "${GCS_SMOKE_PY}" --gcs_bucket_url="${GCS_BUCKET_URL}" \
-    2>&1 > "${LOG_FILE}"
+    > "${LOG_FILE}" 2>&1
 
 if [[ $? != "0" ]]; then
   cat ${LOG_FILE}
@@ -92,6 +92,9 @@ NEW_TFREC_URL=$(grep "Using input path" "${LOG_FILE}" | \
 if [[ -z ${NEW_TFREC_URL} ]]; then
   die "FAIL: Unable to determine the URL to the new tfrecord file in GCS"
 fi
-"${GSUTIL_BIN}" rm "${NEW_TFREC_URL}" && \
-    echo "Cleaned up new tfrecord file in GCS: ${NEW_TFREC_URL}" || \
-    die "FAIL: Unable to clean up new tfrecord file in GCS: ${NEW_TFREC_URL}"
+if "${GSUTIL_BIN}" rm "${NEW_TFREC_URL}"
+then
+  echo "Cleaned up new tfrecord file in GCS: ${NEW_TFREC_URL}"
+else
+  die "FAIL: Unable to clean up new tfrecord file in GCS: ${NEW_TFREC_URL}"
+fi
diff --git a/tensorflow/tools/git/gen_git_source.sh b/tensorflow/tools/git/gen_git_source.sh
index 1487658049..977fe16333 100755
--- a/tensorflow/tools/git/gen_git_source.sh
+++ b/tensorflow/tools/git/gen_git_source.sh
@@ -20,7 +20,7 @@ if [[ -z "${OUTPUT_FILENAME}"  ]]; then
   exit 1
 fi
 
-GIT_VERSION=`git describe --long --tags`
+GIT_VERSION=$(git describe --long --tags)
 if [[ $? != 0 ]]; then
    GIT_VERSION=unknown;
 fi
diff --git a/tensorflow/tools/graph_transforms/README.md b/tensorflow/tools/graph_transforms/README.md
index bfda55d3ad..b4274e67df 100644
--- a/tensorflow/tools/graph_transforms/README.md
+++ b/tensorflow/tools/graph_transforms/README.md
@@ -1055,7 +1055,7 @@ in the future.
 
 The Graph Transform Tool associates names of transforms with the code to
 implement them using the `REGISTER_GRAPH_TRANSFORM()` macro. This takes a string
-and a function, and automatically registers the transform with the tool. You
+and a function, and automagically registers the transform with the tool. You
 will need to watch out for a few things though:
 
 *   Because it's using global C++ objects in each file under the hood, the
diff --git a/tensorflow/tools/graph_transforms/quantize_weights_test.cc b/tensorflow/tools/graph_transforms/quantize_weights_test.cc
index e1a105bdd3..63c5b5a64d 100644
--- a/tensorflow/tools/graph_transforms/quantize_weights_test.cc
+++ b/tensorflow/tools/graph_transforms/quantize_weights_test.cc
@@ -90,13 +90,13 @@ class QuantizeWeightsTest : public ::testing::Test {
     EXPECT_EQ("Const", q_weights_const->op());
     EXPECT_EQ(DT_QUINT8, q_weights_const->attr().at("dtype").type());
 
-    // Run the the original graph.
+    // Run the original graph.
     std::unique_ptr<Session> original_session(NewSession(SessionOptions()));
     TF_ASSERT_OK(original_session->Create(original_graph_def));
     std::vector<Tensor> original_outputs;
     TF_ASSERT_OK(original_session->Run({}, {"output"}, {}, &original_outputs));
 
-    // Run the the quantized graph.
+    // Run the quantized graph.
     std::unique_ptr<Session> quantized_session(NewSession(SessionOptions()));
     TF_ASSERT_OK(quantized_session->Create(quantized_graph_def));
     std::vector<Tensor> quantized_outputs;
diff --git a/tensorflow/tools/lib_package/libtensorflow_java_test.sh b/tensorflow/tools/lib_package/libtensorflow_java_test.sh
index a44298e01a..c44978fc57 100755
--- a/tensorflow/tools/lib_package/libtensorflow_java_test.sh
+++ b/tensorflow/tools/lib_package/libtensorflow_java_test.sh
@@ -29,7 +29,7 @@ TAR="${TAR}"
 
 [ -z "${JAVA}" ] && JAVA="java"
 [ -z "${JAVAC}" ] && JAVAC="javac"
-[ -z "${TAR}"] && TAR="tar"
+[ -z "${TAR}" ] && TAR="tar"
 
 # bazel tests run with ${PWD} set to the root of the bazel workspace
 TARFILE="${PWD}/tensorflow/tools/lib_package/libtensorflow_jni.tar.gz"
diff --git a/tensorflow/tools/lib_package/libtensorflow_test.sh b/tensorflow/tools/lib_package/libtensorflow_test.sh
index 6430c755af..7dfe8eefcc 100755
--- a/tensorflow/tools/lib_package/libtensorflow_test.sh
+++ b/tensorflow/tools/lib_package/libtensorflow_test.sh
@@ -26,7 +26,7 @@ CC="${CC}"
 TAR="${TAR}"
 
 [ -z "${CC}" ] && CC="/usr/bin/gcc"
-[ -z "${TAR}"] && TAR="tar"
+[ -z "${TAR}" ] && TAR="tar"
 
 # bazel tests run with ${PWD} set to the root of the bazel workspace
 TARFILE="${PWD}/tensorflow/tools/lib_package/libtensorflow.tar.gz"
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 073988765c..54ba8064e8 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -29,7 +29,7 @@ from setuptools.dist import Distribution
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.2.0-rc2'
+_VERSION = '1.2.0'
 
 REQUIRED_PACKAGES = [
     'numpy >= 1.11.0',
diff --git a/tensorflow/tools/tfprof/README.md b/tensorflow/tools/tfprof/README.md
index 816ad8c07e..e86584acd1 100644
--- a/tensorflow/tools/tfprof/README.md
+++ b/tensorflow/tools/tfprof/README.md
@@ -1,6 +1,6 @@
 # tfprof: TensorFlow Profiler and Beyond
 
-###Features
+### Features
 
 * Profile model architectures
   * parameters, tensor shapes, float operations, device placement, etc.
@@ -13,7 +13,7 @@
   * operation configuration check
   * distributed runtime check (Not OSS)
 
-###Interfaces
+### Interfaces
 
 * Python API
 * Command Line
@@ -51,7 +51,7 @@ See [Options](g3doc/options.md) for detail instructions.
 -output                     stdout:
 ```
 
-###Tutorials
+### Tutorials
 
 *  [Python API](g3doc/python_api.md)
 *  [Command Line Interface](g3doc/command_line.md)
diff --git a/tensorflow/tools/tfprof/g3doc/command_line.md b/tensorflow/tools/tfprof/g3doc/command_line.md
index 0d8d56809a..9f0de72e07 100644
--- a/tensorflow/tools/tfprof/g3doc/command_line.md
+++ b/tensorflow/tools/tfprof/g3doc/command_line.md
@@ -126,7 +126,7 @@ tfprof>
 -show_name_regexes          .*
 -hide_name_regexes          IsVariableInitialized_[0-9]+,save\/.*,^zeros[0-9_]*
 -account_displayed_op_only  false
-# supported select fileds. Availability depends on --[run_meta|checkpoint|op_log]_path.
+# supported select fields. Availability depends on --[run_meta|checkpoint|op_log]_path.
 # [bytes|micros|params|float_ops|occurrence|tensor_value|device|op_types]
 -select                     params
 # format: output_type:key=value,key=value...
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index c8a3a81000..a5172187f7 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -327,11 +327,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   patched_http_archive(
       name = "protobuf",
       urls = [
-          "http://mirror.bazel.build/github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
-          "https://github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
+          "http://mirror.bazel.build/github.com/google/protobuf/archive/v3.3.1.tar.gz",
+          "https://github.com/google/protobuf/archive/v3.3.1.tar.gz",
       ],
-      sha256 = "e5d3d4e227a0f7afb8745df049bbd4d55474b158ca5aaa2a0e31099af24be1d0",
-      strip_prefix = "protobuf-2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a",
+      sha256 = "30f23a45c6f4515598702a6d19c4295ba92c4a635d7ad8d331a4db9fccff392d",
+      strip_prefix = "protobuf-3.3.1",
       # TODO: remove patching when tensorflow stops linking same protos into
       #       multiple shared libraries loaded in runtime by python.
       #       This patch fixes a runtime crash when tensorflow is compiled
@@ -345,21 +345,21 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.http_archive(
       name = "com_google_protobuf",
       urls = [
-          "http://mirror.bazel.build/github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
-          "https://github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
+          "http://mirror.bazel.build/github.com/google/protobuf/archive/v3.3.1.tar.gz",
+          "https://github.com/google/protobuf/archive/v3.3.1.tar.gz",
       ],
-      sha256 = "e5d3d4e227a0f7afb8745df049bbd4d55474b158ca5aaa2a0e31099af24be1d0",
-      strip_prefix = "protobuf-2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a",
+      sha256 = "30f23a45c6f4515598702a6d19c4295ba92c4a635d7ad8d331a4db9fccff392d",
+      strip_prefix = "protobuf-3.3.1",
   )
 
   native.http_archive(
       name = "com_google_protobuf_cc",
       urls = [
-          "http://mirror.bazel.build/github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
-          "https://github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
+          "http://mirror.bazel.build/github.com/google/protobuf/archive/v3.3.1.tar.gz",
+          "https://github.com/google/protobuf/archive/v3.3.1.tar.gz",
       ],
-      sha256 = "e5d3d4e227a0f7afb8745df049bbd4d55474b158ca5aaa2a0e31099af24be1d0",
-      strip_prefix = "protobuf-2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a",
+      sha256 = "30f23a45c6f4515598702a6d19c4295ba92c4a635d7ad8d331a4db9fccff392d",
+      strip_prefix = "protobuf-3.3.1",
   )
 
   native.new_http_archive(
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 83a377dde5..1e29f23d90 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -847,10 +847,10 @@ def _genrule(src_dir, genrule_name, command, outs):
       genrule_name + '",\n' +
       '    outs = [\n' +
       outs +
-      '    ],\n' +
+      '\n    ],\n' +
       '    cmd = """\n' +
       command +
-      '    """,\n' +
+      '\n   """,\n' +
       ')\n\n'
   )
 
diff --git a/third_party/py/BUILD.tpl b/third_party/py/BUILD.tpl
index 1ee9c071ad..cdfe9c9b45 100644
--- a/third_party/py/BUILD.tpl
+++ b/third_party/py/BUILD.tpl
@@ -6,6 +6,16 @@ cc_library(
     name = "python_headers",
     hdrs = [":python_include"],
     includes = ["python_include"],
+    data = select({
+      ":windows" : [":python_import_lib"],
+      "//conditions:default": [],
+    }),
+    linkopts = select({
+      # TODO(pcloudy): Ideally, this should just go into deps after resolving
+      # https://github.com/bazelbuild/bazel/issues/3237,
+      ":windows" : ["$(locations :python_import_lib)"],
+      "//conditions:default": [],
+    }),
 )
 
 cc_library(
@@ -22,4 +32,6 @@ config_setting(
 
 %{PYTHON_INCLUDE_GENRULE}
 
+%{PYTHON_IMPORT_LIB_GENRULE}
+
 %{NUMPY_INCLUDE_GENRULE}
diff --git a/third_party/py/python_configure.bzl b/third_party/py/python_configure.bzl
index 19a6f1e749..31c4d2b98a 100644
--- a/third_party/py/python_configure.bzl
+++ b/third_party/py/python_configure.bzl
@@ -117,10 +117,10 @@ def _genrule(src_dir, genrule_name, command, outs):
       genrule_name + '",\n' +
       '    outs = [\n' +
       outs +
-      '    ],\n' +
+      '\n    ],\n' +
       '    cmd = """\n' +
       command +
-      '    """,\n' +
+      '\n   """,\n' +
       ')\n\n'
   )
 
@@ -133,15 +133,20 @@ def _norm_path(path):
   return path
 
 
-def _symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name):
+def _symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name,
+    src_files = [], dest_files = []):
   """Returns a genrule to symlink(or copy if on Windows) a set of files.
+
+  If src_dir is passed, files will be read from the given directory; otherwise
+  we assume files are in src_files and dest_files
   """
-  src_dir = _norm_path(src_dir)
-  dest_dir = _norm_path(dest_dir)
-  files = _read_dir(repository_ctx, src_dir)
-  # Create a list with the src_dir stripped to use for outputs.
-  dest_files = files.replace(src_dir, '').splitlines()
-  src_files = files.splitlines()
+  if src_dir != None:
+    src_dir = _norm_path(src_dir)
+    dest_dir = _norm_path(dest_dir)
+    files = _read_dir(repository_ctx, src_dir)
+    # Create a list with the src_dir stripped to use for outputs.
+    dest_files = files.replace(src_dir, '').splitlines()
+    src_files = files.splitlines()
   command = []
   outs = []
   for i in range(len(dest_files)):
@@ -217,11 +222,23 @@ def _get_python_include(repository_ctx, python_bin):
                      'print(sysconfig.get_python_inc())'],
                     error_msg="Problem getting python include path.",
                     error_details=("Is the Python binary path set up right? " +
-                                   "(See ./configure or BAZEL_BIN_PATH.) " +
+                                   "(See ./configure or PYTHON_BIN_PATH.) " +
                                    "Is distutils installed?"))
   return result.stdout.splitlines()[0]
 
 
+def _get_python_import_lib_name(repository_ctx, python_bin):
+  """Get Python import library name (pythonXY.lib) on Windows."""
+  result = _execute(repository_ctx,
+                    [python_bin, "-c",
+                     'import sys;' +
+                     'print("python" + str(sys.version_info[0]) + str(sys.version_info[1]) + ".lib")'],
+                    error_msg="Problem getting python import library.",
+                    error_details=("Is the Python binary path set up right? " +
+                                   "(See ./configure or PYTHON_BIN_PATH.) "))
+  return result.stdout.splitlines()[0]
+
+
 def _get_numpy_include(repository_ctx, python_bin):
   """Gets the numpy include path."""
   return _execute(repository_ctx,
@@ -240,11 +257,11 @@ def _create_local_python_repository(repository_ctx):
   empty_config = False
   # If local checks were requested, the python and numpy include will be auto
   # detected on the host config (using _PYTHON_BIN_PATH).
+  python_bin = _get_env_var(repository_ctx, _PYTHON_BIN_PATH,
+                            "/usr/bin/python")
   if repository_ctx.attr.local_checks:
     # TODO(nlopezgi): The default argument here is a workaround until
     #                 bazelbuild/bazel#3057 is resolved.
-    python_bin = _get_env_var(repository_ctx, _PYTHON_BIN_PATH,
-                              "/usr/bin/python")
     _check_python_bin(repository_ctx, python_bin)
     python_lib = _get_env_var(repository_ctx, _PYTHON_LIB_PATH,
                               _get_python_lib(repository_ctx, python_bin))
@@ -263,6 +280,10 @@ def _create_local_python_repository(repository_ctx):
                                       '    name = "python_include",\n' +
                                       '    srcs = [],\n' +
                                       ')\n'),
+        "%{PYTHON_IMPORT_LIB_GENRULE}": ('filegroup(\n' +
+                                      '    name = "python_import_lib",\n' +
+                                      '    srcs = [],\n' +
+                                      ')\n'),
         "%{NUMPY_INCLUDE_GENRULE}": ('filegroup(\n' +
                                       '    name = "numpy_include",\n' +
                                       '    srcs = [],\n' +
@@ -271,10 +292,21 @@ def _create_local_python_repository(repository_ctx):
   else:
     python_include_rule = _symlink_genrule_for_dir(
         repository_ctx, python_include, 'python_include', 'python_include')
+    python_import_lib_genrule = ""
+    # To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
+    # See https://docs.python.org/3/extending/windows.html
+    if _is_windows(repository_ctx):
+      python_include = _norm_path(python_include)
+      python_import_lib_name = _get_python_import_lib_name(repository_ctx, python_bin)
+      python_import_lib_src = python_include.rsplit('/', 1)[0] + "/libs/" + python_import_lib_name
+      python_import_lib_genrule = _symlink_genrule_for_dir(
+        repository_ctx, None, '', 'python_import_lib',
+        [python_import_lib_src], [python_import_lib_name])
     numpy_include_rule = _symlink_genrule_for_dir(
         repository_ctx, numpy_include, 'numpy_include/numpy', 'numpy_include')
     _tpl(repository_ctx, "BUILD", {
         "%{PYTHON_INCLUDE_GENRULE}": python_include_rule,
+        "%{PYTHON_IMPORT_LIB_GENRULE}": python_import_lib_genrule,
         "%{NUMPY_INCLUDE_GENRULE}": numpy_include_rule,
     })
 
diff --git a/third_party/sycl/crosstool/computecpp.tpl b/third_party/sycl/crosstool/computecpp.tpl
index 94c5e6aaad..8b0a7a7d4a 100755
--- a/third_party/sycl/crosstool/computecpp.tpl
+++ b/third_party/sycl/crosstool/computecpp.tpl
@@ -15,7 +15,7 @@ COMPUTECPP_INCLUDE = COMPUTECPP_ROOT + 'include'
 
 def main():
   remove_flags = ('-Wl,--no-undefined', '-Wno-unused-but-set-variable', '-Wignored-attributes')
-    # remove -fsamotoze-coverage from string with g++
+  # remove -fsanitize-coverage from string with g++
   if 'g++' in CPU_CXX_COMPILER:
     remove_flags += ('-fsanitize-coverage',)
   compiler_flags = [flag for flag in sys.argv[1:] if not flag.startswith(remove_flags)]
@@ -43,7 +43,7 @@ def main():
 
   # create a blacklist of folders that will be skipped when compiling with ComputeCpp
   skip_extensions = [".cu.cc"]
-  skip_folders = ["tensorflow/compiler", "tensorflow/docs_src", "tensorflow/tensorboard", "third_party", "external", "hexagon"]
+  skip_folders = ["tensorflow/compiler", "tensorflow/docs_src", "third_party", "external", "hexagon"]
   skip_folders = [(folder + '/') for folder in skip_folders]
   # if compiling external project skip computecpp
   if any(compiled_file_name.endswith(_ext) for _ext in skip_extensions) or any(_folder in output_file_name for _folder in skip_folders):
@@ -79,7 +79,7 @@ def main():
                                       '-Xclang', '-cl-denorms-are-zero', '-Xclang', '-cl-fp32-correctly-rounded-divide-sqrt']
   # disable flags enabling SIMD instructions
   computecpp_device_compiler_flags += [flag for flag in compiler_flags if \
-    not any(x in flag.lower() for x in ('-fsanitize', '=native', '=core2', 'msse', 'vectorize', 'mavx', 'mmmx', 'm3dnow', 'fma'))]
+    not any(x in flag.lower() for x in ('-fsanitize', '-fno-canonical-system-headers', '=native', '=core2', 'msse', 'vectorize', 'mavx', 'mmmx', 'm3dnow', 'fma'))]
 
   x = call([COMPUTECPP_DRIVER] + computecpp_device_compiler_flags)
   if x == 0:
diff --git a/third_party/sycl/sycl/BUILD.tpl b/third_party/sycl/sycl/BUILD.tpl
index c66a9f007d..6cad190630 100755
--- a/third_party/sycl/sycl/BUILD.tpl
+++ b/third_party/sycl/sycl/BUILD.tpl
@@ -33,7 +33,7 @@ cc_library(
         sycl_library_path("ComputeCpp")
     ],
     includes = ["include/"],
-    linkstatic = 1,
+    linkstatic = 0,
 )
 
 cc_library(
diff --git a/tools/tf_env_collect.sh b/tools/tf_env_collect.sh
index abeebeadea..a1c9c88c58 100755
--- a/tools/tf_env_collect.sh
+++ b/tools/tf_env_collect.sh
@@ -16,49 +16,62 @@
 
 set -u  # Check for undefined variables
 
-echo "Collecting system information..."
-
-OUTPUT_FILE=tf_env.txt
-
-echo >> $OUTPUT_FILE
-echo "== cat /etc/issue ===============================================" >> $OUTPUT_FILE
-uname -a >> $OUTPUT_FILE
-uname=`uname -s`
-if [ "$(uname)" == "Darwin" ]; then
-  echo Mac OS X `sw_vers -productVersion` >> $OUTPUT_FILE
-elif [ "$(uname)" == "Linux" ]; then
-  cat /etc/*release | grep VERSION >> $OUTPUT_FILE
-fi
-
-
-echo >> $OUTPUT_FILE
-echo '== are we in docker =============================================' >> $OUTPUT_FILE
-num=`cat /proc/1/cgroup | grep docker | wc -l`;
-if [ $num -ge 1 ]; then
-  echo "Yes" >> $OUTPUT_FILE
-else
-  echo "No" >> $OUTPUT_FILE
-fi
-
-echo >> $OUTPUT_FILE
-echo '== compiler =====================================================' >> $OUTPUT_FILE
-c++ --version 2>&1 >> $OUTPUT_FILE
-
-echo >> $OUTPUT_FILE
-echo '== uname -a =====================================================' >> $OUTPUT_FILE
-uname -a >> $OUTPUT_FILE
+die() {
+  # Print a message and exit with code 1.
+  #
+  # Usage: die <error_message>
+  #   e.g., die "Something bad happened."
 
-echo >> $OUTPUT_FILE
-echo '== check pips ===================================================' >> $OUTPUT_FILE
-pip list 2>&1 | grep "proto\|numpy\|tensorflow" >> $OUTPUT_FILE
+  echo $@
+  exit 1
+}
 
+echo "Collecting system information..."
 
-echo >> $OUTPUT_FILE
-echo '== check for virtualenv =========================================' >> $OUTPUT_FILE
-python -c "import sys;print(hasattr(sys, \"real_prefix\"))" >> $OUTPUT_FILE
+OUTPUT_FILE=tf_env.txt
+python_bin_path=$(which python || which python3 || die "Cannot find Python binary")
+
+{
+  echo
+  echo "== cat /etc/issue ==============================================="
+  uname -a
+  uname=`uname -s`
+  if [ "$(uname)" == "Darwin" ]; then
+    echo Mac OS X `sw_vers -productVersion`
+  elif [ "$(uname)" == "Linux" ]; then
+    cat /etc/*release | grep VERSION
+  fi
+  
+  echo
+  echo '== are we in docker ============================================='
+  num=`cat /proc/1/cgroup | grep docker | wc -l`;
+  if [ $num -ge 1 ]; then
+    echo "Yes"
+  else
+    echo "No"
+  fi
+  
+  echo
+  echo '== compiler ====================================================='
+  c++ --version 2>&1
+  
+  echo
+  echo '== uname -a ====================================================='
+  uname -a
+  
+  echo
+  echo '== check pips ==================================================='
+  pip list 2>&1 | grep "proto\|numpy\|tensorflow"
+  
+  
+  echo
+  echo '== check for virtualenv ========================================='
+  ${python_bin_path} -c "import sys;print(hasattr(sys, \"real_prefix\"))"
+  
+  echo
+  echo '== tensorflow import ============================================'
+} >> ${OUTPUT_FILE}
 
-echo >> $OUTPUT_FILE
-echo '== tensorflow import ============================================' >> $OUTPUT_FILE
 cat <<EOF > /tmp/check_tf.py
 import tensorflow as tf;
 print("tf.VERSION = %s" % tf.VERSION)
@@ -67,32 +80,34 @@ print("tf.COMPILER_VERSION = %s" % tf.GIT_VERSION)
 with tf.Session() as sess:
   print("Sanity check: %r" % sess.run(tf.constant([1,2,3])[:1]))
 EOF
-python /tmp/check_tf.py 2>&1  >> ${OUTPUT_FILE}
-
-DEBUG_LD=libs python -c "import tensorflow"  2>>${OUTPUT_FILE} > /tmp/loadedlibs
-grep libcudnn.so /tmp/loadedlibs >> $OUTPUT_FILE
-
-echo >> $OUTPUT_FILE
-echo '== env ==========================================================' >> $OUTPUT_FILE
-if [ -z ${LD_LIBRARY_PATH+x} ]; then
-  echo "LD_LIBRARY_PATH is unset" >> $OUTPUT_FILE;
-else
-  echo LD_LIBRARY_PATH ${LD_LIBRARY_PATH}  >> $OUTPUT_FILE;
-fi
-if [ -z ${DYLD_LIBRARY_PATH+x} ]; then
-  echo "DYLD_LIBRARY_PATH is unset" >> $OUTPUT_FILE;
-else
-  echo DYLD_LIBRARY_PATH ${DYLD_LIBRARY_PATH}  >> $OUTPUT_FILE;
-fi
-
-
-echo >> $OUTPUT_FILE >> $OUTPUT_FILE
-echo '== nvidia-smi ===================================================' >> $OUTPUT_FILE
-nvidia-smi 2>&1 >> $OUTPUT_FILE
-
-echo >> $OUTPUT_FILE
+${python_bin_path} /tmp/check_tf.py 2>&1  >> ${OUTPUT_FILE}
+
+DEBUG_LD=libs ${python_bin_path} -c "import tensorflow"  2>>${OUTPUT_FILE} > /tmp/loadedlibs
+
+{
+  grep libcudnn.so /tmp/loadedlibs
+  echo
+  echo '== env =========================================================='
+  if [ -z ${LD_LIBRARY_PATH+x} ]; then
+    echo "LD_LIBRARY_PATH is unset";
+  else
+    echo LD_LIBRARY_PATH ${LD_LIBRARY_PATH} ;
+  fi
+  if [ -z ${DYLD_LIBRARY_PATH+x} ]; then
+    echo "DYLD_LIBRARY_PATH is unset";
+  else
+    echo DYLD_LIBRARY_PATH ${DYLD_LIBRARY_PATH} ;
+  fi
+  
+  
+  echo
+  echo '== nvidia-smi ==================================================='
+  nvidia-smi 2>&1
+  
+  echo
+  echo '== cuda libs  ==================================================='
+} >> ${OUTPUT_FILE}
 
-echo '== cuda libs  ===================================================' >> $OUTPUT_FILE
 find /usr/local -type f -name 'libcudart*'  2>/dev/null | grep cuda |  grep -v "\\.cache" >> ${OUTPUT_FILE}
 find /usr/local -type f -name 'libudnn*'  2>/dev/null | grep cuda |  grep -v "\\.cache" >> ${OUTPUT_FILE}
 
-- 
cgit v1.2.3