149 files changed, 2122 insertions, 428 deletions
diff --git a/README.md b/README.md
index 2dc3dcb3e6..c99e9350dd 100644
--- a/README.md
+++ b/README.md
@@ -33,11 +33,12 @@ and discussion.**
 
 People who are a little more adventurous can also try our nightly binaries:
 
-* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.0rc2-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.0rc2-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.0rc2-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
-* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.0rc2-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.0rc2-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.0rc2-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
-* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.0rc2-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.0rc2-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
-* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.0rc2-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.0rc2-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
-* [Android](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-android/TF_BUILD_CONTAINER_TYPE=ANDROID,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=NO_PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=android-slave/lastSuccessfulBuild/artifact/bazel-out/local_linux/bin/tensorflow/examples/android/tensorflow_demo.apk) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-android/TF_BUILD_CONTAINER_TYPE=ANDROID,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=NO_PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=android-slave/))
+* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave))
+* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/))
+* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
+* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
+* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=cpu,OS=windows/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.0.0-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=cpu,OS=windows/))
+* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=gpu,OS=windows/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.0.0-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=gpu,OS=windows/))
 * Android: [demo APK](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk), [native libs](http://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/native/)
 ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-android/))
 
diff --git a/RELEASE.md b/RELEASE.md
index e9f0361511..f7f5e37bd7 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -89,6 +89,7 @@ To help you upgrade your existing TensorFlow Python code to match the API change
 * Change arg order for `{softmax,sparse_softmax,sigmoid}_cross_entropy_with_logits` to be (labels, predictions), and force use of named args.
 
 ## Bug Fixes and Other Changes
+* Numerous C++ API updates.
 * New op: `parallel_stack`.
 * Introducing common tf io compression options constants for
   RecordReader/RecordWriter.
@@ -127,6 +128,7 @@ To help you upgrade your existing TensorFlow Python code to match the API change
 * `tf.divide` now honors the name field.
 * Make metrics weight broadcasting more strict.
 * Add new queue-like `StagingArea` and new ops: `stage` and `unstage`.
+* Enable inplace update ops for strings on CPU. Speed up string concat.
 
 ## Thanks to our Contributors
 
diff --git a/tensorflow/cc/training/coordinator.h b/tensorflow/cc/training/coordinator.h
index dbcf072015..1b107e2d06 100644
--- a/tensorflow/cc/training/coordinator.h
+++ b/tensorflow/cc/training/coordinator.h
@@ -115,8 +115,6 @@ class Coordinator {
   std::vector<std::unique_ptr<RunnerInterface>> runners_
       GUARDED_BY(runners_lock_);
 
-  std::atomic<int> num_runners_to_cancel_;
-
   TF_DISALLOW_COPY_AND_ASSIGN(Coordinator);
 };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
index 39c5567f80..1f9ac029c7 100644
--- a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
@@ -127,32 +127,21 @@ XLA_MAKE_BINARY(GreaterEqual, b->Ge(lhs, rhs, extend_dimensions));
 XLA_MAKE_BINARY(Less, b->Lt(lhs, rhs, extend_dimensions));
 XLA_MAKE_BINARY(LessEqual, b->Le(lhs, rhs, extend_dimensions));
 
-#undef XLA_MAKE_BINARY
+// Non-linear ops
+XLA_MAKE_BINARY(SigmoidGrad,
+                b->Mul(b->Mul(rhs, lhs),
+                       b->Sub(XlaHelpers::One(b, input_type(0)), lhs)));
 
-#define XLA_MAKE_BINARY_MAP(Name, HLO)                                    \
-  class Name##Op : public XlaBinaryMapOp {                                \
-   public:                                                                \
-    explicit Name##Op(OpKernelConstruction* ctx) : XlaBinaryMapOp(ctx) {} \
-    void BuildMapLambda(xla::ComputationBuilder* b,                       \
-                        const xla::ComputationDataHandle& lhs,            \
-                        const xla::ComputationDataHandle& rhs) override { \
-      HLO;                                                                \
-    }                                                                     \
-  };                                                                      \
-  REGISTER_XLA_OP(#Name, Name##Op)
+XLA_MAKE_BINARY(SoftplusGrad,
+                b->Div(lhs, b->Add(b->Exp(b->Neg(rhs)),
+                                   XlaHelpers::One(b, input_type(1)))));
+
+XLA_MAKE_BINARY(TanhGrad, b->Mul(rhs, b->Sub(XlaHelpers::One(b, input_type(0)),
+                                             b->Mul(lhs, lhs))));
 
-XLA_MAKE_BINARY_MAP(Pow, b->Pow(lhs, rhs));
-XLA_MAKE_BINARY_MAP(SigmoidGrad,
-                    b->Mul(b->Mul(rhs, lhs),
-                           b->Sub(XlaHelpers::One(b, input_type(0)), lhs)));
-XLA_MAKE_BINARY_MAP(SoftplusGrad,
-                    b->Div(lhs, b->Add(b->Exp(b->Neg(rhs)),
-                                       XlaHelpers::One(b, input_type(1)))));
-XLA_MAKE_BINARY_MAP(TanhGrad,
-                    b->Mul(rhs, b->Sub(XlaHelpers::One(b, input_type(0)),
-                                       b->Mul(lhs, lhs))));
-
-#undef XLA_MAKE_BINARY_MAP
+XLA_MAKE_BINARY(Pow, b->Pow(lhs, rhs, extend_dimensions));
+
+#undef XLA_MAKE_BINARY
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/cwise_ops.h b/tensorflow/compiler/tf2xla/kernels/cwise_ops.h
index f0687c1d4b..ba38693325 100644
--- a/tensorflow/compiler/tf2xla/kernels/cwise_ops.h
+++ b/tensorflow/compiler/tf2xla/kernels/cwise_ops.h
@@ -32,9 +32,7 @@ namespace tensorflow {
 // description of the operation; and Computation adds the
 // implementation of the operation to a xla::ComputationBuilder. For most
 // arithmetic Ops XLA handles the broadcasting automatically given the input
-// tensors. Ops like ReluGrad that need to map a scalar function over the inputs
-// can use the XlaBinaryMapOp subclass below which handles manual
-// broadcasting of the inputs.
+// tensors.
 class XlaBinaryOp : public XlaOpKernel {
  public:
   explicit XlaBinaryOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
@@ -83,6 +81,8 @@ class XlaBinaryOp : public XlaOpKernel {
 // virtual methods to override: description is a textual description
 // of the mapped function; and BuildMapLambda adds the
 // implementation of the lambda to a xla::ComputationBuilder.
+// Operations may have better performance if implemented as graphs of
+// element-wise tensor operations.
 class XlaBinaryMapOp : public XlaBinaryOp {
  public:
   explicit XlaBinaryMapOp(OpKernelConstruction* ctx) : XlaBinaryOp(ctx) {}
diff --git a/tensorflow/compiler/tf2xla/kernels/relu_op.cc b/tensorflow/compiler/tf2xla/kernels/relu_op.cc
index 2dea4032c0..d1b857c22a 100644
--- a/tensorflow/compiler/tf2xla/kernels/relu_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/relu_op.cc
@@ -50,36 +50,37 @@ class Relu6Op : public XlaOpKernel {
   }
 };
 
-// A subclass of a XlaBinaryMapOp must build the lambda computation
-// that describes the (scalar,scalar)->scalar function to apply to
-// each element of the input. We have to use XlaBinaryMapOp instead of
-// XlaBinaryOp here because XLA Select does not do automatic
-// broadcasting.
-class ReluGradOp : public XlaBinaryMapOp {
+class ReluGradOp : public XlaOpKernel {
  public:
-  explicit ReluGradOp(OpKernelConstruction* ctx) : XlaBinaryMapOp(ctx) {}
+  explicit ReluGradOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
   // Return the lhs (incoming gradient) if the rhs (input feature) > 0,
   // otherwise return 0.
-  void BuildMapLambda(xla::ComputationBuilder* b,
-                      const xla::ComputationDataHandle& gradient,
-                      const xla::ComputationDataHandle& feature) override {
-    const auto zero = XlaHelpers::Zero(b, input_type(0));
-    b->Select(b->Gt(feature, zero), gradient, zero);
+  void Compile(XlaOpKernelContext* ctx) {
+    xla::ComputationBuilder* b = ctx->builder();
+    const TensorShape shape = ctx->InputShape(0);
+    const auto zero =
+        b->Broadcast(XlaHelpers::Zero(b, input_type(0)), shape.dim_sizes());
+    const auto pred = b->Gt(ctx->Input(1), zero);
+    ctx->SetOutput(0, b->Select(pred, ctx->Input(0), zero));
   }
 };
 
-class Relu6GradOp : public XlaBinaryMapOp {
+class Relu6GradOp : public XlaOpKernel {
  public:
-  explicit Relu6GradOp(OpKernelConstruction* ctx) : XlaBinaryMapOp(ctx) {}
+  explicit Relu6GradOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
   // Return the lhs (incoming gradient) if the rhs (input feature) > 0,
   // otherwise return 0.
-  void BuildMapLambda(xla::ComputationBuilder* b,
-                      const xla::ComputationDataHandle& gradient,
-                      const xla::ComputationDataHandle& feature) override {
-    const auto zero = XlaHelpers::Zero(b, input_type(0));
-    auto six = XlaHelpers::IntegerLiteral(b, input_type(0), 6);
-    b->Select(b->LogicalAnd(b->Lt(feature, six), b->Gt(feature, zero)),
-              gradient, zero);
+  void Compile(XlaOpKernelContext* ctx) {
+    xla::ComputationBuilder* b = ctx->builder();
+    const TensorShape shape = ctx->InputShape(0);
+    const auto zero =
+        b->Broadcast(XlaHelpers::Zero(b, input_type(0)), shape.dim_sizes());
+    const auto six = b->Broadcast(
+        XlaHelpers::IntegerLiteral(b, input_type(0), 6), shape.dim_sizes());
+    auto out = b->Select(
+        b->LogicalAnd(b->Lt(ctx->Input(1), six), b->Gt(ctx->Input(1), zero)),
+        ctx->Input(0), zero);
+    ctx->SetOutput(0, out);
   }
 };
 
diff --git a/tensorflow/compiler/xla/client/computation_builder.cc b/tensorflow/compiler/xla/client/computation_builder.cc
index ae7695ade5..2ee6b51c18 100644
--- a/tensorflow/compiler/xla/client/computation_builder.cc
+++ b/tensorflow/compiler/xla/client/computation_builder.cc
@@ -1010,8 +1010,9 @@ ComputationDataHandle ComputationBuilder::SqrtF32(
 }
 
 ComputationDataHandle ComputationBuilder::Pow(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs) {
-  return BinaryOp(BINOP_POW, lhs, rhs, /*broadcast_dimensions=*/{});
+    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
+    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(BINOP_POW, lhs, rhs, broadcast_dimensions);
 }
 
 ComputationDataHandle ComputationBuilder::ConvertElementType(
diff --git a/tensorflow/compiler/xla/client/computation_builder.h b/tensorflow/compiler/xla/client/computation_builder.h
index a49e5a8843..98649ba02f 100644
--- a/tensorflow/compiler/xla/client/computation_builder.h
+++ b/tensorflow/compiler/xla/client/computation_builder.h
@@ -504,8 +504,9 @@ class ComputationBuilder {
   ComputationDataHandle SquareF32(const ComputationDataHandle& operand);
 
   // Enqueues a lhs^rhs computation onto the computation.
-  ComputationDataHandle Pow(const ComputationDataHandle& lhs,
-                            const ComputationDataHandle& rhs);
+  ComputationDataHandle Pow(
+      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
+      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
 
   // Enqueues an operator that tests if the operand's values are finite, i.e.,
   // not Inf or NaN. Defined only for floating-point types. Returns an array of
diff --git a/tensorflow/contrib/cmake/external/eigen.cmake b/tensorflow/contrib/cmake/external/eigen.cmake
index a2f5774b9f..7dd7dd4070 100644
--- a/tensorflow/contrib/cmake/external/eigen.cmake
+++ b/tensorflow/contrib/cmake/external/eigen.cmake
@@ -36,4 +36,5 @@ ExternalProject_Add(eigen
         -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
         -DCMAKE_INSTALL_PREFIX:STRING=${eigen_INSTALL}
         -DINCLUDE_INSTALL_DIR:STRING=${CMAKE_CURRENT_BINARY_DIR}/external/eigen_archive
+        -DBUILD_TESTING:BOOL=OFF
 )
diff --git a/tensorflow/contrib/cmake/tf_tests.cmake b/tensorflow/contrib/cmake/tf_tests.cmake
index 84107706ca..7762328cfe 100644
--- a/tensorflow/contrib/cmake/tf_tests.cmake
+++ b/tensorflow/contrib/cmake/tf_tests.cmake
@@ -163,6 +163,7 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/py_func_test.py"
       # training tests
       "${tensorflow_source_dir}/tensorflow/python/training/basic_session_run_hooks_test.py"  # Needs tf.contrib fix.
+      "${tensorflow_source_dir}/tensorflow/python/training/evaluation_test.py"  # Needs tf.contrib fix.
       "${tensorflow_source_dir}/tensorflow/python/training/localhost_cluster_performance_test.py"  # Needs portpicker.
       "${tensorflow_source_dir}/tensorflow/python/training/monitored_session_test.py"  # Needs tf.contrib fix.
       "${tensorflow_source_dir}/tensorflow/python/training/saver_large_variable_test.py"  # Overflow error.
diff --git a/tensorflow/contrib/hvx/README.md b/tensorflow/contrib/hvx/README.md
index ef9c8b2e8e..6f64cbee3f 100644
--- a/tensorflow/contrib/hvx/README.md
+++ b/tensorflow/contrib/hvx/README.md
@@ -108,3 +108,29 @@ Finally, you can run the inference tests on your device.
 adb shell 'LD_LIBRARY_PATH=/data/local/tmp:$LD_LIBRARY_PATH' \
 "/data/local/tmp/hexagon_graph_execution"
 ```
+
+#### Troubleshooting
+If you're using the Open-Q 820 Snapdragon development kit, you may run into an issue with running the executable due to a missing testsig library. From the Hexagon SDK documentation: *Dynamic shared objects are required to be digitally signed and then authenticated at runtime before they are allowed to be loaded and executed.* Generating a testsig library is necessary to run the unsigned sample library built from this project.
+
+If the lack of a testsig library is your problem, you will see errors of the type:
+`vendor/qcom/proprietary/adsprpc/src/fastrpc_apps_user.c:169::error: -1: 0 == (nErr = remotectl_open(name, (int*)ph, dlerrstr, sizeof(dlerrstr), &dlerr))`
+appearing in adb logcat.
+
+There are several ways to create the testsig library, the only prerequisite is Python and the correct version of the Hexagon-SDK. The following steps is one way to create this library:
+1. Run adb as root: `adb root`
+2. Run the command `adb shell cat /sys/devices/soc0/serial_number`
+3. Convert the decimal number you get as output to hex
+4. Run the python script: `python ${QUALCOMM_SDK}/tools/elfsigner/elfsigner.py -t $(SERIAL_NUMBER_HEX_VALUE)`
+5. The output of the python script is a shared library stored in ${QUALCOMM_SDK}/tools/elfsigner/output/testsig-$(SERIAL_NUMBER_HEX_VALUE).so
+6. Push the shared library to your device:
+```
+adb root
+adb wait-for-device
+adb remount
+adb wait-for-device
+adb shell mkdir /system/lib/rfsa
+adb shell mkdir /system/lib/rfsa/adsp
+adb push ${QUALCOMM_SDK}/tools/elfsigner/output/testsig-$(SERIAL_NUMBER_HEX_VALUE).so /system/lib/rfsa/adsp/
+```
+
+After rebooting your device, you should be able to run the sample application.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn.py b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
index bfe1cd0aed..f9ba6711e6 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
@@ -748,15 +748,16 @@ class _DNNEstimator(estimator.Estimator):
   To create a _DNNEstimator for binary classification, where
   estimator = _DNNEstimator(
       feature_columns=[sparse_feature_a_emb, sparse_feature_b_emb],
-      head=head=head_lib._multi_class__head(n_classes=2),
+      head=head_lib._multi_class__head(n_classes=2),
       hidden_units=[1024, 512, 256])
 
   If your label is keyed with "y" in your labels dict, and weights are keyed
   with "w" in features dict, and you want to enable centered bias,
-  head=head_lib._multi_class__head(n_classes=2,
-                                   label_name="x"
-                                   weight_column_name="w",
-                                   enable_centered_bias=True)
+  head = head_lib._multi_class__head(
+      n_classes=2,
+      label_name="x",
+      weight_column_name="w",
+      enable_centered_bias=True)
   estimator = _DNNEstimator(
       feature_columns=[sparse_feature_a_emb, sparse_feature_b_emb],
       head=head,
diff --git a/tensorflow/contrib/rnn/python/ops/lstm_ops.py b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
index 2e6f2ac05c..c1ec46d763 100644
--- a/tensorflow/contrib/rnn/python/ops/lstm_ops.py
+++ b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
@@ -362,7 +362,7 @@ class LSTMBlockCell(core_rnn_cell.RNNCell):
 
   @property
   def state_size(self):
-    return (self._num_units,) * 2
+    return core_rnn_cell.LSTMStateTuple(self._num_units, self._num_units)
 
   @property
   def output_size(self):
@@ -401,7 +401,8 @@ class LSTMBlockCell(core_rnn_cell.RNNCell):
           forget_bias=self._forget_bias,
           use_peephole=self._use_peephole)
 
-      return (h, (cs, h))
+      new_state = core_rnn_cell.LSTMStateTuple(cs, h)
+      return h, new_state
 
 
 class LSTMBlockWrapper(fused_rnn_cell.FusedRNNCell):
@@ -544,7 +545,9 @@ class LSTMBlockWrapper(fused_rnn_cell.FusedRNNCell):
         # Input was a list, so return a list
         outputs = array_ops.unstack(outputs)
 
-      return outputs, (final_cell_state, final_output)
+      final_state = core_rnn_cell.LSTMStateTuple(final_cell_state,
+                                                 final_output)
+      return outputs, final_state
 
   def _gather_states(self, data, indices, batch_size):
     """Produce `out`, s.t. out(i, j) = data(indices(i), i, j)."""
diff --git a/tensorflow/contrib/training/python/training/evaluation.py b/tensorflow/contrib/training/python/training/evaluation.py
index c14f4ee440..212f2c7a7b 100644
--- a/tensorflow/contrib/training/python/training/evaluation.py
+++ b/tensorflow/contrib/training/python/training/evaluation.py
@@ -410,8 +410,7 @@ def evaluate_repeatedly(checkpoint_dir,
           '%Y-%m-%d-%H:%M:%S', time.gmtime()))
     num_evaluations += 1
 
-    reached_max = num_evaluations >= max_number_of_evaluations
-    if max_number_of_evaluations and reached_max:
+    if max_number_of_evaluations is not None and num_evaluations >= max_number_of_evaluations:
       return final_ops_hook.final_ops_values
 
   logging.info('Timed-out waiting for a checkpoint.')
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 2f8bacca85..b59b749182 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -117,6 +117,8 @@ load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
 )
+load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")
+
 # -----------------------------------------------------------------------------
 # Public targets
 
@@ -686,6 +688,8 @@ cc_library(
         "//tensorflow/core/kernels:array_not_windows",
         "//tensorflow/core/kernels:math_not_windows",
         "//tensorflow/core/kernels:quantized_ops",
+    ]) + if_mkl([
+        "//tensorflow/core/kernels:mkl_ops",
     ]),
 )
 
@@ -745,6 +749,7 @@ cc_library(
         ":proto_text",
         ":protos_all_cc",
         ":shape_inference_testutil",
+        ":sycl_runtime",
         ":tensor_testutil",
         ":test",
         "//tensorflow/core/kernels:constant_op",
diff --git a/tensorflow/core/common_runtime/sycl/sycl_allocator.cc b/tensorflow/core/common_runtime/sycl/sycl_allocator.cc
index 0d238276f4..b7ef9361e9 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_allocator.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_allocator.cc
@@ -25,6 +25,9 @@ string SYCLAllocator::Name() { return "device:SYCL"; }
 
 void *SYCLAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   assert(device_);
+  if (num_bytes == 0) {
+    return device_->allocate(1);
+  }
   auto p = device_->allocate(num_bytes);
   return p;
 }
@@ -42,6 +45,6 @@ void SYCLAllocator::EnterLameDuckMode() {
   }
 }
 
-} // namespace tensorflow
+}  // namespace tensorflow
 
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/common_runtime/sycl/sycl_allocator.h b/tensorflow/core/common_runtime/sycl/sycl_allocator.h
index c896f7f603..15d9ab41a4 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_allocator.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_allocator.h
@@ -27,8 +27,8 @@ limitations under the License.
 namespace tensorflow {
 
 class SYCLAllocator : public Allocator {
-public:
-  SYCLAllocator(Eigen::QueueInterface* device) : device_(device) {}
+ public:
+  SYCLAllocator(Eigen::QueueInterface *device) : device_(device) {}
   virtual ~SYCLAllocator() override;
   string Name() override;
   void *AllocateRaw(size_t alignment, size_t num_bytes) override;
@@ -36,11 +36,12 @@ public:
 
   void EnterLameDuckMode();
   virtual bool ShouldAllocateEmptyTensors() override final { return true; }
-private:
+
+ private:
   Eigen::QueueInterface *device_;  // not owned
   TF_DISALLOW_COPY_AND_ASSIGN(SYCLAllocator);
 };
 
-} // namespace tensorflow
+}  // namespace tensorflow
 
-#endif // TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
+#endif  // TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.cc b/tensorflow/core/common_runtime/sycl/sycl_device.cc
index 0abe25c373..2c2185b2c0 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_device.cc
@@ -23,7 +23,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-static std::unordered_set<SYCLDevice*> live_devices;
+static std::unordered_set<SYCLDevice *> live_devices;
 static bool first_time = true;
 
 void ShutdownSycl() {
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.h b/tensorflow/core/common_runtime/sycl/sycl_device.h
index b5a72d9476..a5c7c5f0ec 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_device.h
@@ -34,10 +34,11 @@ class SYCLDevice : public LocalDevice {
              Bytes memory_limit, const DeviceLocality &locality,
              const string &physical_device_desc, SYCLSelector sycl_selector,
              Allocator *cpu_allocator)
-      : LocalDevice(options, Device::BuildDeviceAttributes(
-                                 name, DEVICE_SYCL, memory_limit, locality,
-                                 physical_device_desc),
-                    nullptr),
+      : LocalDevice(
+            options,
+            Device::BuildDeviceAttributes(name, DEVICE_SYCL, memory_limit,
+                                          locality, physical_device_desc),
+            nullptr),
         cpu_allocator_(cpu_allocator),
         sycl_queue_(new Eigen::QueueInterface(sycl_selector)),
         sycl_device_(new Eigen::SyclDevice(sycl_queue_)),
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_context.cc b/tensorflow/core/common_runtime/sycl/sycl_device_context.cc
index a6be9195d4..1c868f5606 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device_context.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_device_context.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
-#include "tensorflow/core/common_runtime/sycl/sycl_device_context.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/sycl/sycl_device_context.h"
 
 namespace tensorflow {
 
@@ -31,68 +31,68 @@ void SYCLDeviceContext::CopyCPUTensorToDevice(const Tensor *cpu_tensor,
     const void *src_ptr = DMAHelper::base(cpu_tensor);
     void *dst_ptr = DMAHelper::base(device_tensor);
     switch (cpu_tensor->dtype()) {
-    case DT_FLOAT:
-      device->eigen_sycl_device()->memcpyHostToDevice(
-          static_cast<float *>(dst_ptr), static_cast<const float *>(src_ptr),
-          total_bytes);
-      break;
-    case DT_DOUBLE:
-      device->eigen_sycl_device()->memcpyHostToDevice(
-          static_cast<double *>(dst_ptr), static_cast<const double *>(src_ptr),
-          total_bytes);
-      break;
-    case DT_INT32:
-      device->eigen_sycl_device()->memcpyHostToDevice(
-          static_cast<int32 *>(dst_ptr), static_cast<const int32 *>(src_ptr),
-          total_bytes);
-      break;
-    case DT_INT64:
-      device->eigen_sycl_device()->memcpyHostToDevice(
-          static_cast<int64 *>(dst_ptr), static_cast<const int64 *>(src_ptr),
-          total_bytes);
-      break;
-    case DT_HALF:
-      device->eigen_sycl_device()->memcpyHostToDevice(
-          static_cast<Eigen::half *>(dst_ptr),
-          static_cast<const Eigen::half *>(src_ptr), total_bytes);
-      break;
-    case DT_COMPLEX64:
-      device->eigen_sycl_device()->memcpyHostToDevice(
-          static_cast<std::complex<float> *>(dst_ptr),
-          static_cast<const std::complex<float> *>(src_ptr), total_bytes);
-      break;
-    case DT_COMPLEX128:
-      device->eigen_sycl_device()->memcpyHostToDevice(
-          static_cast<std::complex<double> *>(dst_ptr),
-          static_cast<const std::complex<double> *>(src_ptr), total_bytes);
-      break;
-    case DT_INT8:
-      device->eigen_sycl_device()->memcpyHostToDevice(
-          static_cast<int8 *>(dst_ptr), static_cast<const int8 *>(src_ptr),
-          total_bytes);
-      break;
-    case DT_INT16:
-      device->eigen_sycl_device()->memcpyHostToDevice(
-          static_cast<int16 *>(dst_ptr), static_cast<const int16 *>(src_ptr),
-          total_bytes);
-      break;
-    case DT_UINT8:
-      device->eigen_sycl_device()->memcpyHostToDevice(
-          static_cast<uint8 *>(dst_ptr), static_cast<const uint8 *>(src_ptr),
-          total_bytes);
-      break;
-    case DT_UINT16:
-      device->eigen_sycl_device()->memcpyHostToDevice(
-          static_cast<uint16 *>(dst_ptr), static_cast<const uint16 *>(src_ptr),
-          total_bytes);
-      break;
-    case DT_BOOL:
-      device->eigen_sycl_device()->memcpyHostToDevice(
-          static_cast<bool *>(dst_ptr), static_cast<const bool *>(src_ptr),
-          total_bytes);
-      break;
-    default:
-      assert(false && "unsupported type");
+      case DT_FLOAT:
+        device->eigen_sycl_device()->memcpyHostToDevice(
+            static_cast<float *>(dst_ptr), static_cast<const float *>(src_ptr),
+            total_bytes);
+        break;
+      case DT_DOUBLE:
+        device->eigen_sycl_device()->memcpyHostToDevice(
+            static_cast<double *>(dst_ptr),
+            static_cast<const double *>(src_ptr), total_bytes);
+        break;
+      case DT_INT32:
+        device->eigen_sycl_device()->memcpyHostToDevice(
+            static_cast<int32 *>(dst_ptr), static_cast<const int32 *>(src_ptr),
+            total_bytes);
+        break;
+      case DT_INT64:
+        device->eigen_sycl_device()->memcpyHostToDevice(
+            static_cast<int64 *>(dst_ptr), static_cast<const int64 *>(src_ptr),
+            total_bytes);
+        break;
+      case DT_HALF:
+        device->eigen_sycl_device()->memcpyHostToDevice(
+            static_cast<Eigen::half *>(dst_ptr),
+            static_cast<const Eigen::half *>(src_ptr), total_bytes);
+        break;
+      case DT_COMPLEX64:
+        device->eigen_sycl_device()->memcpyHostToDevice(
+            static_cast<std::complex<float> *>(dst_ptr),
+            static_cast<const std::complex<float> *>(src_ptr), total_bytes);
+        break;
+      case DT_COMPLEX128:
+        device->eigen_sycl_device()->memcpyHostToDevice(
+            static_cast<std::complex<double> *>(dst_ptr),
+            static_cast<const std::complex<double> *>(src_ptr), total_bytes);
+        break;
+      case DT_INT8:
+        device->eigen_sycl_device()->memcpyHostToDevice(
+            static_cast<int8 *>(dst_ptr), static_cast<const int8 *>(src_ptr),
+            total_bytes);
+        break;
+      case DT_INT16:
+        device->eigen_sycl_device()->memcpyHostToDevice(
+            static_cast<int16 *>(dst_ptr), static_cast<const int16 *>(src_ptr),
+            total_bytes);
+        break;
+      case DT_UINT8:
+        device->eigen_sycl_device()->memcpyHostToDevice(
+            static_cast<uint8 *>(dst_ptr), static_cast<const uint8 *>(src_ptr),
+            total_bytes);
+        break;
+      case DT_UINT16:
+        device->eigen_sycl_device()->memcpyHostToDevice(
+            static_cast<uint16 *>(dst_ptr),
+            static_cast<const uint16 *>(src_ptr), total_bytes);
+        break;
+      case DT_BOOL:
+        device->eigen_sycl_device()->memcpyHostToDevice(
+            static_cast<bool *>(dst_ptr), static_cast<const bool *>(src_ptr),
+            total_bytes);
+        break;
+      default:
+        assert(false && "unsupported type");
     }
   }
   device->eigen_sycl_device()->synchronize();
@@ -106,71 +106,71 @@ void SYCLDeviceContext::CopyDeviceTensorToCPU(const Tensor *device_tensor,
                                               StatusCallback done) {
   const int64 total_bytes = device_tensor->TotalBytes();
   if (total_bytes > 0) {
-    const void* src_ptr = DMAHelper::base(device_tensor);
-    void* dst_ptr = DMAHelper::base(cpu_tensor);
+    const void *src_ptr = DMAHelper::base(device_tensor);
+    void *dst_ptr = DMAHelper::base(cpu_tensor);
     switch (device_tensor->dtype()) {
-    case DT_FLOAT:
-      device->eigen_sycl_device()->memcpyDeviceToHost(
-          static_cast<float *>(dst_ptr), static_cast<const float *>(src_ptr),
-          total_bytes);
-      break;
-    case DT_DOUBLE:
-      device->eigen_sycl_device()->memcpyDeviceToHost(
-          static_cast<double *>(dst_ptr), static_cast<const double *>(src_ptr),
-          total_bytes);
-      break;
-    case DT_INT32:
-      device->eigen_sycl_device()->memcpyDeviceToHost(
-          static_cast<int32 *>(dst_ptr), static_cast<const int32 *>(src_ptr),
-          total_bytes);
-      break;
-    case DT_INT64:
-      device->eigen_sycl_device()->memcpyDeviceToHost(
-          static_cast<int64 *>(dst_ptr), static_cast<const int64 *>(src_ptr),
-          total_bytes);
-      break;
-    case DT_HALF:
-      device->eigen_sycl_device()->memcpyDeviceToHost(
-          static_cast<Eigen::half *>(dst_ptr),
-          static_cast<const Eigen::half *>(src_ptr), total_bytes);
-      break;
-    case DT_COMPLEX64:
-      device->eigen_sycl_device()->memcpyDeviceToHost(
-          static_cast<std::complex<float> *>(dst_ptr),
-          static_cast<const std::complex<float> *>(src_ptr), total_bytes);
-      break;
-    case DT_COMPLEX128:
-      device->eigen_sycl_device()->memcpyDeviceToHost(
-          static_cast<std::complex<double> *>(dst_ptr),
-          static_cast<const std::complex<double> *>(src_ptr), total_bytes);
-      break;
-    case DT_INT8:
-      device->eigen_sycl_device()->memcpyDeviceToHost(
-          static_cast<int8 *>(dst_ptr), static_cast<const int8 *>(src_ptr),
-          total_bytes);
-      break;
-    case DT_INT16:
-      device->eigen_sycl_device()->memcpyDeviceToHost(
-          static_cast<int16 *>(dst_ptr), static_cast<const int16 *>(src_ptr),
-          total_bytes);
-      break;
-    case DT_UINT8:
-      device->eigen_sycl_device()->memcpyDeviceToHost(
-          static_cast<uint8 *>(dst_ptr), static_cast<const uint8 *>(src_ptr),
-          total_bytes);
-      break;
-    case DT_UINT16:
-      device->eigen_sycl_device()->memcpyDeviceToHost(
-          static_cast<uint16 *>(dst_ptr), static_cast<const uint16 *>(src_ptr),
-          total_bytes);
-      break;
-    case DT_BOOL:
-      device->eigen_sycl_device()->memcpyDeviceToHost(
-          static_cast<bool *>(dst_ptr), static_cast<const bool *>(src_ptr),
-          total_bytes);
-      break;
-    default:
-      assert(false && "unsupported type");
+      case DT_FLOAT:
+        device->eigen_sycl_device()->memcpyDeviceToHost(
+            static_cast<float *>(dst_ptr), static_cast<const float *>(src_ptr),
+            total_bytes);
+        break;
+      case DT_DOUBLE:
+        device->eigen_sycl_device()->memcpyDeviceToHost(
+            static_cast<double *>(dst_ptr),
+            static_cast<const double *>(src_ptr), total_bytes);
+        break;
+      case DT_INT32:
+        device->eigen_sycl_device()->memcpyDeviceToHost(
+            static_cast<int32 *>(dst_ptr), static_cast<const int32 *>(src_ptr),
+            total_bytes);
+        break;
+      case DT_INT64:
+        device->eigen_sycl_device()->memcpyDeviceToHost(
+            static_cast<int64 *>(dst_ptr), static_cast<const int64 *>(src_ptr),
+            total_bytes);
+        break;
+      case DT_HALF:
+        device->eigen_sycl_device()->memcpyDeviceToHost(
+            static_cast<Eigen::half *>(dst_ptr),
+            static_cast<const Eigen::half *>(src_ptr), total_bytes);
+        break;
+      case DT_COMPLEX64:
+        device->eigen_sycl_device()->memcpyDeviceToHost(
+            static_cast<std::complex<float> *>(dst_ptr),
+            static_cast<const std::complex<float> *>(src_ptr), total_bytes);
+        break;
+      case DT_COMPLEX128:
+        device->eigen_sycl_device()->memcpyDeviceToHost(
+            static_cast<std::complex<double> *>(dst_ptr),
+            static_cast<const std::complex<double> *>(src_ptr), total_bytes);
+        break;
+      case DT_INT8:
+        device->eigen_sycl_device()->memcpyDeviceToHost(
+            static_cast<int8 *>(dst_ptr), static_cast<const int8 *>(src_ptr),
+            total_bytes);
+        break;
+      case DT_INT16:
+        device->eigen_sycl_device()->memcpyDeviceToHost(
+            static_cast<int16 *>(dst_ptr), static_cast<const int16 *>(src_ptr),
+            total_bytes);
+        break;
+      case DT_UINT8:
+        device->eigen_sycl_device()->memcpyDeviceToHost(
+            static_cast<uint8 *>(dst_ptr), static_cast<const uint8 *>(src_ptr),
+            total_bytes);
+        break;
+      case DT_UINT16:
+        device->eigen_sycl_device()->memcpyDeviceToHost(
+            static_cast<uint16 *>(dst_ptr),
+            static_cast<const uint16 *>(src_ptr), total_bytes);
+        break;
+      case DT_BOOL:
+        device->eigen_sycl_device()->memcpyDeviceToHost(
+            static_cast<bool *>(dst_ptr), static_cast<const bool *>(src_ptr),
+            total_bytes);
+        break;
+      default:
+        assert(false && "unsupported type");
     }
   }
   device->eigen_sycl_device()->synchronize();
@@ -178,4 +178,4 @@ void SYCLDeviceContext::CopyDeviceTensorToCPU(const Tensor *device_tensor,
 }
 
 }  // namespace tensorflow
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_context.h b/tensorflow/core/common_runtime/sycl/sycl_device_context.h
index 1f7ad543d9..0f8f17b805 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device_context.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_device_context.h
@@ -26,7 +26,7 @@ limitations under the License.
 namespace tensorflow {
 
 class SYCLDeviceContext : public DeviceContext {
-public:
+ public:
   SYCLDeviceContext() {}
 
   ~SYCLDeviceContext() override {}
@@ -40,6 +40,6 @@ public:
                              StatusCallback done) override;
 };
 
-} // namespace tensorflow
+}  // namespace tensorflow
 
-#endif // TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_DEVICE_CONTEXT_H_
+#endif  // TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_DEVICE_CONTEXT_H_
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
index 51eb4973d8..a643fc7258 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
@@ -21,7 +21,7 @@ limitations under the License.
 namespace tensorflow {
 
 class SYCLDeviceFactory : public DeviceFactory {
-public:
+ public:
   Status CreateDevices(const SessionOptions &options, const string &name_prefix,
                        std::vector<Device *> *devices) override {
     int n = 1;
@@ -31,10 +31,10 @@ public:
     }
     for (int i = 0; i < n; i++) {
       string name = strings::StrCat(name_prefix, "/device:SYCL:", i);
-      devices->push_back(new SYCLDevice(options, name, Bytes(256 << 20),
-                                        DeviceLocality(),
-                                        SYCLDevice::GetShortDeviceDescription(),
-                                        cl::sycl::gpu_selector(), cpu_allocator()));
+      devices->push_back(
+          new SYCLDevice(options, name, Bytes(256 << 20), DeviceLocality(),
+                         SYCLDevice::GetShortDeviceDescription(),
+                         cl::sycl::gpu_selector(), cpu_allocator()));
     }
     return Status::OK();
   }
@@ -43,4 +43,4 @@ public:
 REGISTER_LOCAL_DEVICE_FACTORY("SYCL", SYCLDeviceFactory, 200);
 }
 
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 9bb92ad7ef..41ea0267b2 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -383,7 +383,6 @@ Status MasterSession::ReffedClientGraph::DoBuildPartitions(
 
   // Partition the graph.
   Status s;
-  std::unordered_map<string, GraphDef> graph_partitions;
   return Partition(popts, &client_graph_->graph, out_partitions);
 }
 
diff --git a/tensorflow/core/framework/register_types_traits.h b/tensorflow/core/framework/register_types_traits.h
index 8f8d9fd08e..c1fe5517c6 100644
--- a/tensorflow/core/framework/register_types_traits.h
+++ b/tensorflow/core/framework/register_types_traits.h
@@ -21,6 +21,10 @@ limitations under the License.
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
+
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -66,6 +70,17 @@ struct proxy_type_pod<GPUDevice, 2> {
   typedef Eigen::half type;
 };
 
+#ifdef TENSORFLOW_USE_SYCL
+template <>
+struct proxy_type_pod<SYCLDevice, 8> {
+  typedef double type;
+};
+template <>
+struct proxy_type_pod<SYCLDevice, 4> {
+  typedef float type;
+};
+#endif // TENSORFLOW_USE_SYCL
+
 /// If POD we use proxy_type_pod, otherwise this maps to identiy.
 template <typename Device, typename T>
 struct proxy_type {
@@ -81,6 +96,10 @@ struct proxy_type {
       TF_CALL_int8(m) TF_CALL_complex128(m)
 #define TF_CALL_GPU_PROXY_TYPES(m) \
   TF_CALL_double(m) TF_CALL_float(m) TF_CALL_half(m) TF_CALL_int32(m)
+#ifdef TENSORFLOW_USE_SYCL
+#define TF_CALL_SYCL_PROXY_TYPES(m) \
+  TF_CALL_double(m) TF_CALL_float(m) TF_CALL_int32(m)
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_FRAMEWORK_REGISTER_TYPES_TRAITS_H_
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 2503b441f7..fb7d63767d 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -39,6 +39,10 @@ load(
     "tf_proto_library",
     "tf_kernel_tests_linkstatic",
 )
+load(
+    "//third_party/mkl:build_defs.bzl",
+    "if_mkl",
+)
 
 config_setting(
     # Add "--define tensorflow_xsmm=1" to your build command to use libxsmm for
@@ -4294,6 +4298,17 @@ tf_cc_test(
     ],
 )
 
+if_mkl(
+    tf_kernel_library(
+        name = "mkl_ops",
+        prefix = "mkl_matmul",
+        deps = [
+            ":math",
+            "//third_party/mkl:intel_binary_blob",
+        ],
+    ),
+)
+
 # -----------------------------------------------------------------------------
 # Google-internal targets.  These must be at the end for syncrepo.
 
diff --git a/tensorflow/core/kernels/cast_op.cc b/tensorflow/core/kernels/cast_op.cc
index ab82c247d6..562934ed63 100644
--- a/tensorflow/core/kernels/cast_op.cc
+++ b/tensorflow/core/kernels/cast_op.cc
@@ -34,6 +34,9 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
 
 #define CURRY_TYPES2(FN, arg0)   \
   FN(arg0, bool);                \
@@ -206,6 +209,52 @@ REGISTER_CAST_GPU(bfloat16, float);
 #undef REGISTER_CAST_GPU
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+class SyclCastOp : public CastOpBase {
+ public:
+  explicit SyclCastOp(OpKernelConstruction* ctx) : CastOpBase(ctx) {
+    OP_REQUIRES_OK(ctx, Prepare());
+  }
+
+ private:
+  Status Prepare() {
+    if (src_dtype_ == dst_dtype_) {
+      work_ = nullptr;  // Identity
+      return Status::OK();
+    }
+    if (src_dtype_ == DT_BOOL) {
+      work_ = GetSyclCastFromBool(dst_dtype_);
+    } else if (src_dtype_ == DT_INT32) {
+      work_ = GetSyclCastFromInt32(dst_dtype_);
+    } else if (src_dtype_ == DT_INT64) {
+      work_ = GetSyclCastFromInt64(dst_dtype_);
+    } else if (src_dtype_ == DT_FLOAT) {
+      work_ = GetSyclCastFromFloat(dst_dtype_);
+    } else if (src_dtype_ == DT_DOUBLE) {
+      work_ = GetSyclCastFromDouble(dst_dtype_);
+    }
+
+    return work_ == nullptr ? Unimplemented() : Status::OK();
+  }
+};
+
+#define REGISTER_CAST_SYCL(srctype, dsttype)                    \
+  REGISTER_KERNEL_BUILDER(Name("Cast")                         \
+                              .TypeConstraint<srctype>("SrcT") \
+                              .TypeConstraint<dsttype>("DstT") \
+                              .Device(DEVICE_SYCL),             \
+                          SyclCastOp)
+
+CURRY_TYPES2(REGISTER_CAST_SYCL, bool);
+CURRY_TYPES2(REGISTER_CAST_SYCL, int32);
+CURRY_TYPES2(REGISTER_CAST_SYCL, int64);
+CURRY_TYPES2(REGISTER_CAST_SYCL, float);
+CURRY_TYPES2(REGISTER_CAST_SYCL, double);
+
+#undef REGISTER_CAST_SYCL
+
+#endif // TENSORFLOW_USE_SYCL
+
 #undef CURRY_TYPES2
 
 // HostCast differs from Cast in that its input and output are in host memory.
@@ -213,5 +262,10 @@ REGISTER_KERNEL_BUILDER(Name("_HostCast").Device(DEVICE_CPU), CpuCastOp);
 REGISTER_KERNEL_BUILDER(
     Name("_HostCast").Device(DEVICE_GPU).HostMemory("x").HostMemory("y"),
     CpuCastOp);
-
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(
+    Name("_HostCast").Device(DEVICE_SYCL).HostMemory("x").HostMemory("y"),
+    CpuCastOp);
+#endif // TENSORFLOW_USE_SYCL
 }  // end namespace tensorflow
+
diff --git a/tensorflow/core/kernels/cast_op_impl.h b/tensorflow/core/kernels/cast_op_impl.h
index cb7cc81937..1ee0796ac1 100644
--- a/tensorflow/core/kernels/cast_op_impl.h
+++ b/tensorflow/core/kernels/cast_op_impl.h
@@ -33,6 +33,16 @@ struct CastFunctor<Eigen::ThreadPoolDevice, O, I> {
   }
 };
 
+#ifdef TENSORFLOW_USE_SYCL
+template <typename O, typename I>
+struct CastFunctor<Eigen::SyclDevice, O, I> {
+  void operator()(const Eigen::SyclDevice& d, typename TTypes<O>::Flat o,
+                  typename TTypes<I>::ConstFlat i) {
+    o.device(d) = i.template cast<O>();
+  }
+};
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace functor
 
 #define CURRY_TYPES3(FN, arg0, arg1)   \
@@ -140,6 +150,25 @@ GetGpuCastFromBfloat(DataType dst_dtype);
 
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromBool(DataType dst_dtype);
+
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromInt32(DataType dst_dtype);
+
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromInt64(DataType dst_dtype);
+
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromFloat(DataType dst_dtype);
+
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromDouble(DataType dst_dtype);
+
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
 
 #endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CAST_OP_IMPL_H_
+
diff --git a/tensorflow/core/kernels/cast_op_impl_bool.cc b/tensorflow/core/kernels/cast_op_impl_bool.cc
index 92fee89a47..a13f163009 100644
--- a/tensorflow/core/kernels/cast_op_impl_bool.cc
+++ b/tensorflow/core/kernels/cast_op_impl_bool.cc
@@ -34,4 +34,14 @@ GetGpuCastFromBool(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromBool(DataType dst_dtype) {
+  CURRY_TYPES3(CAST_CASE, SYCLDevice, bool);
+  return nullptr;
+}
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
+
diff --git a/tensorflow/core/kernels/cast_op_impl_double.cc b/tensorflow/core/kernels/cast_op_impl_double.cc
index fd20061d21..fdc8d51158 100644
--- a/tensorflow/core/kernels/cast_op_impl_double.cc
+++ b/tensorflow/core/kernels/cast_op_impl_double.cc
@@ -34,4 +34,14 @@ GetGpuCastFromDouble(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromDouble(DataType dst_dtype) {
+  CURRY_TYPES3(CAST_CASE, SYCLDevice, double);
+  return nullptr;
+}
+#endif // TENSORFLOW_USE_SYC
+
 }  // namespace tensorflow
+
diff --git a/tensorflow/core/kernels/cast_op_impl_float.cc b/tensorflow/core/kernels/cast_op_impl_float.cc
index 71e63fbff0..1241dcd8f2 100644
--- a/tensorflow/core/kernels/cast_op_impl_float.cc
+++ b/tensorflow/core/kernels/cast_op_impl_float.cc
@@ -49,4 +49,14 @@ GetGpuCastFromFloat(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromFloat(DataType dst_dtype) {
+  CURRY_TYPES3(CAST_CASE, SYCLDevice, float);
+  return nullptr;
+}
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
+
diff --git a/tensorflow/core/kernels/cast_op_impl_int32.cc b/tensorflow/core/kernels/cast_op_impl_int32.cc
index 0fc6e16afe..fca9cd60ec 100644
--- a/tensorflow/core/kernels/cast_op_impl_int32.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int32.cc
@@ -34,4 +34,14 @@ GetGpuCastFromInt32(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromInt32(DataType dst_dtype) {
+  CURRY_TYPES3(CAST_CASE, CPUDevice, int32);
+  return nullptr;
+}
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
+
diff --git a/tensorflow/core/kernels/cast_op_impl_int64.cc b/tensorflow/core/kernels/cast_op_impl_int64.cc
index b5571b19a5..c0a543708d 100644
--- a/tensorflow/core/kernels/cast_op_impl_int64.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int64.cc
@@ -19,6 +19,9 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
 
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetCpuCastFromInt64(DataType dst_dtype) {
@@ -34,4 +37,13 @@ GetGpuCastFromInt64(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromInt64(DataType dst_dtype) {
+  CURRY_TYPES3(CAST_CASE, SYCLDevice, int64);
+  return nullptr;
+}
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_test.cc b/tensorflow/core/kernels/cast_op_test.cc
index 5b7529bb8a..a106f287c1 100644
--- a/tensorflow/core/kernels/cast_op_test.cc
+++ b/tensorflow/core/kernels/cast_op_test.cc
@@ -105,7 +105,12 @@ static void BM_gpu_float_int64(int iters, int num) {
   testing::BytesProcessed(static_cast<int64>(iters) * num *
                           (sizeof(float) + sizeof(int64)));
   testing::UseRealTime();
+#if GOOGLE_CUDA
   test::Benchmark("gpu", Cast<float, int64>(num)).Run(iters);
+#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+  test::Benchmark("sycl", Cast<float, int64>(num)).Run(iters);
+#endif // TENSORFLOW_USE_SYCL
 }
 BENCHMARK(BM_gpu_float_int64)->Arg(64 << 10)->Arg(32 << 20);
 
@@ -123,7 +128,12 @@ static void BM_gpu_bool_float(int iters, int num) {
   testing::BytesProcessed(static_cast<int64>(iters) * num *
                           (sizeof(bool) + sizeof(float)));
   testing::UseRealTime();
+#if GOOGLE_CUDA
   test::Benchmark("gpu", Cast<bool, float>(num)).Run(iters);
+#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+  test::Benchmark("sycl", Cast<bool, float>(num)).Run(iters);
+#endif // TENSORFLOW_USE_SYCL
 }
 BENCHMARK(BM_gpu_bool_float)->Arg(64 << 10)->Arg(32 << 20);
 
@@ -168,7 +178,9 @@ static void BM_gpu_float_half(int iters, int num) {
   testing::BytesProcessed(static_cast<int64>(iters) * num *
                           (sizeof(float) + sizeof(Eigen::half)));
   testing::UseRealTime();
+#if GOOGLE_CUDA
   test::Benchmark("gpu", Cast<float, Eigen::half>(num)).Run(iters);
+#endif // GOOGLE_CUDA
 }
 BENCHMARK(BM_gpu_float_half)->Arg(64 << 10)->Arg(32 << 20);
 
@@ -177,7 +189,9 @@ static void BM_gpu_half_float(int iters, int num) {
   testing::BytesProcessed(static_cast<int64>(iters) * num *
                           (sizeof(float) + sizeof(Eigen::half)));
   testing::UseRealTime();
+#if GOOGLE_CUDA
   test::Benchmark("gpu", Cast<Eigen::half, float>(num)).Run(iters);
+#endif // GOOGLE_CUDA
 }
 BENCHMARK(BM_gpu_half_float)->Arg(64 << 10)->Arg(32 << 20);
 
diff --git a/tensorflow/core/kernels/concat_lib.h b/tensorflow/core/kernels/concat_lib.h
index cef873f804..14e6e1bc32 100644
--- a/tensorflow/core/kernels/concat_lib.h
+++ b/tensorflow/core/kernels/concat_lib.h
@@ -38,6 +38,14 @@ void ConcatGPU(
     Tensor* output, typename TTypes<T, 2>::Tensor* output_flat);
 
 #endif  // GOOGLE_CUDA
+
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T>
+void ConcatSYCL(const Eigen::SyclDevice& d,
+               const std::vector<
+                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
+               typename TTypes<T, 2>::Matrix* output);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_CONCAT_LIB_H_
diff --git a/tensorflow/core/kernels/concat_lib_cpu.cc b/tensorflow/core/kernels/concat_lib_cpu.cc
index f83aed6aef..f89948350c 100644
--- a/tensorflow/core/kernels/concat_lib_cpu.cc
+++ b/tensorflow/core/kernels/concat_lib_cpu.cc
@@ -74,4 +74,23 @@ REGISTER(qint16)
 REGISTER(qint32)
 REGISTER(bfloat16)
 
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T>
+void ConcatSYCL(const Eigen::SyclDevice& d,
+               const std::vector<
+                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
+               typename TTypes<T, 2>::Matrix* output) {
+  ConcatSYCLImpl<T>(d, inputs, sizeof(T) /* cost_per_unit */, MemCpyCopier<T>(),
+                   output);
+}
+#define REGISTER_SYCL(T)                                                      \
+ template void ConcatSYCL<T>(                                                 \
+     const Eigen::SyclDevice&,                                                \
+     const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&, \
+     typename TTypes<T, 2>::Matrix* output);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL)
+
+#undef REGISTER_SYCL
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_lib_cpu.h b/tensorflow/core/kernels/concat_lib_cpu.h
index 9d37cafb4e..6a933efde4 100644
--- a/tensorflow/core/kernels/concat_lib_cpu.h
+++ b/tensorflow/core/kernels/concat_lib_cpu.h
@@ -126,4 +126,39 @@ void ConcatCPUImpl(
         cost_per_unit, work);
 }
 
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T, typename ElementCopier>
+void ConcatSYCLImpl(
+    const Eigen::SyclDevice& d,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
+        inputs,
+    int64 cost_per_unit, ElementCopier copier,
+    typename TTypes<T, 2>::Matrix* output) {
+  size_t num_inputs = inputs.size();
+
+  std::vector<ptrdiff_t> sizes;
+  sizes.reserve(num_inputs);
+  int64 row_size = 0;
+  for (const auto& input : inputs) {
+    sizes.push_back(input->dimension(1));
+    row_size += sizes.back();
+  }
+
+  T* out = &(*output)(0, 0);
+  std::vector<const T*> inp;
+  inp.reserve(num_inputs);
+  for (const auto& input : inputs) {
+    inp.push_back(&(*input)(0, 0));
+  }
+  const int64 dim0 = output->dimension(0);
+  for (int64 i = 0; i < dim0; ++i) {
+    for (int64 j = 0; j < num_inputs; ++j) {
+      auto size = sizes[j];
+      d.memcpy(out, inp[j], size * sizeof(T));
+      out += size;
+      inp[j] += size;
+    }
+  }
+}
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index e6dae5fa7e..9628a7efa4 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -35,6 +35,9 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 #if GOOGLE_CUDA
 typedef Eigen::GpuDevice GPUDevice;
 #endif  // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
 
 enum AxisArgumentName { NAME_IS_AXIS, NAME_IS_CONCAT_DIM };
 
@@ -134,6 +137,12 @@ class ConcatBaseOp : public OpKernel {
         return;
       }
 #endif  // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+      if (std::is_same<Device, SYCLDevice>::value) {
+        ConcatSYCL<T>(c->eigen_sycl_device(), inputs_flat, &output_flat);
+        return;
+      }
+#endif // TENSORFLOW_USE_SYCL
       ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
     }
   }
@@ -207,6 +216,39 @@ REGISTER_KERNEL_BUILDER(Name("ConcatV2")
 
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL(type)                                  \
+  REGISTER_KERNEL_BUILDER(Name("Concat")                     \
+                              .Device(DEVICE_SYCL)           \
+                              .TypeConstraint<type>("T")     \
+                              .HostMemory("concat_dim"),     \
+                          ConcatOp<SYCLDevice, type>)        \
+  REGISTER_KERNEL_BUILDER(Name("ConcatV2")                   \
+                              .Device(DEVICE_SYCL)           \
+                              .TypeConstraint<type>("T")     \
+                              .TypeConstraint<int32>("Tidx") \
+                              .HostMemory("axis"),           \
+                          ConcatV2Op<SYCLDevice, type>)
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL);
+REGISTER_KERNEL_BUILDER(Name("Concat")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .HostMemory("concat_dim")
+                            .HostMemory("values")
+                            .HostMemory("output"),
+                        ConcatOp<CPUDevice, int32>);
+REGISTER_KERNEL_BUILDER(Name("ConcatV2")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32>("Tidx")
+                            .HostMemory("values")
+                            .HostMemory("axis")
+                            .HostMemory("output"),
+                        ConcatV2Op<CPUDevice, int32>);
+#undef REGISTER_SYCL
+#endif // TENSORFLOW_USE_SYCL
+
 class ConcatOffsetOp : public OpKernel {
  public:
   explicit ConcatOffsetOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
@@ -293,4 +335,12 @@ REGISTER_KERNEL_BUILDER(Name("ConcatOffset")
                             .HostMemory("offset"),
                         ConcatOffsetOp);
 
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("ConcatOffset")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("concat_dim")
+                            .HostMemory("shape")
+                            .HostMemory("offset"),
+                        ConcatOffsetOp);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index 115a842d1c..d444ddec1d 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -57,7 +57,10 @@ REGISTER_KERNEL_BUILDER(Name("Const").Device(DEVICE_CPU), ConstantOp);
   REGISTER_KERNEL_BUILDER(                                             \
       Name("Const").Device(DEVICE_SYCL).TypeConstraint<TYPE>("dtype"), \
       ConstantOp);
-TF_CALL_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
+REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
+REGISTER_SYCL_KERNEL(bool);
+REGISTER_SYCL_KERNEL(int64);
 #undef REGISTER_SYCL_KERNEL
 #endif
 
@@ -112,6 +115,17 @@ REGISTER_KERNEL_BUILDER(Name("Const")
                         HostConstantOp);
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Const")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("output")
+                            .TypeConstraint<int32>("dtype"),
+                        HostConstantOp);
+#endif // TENSORFLOW_USE_SYCL
+
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
@@ -186,6 +200,7 @@ REGISTER_KERNEL(CPU, quint8);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL(SYCL, float)
+REGISTER_KERNEL(SYCL, double)
 REGISTER_KERNEL_BUILDER(Name("Fill")
                             .Device(DEVICE_SYCL)
                             .TypeConstraint<int32>("T")
@@ -246,6 +261,7 @@ TF_CALL_POD_STRING_TYPES(REGISTER_CPU);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL(float, SYCL);
+REGISTER_KERNEL(bool, SYCL);
 REGISTER_KERNEL_BUILDER(Name("ZerosLike")
                             .Device(DEVICE_SYCL)
                             .TypeConstraint<int32>("T")
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index 1a73a3d0f8..6a79be5a95 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -321,6 +321,30 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
 
 #undef REGISTER_SYCL_KERNEL
 #undef REGISTER_SYCL_REF_KERNEL
+#define REGISTER_SYCL_HOST_KERNEL(type)                   \
+  REGISTER_KERNEL_BUILDER(Name("Enter")                   \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("data")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          EnterOp)
+
+#define REGISTER_SYCL_HOST_REF_KERNEL(type)               \
+  REGISTER_KERNEL_BUILDER(Name("RefEnter")                \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("data")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          EnterOp)
+
+REGISTER_SYCL_HOST_KERNEL(int32);
+REGISTER_SYCL_HOST_REF_KERNEL(int32);
+REGISTER_SYCL_HOST_KERNEL(string);
+REGISTER_SYCL_HOST_REF_KERNEL(string);
+REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
+
+#undef REGISTER_SYCL_HOST_KERNEL
+#undef REGISTER_SYCL_HOST_REF_KERNEL
 #endif
 
 // Special GPU kernels for int32 and string.
diff --git a/tensorflow/core/kernels/cwise_op_acos.cc b/tensorflow/core/kernels/cwise_op_acos.cc
index 1d2d815027..65801da3c7 100644
--- a/tensorflow/core/kernels/cwise_op_acos.cc
+++ b/tensorflow/core/kernels/cwise_op_acos.cc
@@ -26,6 +26,7 @@ REGISTER2(UnaryOp, CPU, "Acos", functor::acos, float, double);
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::acos<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/cwise_op_add_1.cc b/tensorflow/core/kernels/cwise_op_add_1.cc
index a6bff78694..f6e9b59cf8 100644
--- a/tensorflow/core/kernels/cwise_op_add_1.cc
+++ b/tensorflow/core/kernels/cwise_op_add_1.cc
@@ -18,7 +18,7 @@ limitations under the License.
 namespace tensorflow {
 REGISTER5(BinaryOp, CPU, "Add", functor::add, float, Eigen::half, double, int32,
           int64);
-          
+
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(TYPE)                                    \
   REGISTER_KERNEL_BUILDER(                                            \
@@ -26,10 +26,19 @@ REGISTER5(BinaryOp, CPU, "Add", functor::add, float, Eigen::half, double, int32,
                           .Device(DEVICE_SYCL)                        \
                           .TypeConstraint<TYPE>("T"),                 \
                           BinaryOp<SYCLDevice, functor::add<TYPE>>);
-  REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
+
+REGISTER_KERNEL_BUILDER(Name("Add")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::add<int32>>);
 #endif // TENSORFLOW_USE_SYCL
-          
+
 #if GOOGLE_CUDA
 REGISTER3(BinaryOp, GPU, "Add", functor::add, float, Eigen::half, double);
 
diff --git a/tensorflow/core/kernels/cwise_op_asin.cc b/tensorflow/core/kernels/cwise_op_asin.cc
index 92a22e90c4..c9ebfe759b 100644
--- a/tensorflow/core/kernels/cwise_op_asin.cc
+++ b/tensorflow/core/kernels/cwise_op_asin.cc
@@ -26,6 +26,7 @@ REGISTER2(UnaryOp, CPU, "Asin", functor::asin, float, double);
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::asin<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/cwise_op_atan.cc b/tensorflow/core/kernels/cwise_op_atan.cc
index 825e85283f..72645b303f 100644
--- a/tensorflow/core/kernels/cwise_op_atan.cc
+++ b/tensorflow/core/kernels/cwise_op_atan.cc
@@ -26,6 +26,7 @@ REGISTER2(UnaryOp, CPU, "Atan", functor::atan, float, double);
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::atan<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/cwise_op_ceil.cc b/tensorflow/core/kernels/cwise_op_ceil.cc
index c5a4aaf831..c74e10576d 100644
--- a/tensorflow/core/kernels/cwise_op_ceil.cc
+++ b/tensorflow/core/kernels/cwise_op_ceil.cc
@@ -26,6 +26,7 @@ REGISTER3(UnaryOp, CPU, "Ceil", functor::ceil, float, Eigen::half, double);
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::ceil<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/cwise_op_cos.cc b/tensorflow/core/kernels/cwise_op_cos.cc
index a758da5842..634c90adc6 100644
--- a/tensorflow/core/kernels/cwise_op_cos.cc
+++ b/tensorflow/core/kernels/cwise_op_cos.cc
@@ -27,6 +27,7 @@ REGISTER5(UnaryOp, CPU, "Cos", functor::cos, float, Eigen::half, double,
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::cos<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/cwise_op_div.cc b/tensorflow/core/kernels/cwise_op_div.cc
index 74d8faedb5..1e2300832f 100644
--- a/tensorflow/core/kernels/cwise_op_div.cc
+++ b/tensorflow/core/kernels/cwise_op_div.cc
@@ -37,8 +37,18 @@ REGISTER5(BinaryOp, CPU, "RealDiv", functor::div, float, Eigen::half, double,
                           .TypeConstraint<TYPE>("T"),                 \
                           BinaryOp<SYCLDevice, functor::div<TYPE>>);
 REGISTER_SYCL_KERNEL(float)
-REGISTER_SYCL_KERNEL(int32)
+REGISTER_SYCL_KERNEL(double)
 #undef REGISTER_SYCL_KERNEL
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Div")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::safe_div<int32>>);
 #endif // TENSORFLOW_USE_SYCL
 #if GOOGLE_CUDA
 REGISTER9(BinaryOp, GPU, "Div", functor::div, float, Eigen::half, double, uint8,
diff --git a/tensorflow/core/kernels/cwise_op_equal_to_1.cc b/tensorflow/core/kernels/cwise_op_equal_to_1.cc
index 7bd44abd39..93ea768836 100644
--- a/tensorflow/core/kernels/cwise_op_equal_to_1.cc
+++ b/tensorflow/core/kernels/cwise_op_equal_to_1.cc
@@ -34,4 +34,16 @@ REGISTER_KERNEL_BUILDER(Name("Equal")
                         BinaryOp<CPUDevice, functor::equal_to<int32>>);
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER2(BinaryOp, SYCL, "Equal", functor::equal_to, float, double);
+
+REGISTER_KERNEL_BUILDER(Name("Equal")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::equal_to<int32>>);
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_expm1.cc b/tensorflow/core/kernels/cwise_op_expm1.cc
index f1c53ca272..5573c2bcc2 100644
--- a/tensorflow/core/kernels/cwise_op_expm1.cc
+++ b/tensorflow/core/kernels/cwise_op_expm1.cc
@@ -21,4 +21,7 @@ REGISTER5(UnaryOp, CPU, "Expm1", functor::expm1, float, Eigen::half, double,
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Expm1", functor::expm1, float, Eigen::half, double);
 #endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(UnaryOp, SYCL, "Expm1", functor::expm1, float);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_floor.cc b/tensorflow/core/kernels/cwise_op_floor.cc
index 129d754b82..59e32d7f6f 100644
--- a/tensorflow/core/kernels/cwise_op_floor.cc
+++ b/tensorflow/core/kernels/cwise_op_floor.cc
@@ -26,6 +26,7 @@ REGISTER3(UnaryOp, CPU, "Floor", functor::floor, float, Eigen::half, double);
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::floor<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/cwise_op_floor_div.cc b/tensorflow/core/kernels/cwise_op_floor_div.cc
index 8a600f8f95..fa81ef0872 100644
--- a/tensorflow/core/kernels/cwise_op_floor_div.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_div.cc
@@ -21,17 +21,6 @@ REGISTER5(BinaryOp, CPU, "FloorDiv", functor::safe_floor_div, uint8, uint16,
 REGISTER3(BinaryOp, CPU, "FloorDiv", functor::floor_div_real, float,
           Eigen::half, double);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("FloorDiv")                            \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          BinaryOp<SYCLDevice, functor::floor_div_real<TYPE>>);
-REGISTER_SYCL_KERNEL(float)
-#undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
-
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "FloorDiv", functor::floor_div, uint8, uint16, int16,
           int64);
@@ -51,4 +40,14 @@ REGISTER_KERNEL_BUILDER(Name("FloorDiv")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::safe_floor_div<int32>>);
 #endif
+
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("FloorDiv")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::safe_floor_div<int32>>);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_floor_mod.cc b/tensorflow/core/kernels/cwise_op_floor_mod.cc
index 4e641a8bb3..55f8a30461 100644
--- a/tensorflow/core/kernels/cwise_op_floor_mod.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_mod.cc
@@ -31,4 +31,14 @@ REGISTER_KERNEL_BUILDER(Name("FloorMod")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::safe_floor_mod<int32>>);
 #endif
+
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("FloorMod")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::safe_floor_mod<int32>>);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_greater.cc b/tensorflow/core/kernels/cwise_op_greater.cc
index 8c9691d1ea..6b5a806aa2 100644
--- a/tensorflow/core/kernels/cwise_op_greater.cc
+++ b/tensorflow/core/kernels/cwise_op_greater.cc
@@ -33,5 +33,19 @@ REGISTER_KERNEL_BUILDER(Name("Greater")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::greater<int32>>);
 #endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(BinaryOp, SYCL, "Greater", functor::greater, float);
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Greater")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::greater<int32>>);
+#endif // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_greater_equal.cc b/tensorflow/core/kernels/cwise_op_greater_equal.cc
index a6083cb9cd..ac21528256 100644
--- a/tensorflow/core/kernels/cwise_op_greater_equal.cc
+++ b/tensorflow/core/kernels/cwise_op_greater_equal.cc
@@ -34,4 +34,15 @@ REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
                         BinaryOp<CPUDevice, functor::greater_equal<int32>>);
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float);
+
+REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::greater_equal<int32>>);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_isfinite.cc b/tensorflow/core/kernels/cwise_op_isfinite.cc
index 59976141c7..0faeffa95c 100644
--- a/tensorflow/core/kernels/cwise_op_isfinite.cc
+++ b/tensorflow/core/kernels/cwise_op_isfinite.cc
@@ -27,6 +27,7 @@ REGISTER3(UnaryOp, CPU, "IsFinite", functor::isfinite, float, Eigen::half,
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::isfinite<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/cwise_op_isinf.cc b/tensorflow/core/kernels/cwise_op_isinf.cc
index 675cb95b95..df63006b3f 100644
--- a/tensorflow/core/kernels/cwise_op_isinf.cc
+++ b/tensorflow/core/kernels/cwise_op_isinf.cc
@@ -26,6 +26,7 @@ REGISTER3(UnaryOp, CPU, "IsInf", functor::isinf, float, Eigen::half, double);
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::isinf<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/cwise_op_isnan.cc b/tensorflow/core/kernels/cwise_op_isnan.cc
index c394087ed8..e1cf7a8637 100644
--- a/tensorflow/core/kernels/cwise_op_isnan.cc
+++ b/tensorflow/core/kernels/cwise_op_isnan.cc
@@ -26,6 +26,7 @@ REGISTER3(UnaryOp, CPU, "IsNan", functor::isnan, float, Eigen::half, double);
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::isnan<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/cwise_op_less.cc b/tensorflow/core/kernels/cwise_op_less.cc
index 701007d637..a38f1024a9 100644
--- a/tensorflow/core/kernels/cwise_op_less.cc
+++ b/tensorflow/core/kernels/cwise_op_less.cc
@@ -33,5 +33,15 @@ REGISTER_KERNEL_BUILDER(Name("Less")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::less<int32>>);
 #endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER3(BinaryOp, SYCL, "Less", functor::less, float, double, int64);
 
+REGISTER_KERNEL_BUILDER(Name("Less")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::less<int32>>);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_less_equal.cc b/tensorflow/core/kernels/cwise_op_less_equal.cc
index 97fd1ae919..3a2cc2ae0e 100644
--- a/tensorflow/core/kernels/cwise_op_less_equal.cc
+++ b/tensorflow/core/kernels/cwise_op_less_equal.cc
@@ -34,4 +34,16 @@ REGISTER_KERNEL_BUILDER(Name("LessEqual")
                         BinaryOp<CPUDevice, functor::less_equal<int32>>);
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(BinaryOp, SYCL, "LessEqual", functor::less_equal, float);
+
+REGISTER_KERNEL_BUILDER(Name("LessEqual")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::less_equal<int32>>);
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_log.cc b/tensorflow/core/kernels/cwise_op_log.cc
index 71c4588b3d..5e74e778c7 100644
--- a/tensorflow/core/kernels/cwise_op_log.cc
+++ b/tensorflow/core/kernels/cwise_op_log.cc
@@ -27,6 +27,7 @@ REGISTER5(UnaryOp, CPU, "Log", functor::log, float, Eigen::half, double,
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::log<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/cwise_op_log1p.cc b/tensorflow/core/kernels/cwise_op_log1p.cc
index 03ea3a0a89..edb821318e 100644
--- a/tensorflow/core/kernels/cwise_op_log1p.cc
+++ b/tensorflow/core/kernels/cwise_op_log1p.cc
@@ -27,6 +27,7 @@ REGISTER5(UnaryOp, CPU, "Log1p", functor::log1p, float, Eigen::half, double,
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::log1p<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/cwise_op_maximum.cc b/tensorflow/core/kernels/cwise_op_maximum.cc
index f93b5a8303..7311f25ec0 100644
--- a/tensorflow/core/kernels/cwise_op_maximum.cc
+++ b/tensorflow/core/kernels/cwise_op_maximum.cc
@@ -34,4 +34,19 @@ REGISTER_KERNEL_BUILDER(Name("Maximum")
                         BinaryOp<CPUDevice, functor::maximum<int32>>);
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(BinaryOp, SYCL, "Maximum", functor::maximum, float);
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Maximum")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::maximum<int32>>);
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_minimum.cc b/tensorflow/core/kernels/cwise_op_minimum.cc
index 36800975a8..99e5a76620 100644
--- a/tensorflow/core/kernels/cwise_op_minimum.cc
+++ b/tensorflow/core/kernels/cwise_op_minimum.cc
@@ -34,4 +34,16 @@ REGISTER_KERNEL_BUILDER(Name("Minimum")
                         BinaryOp<CPUDevice, functor::minimum<int32>>);
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(BinaryOp, SYCL, "Minimum", functor::minimum, float);
+
+REGISTER_KERNEL_BUILDER(Name("Minimum")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::minimum<int32>>);
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_mul_1.cc b/tensorflow/core/kernels/cwise_op_mul_1.cc
index e23fe6761d..5273522626 100644
--- a/tensorflow/core/kernels/cwise_op_mul_1.cc
+++ b/tensorflow/core/kernels/cwise_op_mul_1.cc
@@ -28,7 +28,15 @@ REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double,
                           .TypeConstraint<TYPE>("T"),                 \
                           BinaryOp<SYCLDevice, functor::mul<TYPE>>);
 REGISTER_SYCL_KERNEL(float)
+REGISTER_SYCL_KERNEL(double)
 #undef REGISTER_SYCL_KERNEL
+REGISTER_KERNEL_BUILDER(Name("Mul")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::mul<int32>>);
 #endif // TENSORFLOW_USE_SYCL
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "Mul", functor::mul, float, Eigen::half, double,
diff --git a/tensorflow/core/kernels/cwise_op_pow.cc b/tensorflow/core/kernels/cwise_op_pow.cc
index 8eeba6ab14..f1780168e4 100644
--- a/tensorflow/core/kernels/cwise_op_pow.cc
+++ b/tensorflow/core/kernels/cwise_op_pow.cc
@@ -27,6 +27,7 @@ REGISTER7(BinaryOp, CPU, "Pow", functor::pow, float, Eigen::half, double, int32,
                           .TypeConstraint<TYPE>("T"),                 \
                           BinaryOp<SYCLDevice, functor::pow<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/cwise_op_reciprocal.cc b/tensorflow/core/kernels/cwise_op_reciprocal.cc
index d858a077f5..8c0e21f9cf 100644
--- a/tensorflow/core/kernels/cwise_op_reciprocal.cc
+++ b/tensorflow/core/kernels/cwise_op_reciprocal.cc
@@ -36,6 +36,9 @@ REGISTER5(UnaryOp, CPU, "Reciprocal", functor::inverse, float, Eigen::half,
 REGISTER4(UnaryOp, GPU, "Reciprocal", functor::inverse, float, Eigen::half,
           double, int64);
 #endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(UnaryOp, SYCL, "Reciprocal", functor::inverse, float);
+#endif // TENSORFLOW_USE_SYCL
 
 REGISTER5(SimpleBinaryOp, CPU, "ReciprocalGrad", functor::inverse_grad, float,
           Eigen::half, double, complex64, complex128);
@@ -43,4 +46,7 @@ REGISTER5(SimpleBinaryOp, CPU, "ReciprocalGrad", functor::inverse_grad, float,
 REGISTER3(SimpleBinaryOp, GPU, "ReciprocalGrad", functor::inverse_grad, float,
           Eigen::half, double);
 #endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(SimpleBinaryOp, SYCL, "ReciprocalGrad", functor::inverse_grad, float);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_round.cc b/tensorflow/core/kernels/cwise_op_round.cc
index 7a4482dbb2..e192f89782 100644
--- a/tensorflow/core/kernels/cwise_op_round.cc
+++ b/tensorflow/core/kernels/cwise_op_round.cc
@@ -20,9 +20,9 @@ REGISTER5(UnaryOp, CPU, "Round", functor::round, Eigen::half, float, double,
           int32, int64);
 
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER(UnaryOp, SYCL, "Round", functor::round, float);
+REGISTER2(UnaryOp, SYCL, "Round", functor::round, float, double);
 namespace functor {
-DEFINE_UNARY1(round, float);
+DEFINE_UNARY2(round, float, double);
 }  // namespace functor
 #endif
 
diff --git a/tensorflow/core/kernels/cwise_op_rsqrt.cc b/tensorflow/core/kernels/cwise_op_rsqrt.cc
index 7dc96d47a6..f23725f48e 100644
--- a/tensorflow/core/kernels/cwise_op_rsqrt.cc
+++ b/tensorflow/core/kernels/cwise_op_rsqrt.cc
@@ -27,6 +27,7 @@ REGISTER5(UnaryOp, CPU, "Rsqrt", functor::rsqrt, float, Eigen::half, double,
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::rsqrt<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/cwise_op_select.cc b/tensorflow/core/kernels/cwise_op_select.cc
index add26b5ac8..709628da13 100644
--- a/tensorflow/core/kernels/cwise_op_select.cc
+++ b/tensorflow/core/kernels/cwise_op_select.cc
@@ -28,6 +28,10 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
+
 template <typename Device, typename T>
 class SelectOp : public OpKernel {
  public:
@@ -169,12 +173,24 @@ REGISTER_SELECT_GPU(complex128);
 
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+// Registration of the SYCL implementations.
+#define REGISTER_SELECT_SYCL(type)                                  \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("Select").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+      SelectOp<SYCLDevice, type>);
+
+REGISTER_SELECT_SYCL(float);
+REGISTER_SELECT_SYCL(int32);
+#undef REGISTER_SELECT_SYCL
+#endif // TENSORFLOW_USE_SYCL
+
 namespace functor {
 
 // CPU Specializations of Select functors.
-template <typename T>
-struct SelectFunctor<CPUDevice, T> {
-  void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
+template <typename Device, typename T>
+struct SelectFunctorBase {
+  void operator()(const Device& d, typename TTypes<T>::Flat out,
                   typename TTypes<bool>::ConstFlat cond_flat,
                   typename TTypes<T>::ConstFlat then_flat,
                   typename TTypes<T>::ConstFlat else_flat) {
@@ -182,10 +198,18 @@ struct SelectFunctor<CPUDevice, T> {
   }
 };
 
-// CPU Specializations of Select functors with scalar
 template <typename T>
-struct SelectScalarFunctor<CPUDevice, T> {
-  void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
+struct SelectFunctor<CPUDevice, T>
+        : SelectFunctorBase<CPUDevice, T> {};
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T>
+struct SelectFunctor<SYCLDevice, T>
+        : SelectFunctorBase<SYCLDevice, T> {};
+#endif // TENSORFLOW_USE_SYCL
+
+template <typename Device, typename T>
+struct SelectScalarFunctorBase {
+  void operator()(const Device& d, typename TTypes<T>::Flat out,
                   TTypes<bool>::ConstScalar cond,
                   typename TTypes<T>::ConstFlat then_flat,
                   typename TTypes<T>::ConstFlat else_flat) {
@@ -193,9 +217,19 @@ struct SelectScalarFunctor<CPUDevice, T> {
   }
 };
 
+// CPU Specializations of Select functors with scalar
 template <typename T>
-struct BatchSelectFunctor<CPUDevice, T> {
-  void operator()(const CPUDevice& d,
+struct SelectScalarFunctor<CPUDevice, T>
+        : SelectScalarFunctorBase<CPUDevice, T> {};
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T>
+struct SelectScalarFunctor<SYCLDevice, T>
+        : SelectScalarFunctorBase<SYCLDevice, T> {};
+#endif // TENSORFLOW_USE_SYCL
+
+template <typename Device, typename T>
+struct BatchSelectFunctorBase {
+  void operator()(const Device& d,
                   typename TTypes<T>::Matrix output_flat_outer_dims,
                   TTypes<bool>::ConstVec cond_vec,
                   typename TTypes<T>::ConstMatrix then_flat_outer_dims,
@@ -220,6 +254,15 @@ struct BatchSelectFunctor<CPUDevice, T> {
   }
 };
 
+template <typename T>
+struct BatchSelectFunctor<CPUDevice, T>
+        : BatchSelectFunctorBase<CPUDevice, T> {};
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T>
+struct BatchSelectFunctor<SYCLDevice, T>
+        : BatchSelectFunctorBase<SYCLDevice, T> {};
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace functor
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sigmoid.cc b/tensorflow/core/kernels/cwise_op_sigmoid.cc
index cc1f9b8f03..a76a088ac8 100644
--- a/tensorflow/core/kernels/cwise_op_sigmoid.cc
+++ b/tensorflow/core/kernels/cwise_op_sigmoid.cc
@@ -23,6 +23,9 @@ REGISTER5(UnaryOp, CPU, "Sigmoid", functor::sigmoid, float, Eigen::half, double,
 REGISTER3(UnaryOp, GPU, "Sigmoid", functor::sigmoid, float, Eigen::half,
           double);
 #endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(UnaryOp, SYCL, "Sigmoid", functor::sigmoid, float);
+#endif // TENSORFLOW_USE_SYCL
 
 REGISTER5(SimpleBinaryOp, CPU, "SigmoidGrad", functor::sigmoid_grad, float,
           Eigen::half, double, complex64, complex128);
@@ -30,5 +33,8 @@ REGISTER5(SimpleBinaryOp, CPU, "SigmoidGrad", functor::sigmoid_grad, float,
 REGISTER3(SimpleBinaryOp, GPU, "SigmoidGrad", functor::sigmoid_grad, float,
           Eigen::half, double);
 #endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(SimpleBinaryOp, SYCL, "SigmoidGrad", functor::sigmoid_grad, float);
+#endif // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sign.cc b/tensorflow/core/kernels/cwise_op_sign.cc
index 568906612a..dedd414db5 100644
--- a/tensorflow/core/kernels/cwise_op_sign.cc
+++ b/tensorflow/core/kernels/cwise_op_sign.cc
@@ -33,4 +33,17 @@ REGISTER_KERNEL_BUILDER(Name("Sign")
                         UnaryOp<CPUDevice, functor::sign<int32>>);
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(UnaryOp, SYCL, "Sign", functor::sign, float);
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Sign")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .TypeConstraint<int32>("T"),
+                        UnaryOp<CPUDevice, functor::sign<int32>>);
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sin.cc b/tensorflow/core/kernels/cwise_op_sin.cc
index 8d0c0959f7..ab54c61b56 100644
--- a/tensorflow/core/kernels/cwise_op_sin.cc
+++ b/tensorflow/core/kernels/cwise_op_sin.cc
@@ -27,6 +27,7 @@ REGISTER5(UnaryOp, CPU, "Sin", functor::sin, float, Eigen::half, double,
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::sin<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYC
 
diff --git a/tensorflow/core/kernels/cwise_op_sqrt.cc b/tensorflow/core/kernels/cwise_op_sqrt.cc
index 710001517b..55acf648db 100644
--- a/tensorflow/core/kernels/cwise_op_sqrt.cc
+++ b/tensorflow/core/kernels/cwise_op_sqrt.cc
@@ -27,8 +27,9 @@ REGISTER5(UnaryOp, CPU, "Sqrt", functor::sqrt, float, Eigen::half, double,
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::sqrt<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYC
+#endif // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Sqrt", functor::sqrt, float, Eigen::half, double);
diff --git a/tensorflow/core/kernels/cwise_op_square.cc b/tensorflow/core/kernels/cwise_op_square.cc
index f867f127a7..afcacfec1c 100644
--- a/tensorflow/core/kernels/cwise_op_square.cc
+++ b/tensorflow/core/kernels/cwise_op_square.cc
@@ -27,6 +27,7 @@ REGISTER7(UnaryOp, CPU, "Square", functor::square, float, Eigen::half, double,
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::square<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYC
 
diff --git a/tensorflow/core/kernels/cwise_op_tan.cc b/tensorflow/core/kernels/cwise_op_tan.cc
index ac49cad88f..9c850c9420 100644
--- a/tensorflow/core/kernels/cwise_op_tan.cc
+++ b/tensorflow/core/kernels/cwise_op_tan.cc
@@ -26,6 +26,7 @@ REGISTER2(UnaryOp, CPU, "Tan", functor::tan, float, double);
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::tan<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYC
 
diff --git a/tensorflow/core/kernels/cwise_op_tanh.cc b/tensorflow/core/kernels/cwise_op_tanh.cc
index ae2c473e20..1dbc13061b 100644
--- a/tensorflow/core/kernels/cwise_op_tanh.cc
+++ b/tensorflow/core/kernels/cwise_op_tanh.cc
@@ -28,6 +28,7 @@ REGISTER5(UnaryOp, CPU, "Tanh", functor::tanh, float, Eigen::half, double,
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::tanh<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYC
 
diff --git a/tensorflow/core/kernels/cwise_ops_gradients.h b/tensorflow/core/kernels/cwise_ops_gradients.h
index 671de380d3..77b330f589 100644
--- a/tensorflow/core/kernels/cwise_ops_gradients.h
+++ b/tensorflow/core/kernels/cwise_ops_gradients.h
@@ -171,6 +171,21 @@ struct SimpleBinaryFunctor<CPUDevice, Functor> {
   }
 };
 
+
+#ifdef TENSORFLOW_USE_SYCL
+// Partial specialization of BinaryFunctor for SYCL devices
+typedef Eigen::SyclDevice SYCLDevice;
+template <typename Functor>
+struct SimpleBinaryFunctor<SYCLDevice, Functor> {
+  void operator()(const SYCLDevice& d, typename Functor::tout_type out,
+                  typename Functor::tin_type in0,
+                  typename Functor::tin_type in1) {
+    out.device(d) = in0.binaryExpr(in1, typename Functor::func());
+  }
+};
+
+#endif // TENSORFLOW_USE_SYCL
+
 template <typename T>
 struct tanh_grad : base<T, Eigen::internal::scalar_tanh_gradient_op<T>> {};
 
diff --git a/tensorflow/core/kernels/cwise_ops_test.cc b/tensorflow/core/kernels/cwise_ops_test.cc
index 6250928aca..92018ec871 100644
--- a/tensorflow/core/kernels/cwise_ops_test.cc
+++ b/tensorflow/core/kernels/cwise_ops_test.cc
@@ -51,18 +51,38 @@ static int ColsFromArg(int arg) { return (arg % kRows); }
   BENCHMARK(BM_##DEVICE##_##FUNC##_##TYPE)->Range(4 << 10, 1 << 20);
 
 BM_UNARY(cpu, Floor, float, DT_FLOAT);
+#if GOOGLE_CUDA
 BM_UNARY(gpu, Floor, float, DT_FLOAT);
+#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+BM_UNARY(sycl, Floor, float, DT_FLOAT);
+#endif // TENSORFLOW_USE_SYCL
+
 BM_UNARY(cpu, Floor, double, DT_DOUBLE);
+#if GOOGLE_CUDA
 BM_UNARY(gpu, Floor, double, DT_DOUBLE);
+#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+BM_UNARY(sycl, Floor, double, DT_DOUBLE);
+#endif // TENSORFLOW_USE_SYCL
+
 BM_UNARY(cpu, Conj, std::complex<float>, DT_COMPLEX64);
+#if GOOGLE_CUDA
 BM_UNARY(gpu, Conj, std::complex<float>, DT_COMPLEX64);
+#endif // GOOGLE_CUDA
 BM_UNARY(cpu, Conj, std::complex<double>, DT_COMPLEX128);
+#if GOOGLE_CUDA
 BM_UNARY(gpu, Conj, std::complex<double>, DT_COMPLEX128);
+#endif // GOOGLE_CUDA
 
 BM_UNARY(cpu, Rint, double, DT_DOUBLE);
+#if GOOGLE_CUDA
 BM_UNARY(gpu, Rint, double, DT_DOUBLE);
+#endif // GOOGLE_CUDA
 BM_UNARY(cpu, Rint, float, DT_FLOAT);
+#if GOOGLE_CUDA
 BM_UNARY(gpu, Rint, float, DT_FLOAT);
+#endif // GOOGLE_CUDA
 
 // data func scalar.
 static Graph* BinaryScalar(int num, const string& func) {
@@ -90,9 +110,20 @@ static Graph* BinaryScalar(int num, const string& func) {
       ->Arg(1048576);
 
 BM_BINARY_SCALAR(cpu, Less);
+#if GOOGLE_CUDA
 BM_BINARY_SCALAR(gpu, Less);
+#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+BM_BINARY_SCALAR(sycl, Less);
+#endif // TENSORFLOW_USE_SYCL
+
 BM_BINARY_SCALAR(cpu, Add);
+#if GOOGLE_CUDA
 BM_BINARY_SCALAR(gpu, Add);
+#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+BM_BINARY_SCALAR(sycl, Add);
+#endif // TENSORFLOW_USE_SYCL
 #undef BM_BINARY_SCALAR
 
 template <class T>
@@ -130,9 +161,13 @@ static Graph* BiasAdd(int rows, int cols, DataType type) {
 
 using Eigen::half;
 BM_BIAS_ADD_ALL(cpu, float, DT_FLOAT);
+#if GOOGLE_CUDA
 BM_BIAS_ADD_ALL(gpu, float, DT_FLOAT);
+#endif // GOOGLE_CUDA
 BM_BIAS_ADD_ALL(cpu, half, DT_HALF);
+#if GOOGLE_CUDA
 BM_BIAS_ADD_ALL(gpu, half, DT_HALF);
+#endif // GOOGLE_CUDA
 #undef BM_BIAS_ADD_ALL
 #undef BM_BIAS_ADD
 
@@ -180,12 +215,18 @@ static Graph* BiasAddGrad(int rows, int cols, int channels, DataType type,
   BM_BIAS_ADD_GRAD(DEVICE, FORMAT, C_TYPE, TF_TYPE, 4096, 4096, 1);
 
 using Eigen::half;
+#if GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, float, DT_FLOAT);
 BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, half, DT_HALF);
+#endif // GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, float, DT_FLOAT);
+#if GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, float, DT_FLOAT);
+#endif // GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, half, DT_HALF);
+#if GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, half, DT_HALF);
+#endif // GOOGLE_CUDA
 #undef BM_BIAS_ADD_GRAD_ALL
 #undef BM_BIAS_ADD_GRAD
 
@@ -223,7 +264,12 @@ static Graph* BcastAdd(int rows, int cols, int dim) {
   BM_BCAST_ADD_ROW(DEVICE, 2048, 512); \
   BM_BCAST_ADD_ROW(DEVICE, 4096, 512);
 BM_BCAST_ADD_ROW_ALL(cpu);
+#if GOOGLE_CUDA
 BM_BCAST_ADD_ROW_ALL(gpu);
+#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+BM_BCAST_ADD_ROW_ALL(sycl);
+#endif // TENSORFLOW_USE_SYCL
 #undef BM_BCAST_ADD_ROW_ALL
 #undef BM_BCAST_ADD_ROW
 
@@ -244,7 +290,12 @@ BM_BCAST_ADD_ROW_ALL(gpu);
   BM_BCAST_ADD_COL(DEVICE, 2048, 512); \
   BM_BCAST_ADD_COL(DEVICE, 4096, 512);
 BM_BCAST_ADD_COL_ALL(cpu);
+#if GOOGLE_CUDA
 BM_BCAST_ADD_COL_ALL(gpu);
+#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+BM_BCAST_ADD_COL_ALL(sycl);
+#endif // TENSORFLOW_USE_SYCL
 #undef BM_BCAST_ADD_COL_ALL
 #undef BM_BCAST_ADD_COL
 
diff --git a/tensorflow/core/kernels/debug_ops.cc b/tensorflow/core/kernels/debug_ops.cc
index e9ddd535ec..55a7657ea8 100644
--- a/tensorflow/core/kernels/debug_ops.cc
+++ b/tensorflow/core/kernels/debug_ops.cc
@@ -97,6 +97,7 @@ REGISTER_GPU_DEBUG_NAN_COUNT(double);
                               .TypeConstraint<type>("T"), \
                           DebugNanCountOp<type>);
 REGISTER_GPU_DEBUG_NAN_COUNT(float);
+REGISTER_GPU_DEBUG_NAN_COUNT(double);
 #endif // TENSORFLOW_USE_SYCL
 
 // Register debug numeric summary ops.
@@ -133,6 +134,7 @@ TF_CALL_double(REGISTER_GPU_DEBUG_NUMERIC_SUMMARY_COUNT);
                               .TypeConstraint<type>("T"), \
                           DebugNumericSummaryOp<type>);
 REGISTER_GPU_DEBUG_NUMERIC_SUMMARY_COUNT(float);
+REGISTER_GPU_DEBUG_NUMERIC_SUMMARY_COUNT(double);
 #endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/dense_update_ops.cc b/tensorflow/core/kernels/dense_update_ops.cc
index 42fe6e88c9..767f143727 100644
--- a/tensorflow/core/kernels/dense_update_ops.cc
+++ b/tensorflow/core/kernels/dense_update_ops.cc
@@ -152,6 +152,7 @@ typedef Eigen::SyclDevice SYCLDevice;
       DenseUpdateOp<SYCLDevice, type, DenseUpdateType::SUB>);
 
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif
 
diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc
index 08ec4baff3..0df8f9d3ed 100644
--- a/tensorflow/core/kernels/fill_functor.cc
+++ b/tensorflow/core/kernels/fill_functor.cc
@@ -62,6 +62,8 @@ void SetZeroFunctor<Eigen::SyclDevice, T>::operator()(
 #define DEFINE_SETZERO_SYCL(T) \
   template struct SetZeroFunctor<Eigen::SyclDevice, T>;
 DEFINE_SETZERO_SYCL(float);
+DEFINE_SETZERO_SYCL(bool);
+DEFINE_SETZERO_SYCL(double);
 #undef DEFINE_SETZERO_SYCL
 #endif  // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index 9aa289c3c9..d08dec46d1 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -185,6 +185,34 @@ REGISTER_KERNEL_BUILDER(Name("_ArrayToList")
                             .TypeConstraint<int32>("T"),
                         PassOn);
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNELS(type)                                      \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_ListToArray").Device(DEVICE_SYCL).TypeConstraint<type>("T"),\
+      PassOn);                                                           \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_ArrayToList").Device(DEVICE_SYCL).TypeConstraint<type>("T"),\
+      PassOn);
+
+REGISTER_SYCL_KERNELS(float);
+REGISTER_SYCL_KERNELS(double);
+
+#undef REGISTER_SYCL_KERNELS
+
+REGISTER_KERNEL_BUILDER(Name("_ListToArray")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("input")
+                            .HostMemory("output")
+                            .TypeConstraint<int32>("T"),
+                        PassOn);
+REGISTER_KERNEL_BUILDER(Name("_ArrayToList")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("input")
+                            .HostMemory("output")
+                            .TypeConstraint<int32>("T"),
+                        PassOn);
+#endif // TENSORFLOW_USE_SYCL
+
 class SymbolicGradientOp : public AsyncOpKernel {
  public:
   SymbolicGradientOp(OpKernelConstruction* ctx)
diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc
index a2b0127fac..94fe22ed31 100644
--- a/tensorflow/core/kernels/matmul_op.cc
+++ b/tensorflow/core/kernels/matmul_op.cc
@@ -46,6 +46,9 @@ perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T, bool USE_CUBLAS>
 struct LaunchMatMul;
@@ -118,27 +121,42 @@ bool ExplicitVectorMatrixOptimization<Eigen::half>(
   return false;
 }
 
-// On CPUs, we ignore USE_CUBLAS
-template <typename T>
-struct LaunchMatMulCPU {
+template <typename Device, typename T>
+struct LaunchMatMulBase {
   static void launch(
       OpKernelContext* ctx, OpKernel* kernel, const Tensor& a, const Tensor& b,
       const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
       Tensor* out) {
+#ifndef TENSORFLOW_USE_SYCL
     // An explicit vector-matrix multiply is much better optimized than an
     // implicit one and this is a bottleneck during non-batched inference.
     bool was_vector = ExplicitVectorMatrixOptimization<T>(a, b, dim_pair, out);
     if (!was_vector) {
-      functor::MatMulFunctor<CPUDevice, T>()(ctx->eigen_device<CPUDevice>(),
+#endif // TENSORFLOW_USE_SYCL
+      functor::MatMulFunctor<Device, T>()(ctx->eigen_device<Device>(),
                                              out->matrix<T>(), a.matrix<T>(),
                                              b.matrix<T>(), dim_pair);
+#ifndef TENSORFLOW_USE_SYCL
     }
+#endif // TENSORFLOW_USE_SYCL
   }
 };
+// On CPUs, we ignore USE_CUBLAS
+template <typename T>
+struct LaunchMatMulCPU : LaunchMatMulBase<CPUDevice, T> {};
+
 
 template <typename T, bool USE_CUBLAS>
 struct LaunchMatMul<CPUDevice, T, USE_CUBLAS> : public LaunchMatMulCPU<T> {};
 
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T>
+struct LaunchMatMulSYCL : LaunchMatMulBase<SYCLDevice, T> {};
+
+template <typename T, bool USE_CUBLAS>
+struct LaunchMatMul<SYCLDevice, T, USE_CUBLAS> : public LaunchMatMulSYCL<T> {};
+#endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 
 template <typename T>
@@ -256,6 +274,20 @@ struct MatMulFunctor<CPUDevice, T> {
   }
 };
 
+#ifdef TENSORFLOW_USE_SYCL
+// Partial specialization MatMulFunctor<Device=SYCLDevice, T>.
+template <typename T>
+struct MatMulFunctor<SYCLDevice, T> {
+  void operator()(
+      const SYCLDevice& d, typename MatMulTypes<T>::out_type out,
+      typename MatMulTypes<T>::in_type in0,
+      typename MatMulTypes<T>::in_type in1,
+      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair) {
+    MatMul<SYCLDevice>(d, out, in0, in1, dim_pair);
+  }
+};
+#endif // TENSORFLOW_USE_SYCL
+
 }  // end namespace functor
 
 #define REGISTER_CPU(T)                                                        \
@@ -276,6 +308,12 @@ struct MatMulFunctor<CPUDevice, T> {
                               .Label("cublas"),                    \
                           MatMulOp<GPUDevice, T, true /* cublas */>)
 
+#if defined (INTEL_MKL)
+// MKL does not support half and int32 types for matrix-multiplication, so
+// register the kernel to use default Eigen based implementations for these types
+TF_CALL_half(REGISTER_CPU);
+TF_CALL_int32(REGISTER_CPU);
+#else
 TF_CALL_float(REGISTER_CPU);
 TF_CALL_double(REGISTER_CPU);
 TF_CALL_half(REGISTER_CPU);
@@ -283,6 +321,7 @@ TF_CALL_half(REGISTER_CPU);
 TF_CALL_int32(REGISTER_CPU);
 TF_CALL_complex64(REGISTER_CPU);
 TF_CALL_complex128(REGISTER_CPU);
+#endif
 
 #if GOOGLE_CUDA
 TF_CALL_float(REGISTER_GPU);
@@ -294,4 +333,17 @@ TF_CALL_half(REGISTER_GPU);
 #endif
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL(T)                                            \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("MatMul").Device(DEVICE_SYCL).TypeConstraint<T>("T"),    \
+      MatMulOp<SYCLDevice, T, false /* xxblas */>); \
+  REGISTER_KERNEL_BUILDER(Name("MatMul")                            \
+                              .Device(DEVICE_SYCL)                  \
+                              .TypeConstraint<T>("T")               \
+                              .Label("eigen"),                      \
+                          MatMulOp<SYCLDevice, T, false /* xxblas */>)
+TF_CALL_float(REGISTER_SYCL);
+
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mkl_matmul_op.cc b/tensorflow/core/kernels/mkl_matmul_op.cc
new file mode 100644
index 0000000000..3ba28c13ed
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_matmul_op.cc
@@ -0,0 +1,217 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/math_ops.cc.
+
+// This file uses MKL CBLAS xGEMM for acceleration of TF Matrix-Matrix
+// Multiplication (MatMul) operations.
+// We currently register this kernel only for MKL supported data
+// types (float, double, complex64, complex128). The macro INTEL_MKL is defined
+// by the build system only when MKL is chosen as an option at configure stage
+// and when it is undefined at build time, this file becomes an empty
+// compilation unit
+
+#if defined(INTEL_MKL)
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+#include "third_party/mkl/include/mkl_cblas.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, typename T, bool USE_CUBLAS>
+class MklMatMulOp : public OpKernel {
+ public:
+  explicit MklMatMulOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_a", &transpose_a_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_b", &transpose_b_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& a = ctx->input(0);
+    const Tensor& b = ctx->input(1);
+
+    // Check that the dimensions of the two matrices are valid.
+    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a.shape()),
+                errors::InvalidArgument("In[0] is not a matrix"));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(b.shape()),
+                errors::InvalidArgument("In[1] is not a matrix"));
+    Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
+    dim_pair[0].first = transpose_a_ ? 0 : 1;
+    dim_pair[0].second = transpose_b_ ? 1 : 0;
+
+    OP_REQUIRES(ctx,
+                a.dim_size(dim_pair[0].first) == b.dim_size(dim_pair[0].second),
+                errors::InvalidArgument("Matrix size-incompatible: In[0]: ",
+                                        a.shape().DebugString(), ", In[1]: ",
+                                        b.shape().DebugString()));
+    int a_dim_remaining = 1 - dim_pair[0].first;
+    int b_dim_remaining = 1 - dim_pair[0].second;
+    TensorShape out_shape(
+        {a.dim_size(a_dim_remaining), b.dim_size(b_dim_remaining)});
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
+
+    if (out->NumElements() == 0) {
+      // If a has shape [0, x] or b has shape [x, 0], the output shape
+      // is a 0-element matrix, so there is nothing to do.
+      return;
+    }
+
+    if (a.NumElements() == 0 || b.NumElements() == 0) {
+      // If a has shape [x, 0] and b has shape [0, y], the
+      // output shape is [x, y] where x and y are non-zero, so we fill
+      // the output with zeros.
+      functor::SetZeroFunctor<Device, T> f;
+      f(ctx->eigen_device<Device>(), out->flat<T>());
+      return;
+    }
+
+    const int m = a.dim_size(1 - dim_pair[0].first);
+    const int k = a.dim_size(dim_pair[0].first);
+    const int n = b.dim_size(1 - dim_pair[0].second);
+    bool transpose_a = dim_pair[0].first == 0;
+    bool transpose_b = dim_pair[0].second == 1;
+
+    auto a_ptr = (a.template flat<T>().data());
+    auto b_ptr = (b.template flat<T>().data());
+    auto c_ptr = (out->template flat<T>().data());
+
+    MklBlasGemm(transpose_a, transpose_b, m, n, k, a_ptr, transpose_a ? m : k,
+                b_ptr, transpose_b ? k : n, c_ptr, n);
+  }
+
+ private:
+  bool transpose_a_;
+  bool transpose_b_;
+
+  // --------------------------------------------------------------------------
+  //
+  // @brief Matrix-Matrix Multiplication with FP32 tensors, a, b, c using CBLAS
+  // interface. c = op(a) * op(b)
+  //
+  // @param transa  Specifies the form of op(a) used in MatMul. If transa is
+  // true, then op(a) = a^T, otherwise op(a) = a
+  //
+  // @param transb  Specifies the form of op(b) used in MatMul. If transb is
+  // true, then op(b) = b^T, otherwise op(b) = b
+  //
+  // @param m       Specifies the number of rows of the matrix op(a) and of the
+  // matrix c. The value of m must be at least zero.
+  //
+  // @param n       Specifies the number of columns of the matrix op(b) and the
+  // number of columns of the matrix c. The value of n must be at least zero.
+  //
+  // @param k       Specifies the number of columns of the matrix op(a) and the
+  // number of rows of the matrix op(b)
+  //
+  // @param a       Address of matrix a
+  //
+  // @param lda     Leading dimension of 'a' matrix. This is set at calling site
+  // depending on transa parameter. Since TF uses row-major
+  // layout, leading dimension is the stride between consecutive rows
+  // lda = max(1,k) when transa is false, otherwise lda = max(1,m)
+  //
+  // @param b       Address of matrix b
+  //
+  // @param ldb     Leading dimension of 'b' matrix. This is set at calling site
+  // depending on transb parameter. Since TF uses row-major
+  // layout, leading dimension is the stride between consecutive rows
+  // ldb = max(1,n) when transb is false, otherwise ldb = max(1,k)
+  //
+  // @param c       Address of matrix c
+  //
+  // @param ldc     Leading dimension of 'c' matrix. Since TF uses row-major
+  // layout, leading dimension is the stride between consecutive rows, max(1,n)
+  //
+  // --------------------------------------------------------------------------
+  void MklBlasGemm(bool transa, bool transb, const int m, const int n,
+                   const int k, const float* a, const int lda, const float* b,
+                   const int ldb, float* c, const int ldc) {
+    // BLAS GEMM API defines Matrix Multiplication as c = alpha * op(a) * op(b)
+    // + beta * c.
+    // Since TF MatMul does not have parameters for alpha, beta, we set them to
+    // 1.0 and 0.0 respectively.
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+    cblas_sgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
+                transb ? CblasTrans : CblasNoTrans, m, n, k, alpha, a, lda, b,
+                ldb, beta, c, ldc);
+  }
+
+  // Matrix-Matrix Multiplication with FP64 tensors. For detailed info about
+  // parameters, look at FP32 function description.
+  void MklBlasGemm(bool transa, bool transb, const int m, const int n,
+                   const int k, const double* a, const int lda, const double* b,
+                   const int ldb, double* c, const int ldc) {
+    const double alpha = 1.0;
+    const double beta = 0.0;
+    cblas_dgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
+                transb ? CblasTrans : CblasNoTrans, m, n, k, alpha, a, lda, b,
+                ldb, beta, c, ldc);
+  }
+
+  // Matrix-Matrix Multiplication with Complex64 (std::complex<float>) tensors.
+  // For detailed info about parameters, look at FP32 function description.
+  void MklBlasGemm(bool transa, bool transb, const int m, const int n,
+                   const int k, const std::complex<float>* a, const int lda,
+                   const std::complex<float>* b, const int ldb,
+                   std::complex<float>* c, int const ldc) {
+    const MKL_Complex8 alpha = {1.0f, 0.0f};
+    const MKL_Complex8 beta = {0.0f, 0.0f};
+    cblas_cgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
+                transb ? CblasTrans : CblasNoTrans, m, n, k,
+                static_cast<const void*>(&alpha), static_cast<const void*>(a),
+                lda, static_cast<const void*>(b), ldb,
+                static_cast<const void*>(&beta), static_cast<void*>(c), ldc);
+  }
+
+  // Matrix-Matrix Multiplication with Complex128 (std::complex<double>)
+  // tensors. For detailed info about parameters, look at FP32 function
+  // description.
+  void MklBlasGemm(bool transa, bool transb, const int m, const int n,
+                   const int k, const std::complex<double>* a, const int lda,
+                   const std::complex<double>* b, const int ldb,
+                   std::complex<double>* c, const int ldc) {
+    const MKL_Complex16 alpha = {1.0, 0.0};
+    const MKL_Complex16 beta = {0.0, 0.0};
+    cblas_zgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
+                transb ? CblasTrans : CblasNoTrans, m, n, k,
+                static_cast<const void*>(&alpha), static_cast<const void*>(a),
+                lda, static_cast<const void*>(b), ldb,
+                static_cast<const void*>(&beta), static_cast<void*>(c), ldc);
+  }
+};
+
+#define REGISTER_CPU(T)                                                      \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name("MatMul").Device(DEVICE_CPU).TypeConstraint<T>("T"),              \
+      MklMatMulOp<CPUDevice, T, false /* cublas, ignored for CPU */>);       \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name("MatMul").Device(DEVICE_CPU).TypeConstraint<T>("T").Label("MKL"), \
+      MklMatMulOp<CPUDevice, T, false /* cublas, ignored for CPU */>)
+
+// TODO:Consider template specialization when adding/removing additional types
+TF_CALL_float(REGISTER_CPU);
+TF_CALL_double(REGISTER_CPU);
+TF_CALL_complex64(REGISTER_CPU);
+TF_CALL_complex128(REGISTER_CPU);
+
+}  // namespace tensorflow
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc
index 4977ad1d7c..a6650f369b 100644
--- a/tensorflow/core/kernels/pack_op.cc
+++ b/tensorflow/core/kernels/pack_op.cc
@@ -167,6 +167,7 @@ REGISTER_KERNEL_BUILDER(Name("Pack")
       PackOp<SYCLDevice, type>)
 
 REGISTER_SYCL(float);
+REGISTER_SYCL(double);
 #undef REGISTER_SYCL
 
 // A special GPU kernel for int32.
diff --git a/tensorflow/core/kernels/pad_op.cc b/tensorflow/core/kernels/pad_op.cc
index bec2d02cb5..91984319c6 100644
--- a/tensorflow/core/kernels/pad_op.cc
+++ b/tensorflow/core/kernels/pad_op.cc
@@ -38,6 +38,9 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class PadOp : public OpKernel {
@@ -199,4 +202,30 @@ REGISTER_KERNEL_BUILDER(Name("Pad")
                         PadOp<CPUDevice, int32>);
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+// Registration of the GPU implementations.
+#define REGISTER_SYCL_KERNEL(T)                                   \
+  REGISTER_KERNEL_BUILDER(Name("Pad")                             \
+                              .Device(DEVICE_SYCL)                \
+                              .TypeConstraint<T>("T")             \
+                              .TypeConstraint<int32>("Tpaddings") \
+                              .HostMemory("paddings"),            \
+                          PadOp<SYCLDevice, T>)
+
+REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Pad")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32>("Tpaddings")
+                            .HostMemory("input")
+                            .HostMemory("paddings")
+                            .HostMemory("output"),
+                        PadOp<CPUDevice, int32>);
+#endif // TENSORFLOW_USE_SYCL
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_common.h b/tensorflow/core/kernels/reduction_ops_common.h
index 1bc7e14187..0d309c2185 100644
--- a/tensorflow/core/kernels/reduction_ops_common.h
+++ b/tensorflow/core/kernels/reduction_ops_common.h
@@ -268,6 +268,31 @@ struct ReduceFunctor<CPUDevice, Reducer>
 template <typename Reducer>
 struct ReduceFunctor<SYCLDevice, Reducer>
         : ReduceFunctorBase<SYCLDevice, Reducer>{};
+
+template <typename T>
+struct ReduceFunctor<SYCLDevice, Eigen::internal::MeanReducer<T> > {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(const SYCLDevice& d, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const Eigen::internal::MeanReducer<T>& reducer) {
+    typedef typename IN_T::Index Index;
+    // Eigen sum reductions are much faster on GPU than mean reductions:
+    // Simply trigger them by computing the sum of the weighted inputs.
+    Index num_coeffs_to_reduce = 1;
+    for (int i = 0; i < Eigen::internal::array_size<ReductionAxes>::value;
+         ++i) {
+      num_coeffs_to_reduce *= in.dimension(reduction_axes[i]);
+    }
+    T scale = T(1.0) / num_coeffs_to_reduce;
+    out.device(d) = (in * scale).sum(reduction_axes);
+  }
+
+  template <typename OUT_T>
+  static void FillIdentity(const SYCLDevice& d, OUT_T out,
+                           const Eigen::internal::MeanReducer<T>& reducer) {
+    FillIdentityEigenImpl(d, out, reducer);
+  }
+};
 #endif // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/reduction_ops_max.cc b/tensorflow/core/kernels/reduction_ops_max.cc
index db86157c8e..5ab97d1eee 100644
--- a/tensorflow/core/kernels/reduction_ops_max.cc
+++ b/tensorflow/core/kernels/reduction_ops_max.cc
@@ -57,4 +57,27 @@ REGISTER_KERNEL_BUILDER(
 
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNELS(type)         \
+  REGISTER_KERNEL_BUILDER(                  \
+      Name("Max")                           \
+          .Device(DEVICE_SYCL)              \
+          .TypeConstraint<type>("T")        \
+          .TypeConstraint<int32>("Tidx")    \
+          .HostMemory("reduction_indices"), \
+      ReductionOp<SYCLDevice, type, Eigen::internal::MaxReducer<type>>);
+REGISTER_SYCL_KERNELS(float);
+#undef REGISTER_SYCL_KERNELS
+
+REGISTER_KERNEL_BUILDER(
+    Name("Max")
+        .Device(DEVICE_SYCL)
+        .HostMemory("reduction_indices")
+        .HostMemory("input")
+        .HostMemory("output")
+        .TypeConstraint<int32>("T")
+        .TypeConstraint<int32>("Tidx"),
+    ReductionOp<CPUDevice, int32, Eigen::internal::MaxReducer<int32>>);
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_mean.cc b/tensorflow/core/kernels/reduction_ops_mean.cc
index fef3cd0699..e018cb55dd 100644
--- a/tensorflow/core/kernels/reduction_ops_mean.cc
+++ b/tensorflow/core/kernels/reduction_ops_mean.cc
@@ -44,4 +44,17 @@ REGISTER_GPU_KERNELS(double);
 
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNELS(type)         \
+  REGISTER_KERNEL_BUILDER(                  \
+      Name("Mean")                          \
+          .Device(DEVICE_SYCL)              \
+          .TypeConstraint<type>("T")        \
+          .TypeConstraint<int32>("Tidx")    \
+          .HostMemory("reduction_indices"), \
+      ReductionOp<SYCLDevice, type, Eigen::internal::MeanReducer<type>>);
+REGISTER_SYCL_KERNELS(float);
+#undef REGISTER_SYCL_KERNELS
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_min.cc b/tensorflow/core/kernels/reduction_ops_min.cc
index c362bc8867..ec240421b9 100644
--- a/tensorflow/core/kernels/reduction_ops_min.cc
+++ b/tensorflow/core/kernels/reduction_ops_min.cc
@@ -57,4 +57,27 @@ REGISTER_KERNEL_BUILDER(
 
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNELS(type)         \
+  REGISTER_KERNEL_BUILDER(                  \
+      Name("Min")                           \
+          .Device(DEVICE_SYCL)              \
+          .TypeConstraint<type>("T")        \
+          .TypeConstraint<int32>("Tidx")    \
+          .HostMemory("reduction_indices"), \
+      ReductionOp<SYCLDevice, type, Eigen::internal::MinReducer<type>>);
+REGISTER_SYCL_KERNELS(float);
+#undef REGISTER_SYCL_KERNELS
+
+REGISTER_KERNEL_BUILDER(
+    Name("Min")
+        .Device(DEVICE_SYCL)
+        .HostMemory("reduction_indices")
+        .HostMemory("input")
+        .HostMemory("output")
+        .TypeConstraint<int32>("T")
+        .TypeConstraint<int32>("Tidx"),
+    ReductionOp<CPUDevice, int32, Eigen::internal::MinReducer<int32>>);
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_prod.cc b/tensorflow/core/kernels/reduction_ops_prod.cc
index c6aff8c2ed..e04c655dab 100644
--- a/tensorflow/core/kernels/reduction_ops_prod.cc
+++ b/tensorflow/core/kernels/reduction_ops_prod.cc
@@ -45,4 +45,28 @@ REGISTER_GPU_KERNELS(double);
 
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNELS(type)         \
+  REGISTER_KERNEL_BUILDER(                  \
+      Name("Prod")                          \
+          .Device(DEVICE_SYCL)              \
+          .TypeConstraint<type>("T")        \
+          .TypeConstraint<int32>("Tidx")    \
+          .HostMemory("reduction_indices"), \
+      ReductionOp<SYCLDevice, type, Eigen::internal::ProdReducer<type>>);
+REGISTER_SYCL_KERNELS(float);
+REGISTER_SYCL_KERNELS(double);
+#undef REGISTER_SYCL_KERNELS
+
+REGISTER_KERNEL_BUILDER(
+    Name("Prod")
+        .Device(DEVICE_SYCL)
+        .TypeConstraint<int32>("T")
+        .TypeConstraint<int32>("Tidx")
+        .HostMemory("input")
+        .HostMemory("output")
+        .HostMemory("reduction_indices"),
+    ReductionOp<CPUDevice, int32, Eigen::internal::ProdReducer<int32>>);
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_sum.cc b/tensorflow/core/kernels/reduction_ops_sum.cc
index 3aa38f418e..938ca66a0c 100644
--- a/tensorflow/core/kernels/reduction_ops_sum.cc
+++ b/tensorflow/core/kernels/reduction_ops_sum.cc
@@ -74,7 +74,6 @@ REGISTER_KERNEL_BUILDER(
           .HostMemory("reduction_indices"), \
       ReductionOp<SYCLDevice, type, Eigen::internal::SumReducer<type>>);
 REGISTER_SYCL_KERNELS(float);
-REGISTER_SYCL_KERNELS(double);
 #undef REGISTER_SYCL_KERNELS
 
 // A special GPU kernel for int32.
diff --git a/tensorflow/core/kernels/relu_op.cc b/tensorflow/core/kernels/relu_op.cc
index f24a71ec8c..d70398bea5 100644
--- a/tensorflow/core/kernels/relu_op.cc
+++ b/tensorflow/core/kernels/relu_op.cc
@@ -29,6 +29,9 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
 
 #define REGISTER_RELU_KERNELS(type)                                   \
   REGISTER_KERNEL_BUILDER(                                            \
@@ -131,4 +134,30 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+// Registration of the GPU implementations.
+#define REGISTER_SYCL_KERNELS(type)                                    \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("Relu").Device(DEVICE_SYCL).TypeConstraint<type>("T"),      \
+      ReluOp<SYCLDevice, type>);                                       \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("ReluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"),  \
+      ReluGradOp<SYCLDevice, type>);                                   \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("Relu6").Device(DEVICE_SYCL).TypeConstraint<type>("T"),     \
+      Relu6Op<SYCLDevice, type>);                                      \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("Relu6Grad").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+      Relu6GradOp<SYCLDevice, type>);                                  \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("Elu").Device(DEVICE_SYCL).TypeConstraint<type>("T"),       \
+      EluOp<SYCLDevice, type>);                                        \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("EluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"),   \
+      EluGradOp<SYCLDevice, type>)
+
+REGISTER_SYCL_KERNELS(float);
+#undef REGISTER_SYCL_KERNELS
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/relu_op.h b/tensorflow/core/kernels/relu_op.h
index 365c6201a5..e2e0bd48dd 100644
--- a/tensorflow/core/kernels/relu_op.h
+++ b/tensorflow/core/kernels/relu_op.h
@@ -175,6 +175,10 @@ void EluGradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
 
 }  // namespace tensorflow
 
+#ifdef TENSORFLOW_USE_SYCL
+#undef EIGEN_USE_SYCL
+#endif // TENSORFLOW_USE_SYCL
+
 #undef EIGEN_USE_THREADS
 
 #endif  // TENSORFLOW_KERNELS_RELU_OP_H_
diff --git a/tensorflow/core/kernels/resize_bicubic_op.cc b/tensorflow/core/kernels/resize_bicubic_op.cc
index 5df36ef4cd..c5c805c44f 100644
--- a/tensorflow/core/kernels/resize_bicubic_op.cc
+++ b/tensorflow/core/kernels/resize_bicubic_op.cc
@@ -130,7 +130,7 @@ class CachedInterpolation {
     }
     // We use 2 hands and walk through, copying from one to another where
     // we already have values.
-    // Invarient, new_indicies_hand <= cached_values_hand
+    // Invariant, new_indicies_hand <= cached_values_hand
     const std::array<int64, 4> new_x_indices{{x_0, x_1, x_2, x_3}};
     int cached_values_hand = 0;
     int new_indicies_hand = 0;
diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc
index ec3b27ed6f..24b3ba31b8 100644
--- a/tensorflow/core/kernels/reverse_op.cc
+++ b/tensorflow/core/kernels/reverse_op.cc
@@ -33,6 +33,9 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
 
 namespace {
 
@@ -344,4 +347,36 @@ REGISTER_KERNEL_BUILDER(Name("ReverseV2")
                         ReverseV2Op<CPUDevice, int32>);
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNELS(T)                             \
+  REGISTER_KERNEL_BUILDER(Name("Reverse")                    \
+                              .Device(DEVICE_SYCL)           \
+                              .TypeConstraint<T>("T")        \
+                              .HostMemory("dims"),           \
+                          ReverseOp<SYCLDevice, T>)          \
+  REGISTER_KERNEL_BUILDER(Name("ReverseV2")                  \
+                              .Device(DEVICE_SYCL)           \
+                              .TypeConstraint<T>("T")        \
+                              .TypeConstraint<int32>("Tidx") \
+                              .HostMemory("axis"),           \
+                          ReverseV2Op<SYCLDevice, T>)
+TF_CALL_float(REGISTER_SYCL_KERNELS);
+
+REGISTER_KERNEL_BUILDER(Name("Reverse")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .HostMemory("tensor")
+                            .HostMemory("dims")
+                            .HostMemory("output"),
+                        ReverseOp<CPUDevice, int32>);
+REGISTER_KERNEL_BUILDER(Name("ReverseV2")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32>("Tidx")
+                            .HostMemory("tensor")
+                            .HostMemory("axis")
+                            .HostMemory("output"),
+                        ReverseV2Op<CPUDevice, int32>);
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/scatter_op.cc b/tensorflow/core/kernels/scatter_op.cc
index 827eb7dbca..51dad49cfe 100644
--- a/tensorflow/core/kernels/scatter_op.cc
+++ b/tensorflow/core/kernels/scatter_op.cc
@@ -180,8 +180,8 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_UPDATE_GPU);
 
 #define REGISTER_SCATTER_UPDATE_SYCL(type) REGISTER_SCATTER_UPDATE(type, SYCL);
 
-REGISTER_SCATTER_ARITHEMTIC_SYCL(float);
-REGISTER_SCATTER_UPDATE_SYCL(float);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ARITHEMTIC_SYCL);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_UPDATE_SYCL);
 
 #undef REGISTER_SCATTER_ARITHEMTIC_SYCL
 #undef REGISTER_SCATTER_UPDATE_SYCL
diff --git a/tensorflow/core/kernels/sequence_ops.cc b/tensorflow/core/kernels/sequence_ops.cc
index c24ecdf8b9..c8ea923020 100644
--- a/tensorflow/core/kernels/sequence_ops.cc
+++ b/tensorflow/core/kernels/sequence_ops.cc
@@ -92,9 +92,11 @@ class RangeOp : public OpKernel {
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(T) REGISTER_KERNEL(DEVICE_SYCL, T)
 TF_CALL_float(REGISTER_SYCL_KERNEL);
+TF_CALL_double(REGISTER_SYCL_KERNEL);
 TF_CALL_int32(REGISTER_SYCL_KERNEL);
 TF_CALL_int64(REGISTER_SYCL_KERNEL);
-#endif  // TENSORFLOW_USE_SYCL
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYCL
 
 TF_CALL_float(REGISTER_CPU_KERNEL);
 TF_CALL_double(REGISTER_CPU_KERNEL);
@@ -170,4 +172,9 @@ TF_CALL_double(REGISTER_CPU_KERNEL);
 TF_CALL_float(REGISTER_GPU_KERNEL);
 TF_CALL_double(REGISTER_GPU_KERNEL);
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(T) REGISTER_KERNEL(DEVICE_SYCL, T)
+TF_CALL_float(REGISTER_SYCL_KERNEL);
+TF_CALL_double(REGISTER_SYCL_KERNEL);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/shape_ops.cc b/tensorflow/core/kernels/shape_ops.cc
index 6bc0b4560b..177a32464b 100644
--- a/tensorflow/core/kernels/shape_ops.cc
+++ b/tensorflow/core/kernels/shape_ops.cc
@@ -201,6 +201,7 @@ REGISTER_KERNEL_BUILDER(Name("Rank").Device(DEVICE_CPU).HostMemory("output"),
                               .HostMemory("output"),     \
                           RankOp);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 
 // A special GPU kernel for int32 and bool.
@@ -297,6 +298,43 @@ REGISTER_KERNEL_BUILDER(Name("Size")
                         SizeOp<int64>);
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(type)                               \
+  REGISTER_KERNEL_BUILDER(Name("Size")                           \
+                              .Device(DEVICE_SYCL)               \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int32>("out_type") \
+                              .HostMemory("output"),             \
+                          SizeOp<int32>);                        \
+  REGISTER_KERNEL_BUILDER(Name("Size")                           \
+                              .Device(DEVICE_SYCL)               \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int64>("out_type") \
+                              .HostMemory("output"),             \
+                          SizeOp<int64>);
+REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
+#undef REGISTER_SYCL_KERNEL
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Size")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32>("out_type")
+                            .HostMemory("input")
+                            .HostMemory("output"),
+                        SizeOp<int32>);
+REGISTER_KERNEL_BUILDER(Name("Size")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int64>("out_type")
+                            .HostMemory("input")
+                            .HostMemory("output"),
+                        SizeOp<int64>);
+#endif // TENSORFLOW_USE_SYCL
+
 // ExpandDims ------------------------------------
 REGISTER_KERNEL_BUILDER(Name("ExpandDims")
                             .Device(DEVICE_CPU)
@@ -323,7 +361,30 @@ REGISTER_KERNEL_BUILDER(Name("ExpandDims")
                             .HostMemory("dim")
                             .HostMemory("output"),
                         ExpandDimsOp);
-#endif
+#endif // GOOGLE_CUDA
+
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(type)                           \
+  REGISTER_KERNEL_BUILDER(Name("ExpandDims")                 \
+                              .Device(DEVICE_SYCL)           \
+                              .TypeConstraint<type>("T")     \
+                              .TypeConstraint<int32>("Tdim") \
+                              .HostMemory("dim"),            \
+                          ExpandDimsOp);
+REGISTER_SYCL_KERNEL(float)
+REGISTER_SYCL_KERNEL(double)
+
+#undef REGISTER_SYCL_KERNEL
+
+REGISTER_KERNEL_BUILDER(Name("ExpandDims")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32>("Tdim")
+                            .HostMemory("input")
+                            .HostMemory("dim")
+                            .HostMemory("output"),
+                        ExpandDimsOp);
+#endif // TENSORFLOW_USE_SYCL
 
 // Squeeze ---------------------------------------
 REGISTER_KERNEL_BUILDER(Name("Squeeze").Device(DEVICE_CPU), SqueezeOp);
@@ -347,4 +408,24 @@ REGISTER_KERNEL_BUILDER(Name("Squeeze")
                         SqueezeOp);
 #endif
 
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(type)                                  \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("Squeeze").Device(DEVICE_SYCL).TypeConstraint<type>("T"),\
+      SqueezeOp);
+REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
+#undef REGISTER_SYCL_KERNEL
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Squeeze")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .HostMemory("input")
+                            .HostMemory("output"),
+                        SqueezeOp);
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/softmax_op.cc b/tensorflow/core/kernels/softmax_op.cc
index c7ae93852f..de11de32f1 100644
--- a/tensorflow/core/kernels/softmax_op.cc
+++ b/tensorflow/core/kernels/softmax_op.cc
@@ -28,17 +28,27 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
 
 // Partial specialization for a CPUDevice, that uses the Eigen implementation
 // from SoftmaxEigenImpl.
 namespace functor {
-template <typename T>
-struct SoftmaxFunctor<CPUDevice, T> {
-  void operator()(const CPUDevice& d, typename TTypes<T>::ConstMatrix logits,
+template <typename Device, typename T>
+struct SoftmaxFunctorBase {
+  void operator()(const Device& d, typename TTypes<T>::ConstMatrix logits,
                   typename TTypes<T>::Matrix softmax, const bool log) {
-    SoftmaxEigenImpl<CPUDevice, T>::Compute(d, logits, softmax, log);
+    SoftmaxEigenImpl<Device, T>::Compute(d, logits, softmax, log);
   }
 };
+template <typename T>
+struct SoftmaxFunctor<CPUDevice, T> : SoftmaxFunctorBase<CPUDevice, T> {};
+
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T>
+struct SoftmaxFunctor<SYCLDevice, T> : SoftmaxFunctorBase<SYCLDevice, T> {};
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace functor
 
 #define REGISTER_CPU(T)                                          \
@@ -76,4 +86,10 @@ REGISTER_KERNEL_BUILDER(
     SoftmaxOp<GPUDevice, float>);
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(
+    Name("Softmax").Device(DEVICE_SYCL).TypeConstraint<float>("T"),
+    SoftmaxOp<SYCLDevice, float>);
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/stage_op.cc b/tensorflow/core/kernels/stage_op.cc
index c18b992ea1..161ba89212 100644
--- a/tensorflow/core/kernels/stage_op.cc
+++ b/tensorflow/core/kernels/stage_op.cc
@@ -99,6 +99,9 @@ REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_CPU), StageOp);
 #if GOOGLE_CUDA
 REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_GPU), StageOp);
 #endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_SYCL), StageOp);
+#endif // TENSORFLOW_USE_SYCL
 
 class UnstageOp : public OpKernel {
  public:
@@ -126,5 +129,8 @@ REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_CPU), UnstageOp);
 #if GOOGLE_CUDA
 REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_GPU), UnstageOp);
 #endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_SYCL), UnstageOp);
+#endif // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index 6580d271d1..10593516f7 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -451,4 +451,71 @@ REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")
 #undef REGISTER_GPU
 
 #endif  // GOOGLE_CUDA
+
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL(type)                                    \
+  REGISTER_KERNEL_BUILDER(Name("StridedSlice")                 \
+                              .Device(DEVICE_SYCL)             \
+                              .TypeConstraint<type>("T")       \
+                              .HostMemory("begin")             \
+                              .HostMemory("end")               \
+                              .HostMemory("strides")           \
+                              .TypeConstraint<int32>("Index"), \
+                          StridedSliceOp<SYCLDevice, type>)    \
+  REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad")             \
+                              .Device(DEVICE_SYCL)             \
+                              .TypeConstraint<type>("T")       \
+                              .HostMemory("shape")             \
+                              .HostMemory("begin")             \
+                              .HostMemory("end")               \
+                              .HostMemory("strides")           \
+                              .TypeConstraint<int32>("Index"), \
+                          StridedSliceGradOp<SYCLDevice, type>)\
+  REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")           \
+                              .Device(DEVICE_SYCL)             \
+                              .TypeConstraint<type>("T")       \
+                              .HostMemory("begin")             \
+                              .HostMemory("end")               \
+                              .HostMemory("strides")           \
+                              .TypeConstraint<int32>("Index"), \
+                          StridedSliceAssignOp<SYCLDevice, type>)
+
+REGISTER_SYCL(float);
+REGISTER_SYCL(double);
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("StridedSlice")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32>("Index")
+                            .HostMemory("input")
+                            .HostMemory("begin")
+                            .HostMemory("end")
+                            .HostMemory("strides")
+                            .HostMemory("output"),
+                        StridedSliceOp<CPUDevice, int32>);
+REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32>("Index")
+                            .HostMemory("shape")
+                            .HostMemory("begin")
+                            .HostMemory("end")
+                            .HostMemory("strides")
+                            .HostMemory("dy")
+                            .HostMemory("output"),
+                        StridedSliceGradOp<CPUDevice, int32>);
+REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32>("Index")
+                            .HostMemory("ref")
+                            .HostMemory("begin")
+                            .HostMemory("end")
+                            .HostMemory("strides"),
+                        StridedSliceAssignOp<CPUDevice, int32>)
+#undef REGISTER_SYCL
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h
index 6341964b4b..d820db348e 100644
--- a/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -287,6 +287,20 @@ DECLARE_FOR_N_GPU(int32);
 TF_CALL_ALL_TYPES(DECLARE_FOR_N_CPU);
 DECLARE_FOR_N_CPU(bfloat16);
 
+#ifdef TENSORFLOW_USE_SYCL
+#define PREVENT_FOR_N_SYCL(T) \
+  PREVENT_INSTANTIATE(T, STRIDED_SLICE_INSTANTIATE_DIM)
+
+#define DECLARE_FOR_N_SYCL(T) \
+  INSTANTIATE(SYCLDevice, T, STRIDED_SLICE_INSTANTIATE_DIM)
+
+TF_CALL_SYCL_PROXY_TYPES(PREVENT_FOR_N_SYCL);
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_N_SYCL);
+DECLARE_FOR_N_SYCL(int32);
+
+#undef DECLARE_FOR_N_SYCL
+#endif // TENSORFLOW_USE_SYCL
+
 #undef INSTANTIATE
 #undef DECLARE_FOR_N_CPU
 #undef DECLARE_FOR_N_GPU
diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc
index e55c8679e9..9822b021eb 100644
--- a/tensorflow/core/kernels/tile_ops.cc
+++ b/tensorflow/core/kernels/tile_ops.cc
@@ -260,6 +260,8 @@ TF_CALL_complex128(HANDLE_TYPE_NAME_GPU);
 
 #ifdef TENSORFLOW_USE_SYCL
 TF_CALL_float(HANDLE_TYPE_NAME_SYCL);
+TF_CALL_double(HANDLE_TYPE_NAME_SYCL);
+TF_CALL_int32(HANDLE_TYPE_NAME_SYCL);
 #endif // TENSORFLOW_USE_SYCL
 
 #undef HANDLE_TYPE_NAME_CPU
@@ -506,6 +508,16 @@ TF_CALL_complex64(HANDLE_TYPE_NAME_GPU);
 TF_CALL_complex128(HANDLE_TYPE_NAME_GPU);
 #endif  // GOOGLE_CUDA
 
+#if TENSORFLOW_USE_SYCL
+#define HANDLE_TYPE_NAME_SYCL(T) \
+  HANDLE_CASE_DIM(SYCLDevice, T, DataTypeToEnum<T>::value);
+
+TF_CALL_float(HANDLE_TYPE_NAME_SYCL);
+TF_CALL_double(HANDLE_TYPE_NAME_SYCL);
+TF_CALL_int32(HANDLE_TYPE_NAME_SYCL);
+#undef HANDLE_TYPE_NAME_SYCL
+#endif // TENSORFLOW_USE_SYCL
+
 #undef HANDLE_TYPE_NAME_CPU
 #undef HANDLE_TYPE_NAME_GPU
 #undef HANDLE_CASE_DIM
@@ -605,6 +617,25 @@ REGISTER_KERNEL_BUILDER(Name("Tile")
                             .TypeConstraint<int32>("Tmultiples")
                             .HostMemory("multiples"),
                         TileOp<SYCLDevice>);
+REGISTER_KERNEL_BUILDER(Name("Tile")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<double>("T")
+                            .TypeConstraint<int32>("Tmultiples")
+                            .HostMemory("multiples"),
+                        TileOp<SYCLDevice>);
+
+REGISTER_KERNEL_BUILDER(Name("TileGrad")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<float>("T")
+                            .TypeConstraint<int32>("Tmultiples")
+                            .HostMemory("multiples"),
+                        TileGradientOp<SYCLDevice>);
+REGISTER_KERNEL_BUILDER(Name("TileGrad")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<double>("T")
+                            .TypeConstraint<int32>("Tmultiples")
+                            .HostMemory("multiples"),
+                        TileGradientOp<SYCLDevice>);
 #endif // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tile_ops_cpu_impl.h b/tensorflow/core/kernels/tile_ops_cpu_impl.h
index 650c739ed5..f06cc5514c 100644
--- a/tensorflow/core/kernels/tile_ops_cpu_impl.h
+++ b/tensorflow/core/kernels/tile_ops_cpu_impl.h
@@ -70,6 +70,8 @@ typedef Eigen::SyclDevice SYCLDevice;
 #define DEFINE_TYPE(T) DEFINE_DIM(T, CPU_PROVIDED_IXDIM)
 
 TF_CALL_float(DEFINE_TYPE);
+TF_CALL_double(DEFINE_TYPE);
+TF_CALL_int32(DEFINE_TYPE);
 
 #undef DEFINE_DIM
 #undef DEFINE_TYPE
@@ -81,6 +83,8 @@ TF_CALL_float(DEFINE_TYPE);
 #define DEFINE_TYPE(T) DEFINE_DIM(T, CPU_PROVIDED_IXDIM)
 
 TF_CALL_float(DEFINE_TYPE);
+TF_CALL_double(DEFINE_TYPE);
+TF_CALL_int32(DEFINE_TYPE);
 
 #undef DEFINE_DIM
 #undef DEFINE_TYPE
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index 336c6b0ccc..5c2d371430 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -423,6 +423,7 @@ TF_CALL_double(REGISTER_CPU_KERNELS);
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T);
 TF_CALL_float(REGISTER_SYCL_KERNELS);
+TF_CALL_double(REGISTER_SYCL_KERNELS);
 #undef REGISTER_SYCL_KERNELS
 #endif
 
@@ -2355,6 +2356,7 @@ TF_CALL_double(REGISTER_CPU_KERNELS);
 #define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T);
 
 TF_CALL_float(REGISTER_SYCL_KERNELS);
+TF_CALL_double(REGISTER_SYCL_KERNELS);
 #endif
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/transpose_functor_cpu.cc b/tensorflow/core/kernels/transpose_functor_cpu.cc
index 30b82f1843..3681b9a129 100644
--- a/tensorflow/core/kernels/transpose_functor_cpu.cc
+++ b/tensorflow/core/kernels/transpose_functor_cpu.cc
@@ -127,6 +127,7 @@ Status DoTranspose<SYCLDevice>(const SYCLDevice& d, const Tensor& in,
   switch (in.dtype()) {
 
     case DT_FLOAT:
+    case DT_DOUBLE:
     case DT_INT32:
       internal::Transpose<SYCLDevice, uint32>(d, in, perm, out);
       break;
diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc
index 67300c1e96..4d303f0173 100644
--- a/tensorflow/core/kernels/transpose_op.cc
+++ b/tensorflow/core/kernels/transpose_op.cc
@@ -82,6 +82,15 @@ REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
                             .HostMemory("y"),
                         InvertPermutationOp);
 
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .HostMemory("x")
+                            .HostMemory("y"),
+                        InvertPermutationOp);
+#endif // TENSORFLOW_USE_SYCL
+
 // output = TransposeOp(T<any> input, T<int32> perm) takes a tensor
 // of type T and rank N, and a permutation of 0, 1, ..., N-1. It
 // shuffles the dimensions of the input tensor according to permutation.
@@ -201,4 +210,24 @@ TF_CALL_POD_TYPES(REGISTER);
 #undef REGISTER
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+Status TransposeSyclOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
+                                   gtl::ArraySlice<int32> perm, Tensor* out) {
+  typedef Eigen::SyclDevice SYCLDevice;
+  return ::tensorflow::DoTranspose(ctx->eigen_device<SYCLDevice>(), in, perm,
+                                   out);
+}
+#define REGISTER(T)                                           \
+  REGISTER_KERNEL_BUILDER(Name("Transpose")                   \
+                              .Device(DEVICE_SYCL)            \
+                              .TypeConstraint<T>("T")         \
+                              .TypeConstraint<int32>("Tperm") \
+                              .HostMemory("perm"),            \
+                          TransposeSyclOp);
+REGISTER(float);
+REGISTER(bool);
+REGISTER(int32);
+#undef REGISTER
+#endif
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/transpose_op.h b/tensorflow/core/kernels/transpose_op.h
index 3b209c0ccc..5f40bcecc1 100644
--- a/tensorflow/core/kernels/transpose_op.h
+++ b/tensorflow/core/kernels/transpose_op.h
@@ -50,6 +50,17 @@ class TransposeGpuOp : public TransposeOp {
                      gtl::ArraySlice<int32> perm, Tensor* out) override;
 };
 
+#ifdef TENSORFLOW_USE_SYCL
+class TransposeSyclOp : public TransposeOp {
+ public:
+  explicit TransposeSyclOp(OpKernelConstruction* ctx) : TransposeOp(ctx) {}
+
+ protected:
+  Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
+                     gtl::ArraySlice<int32> perm, Tensor* out) override;
+};
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_TRANSPOSE_OP_H_
diff --git a/tensorflow/core/kernels/unpack_op.cc b/tensorflow/core/kernels/unpack_op.cc
index 2a14fa3265..e4c79ae17b 100644
--- a/tensorflow/core/kernels/unpack_op.cc
+++ b/tensorflow/core/kernels/unpack_op.cc
@@ -160,6 +160,7 @@ REGISTER_KERNEL_BUILDER(Name("Unpack")
       UnpackOp<SYCLDevice, type>)
 
 REGISTER_SYCL(float);
+REGISTER_SYCL(double);
 #undef REGISTER_SYCL
 
 // A special SYCL kernel for int32.
diff --git a/tensorflow/core/kernels/variable_ops.cc b/tensorflow/core/kernels/variable_ops.cc
index 34e227156d..7a4d9dc650 100644
--- a/tensorflow/core/kernels/variable_ops.cc
+++ b/tensorflow/core/kernels/variable_ops.cc
@@ -58,8 +58,9 @@ REGISTER_KERNEL_BUILDER(Name("IsVariableInitialized").Device(DEVICE_CPU),
                           IsVariableInitializedOp);
 
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
-#endif
+#endif // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 // Only register 'Variable' on GPU for the subset of types also supported by
diff --git a/tensorflow/core/kernels/xent_op.cc b/tensorflow/core/kernels/xent_op.cc
index 26f4fb2a2e..dc21cee3a8 100644
--- a/tensorflow/core/kernels/xent_op.cc
+++ b/tensorflow/core/kernels/xent_op.cc
@@ -28,6 +28,9 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class SoftmaxXentWithLogitsOp : public OpKernel {
@@ -74,17 +77,25 @@ class SoftmaxXentWithLogitsOp : public OpKernel {
 // Partial specialization for a CPUDevice, that uses the Eigen implementation
 // from XentEigenImpl.
 namespace functor {
-template <typename T>
-struct XentFunctor<CPUDevice, T> {
-  void operator()(const CPUDevice& d, typename TTypes<T>::ConstMatrix logits,
+template <typename Device, typename T>
+struct XentFunctorBase {
+  void operator()(const Device& d, typename TTypes<T>::ConstMatrix logits,
                   typename TTypes<T>::ConstMatrix labels,
                   typename TTypes<T>::Matrix scratch,
                   typename TTypes<T>::Vec loss,
                   typename TTypes<T>::Matrix backprop) {
-    XentEigenImpl<CPUDevice, T>::Compute(d, logits, labels, scratch, loss,
+    XentEigenImpl<Device, T>::Compute(d, logits, labels, scratch, loss,
                                          backprop);
   }
 };
+
+template <typename T>
+struct XentFunctor<CPUDevice, T> : XentFunctorBase<CPUDevice, T> {};
+
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T>
+struct XentFunctor<SYCLDevice, T> : XentFunctorBase<SYCLDevice, T> {};
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace functor
 
 #define REGISTER_CPU(T)                                         \
@@ -111,4 +122,11 @@ REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
                         SoftmaxXentWithLogitsOp<GPUDevice, double>);
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<float>("T"),
+                        SoftmaxXentWithLogitsOp<SYCLDevice, float>);
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/math_grad_test.cc b/tensorflow/core/ops/math_grad_test.cc
index 2def59ff04..8670ca307c 100644
--- a/tensorflow/core/ops/math_grad_test.cc
+++ b/tensorflow/core/ops/math_grad_test.cc
@@ -390,7 +390,7 @@ class TestOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("TestOpWithNoGrad").Device(DEVICE_CPU), TestOp);
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("TestOpWithNoGrad").Device(DEVICE_SYCL), TestOp);
-#endif  // TENSORFLOW_USE_SYCL
+#endif // TENSORFLOW_USE_SYCL
 
 TEST_F(MathGradTest, Error_Reporting) {
   auto x = test::AsTensor<float>({-3.f});
@@ -707,6 +707,8 @@ TEST_F(MathGradTest, Pow) {
   }
 }
 
+//TODO{lukeiwanski}: Implement Complex Pow for SYCL
+#ifndef TENSORFLOW_USE_SYCL
 TEST_F(MathGradTest, ComplexPow) {
   auto x = test::AsTensor<complex64>({0.f, 2.f, -2.f}, TensorShape({3}));
   auto y = test::AsTensor<complex64>({2.f, 2.f, 2.f}, TensorShape({3}));
@@ -725,6 +727,7 @@ TEST_F(MathGradTest, ComplexPow) {
       dy, test::AsTensor<complex64>({h(0.f, 2.f), h(2.f, 2.f), h(-2.f, 2.f)},
                                     TensorShape({3})));
 }
+#endif // TENSORFLOW_USE_SYCL
 
 TEST_F(MathGradTest, Maximum) {
   auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
@@ -886,6 +889,8 @@ TEST_F(MathGradTest, MatMul_11) {
   test::ExpectClose(dy, MatMul(dz, true, x, true));
 }
 
+//TODO{lukeiwanski}: Implement BatchMatMul for SYCL
+#ifndef TENSORFLOW_USE_SYCL
 TEST_F(MathGradTest, BatchMatMul_00) {
   auto x = test::AsTensor<float>({1.f, 2.f, 3.f, 4.f, 5.f, 6.f},
                                  TensorShape({1, 2, 3}));
@@ -933,6 +938,7 @@ TEST_F(MathGradTest, BatchMatMul_11) {
   test::ExpectClose(dx, BatchMatMul(y, true, dz, true));
   test::ExpectClose(dy, BatchMatMul(dz, true, x, true));
 }
+#endif // TENSORFLOW_USE_SYCL
 
 TEST_F(MathGradTest, Sum_dim0) {
   auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index c9d648f8b9..9180170e20 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -25367,6 +25367,59 @@ op {
   description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n`(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such\nthat `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`\nneed not be sorted and need not cover all values in the full\nrange of valid values.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\n\n`num_segments` should equal the number of distinct segment IDs.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/UnsortedSegmentSum.png\" alt>\n</div>"
 }
 op {
+  name: "UnsortedSegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    description: "A tensor whose shape is a prefix of `data.shape`."
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    description: "Has same shape as data, except for the first `segment_ids.rank`\ndimensions, which are replaced with a single dimension which has size\n`num_segments`."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  summary: "Computes the max along segments of a tensor."
+  description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n\\\\(output_i = \\sum_j data_j\\\\) where sum is over `j` such\nthat `segment_ids[j] == i`. Unlike `SegmentSum`, `segment_ids`\nneed not be sorted and need not cover all values in the full\n  range of valid values.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\n\n`num_segments` should equal the number of distinct segment IDs.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/UnsortedSegmentSum.png\" alt>\n</div>"
+}
+op {
   name: "Unstage"
   output_arg {
     name: "values"
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index cd0a9436d9..81d49684a8 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 #define TF_MAJOR_VERSION 1
 #define TF_MINOR_VERSION 0
-#define TF_PATCH_VERSION 0-rc2
+#define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
diff --git a/tensorflow/docs_src/programmers_guide/meta_graph.md b/tensorflow/docs_src/programmers_guide/meta_graph.md
index add4379d7d..fa4cee8700 100644
--- a/tensorflow/docs_src/programmers_guide/meta_graph.md
+++ b/tensorflow/docs_src/programmers_guide/meta_graph.md
@@ -277,7 +277,7 @@ Here are some of the typical usage models:
 * Retrieve Hyper Parameters
 
   ```Python
-  filename = ".".join([tf.latest_checkpoint(train_dir), "meta"])
+  filename = ".".join([tf.train.latest_checkpoint(train_dir), "meta"])
   tf.train.import_meta_graph(filename)
   hparams = tf.get_collection("hparams")
   ```
diff --git a/tensorflow/examples/image_retraining/retrain.py b/tensorflow/examples/image_retraining/retrain.py
index e612eb7424..575dea5584 100644
--- a/tensorflow/examples/image_retraining/retrain.py
+++ b/tensorflow/examples/image_retraining/retrain.py
@@ -893,7 +893,8 @@ def main(_):
     print('=== MISCLASSIFIED TEST IMAGES ===')
     for i, test_filename in enumerate(test_filenames):
       if predictions[i] != test_ground_truth[i].argmax():
-        print('%70s  %s' % (test_filename, image_lists.keys()[predictions[i]]))
+        print('%70s  %s' % (test_filename, 
+                            list(image_lists.keys())[predictions[i]]))
 
   # Write out the trained graph and labels with the weights stored as constants.
   output_graph_def = graph_util.convert_variables_to_constants(
diff --git a/tensorflow/examples/learn/mnist.py b/tensorflow/examples/learn/mnist.py
index 6e5fe7891b..15cf4b91dd 100644
--- a/tensorflow/examples/learn/mnist.py
+++ b/tensorflow/examples/learn/mnist.py
@@ -46,13 +46,13 @@ def conv_model(feature, target, mode):
 
   # First conv layer will compute 32 features for each 5x5 patch
   with tf.variable_scope('conv_layer1'):
-    h_conv1 = layers.convolution(
+    h_conv1 = layers.convolution2d(
         feature, 32, kernel_size=[5, 5], activation_fn=tf.nn.relu)
     h_pool1 = max_pool_2x2(h_conv1)
 
   # Second conv layer will compute 64 features for each 5x5 patch.
   with tf.variable_scope('conv_layer2'):
-    h_conv2 = layers.convolution(
+    h_conv2 = layers.convolution2d(
         h_pool1, 64, kernel_size=[5, 5], activation_fn=tf.nn.relu)
     h_pool2 = max_pool_2x2(h_conv2)
     # reshape tensor into a batch of vectors
diff --git a/tensorflow/examples/learn/text_classification.py b/tensorflow/examples/learn/text_classification.py
index a3a5f9e3e9..c3d00a11b9 100644
--- a/tensorflow/examples/learn/text_classification.py
+++ b/tensorflow/examples/learn/text_classification.py
@@ -104,8 +104,13 @@ def main(unused_argv):
 
   # Process vocabulary
   vocab_processor = learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH)
-  x_train = np.array(list(vocab_processor.fit_transform(x_train)))
-  x_test = np.array(list(vocab_processor.transform(x_test)))
+  
+  x_transform_train = vocab_processor.fit_transform(x_train)
+  x_transform_test = vocab_processor.transform(x_test)
+  
+  x_train = np.array(list(x_transform_train))
+  x_test = np.array(list(x_transform_test))
+  
   n_words = len(vocab_processor.vocabulary_)
   print('Total words: %d' % n_words)
 
diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
index 8dcd3bf37a..25800c109e 100644
--- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
+++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
@@ -181,7 +181,7 @@ with graph.as_default():
       valid_embeddings, normalized_embeddings, transpose_b=True)
 
   # Add variable initializer.
-  init = tf.global_variables_initializer()
+  init = tf.initialize_all_variables()
 
 # Step 5: Begin training.
 num_steps = 100001
diff --git a/tensorflow/examples/udacity/README.md b/tensorflow/examples/udacity/README.md
index 143a75a3e9..6faad294c2 100644
--- a/tensorflow/examples/udacity/README.md
+++ b/tensorflow/examples/udacity/README.md
@@ -6,7 +6,7 @@ Course information can be found at https://www.udacity.com/course/deep-learning-
 Running the Docker container from the Google Cloud repository
 -------------------------------------------------------------
 
-    docker run -p 8888:8888 --name tensorflow-udacity -it gcr.io/tensorflow/udacity-assignments:0.6.0
+    docker run -p 8888:8888 --name tensorflow-udacity -it gcr.io/tensorflow/udacity-assignments:1.0.0
 
 Note that if you ever exit the container, you can return to it using:
 
@@ -94,10 +94,10 @@ This will allow you to save work and have access to generated files on the host
 Pushing a Google Cloud release
 ------------------------------
 
-    V=0.6.0
+    V=1.0.0
     docker tag $USER/assignments gcr.io/tensorflow/udacity-assignments:$V
     gcloud docker push gcr.io/tensorflow/udacity-assignments
-    docker tag -f $USER/assignments gcr.io/tensorflow/udacity-assignments:latest
+    docker tag $USER/assignments gcr.io/tensorflow/udacity-assignments:latest
     gcloud docker push gcr.io/tensorflow/udacity-assignments
 
 History
@@ -109,3 +109,4 @@ History
 * 0.4.0: Move notMMNIST data for Google Cloud.
 * 0.5.0: Actually use 0.7.1 release.
 * 0.6.0: Update to TF 0.10.0, add libjpeg (for Pillow).
+* 1.0.0: Update to TF 1.0.0 release.
diff --git a/tensorflow/python/client/device_lib_test.py b/tensorflow/python/client/device_lib_test.py
index 561ce09099..7bba10efac 100644
--- a/tensorflow/python/client/device_lib_test.py
+++ b/tensorflow/python/client/device_lib_test.py
@@ -34,7 +34,7 @@ class DeviceLibTest(test_util.TensorFlowTestCase):
     # GPU test
     if test.is_gpu_available():
       self.assertGreater(len(devices), 1)
-      self.assertTrue("GPU" in [d.device_type for d in devices])
+      self.assertTrue("GPU" in [d.device_type for d in devices] or "SYCL" in [d.device_type for d in devices])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index f2fd687adf..3ea7e547ee 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -44,7 +44,14 @@ from tensorflow.python.platform import googletest
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util.protobuf import compare
+from tensorflow.python.client import device_lib
 
+def gpu_device_name():
+  """Returns the name of a GPU device if available or the empty string."""
+  for x in device_lib.list_local_devices():
+    if x.device_type == 'GPU' or x.device_type == 'SYCL':
+      return x.name
+  return ''
 
 def assert_ops_in_graph(expected_ops, graph):
   """Assert all expected operations are found.
@@ -301,7 +308,12 @@ class TensorFlowTestCase(googletest.TestCase):
       sess = self._cached_session
       with sess.graph.as_default(), sess.as_default():
         if force_gpu:
-          with sess.graph.device("/gpu:0"):
+          # Use the name of an actual device if one is detected, or '/gpu:0'
+          # otherwise
+          gpu_name = gpu_device_name()
+          if len(gpu_name) == 0:
+            gpu_name = '/gpu:0'
+          with sess.graph.device(gpu_name):
             yield sess
         elif use_gpu:
           yield sess
@@ -311,7 +323,12 @@ class TensorFlowTestCase(googletest.TestCase):
     else:
       with session.Session(graph=graph, config=prepare_config(config)) as sess:
         if force_gpu:
-          with sess.graph.device("/gpu:0"):
+          # Use the name of an actual device if one is detected, or '/gpu:0'
+          # otherwise
+          gpu_name = gpu_device_name()
+          if len(gpu_name) == 0:
+            gpu_name = '/gpu:0'
+          with sess.graph.device(gpu_name):
             yield sess
         elif use_gpu:
           yield sess
diff --git a/tensorflow/python/kernel_tests/stage_op_test.py b/tensorflow/python/kernel_tests/stage_op_test.py
index ae26dcb5a9..81eee48d2e 100644
--- a/tensorflow/python/kernel_tests/stage_op_test.py
+++ b/tensorflow/python/kernel_tests/stage_op_test.py
@@ -46,7 +46,7 @@ class StageTest(test.TestCase):
       with ops.device('/cpu:0'):
         x = array_ops.placeholder(dtypes.float32)
         v = 2. * (array_ops.zeros([128, 128]) + x)
-      with ops.device('/gpu:0'):
+      with ops.device(test.gpu_device_name()):
         stager = data_flow_ops.StagingArea([dtypes.float32, dtypes.float32])
         stage = stager.put([x, v])
         z, y = stager.get()
@@ -62,7 +62,7 @@ class StageTest(test.TestCase):
       with ops.device('/cpu:0'):
         x = array_ops.placeholder(dtypes.float32)
         v = 2. * (array_ops.zeros([128, 128]) + x)
-      with ops.device('/gpu:0'):
+      with ops.device(test.gpu_device_name()):
         stager = data_flow_ops.StagingArea(
             [dtypes.float32, dtypes.float32],
             shapes=[[], [128, 128]],
diff --git a/tensorflow/python/kernel_tests/variables_test.py b/tensorflow/python/kernel_tests/variables_test.py
index 11b350a99e..48be8cff4f 100644
--- a/tensorflow/python/kernel_tests/variables_test.py
+++ b/tensorflow/python/kernel_tests/variables_test.py
@@ -46,11 +46,13 @@ class VariablesTestCase(test.TestCase):
       self.assertEqual("Variable:0", var0.name)
       self.assertEqual([], var0.get_shape())
       self.assertEqual([], var0.get_shape())
+      self.assertEqual([], var0.shape)
 
       var1 = variables.Variable(1.1)
       self.assertEqual("Variable_1:0", var1.name)
       self.assertEqual([], var1.get_shape())
       self.assertEqual([], var1.get_shape())
+      self.assertEqual([], var1.shape)
 
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
         var0.eval()
@@ -69,11 +71,13 @@ class VariablesTestCase(test.TestCase):
       self.assertEqual("rnd:0", rnd.name)
       self.assertEqual([3, 6], rnd.get_shape())
       self.assertEqual([3, 6], rnd.get_shape())
+      self.assertEqual([3, 6], rnd.shape)
 
       dep = variables.Variable(rnd.initialized_value(), name="dep")
       self.assertEqual("dep:0", dep.name)
       self.assertEqual([3, 6], dep.get_shape())
       self.assertEqual([3, 6], dep.get_shape())
+      self.assertEqual([3, 6], dep.shape)
 
       # Currently have to set the shape manually for Add.
       added_val = rnd.initialized_value() + dep.initialized_value() + 2.0
@@ -83,6 +87,7 @@ class VariablesTestCase(test.TestCase):
       self.assertEqual("depdep:0", depdep.name)
       self.assertEqual([3, 6], depdep.get_shape())
       self.assertEqual([3, 6], depdep.get_shape())
+      self.assertEqual([3, 6], depdep.shape)
 
       variables.global_variables_initializer().run()
 
@@ -375,6 +380,7 @@ class VariablesTestCase(test.TestCase):
 
       v1 = variables.Variable(initializer, dtype=dtypes.float32)
       self.assertEqual(shape, v1.get_shape())
+      self.assertEqual(shape, v1.shape)
       self.assertAllClose(value, v1.initial_value.eval())
       with self.assertRaises(errors_impl.FailedPreconditionError):
         v1.eval()
@@ -382,6 +388,7 @@ class VariablesTestCase(test.TestCase):
       v2 = variables.Variable(
           math_ops.negative(v1.initialized_value()), dtype=dtypes.float32)
       self.assertEqual(v1.get_shape(), v2.get_shape())
+      self.assertEqual(v1.shape, v2.shape)
       self.assertAllClose(np.negative(value), v2.initial_value.eval())
 
       # Once v2.initial_value.eval() has been called, v1 has effectively been
@@ -532,6 +539,7 @@ class PartitionedVariableTest(test.TestCase):
       self.assertEqual(2, num_partitions)
       self.assertEqual([v0, v1], iterated_partitions)
       self.assertEqual([2], concatenated.get_shape())
+      self.assertEqual([2], concatenated.shape)
 
   def testPartitionedVariableFailures(self):
     with ops.Graph().as_default():
diff --git a/tensorflow/python/kernel_tests/xent_op_test.py b/tensorflow/python/kernel_tests/xent_op_test.py
index d037ceac61..4b3dadc112 100644
--- a/tensorflow/python/kernel_tests/xent_op_test.py
+++ b/tensorflow/python/kernel_tests/xent_op_test.py
@@ -157,7 +157,7 @@ class XentTest(test.TestCase):
         np.array([[0., 0., 0., 1.], [0., .5, .5, 0.]]).astype(np.float64))
 
   def testGradient(self):
-    with self.test_session():
+    with self.test_session() as sess:
       l = constant_op.constant(
           [0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.5],
           shape=[3, 4],
@@ -171,14 +171,21 @@ class XentTest(test.TestCase):
       x = nn_ops.softmax_cross_entropy_with_logits(labels=l, logits=f,
                                                    name="xent")
       err = gradient_checker.compute_gradient_error(f, [3, 4], x, [3])
+
+      # Check that no extra computation performed. When only first derivative is requested,
+      # second derivative must not be computed. So when there is no second derivative,
+      # there is no `BatchMatMul` op in the graph.
+      op_names = [op.op_def.name for op in sess.graph.get_operations() if op.op_def]
+      self.assertNotIn('BatchMatMul', op_names)
+
     print("cross entropy gradient err = ", err)
     self.assertLess(err, 5e-8)
 
   def testSecondGradient(self):
-    with self.test_session():
-      l = constant_op.constant([0.0, 0.0, 1.0, 0.0,
-                                1.0, 0.0, 0.0, 0.0,
-                                0.0, 0.5, 0.0, 0.5], shape=[12],
+    with self.test_session() as sess:
+      l = constant_op.constant([0.0, 0.0, 1.0/3, 0.0,
+                                1.0/3, 0.0, 0.0, 0.0,
+                                0.0, 0.5/3, 0.0, 0.5/3], shape=[12],
                                dtype=dtypes.float64, name="l")
       f = constant_op.constant([0.1, 0.2, 0.3, 0.4,
                                 0.1, 0.4, 0.9, 1.6,
@@ -186,13 +193,19 @@ class XentTest(test.TestCase):
                                dtype=dtypes.float64, name="f")
       x = nn_ops.softmax_cross_entropy_with_logits(labels=l, logits=f,
                                                    name="xent")
-      loss = math_ops.reduce_mean(x)
+      loss = math_ops.reduce_sum(x)
 
-    # Taking ths second gradient should fail, since it is not
-    # yet supported.
-    with self.assertRaisesRegexp(LookupError,
-                                 "explicitly disabled"):
-      _ = gradients_impl.hessians(loss, [f])
+      gradients = gradients_impl.gradients(loss, [f])[0]
+
+      err = gradient_checker.compute_gradient_error(f, [12], gradients, [12])
+
+      # Check that second derivative is calculated.
+      # (it is equivalent to being `BatchMatMul` op in the graph because of implementation of xentropy grad)
+      op_names = [op.op_def.name for op in sess.graph.get_operations() if op.op_def]
+      self.assertIn('BatchMatMul', op_names)
+
+    print("cross entropy hessian err = ", err)
+    self.assertLess(err, 5e-8)
 
   def testWrapper(self):
     features = np.array(
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index c64813962a..34380951af 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -586,7 +586,7 @@ def strided_slice(input_,
   `foo[::-1]` reverses a tensor with shape 8.
 
 
-  If the ith bit of `ellipsis_mask`, as many unspecified dimensions
+  If the ith bit of `ellipsis_mask` is non-zero, as many unspecified dimensions
   as needed will be inserted between other dimensions. Only one
   non-zero bit is allowed in `ellipsis_mask`.
 
@@ -594,7 +594,7 @@ def strided_slice(input_,
   equivalent to `foo[3:5,:,:,4:5]` and
   `foo[3:5,...]` is equivalent to `foo[3:5,:,:,:]`.
 
-  If the ith bit of `new_axis_mask` is one, then a `begin`,
+  If the ith bit of `new_axis_mask` is one, then `begin`,
   `end`, and `stride` are ignored and a new length 1 dimension is
   added at this point in the output tensor.
 
@@ -1545,7 +1545,7 @@ def sparse_placeholder(dtype, shape=None, name=None):
       x: (indices, values, shape)}))  # Will succeed.
 
     sp = tf.SparseTensor(indices=indices, values=values, dense_shape=shape)
-    sp_value = sp.eval(session)
+    sp_value = sp.eval(session=sess)
     print(sess.run(y, feed_dict={x: sp_value}))  # Will succeed.
   ```
 
diff --git a/tensorflow/python/ops/clip_ops.py b/tensorflow/python/ops/clip_ops.py
index bda7212c8a..3dc0ac34c8 100644
--- a/tensorflow/python/ops/clip_ops.py
+++ b/tensorflow/python/ops/clip_ops.py
@@ -41,8 +41,10 @@ def clip_by_value(t, clip_value_min, clip_value_max,
 
   Args:
     t: A `Tensor`.
-    clip_value_min: A 0-D (scalar) `Tensor`. The minimum value to clip by.
-    clip_value_max: A 0-D (scalar) `Tensor`. The maximum value to clip by.
+    clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+      as `t`. The minimum value to clip by.
+    clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+      as `t`. The maximum value to clip by.
     name: A name for the operation (optional).
 
   Returns:
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index 397c522dbe..41e8877d72 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -321,21 +321,25 @@ def _BroadcastMul(vec, mat):
 
 
 @ops.RegisterGradient("SoftmaxCrossEntropyWithLogits")
-def _SoftmaxCrossEntropyWithLogitsGrad(op, grad_0, _):
+def _SoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad):
   """Gradient function for SoftmaxCrossEntropyWithLogits."""
-  # grad_0 is the backprop for cost, and we multiply it with the gradients
+  # grad_loss is the backprop for cost, and we multiply it with the gradients
   # (which is output[1])
+  # grad_grad is the backprop for softmax gradient.
   # There is no gradient for the labels
   #
-  # Currently there is no way to take the second derivative of this op
-  # due to the fused implementation's interaction with tf.gradients(),
-  # so we make sure we prevent silently incorrect results by raising
-  # an error if the second derivative is requested via prevent_gradient.
-  softmax_grad_without_gradient = array_ops.prevent_gradient(
-      op.outputs[1], message="Currently there is no way to take the second "
-      "derivative of softmax_cross_entropy_with_logits due to the fused "
-      " implementation's interaction with tf.gradients()")
-  return _BroadcastMul(grad_0, softmax_grad_without_gradient), None
+  # Second derivative is just softmax derivative w.r.t. logits.
+  softmax_grad = op.outputs[1]
+  grad = _BroadcastMul(grad_loss, softmax_grad)
+
+  if grad_grad.op.type not in ('ZerosLike', 'Zeros'):
+    logits = op.inputs[0]
+    softmax = nn_ops.softmax(logits)
+
+    grad += ((grad_grad - array_ops.squeeze(math_ops.matmul(grad_grad[:, None, :],
+                                                              softmax[:, :, None]), axis=1)) * softmax)
+
+  return grad, None
 
 
 @ops.RegisterGradient("SparseSoftmaxCrossEntropyWithLogits")
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index 06ae3589a2..658c9f79ee 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -515,7 +515,7 @@ def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
       state = initial_state
     else:
       if not dtype:
-        raise ValueError("If no initial_state is provided, dtype must be.")
+        raise ValueError("If there is no initial_state, you must give a dtype.")
       state = cell.zero_state(batch_size, dtype)
 
     def _assert_has_shape(x, shape):
diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py
index bf4d198209..5a8eb432d1 100644
--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@@ -318,44 +318,28 @@ def _einsum_reduction(t0, t0_axis_labels, t1, t1_axis_labels, axes_to_sum):
     # into a single axis, and combine multiple summed axes into a
     # single axis.
 
-    t0_shape = tuple(x.value for x in t0.get_shape())
+    t0_shape = _get_shape(t0)
     num_broadcast_elements_t0 = _total_size(
         t0_shape[len(preserved_axes):-len(axes_to_sum)])
     num_summed_elements = _total_size(t0_shape[-len(axes_to_sum):])
-    new_shape = t0_shape[:len(preserved_axes)] + (num_broadcast_elements_t0,
-                                                  num_summed_elements)
+    new_shape = (t0_shape[:len(preserved_axes)]
+                 + [num_broadcast_elements_t0, num_summed_elements])
     t0 = _reshape_if_necessary(t0, new_shape)
 
-    t1_shape = tuple(x.value for x in t1.get_shape())
+    t1_shape = _get_shape(t1)
     num_broadcast_elements_t1 = _total_size(
         t1_shape[len(preserved_axes)+len(axes_to_sum):])
-    new_shape = t1_shape[:len(preserved_axes)] + (num_summed_elements,
-                                                  num_broadcast_elements_t1)
+    new_shape = (t1_shape[:len(preserved_axes)]
+                 + [num_summed_elements, num_broadcast_elements_t1])
     t1 = _reshape_if_necessary(t1, new_shape)
 
     product = math_ops.matmul(t0, t1)
 
     # Undo compaction of broadcast axes
     uncompacted_shape = (
-        t0_shape[:len(preserved_axes)+len(broadcast_axes[0])] +
-        t1_shape[len(t1_shape)-len(broadcast_axes[1]):]
+        t0_shape[:len(preserved_axes)+len(broadcast_axes[0])]
+        + t1_shape[len(t1_shape)-len(broadcast_axes[1]):]
     )
-
-    # Check the number of None values and replace them with Tensors containing
-    # corresponding dimensions if there exist two or more None values
-    num_none_dims = sum(1 for d in uncompacted_shape if d is None)
-    if num_none_dims > 1:
-      uncompacted_shape = list(uncompacted_shape)
-      for i in xrange(len(uncompacted_shape)):
-        if uncompacted_shape[i] is None:
-          if i < len(preserved_axes) + len(broadcast_axes[0]):
-            uncompacted_shape[i] = array_ops.shape(inputs[0])[i]
-          else:
-            idx = (i - len(preserved_axes) - len(broadcast_axes[0])
-                   + len(t1_shape) - len(broadcast_axes[1]))
-            uncompacted_shape[i] = array_ops.shape(inputs[1])[idx]
-      uncompacted_shape = tuple(uncompacted_shape)
-
     product = _reshape_if_necessary(product, uncompacted_shape)
 
     product_axes = (
@@ -386,13 +370,27 @@ def _reshape_if_necessary(tensor, new_shape):
     return array_ops.reshape(tensor, new_shape)
 
 
+def _get_shape(tensor):
+  """Like get_shape().as_list(), but explicitly queries the shape of a tensor
+  if necessary to ensure that the returned value contains no unknown value."""
+
+  shape = tensor.get_shape().as_list()
+  none_indices = [i for i, d in enumerate(shape) if d is None]
+  if none_indices:
+    # Query the shape if shape contains None values
+    shape_tensor = array_ops.shape(tensor)
+    for i in none_indices:
+      shape[i] = shape_tensor[i]
+  return shape
+
 def _total_size(shape_values):
-  """Given list of tensor shape values, returns total size or -1 if unknown."""
+  """Given list of tensor shape values, returns total size.
+  If shape_values contains tensor values (which are results of
+  array_ops.shape), then it returns a scalar tensor.
+  If not, it returns an integer."""
+
   result = 1
   for val in shape_values:
-    if val is None:
-      return -1
-    assert isinstance(val, int)
     result *= val
   return result
 
diff --git a/tensorflow/python/ops/special_math_ops_test.py b/tensorflow/python/ops/special_math_ops_test.py
index 3d289bcc9a..c792d32277 100644
--- a/tensorflow/python/ops/special_math_ops_test.py
+++ b/tensorflow/python/ops/special_math_ops_test.py
@@ -318,7 +318,19 @@ class EinsumTest(test.TestCase):
             m1: [3, 2],
         }
         np.testing.assert_almost_equal(
-            [[7]], sess.run(out, feed_dict=feed_dict))
+           [[7]], sess.run(out, feed_dict=feed_dict))
+
+    with ops.Graph().as_default():
+      m0 = array_ops.placeholder(dtypes.int32, shape=(None, 2, None, 2))
+      m1 = array_ops.placeholder(dtypes.int32, shape=(None, 2))
+      out = special_math_ops.einsum('ijkl,ij->ikl', m0, m1)
+      with session.Session() as sess:
+        feed_dict = {
+            m0: [[[[1, 2]], [[2, 1]]]],
+            m1: [[3, 2]],
+        }
+        np.testing.assert_almost_equal(
+            [[[7, 8]]], sess.run(out, feed_dict=feed_dict))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 5a1a43b5d5..ee1e361093 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -723,7 +723,8 @@ class Variable(object):
     """The `Graph` of this variable."""
     return self._variable.graph
 
-  def get_shape(self):
+  @property
+  def shape(self):
     """The `TensorShape` of this variable.
 
     Returns:
@@ -731,6 +732,10 @@ class Variable(object):
     """
     return self._variable.get_shape()
 
+  def get_shape(self):
+    """Alias of Variable.shape."""
+    return self.shape
+
   def to_proto(self, export_scope=None):
     """Converts a `Variable` to a `VariableDef` protocol buffer.
 
diff --git a/tensorflow/python/platform/test.py b/tensorflow/python/platform/test.py
index 0001eb65e1..452b8f5d3b 100644
--- a/tensorflow/python/platform/test.py
+++ b/tensorflow/python/platform/test.py
@@ -42,6 +42,7 @@ from tensorflow.python.util.all_util import remove_undocumented
 # pylint: disable=unused-import
 from tensorflow.python.framework.test_util import assert_equal_graph_def
 from tensorflow.python.framework.test_util import TensorFlowTestCase as TestCase
+from tensorflow.python.framework.test_util import gpu_device_name
 
 from tensorflow.python.ops.gradient_checker import compute_gradient_error
 from tensorflow.python.ops.gradient_checker import compute_gradient
@@ -107,15 +108,6 @@ def is_gpu_available(cuda_only=False):
     return any((x.device_type == 'GPU' or x.device_type == 'SYCL')
                for x in _device_lib.list_local_devices())
 
-
-def gpu_device_name():
-  """Returns the name of a GPU device if available or the empty string."""
-  for x in _device_lib.list_local_devices():
-    if x.device_type == 'GPU' or x.device_type == 'SYCL':
-      return x.name
-  return ''
-
-
 _allowed_symbols = [
     # We piggy-back googletest documentation.
     'Benchmark',
diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
index def28053f4..01ce67252e 100644
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
@@ -57,7 +57,7 @@ namespace cuda {
 
 #ifdef __APPLE__
 static const CFStringRef kDriverKextIdentifier = CFSTR("com.nvidia.CUDA");
-#else
+#elif !defined(PLATFORM_WINDOWS)
 static const char *kDriverVersionPath = "/proc/driver/nvidia/version";
 #endif
 
@@ -341,6 +341,12 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
                               CFStringGetCStringPtr(kDriverKextIdentifier, kCFStringEncodingUTF8))
     };
   return status;
+#elif defined(PLATFORM_WINDOWS)
+  auto status =
+    port::Status{port::error::UNIMPLEMENTED,
+                 "kernel reported driver version not implemented on Windows"
+    };
+  return status;
 #else
   FILE *driver_version_file = fopen(kDriverVersionPath, "r");
   if (driver_version_file == nullptr) {
diff --git a/tensorflow/tools/ci_build/README.md b/tensorflow/tools/ci_build/README.md
index 5c90fceaf7..eede1f5fac 100644
--- a/tensorflow/tools/ci_build/README.md
+++ b/tensorflow/tools/ci_build/README.md
@@ -53,10 +53,10 @@ tensorflow/tools/ci_build/ci_build.sh CPU bazel test //tensorflow/...
 tensorflow/tools/ci_build/ci_build.sh GPU bazel test -c opt --config=cuda //tensorflow/...
 
 # build pip with gpu support
-tensorflow/tools/ci_build/ci_build.sh GPU tensorflow/tools/ci_build/builds/pip.sh GPU
+tensorflow/tools/ci_build/ci_build.sh GPU tensorflow/tools/ci_build/builds/pip.sh GPU -c opt --config=cuda
 
 # build and run gpu tests using python 3
-CI_DOCKER_EXTRA_PARAMS="-e CI_BUILD_PYTHON=python3" tensorflow/tools/ci_build/ci_build.sh GPU tensorflow/tools/ci_build/builds/pip.sh GPU
+CI_DOCKER_EXTRA_PARAMS="-e CI_BUILD_PYTHON=python3" tensorflow/tools/ci_build/ci_build.sh GPU tensorflow/tools/ci_build/builds/pip.sh GPU -c opt --config=cuda
 
 # build android example app
 tensorflow/tools/ci_build/ci_build.sh ANDROID tensorflow/tools/ci_build/builds/android.sh
diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index 5527feee34..1399f1e58f 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@@ -19,8 +19,7 @@
 # The PIP installation is done using the --user flag.
 #
 # Usage:
-#   pip.sh CONTAINER_TYPE [--mavx] [--mavx2]
-#                         [--test_tutorials] [--integration_tests]
+#   pip.sh CONTAINER_TYPE [--test_tutorials] [--integration_tests] [bazel flags]
 #
 # When executing the Python unit tests, the script obeys the shell
 # variables: TF_BUILD_BAZEL_CLEAN, TF_BUILD_INSTALL_EXTRA_PIP_PACKAGES,
@@ -39,8 +38,7 @@
 # If NO_TEST_USER_OPS has any non-empty and non-0 value, the testing of user-
 # defined ops against the installation will be skipped.
 #
-# Use --mavx or --mavx2 to let bazel use --copt=-mavx or --copt=-mavx2 options
-# while building the pip package, respectively.
+# Any flags not listed in the usage above will be passed directly to Bazel.
 #
 # If the --test_tutorials flag is set, it will cause the script to run the
 # tutorial tests (see test_tutorials.sh) after the PIP
@@ -49,6 +47,11 @@
 # to run.
 #
 
+# Helper function: Strip leading and trailing whitespaces
+str_strip () {
+  echo -e "$1" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//'
+}
+
 # Fixed naming patterns for wheel (.whl) files given different python versions
 if [[ $(uname) == "Linux" ]]; then
   declare -A WHL_TAGS
@@ -66,6 +69,7 @@ source "${SCRIPT_DIR}/builds_common.sh"
 
 # Get the command line arguments
 CONTAINER_TYPE=$( echo "$1" | tr '[:upper:]' '[:lower:]' )
+shift
 
 if [[ ! -z "${TF_BUILD_BAZEL_CLEAN}" ]] && \
    [[ "${TF_BUILD_BAZEL_CLEAN}" != "0" ]]; then
@@ -82,16 +86,14 @@ fi
 
 DO_TEST_TUTORIALS=0
 DO_INTEGRATION_TESTS=0
-MAVX_FLAG=""
+BAZEL_FLAGS=""
 while true; do
   if [[ "${1}" == "--test_tutorials" ]]; then
     DO_TEST_TUTORIALS=1
   elif [[ "${1}" == "--integration_tests" ]]; then
     DO_INTEGRATION_TESTS=1
-  elif [[ "${1}" == "--mavx" ]]; then
-    MAVX_FLAG="--copt=-mavx"
-  elif [[ "${1}" == "--mavx2" ]]; then
-    MAVX_FLAG="--copt=-mavx2"
+  else
+    BAZEL_FLAGS="${BAZEL_FLAGS} ${1}"
   fi
 
   shift
@@ -100,18 +102,18 @@ while true; do
   fi
 done
 
-if [[ ! -z "${MAVX_FLAG}" ]]; then
-  echo "Using MAVX flag: ${MAVX_FLAG}"
-fi
+BAZEL_FLAGS=$(str_strip "${BAZEL_FLAGS}")
+
+echo "Using Bazel flags: ${BAZEL_FLAGS}"
 
 PIP_BUILD_TARGET="//tensorflow/tools/pip_package:build_pip_package"
 GPU_FLAG=""
 if [[ ${CONTAINER_TYPE} == "cpu" ]] || \
    [[ ${CONTAINER_TYPE} == "debian.jessie.cpu" ]]; then
-  bazel build -c opt ${MAVX_FLAG} ${PIP_BUILD_TARGET} || \
+  bazel build ${BAZEL_FLAGS} ${PIP_BUILD_TARGET} || \
       die "Build failed."
 elif [[ ${CONTAINER_TYPE} == "gpu" ]]; then
-  bazel build -c opt --config=cuda ${MAVX_FLAG} ${PIP_BUILD_TARGET} || \
+  bazel build ${BAZEL_FLAGS} ${PIP_BUILD_TARGET} || \
       die "Build failed."
   GPU_FLAG="--gpu"
 else
diff --git a/tensorflow/tools/ci_build/ci_build.sh b/tensorflow/tools/ci_build/ci_build.sh
index 3697fd46a0..f0fa8a9381 100755
--- a/tensorflow/tools/ci_build/ci_build.sh
+++ b/tensorflow/tools/ci_build/ci_build.sh
@@ -26,7 +26,7 @@
 #                  directory as this script will be used.
 #
 # COMMAND: Command to be executed in the docker container, e.g.,
-#          tensorflow/tools/ci_build/builds/pip.sh gpu
+#          tensorflow/tools/ci_build/builds/pip.sh gpu -c opt --config=cuda
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 source "${SCRIPT_DIR}/builds/builds_common.sh"
diff --git a/tensorflow/tools/ci_build/ci_parameterized_build.sh b/tensorflow/tools/ci_build/ci_parameterized_build.sh
index 5e2cf94059..c2ad03bc6f 100755
--- a/tensorflow/tools/ci_build/ci_parameterized_build.sh
+++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh
@@ -83,7 +83,8 @@
 #                      support for Google Cloud Platform (GCP), which is
 #                      enabled by default.
 #   TF_BUILD_OPTIONS:
-#                     (FASTBUILD | OPT | OPTDBG | MAVX | MAVX2)
+#                     (FASTBUILD | OPT | OPTDBG | MAVX | MAVX2_FMA | MAVX_DBG |
+#                      MAVX2_FMA_DBG)
 #                     Use the specified configurations when building.
 #                     When set, overrides TF_BUILD_IS_OPT and TF_BUILD_MAVX
 #                     options, as this will replace the two.
@@ -304,11 +305,14 @@ else
     MAVX)
       OPT_FLAG="${OPT_FLAG} -c opt --copt=-mavx"
       ;;
-    MAVXDBG)
+    MAVX_DBG)
       OPT_FLAG="${OPT_FLAG} -c opt --copt=-g --copt=-mavx"
       ;;
-    MAVX2)
-      OPT_FLAG="${OPT_FLAG} -c opt --copt=-mavx2"
+    MAVX2_FMA)
+      OPT_FLAG="${OPT_FLAG} -c opt --copt=-mavx2 --copt=-mfma"
+      ;;
+    MAVX2_FMA_DBG)
+      OPT_FLAG="${OPT_FLAG} -c opt --copt=-g --copt=-mavx2 --copt=-mfma"
       ;;
   esac
 fi
@@ -376,12 +380,7 @@ if [[ ${TF_BUILD_IS_PIP} == "pip" ]] ||
     exit 0
   fi
 
-  PIP_MAIN_CMD="${MAIN_CMD} ${PIP_CMD} ${CTYPE} ${EXTRA_AGRS}"
-
-  # Add flag for mavx/mavx2
-  if [[ ! -z "${TF_BUILD_MAVX}" ]]; then
-    PIP_MAIN_CMD="${PIP_MAIN_CMD} --${TF_BUILD_MAVX}"
-  fi
+  PIP_MAIN_CMD="${MAIN_CMD} ${PIP_CMD} ${CTYPE} ${EXTRA_ARGS} ${OPT_FLAG}"
 
   # Add flag for integration tests
   if [[ ! -z "${TF_BUILD_INTEGRATION_TESTS}" ]] &&
diff --git a/tensorflow/tools/ci_build/install/install_buildifier.sh b/tensorflow/tools/ci_build/install/install_buildifier.sh
index 2f3470881a..5420934c6b 100755
--- a/tensorflow/tools/ci_build/install/install_buildifier.sh
+++ b/tensorflow/tools/ci_build/install/install_buildifier.sh
@@ -16,8 +16,9 @@
 
 set -e
 BUILDIFIER_DIR="buildifier"
-rm -rf ${BUILDIFIER_DIR}
-git clone https://github.com/bazelbuild/buildifier.git ${BUILDIFIER_DIR}
+mkdir ${BUILDIFIER_DIR}
+curl -Ls https://github.com/bazelbuild/buildifier/archive/0.4.3.tar.gz | \
+    tar -C "${BUILDIFIER_DIR}" --strip-components=1 -xz
 pushd ${BUILDIFIER_DIR}
 
 bazel build buildifier:buildifier --spawn_strategy=standalone --genrule_strategy=standalone
diff --git a/tensorflow/tools/docker/parameterized_docker_build.sh b/tensorflow/tools/docker/parameterized_docker_build.sh
index 35c1218470..886266caaf 100755
--- a/tensorflow/tools/docker/parameterized_docker_build.sh
+++ b/tensorflow/tools/docker/parameterized_docker_build.sh
@@ -266,7 +266,7 @@ fi
 IMG="${USER}/tensorflow:${FINAL_TAG}"
 echo "Building docker image with image name and tag: ${IMG}"
 
-"${DOCKER_BINARY}" build --no-cache -t "${IMG}" -f "${DOCKERFILE}" "${TMP_DIR}"
+"${DOCKER_BINARY}" build --no-cache --pull -t "${IMG}" -f "${DOCKERFILE}" "${TMP_DIR}"
 if [[ $? == "0" ]]; then
   echo "${DOCKER_BINARY} build of ${IMG} succeeded"
 else
diff --git a/tensorflow/tools/graph_transforms/README.md b/tensorflow/tools/graph_transforms/README.md
index 36a5c01a0c..73af8699b2 100644
--- a/tensorflow/tools/graph_transforms/README.md
+++ b/tensorflow/tools/graph_transforms/README.md
@@ -179,7 +179,7 @@ bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
 --outputs='softmax:0' \
 --transforms='\
 strip_unused_nodes(type=float, shape="1,299,299,3") \
-fold_constants \
+fold_constants(ignore_errors=true) \
 fold_batch_norms \
 fold_old_batch_norms\
 '
@@ -715,7 +715,7 @@ shape arguments let you control the attributes of any new Placeholders that are
 created. Plain `type` and `shape` set global defaults, but if you have different
 inputs with varying characteristics, you'll need to pass in a list of arguments
 where the preceding name specifies what layer each applies to. For example, if
-you had two inputs in1 and in2, you could call `strip_unused_node(name=in1,
+you had two inputs in1 and in2, you could call `strip_unused_nodes(name=in1,
 type_for_name=int32, shape_for_name="2,3", name=in2, type_for_name=float,
 shape_for_name="1,10,10,3")`.
 
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index ea4c7e9ff4..9b229704d6 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -29,7 +29,7 @@ from setuptools.dist import Distribution
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.0.0-rc2'
+_VERSION = '1.0.0'
 
 REQUIRED_PACKAGES = [
     'numpy >= 1.11.0',
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 5a2fe99c67..66ff8843d8 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -406,12 +406,16 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       actual = "@zlib_archive//:zlib",
   )
 
-  native.new_http_archive(
+  temp_workaround_http_archive(
       name = "nccl_archive",
-      url = "https://github.com/nvidia/nccl/archive/024d1e267845f2ed06f3e2e42476d50f04a00ee6.tar.gz",
+      urls = [
+          "http://bazel-mirror.storage.googleapis.com/github.com/nvidia/nccl/archive/024d1e267845f2ed06f3e2e42476d50f04a00ee6.tar.gz",
+          "https://github.com/nvidia/nccl/archive/024d1e267845f2ed06f3e2e42476d50f04a00ee6.tar.gz",
+      ],
       sha256 = "6787f0eed88d52ee8e32956fa4947d92c139da469f1d8e311c307f27d641118e",
       strip_prefix = "nccl-024d1e267845f2ed06f3e2e42476d50f04a00ee6",
       build_file = str(Label("//third_party:nccl.BUILD")),
+      repository = tf_repo_name,
   )
 
   java_import_external(
diff --git a/third_party/curl.BUILD b/third_party/curl.BUILD
index dde8e6cdb7..557c2885eb 100644
--- a/third_party/curl.BUILD
+++ b/third_party/curl.BUILD
@@ -204,13 +204,13 @@ cc_library(
         "lib/wildcard.h",
         "lib/x509asn1.h",
     ] + select({
-        ":darwin": [
+        "@//tensorflow:darwin": [
             "lib/vtls/darwinssl.c",
         ],
-        ":ios": [
+        "@//tensorflow:ios": [
             "lib/vtls/darwinssl.c",
         ],
-        ":windows": [
+        "@//tensorflow:windows": [
             "lib/asyn-thread.c",
             "lib/inet_ntop.c",
             "lib/system_win32.c",
@@ -231,7 +231,7 @@ cc_library(
         "include/curl/typecheck-gcc.h",
     ],
     copts = select({
-        ":windows": [
+        "@//tensorflow:windows": [
             "/I%prefix%/curl/lib",
             "/DHAVE_CONFIG_H",
             "/DCURL_DISABLE_FTP",
@@ -255,10 +255,10 @@ cc_library(
             "-Wno-string-plus-int",
         ],
     }) + select({
-        ":darwin": [
+        "@//tensorflow:darwin": [
             "-fno-constant-cfstrings",
         ],
-        ":windows": [
+        "@//tensorflow:windows": [
             # See curl.h for discussion of write size and Windows
             "/DCURL_MAX_WRITE_SIZE=16384",
         ],
@@ -268,17 +268,17 @@ cc_library(
     }),
     includes = ["include"],
     linkopts = select({
-        ":android": [
+        "@//tensorflow:android": [
             "-pie",
         ],
-        ":darwin": [
+        "@//tensorflow:darwin": [
             "-Wl,-framework",
             "-Wl,CoreFoundation",
             "-Wl,-framework",
             "-Wl,Security",
         ],
-        ":ios": [],
-        ":windows": [
+        "@//tensorflow:ios": [],
+        "@//tensorflow:windows": [
             "ws2_32.lib",
         ],
         "//conditions:default": [
@@ -289,8 +289,8 @@ cc_library(
     deps = [
         "@zlib_archive//:zlib",
     ] + select({
-        ":ios": [],
-        ":windows": [],
+        "@//tensorflow:ios": [],
+        "@//tensorflow:windows": [],
         "//conditions:default": [
             "@boringssl//:ssl",
         ],
@@ -386,7 +386,7 @@ cc_binary(
         "src/tool_xattr.h",
     ],
     copts = select({
-        ":windows": [
+        "@//tensorflow:windows": [
             "/I%prefix%/curl/lib",
             "/DHAVE_CONFIG_H",
             "/DCURL_DISABLE_LIBCURL_OPTION",
@@ -657,23 +657,3 @@ genrule(
         "EOF",
     ]),
 )
-
-config_setting(
-    name = "ios",
-    values = {"crosstool_top": "//tools/osx/crosstool:crosstool"},
-)
-
-config_setting(
-    name = "darwin",
-    values = {"cpu": "darwin"},
-)
-
-config_setting(
-    name = "windows",
-    values = {"cpu": "x64_windows_msvc"},
-)
-
-config_setting(
-    name = "android",
-    values = {"crosstool_top": "//external:android/crosstool"},
-)
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
index 98deb1742e..078be83e0d 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
@@ -11,6 +11,13 @@ typedef struct Packet32q8i {
   Packet32q8i(__m256i val) : val(val) {}
 } Packet32q8i;
 
+typedef struct Packet16q16i {
+  __m256i val;
+  operator __m256i() const { return val; }
+  Packet16q16i();
+  Packet16q16i(__m256i val) : val(val) {}
+} Packet16q16i;
+
 typedef struct Packet32q8u {
   __m256i val;
   operator __m256i() const { return val; }
@@ -32,6 +39,13 @@ typedef struct Packet16q8u {
   Packet16q8u(__m128i val) : val(val) {}
 } Packet16q8u;
 
+typedef struct Packet8q16i {
+  __m128i val;
+  operator __m128i() const { return val; }
+  Packet8q16i();
+  Packet8q16i(__m128i val) : val(val) {}
+} Packet8q16i;
+
 typedef struct Packet8q32i {
   __m256i val;
   operator __m256i() const { return val; }
@@ -92,6 +106,28 @@ struct packet_traits<QUInt8> : default_packet_traits {
   };
 };
 template <>
+struct packet_traits<QInt16> : default_packet_traits {
+  typedef Packet16q16i type;
+  typedef Packet8q16i half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+  };
+  enum {
+    HasAdd = 0,
+    HasSub = 0,
+    HasMul = 0,
+    HasNegate = 0,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 0,
+    HasSetLinear = 0
+  };
+};
+template <>
 struct packet_traits<QInt32> : default_packet_traits {
   typedef Packet8q32i type;
   typedef Packet4q32i half;
@@ -122,6 +158,12 @@ struct unpacket_traits<Packet32q8i> {
   enum { size = 32, alignment=Aligned32 };
 };
 template <>
+struct unpacket_traits<Packet16q16i> {
+  typedef QInt16 type;
+  typedef Packet8q16i half;
+  enum { size = 16, alignment=Aligned32 };
+};
+template <>
 struct unpacket_traits<Packet32q8u> {
   typedef QUInt8 type;
   typedef Packet16q8u half;
@@ -146,6 +188,11 @@ EIGEN_STRONG_INLINE Packet32q8u ploadu<Packet32q8u>(const QUInt8* from) {
       reinterpret_cast<const __m256i*>(from));
 }
 template <>
+EIGEN_STRONG_INLINE Packet16q16i ploadu<Packet16q16i>(const QInt16* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
+      reinterpret_cast<const __m256i*>(from));
+}
+template <>
 EIGEN_STRONG_INLINE Packet8q32i ploadu<Packet8q32i>(const QInt32* from) {
   EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
       reinterpret_cast<const __m256i*>(from));
@@ -163,6 +210,11 @@ EIGEN_STRONG_INLINE Packet32q8u pload<Packet32q8u>(const QUInt8* from) {
       reinterpret_cast<const __m256i*>(from));
 }
 template <>
+EIGEN_STRONG_INLINE Packet16q16i pload<Packet16q16i>(const QInt16* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
+      reinterpret_cast<const __m256i*>(from));
+}
+template <>
 EIGEN_STRONG_INLINE Packet8q32i pload<Packet8q32i>(const QInt32* from) {
   EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
       reinterpret_cast<const __m256i*>(from));
@@ -180,6 +232,11 @@ EIGEN_STRONG_INLINE void pstoreu<QUInt8>(QUInt8* to, const Packet32q8u& from) {
       reinterpret_cast<__m256i*>(to), from.val);
 }
 template <>
+EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet16q16i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
+      reinterpret_cast<__m256i*>(to), from.val);
+}
+template <>
 EIGEN_STRONG_INLINE void pstoreu<QInt32>(QInt32* to, const Packet8q32i& from) {
   EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
       reinterpret_cast<__m256i*>(to), from.val);
@@ -192,6 +249,11 @@ EIGEN_STRONG_INLINE void pstore<QInt32>(QInt32* to, const Packet8q32i& from) {
                                                from.val);
 }
 template <>
+EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet16q16i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
+                                               from.val);
+}
+template <>
 EIGEN_STRONG_INLINE void pstore<QUInt8>(QUInt8* to, const Packet32q8u& from) {
   EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
                                                from.val);
@@ -208,6 +270,10 @@ EIGEN_STRONG_INLINE QInt32 pfirst<Packet8q32i>(const Packet8q32i& a) {
   return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
 }
 template <>
+EIGEN_STRONG_INLINE QInt16 pfirst<Packet16q16i>(const Packet16q16i& a) {
+  return _mm256_extract_epi16(a.val, 0);
+}
+template <>
 EIGEN_STRONG_INLINE QUInt8 pfirst<Packet32q8u>(const Packet32q8u& a) {
   return static_cast<uint8_t>(_mm256_extract_epi8(a.val, 0));
 }
@@ -237,6 +303,10 @@ EIGEN_STRONG_INLINE Packet8q32i padd<Packet8q32i>(const Packet8q32i& a,
   return _mm256_add_epi32(a.val, b.val);
 }
 template <>
+EIGEN_STRONG_INLINE Packet16q16i pset1<Packet16q16i>(const QInt16& from) {
+  return _mm256_set1_epi16(from.value);
+}
+template <>
 EIGEN_STRONG_INLINE Packet8q32i psub<Packet8q32i>(const Packet8q32i& a,
                                                   const Packet8q32i& b) {
   return _mm256_sub_epi32(a.val, b.val);
@@ -265,6 +335,17 @@ EIGEN_STRONG_INLINE Packet8q32i pmax<Packet8q32i>(const Packet8q32i& a,
 }
 
 template <>
+EIGEN_STRONG_INLINE Packet16q16i pmin<Packet16q16i>(const Packet16q16i& a,
+                                                    const Packet16q16i& b) {
+  return _mm256_min_epi16(a.val, b.val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q16i pmax<Packet16q16i>(const Packet16q16i& a,
+                                                    const Packet16q16i& b) {
+  return _mm256_max_epi16(a.val, b.val);
+}
+
+template <>
 EIGEN_STRONG_INLINE Packet32q8u pmin<Packet32q8u>(const Packet32q8u& a,
                                                   const Packet32q8u& b) {
   return _mm256_min_epu8(a.val, b.val);
@@ -305,6 +386,23 @@ EIGEN_STRONG_INLINE QInt32 predux_max<Packet8q32i>(const Packet8q32i& a) {
 }
 
 template <>
+EIGEN_STRONG_INLINE QInt16 predux_min<Packet16q16i>(const Packet16q16i& a) {
+  __m256i tmp = _mm256_min_epi16(a, _mm256_permute2f128_si256(a, a, 1));
+  tmp =
+      _mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp = _mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, 1));
+  return std::min(_mm256_extract_epi16(tmp, 0), _mm256_extract_epi16(tmp, 1));
+}
+template <>
+EIGEN_STRONG_INLINE QInt16 predux_max<Packet16q16i>(const Packet16q16i& a) {
+  __m256i tmp = _mm256_max_epi16(a, _mm256_permute2f128_si256(a, a, 1));
+  tmp =
+      _mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp = _mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, 1));
+  return std::max(_mm256_extract_epi16(tmp, 0), _mm256_extract_epi16(tmp, 1));
+}
+
+template <>
 EIGEN_STRONG_INLINE QUInt8 predux_min<Packet32q8u>(const Packet32q8u& a) {
   __m256i tmp = _mm256_min_epu8(a, _mm256_permute2f128_si256(a, a, 1));
   tmp =
diff --git a/third_party/mkl/BUILD b/third_party/mkl/BUILD
index ddaf29a086..7e95ebd355 100644
--- a/third_party/mkl/BUILD
+++ b/third_party/mkl/BUILD
@@ -8,12 +8,17 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+load(
+    "//third_party/mkl:build_defs.bzl",
+    "if_mkl",
+)
+
 cc_library(
     name = "intel_binary_blob",
-    srcs = [
-        "libiomp5.so",
+    srcs = if_mkl([
         "libmklml_intel.so",
-    ],
+        "libiomp5.so",
+    ]),
     includes = ["."],
     visibility = ["//visibility:public"],
 )
diff --git a/third_party/nccl.BUILD b/third_party/nccl.BUILD
index bb460a05e0..06b9b8ff68 100644
--- a/third_party/nccl.BUILD
+++ b/third_party/nccl.BUILD
@@ -43,6 +43,24 @@ cc_library(
         "-Iexternal/nccl_archive/src",
         "-O3",
     ] + cuda_default_copts(),
+    linkopts = select({
+        "@%ws%//tensorflow:android": [
+            "-pie",
+        ],
+        "@%ws%//tensorflow:darwin": [
+            "-Wl,-framework",
+            "-Wl,CoreFoundation",
+            "-Wl,-framework",
+            "-Wl,Security",
+        ],
+        "@%ws%//tensorflow:ios": [],
+        "@%ws%//tensorflow:windows": [
+            "ws2_32.lib",
+        ],
+        "//conditions:default": [
+            "-lrt",
+        ],
+    }),
     visibility = ["//visibility:public"],
     deps = ["@local_config_cuda//cuda:cuda_headers"],
 )
diff --git a/third_party/sycl/crosstool/computecpp.tpl b/third_party/sycl/crosstool/computecpp.tpl
index a5e6b9fe93..66dd9aea7b 100755
--- a/third_party/sycl/crosstool/computecpp.tpl
+++ b/third_party/sycl/crosstool/computecpp.tpl
@@ -26,9 +26,7 @@ def main():
 
   if(output_file_index == 1):
     # we are linking
-    return subprocess.call([CPU_CXX_COMPILER] + compiler_flags)
-
-  compiler_flags = compiler_flags + ['-D_GLIBCXX_USE_CXX11_ABI=0', '-DEIGEN_USE_SYCL=1']
+    return subprocess.call([CPU_CXX_COMPILER] + compiler_flags + ['-Wl,--no-undefined'])
 
   # find what we compile
   compiling_cpp = 0
@@ -38,6 +36,28 @@ def main():
       if(compited_file_name.endswith(('.cc', '.c++', '.cpp', '.CPP', '.C', '.cxx'))):
           compiling_cpp = 1;
 
+  compiler_flags = compiler_flags + ['-D_GLIBCXX_USE_CXX11_ABI=0', '-DEIGEN_USE_SYCL=1', '-DTENSORFLOW_USE_SYCL', '-DEIGEN_HAS_C99_MATH']
+
+  if(compiling_cpp == 1):
+      # create a blacklist of folders that will be skipped when compiling with ComputeCpp
+      _skip = ["external", "llvm", ".cu.cc"]
+      # if compiling external project skip computecpp
+      if any(_folder in _skip for _folder in output_file_name):
+        return subprocess.call([CPU_CXX_COMPILER] + compiler_flags)
+
+  if(compiling_cpp == 1):
+      # this is an optimisation that will check if compiled file has to be compiled with ComputeCpp
+
+      _tmp_flags = [flag for flag in compiler_flags if not flag.startswith(('-o', output_file_name))]
+      # create preprocessed of the file
+      _cmd = " ".join([CPU_CXX_COMPILER] + _tmp_flags + ["-E"])
+      # check if it has parallel_for< in it
+      _cmd += " | grep \".parallel_for\" > /dev/null"
+      ps = subprocess.call(_cmd, shell=True)
+      # if not call CXX compiler
+      if(ps != 0):
+          return subprocess.call([CPU_CXX_COMPILER] + compiler_flags)
+
   if(compiling_cpp == 1):
       filename, file_extension = os.path.splitext(output_file_name)
       bc_out = filename + '.sycl'
@@ -52,9 +72,12 @@ def main():
           # dont want that in case of compiling with computecpp first
           host_compiler_flags = [flag for flag in compiler_flags
                                     if not flag.startswith(('-MF', '-MD',))
-                                    if not '.d' in flag]
+                                    if not '.d' in flag
+                                ]
+
+          host_compiler_flags[host_compiler_flags.index('-c')] = "--include"
 
-          host_compiler_flags = ['-D_GLIBCXX_USE_CXX11_ABI=0', '-DTENSORFLOW_USE_SYCL', '-Wno-unused-variable', '-I', COMPUTECPP_INCLUDE, '--include', bc_out] + host_compiler_flags
+          host_compiler_flags = ['-xc++', '-D_GLIBCXX_USE_CXX11_ABI=0', '-DTENSORFLOW_USE_SYCL', '-Wno-unused-variable', '-I', COMPUTECPP_INCLUDE, '-c', bc_out] + host_compiler_flags
           x = subprocess.call([CPU_CXX_COMPILER] + host_compiler_flags)
       return x
   else:
diff --git a/tools/bazel.rc.template b/tools/bazel.rc.template
index 48c9f5aa3f..3622b9423c 100644
--- a/tools/bazel.rc.template
+++ b/tools/bazel.rc.template
@@ -7,7 +7,7 @@ build:sycl --crosstool_top=@local_config_sycl//crosstool:toolchain
 build:sycl --define=using_sycl=true
 
 build:sycl_asan --crosstool_top=@local_config_sycl//crosstool:toolchain
-build:sycl_asan --define=using_sycl=true --copt -fno-omit-frame-pointer --copt -fsanitize-coverage=3 --copt -fsanitize=address --copt -DGPR_NO_DIRECT_SYSCALLS --linkopt -fPIC --linkopt -lasan
+build:sycl_asan --define=using_sycl=true --copt -fno-omit-frame-pointer --copt -fsanitize-coverage=3 --copt -DGPR_NO_DIRECT_SYSCALLS --linkopt -fPIC --linkopt -fsanitize=address
 
 build --force_python=py$PYTHON_MAJOR_VERSION
 build --host_force_python=py$PYTHON_MAJOR_VERSION