323 files changed, 33366 insertions, 0 deletions
diff --git a/tensorflow/core/kernels/adjust_contrast_op.cc b/tensorflow/core/kernels/adjust_contrast_op.cc
new file mode 100644
index 0000000000..7cc0534354
--- /dev/null
+++ b/tensorflow/core/kernels/adjust_contrast_op.cc
@@ -0,0 +1,121 @@
+// See docs in ../ops/image_ops.cc
+#define EIGEN_USE_THREADS
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/adjust_contrast_op.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class AdjustContrastOp : public OpKernel {
+ public:
+  explicit AdjustContrastOp(OpKernelConstruction* context) : OpKernel(context) {
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const Tensor& factor = context->input(1);
+    const Tensor& min_value = context->input(2);
+    const Tensor& max_value = context->input(3);
+    OP_REQUIRES(context, input.dims() >= 3,
+                errors::InvalidArgument("input must be at least 3-D, got shape",
+                                        input.shape().ShortDebugString()));
+    const int64 height = input.dim_size(input.dims() - 3);
+    const int64 width = input.dim_size(input.dims() - 2);
+    const int64 channels = input.dim_size(input.dims() - 1);
+
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(factor.shape()),
+                errors::InvalidArgument("contrast_factor must be scalar: ",
+                                        factor.shape().ShortDebugString()));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(min_value.shape()),
+                errors::InvalidArgument("min_value must be scalar: ",
+                                        min_value.shape().ShortDebugString()));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(max_value.shape()),
+                errors::InvalidArgument("max_value must be scalar: ",
+                                        max_value.shape().ShortDebugString()));
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input.shape(), &output));
+
+    Tensor mean_values;
+    OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<float>::value,
+                                                   TensorShape(input.shape()),
+                                                   &mean_values));
+
+    if (input.NumElements() > 0) {
+      const int64 batch = input.NumElements() / (height * width * channels);
+      const int64 shape[4] = {batch, height, width, channels};
+      functor::AdjustContrast<Device, T>()(
+          context->eigen_device<Device>(), input.shaped<T, 4>(shape),
+          factor.scalar<float>(), min_value.scalar<float>(),
+          max_value.scalar<float>(), mean_values.shaped<float, 4>(shape),
+          output->shaped<float, 4>(shape));
+    }
+  }
+};
+
+#define REGISTER_KERNEL(T)                                              \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("AdjustContrast").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      AdjustContrastOp<CPUDevice, T>);
+
+REGISTER_KERNEL(uint8);
+REGISTER_KERNEL(int8);
+REGISTER_KERNEL(int16);
+REGISTER_KERNEL(int32);
+REGISTER_KERNEL(float);
+REGISTER_KERNEL(double);
+#undef REGISTER_KERNEL
+
+#if GOOGLE_CUDA
+// Forward declarations of the function specializations for GPU (to prevent
+// building the GPU versions here, they will be built compiling _gpu.cu.cc).
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                         \
+  template <>                                                       \
+  void AdjustContrast<GPUDevice, T>::operator()(                    \
+      const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input, \
+      typename TTypes<float>::ConstScalar contrast_factor,          \
+      typename TTypes<float>::ConstScalar min_value,                \
+      typename TTypes<float>::ConstScalar max_value,                \
+      typename TTypes<float, 4>::Tensor mean_values,                \
+      typename TTypes<float, 4>::Tensor output);                    \
+  extern template struct AdjustContrast<GPUDevice, T>;
+
+DECLARE_GPU_SPEC(uint8);
+DECLARE_GPU_SPEC(int8);
+DECLARE_GPU_SPEC(int16);
+DECLARE_GPU_SPEC(int32);
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+// Registration of the GPU implementations.
+#define REGISTER_GPU_KERNEL(T)                                          \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("AdjustContrast").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      AdjustContrastOp<GPUDevice, T>);
+REGISTER_GPU_KERNEL(uint8);
+REGISTER_GPU_KERNEL(int8);
+REGISTER_GPU_KERNEL(int16);
+REGISTER_GPU_KERNEL(int32);
+REGISTER_GPU_KERNEL(float);
+REGISTER_GPU_KERNEL(double);
+#undef REGISTER_GPU_KERNEL
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/adjust_contrast_op.h b/tensorflow/core/kernels/adjust_contrast_op.h
new file mode 100644
index 0000000000..2182b33c03
--- /dev/null
+++ b/tensorflow/core/kernels/adjust_contrast_op.h
@@ -0,0 +1,64 @@
+#ifndef TENSORFLOW_KERNELS_ADJUST_CONTRAST_OP_H_
+#define TENSORFLOW_KERNELS_ADJUST_CONTRAST_OP_H_
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by AdjustContrastOp to do the computations.
+template <typename Device, typename T>
+struct AdjustContrast {
+  void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input,
+                  typename TTypes<float>::ConstScalar contrast_factor,
+                  typename TTypes<float>::ConstScalar min_value,
+                  typename TTypes<float>::ConstScalar max_value,
+                  typename TTypes<float, 4>::Tensor mean_values,
+                  typename TTypes<float, 4>::Tensor output) {
+    const int batch = input.dimension(0);
+    const int height = input.dimension(1);
+    const int width = input.dimension(2);
+    const int channels = input.dimension(3);
+
+    Eigen::array<int, 4> scalar_broadcast{{batch, height, width, channels}};
+#if !defined(EIGEN_HAS_INDEX_LIST)
+    Eigen::array<int, 2> reduction_axis{{1, 2}};
+    Eigen::array<int, 4> scalar{{1, 1, 1, 1}};
+    Eigen::array<int, 4> broadcast_dims{{1, height, width, 1}};
+    Eigen::Tensor<int, 4>::Dimensions reshape_dims{{batch, 1, 1, channels}};
+#else
+    Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> >
+        reduction_axis;
+    Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<1>,
+                     Eigen::type2index<1>, Eigen::type2index<1> > scalar;
+    Eigen::IndexList<Eigen::type2index<1>, int, int, Eigen::type2index<1> >
+        broadcast_dims;
+    broadcast_dims.set(1, height);
+    broadcast_dims.set(2, width);
+    Eigen::IndexList<int, Eigen::type2index<1>, Eigen::type2index<1>, int>
+        reshape_dims;
+    reshape_dims.set(0, batch);
+    reshape_dims.set(3, channels);
+#endif
+    mean_values.device(d) = input.template cast<float>()
+                                .mean(reduction_axis)
+                                .eval()
+                                .reshape(reshape_dims)
+                                .broadcast(broadcast_dims);
+
+    auto contrast_factor_tensor =
+        contrast_factor.reshape(scalar).broadcast(scalar_broadcast);
+    auto adjusted =
+        (input.template cast<float>() - mean_values) * contrast_factor_tensor +
+        mean_values;
+    auto min_bcast = min_value.reshape(scalar).broadcast(scalar_broadcast);
+    auto max_bcast = max_value.reshape(scalar).broadcast(scalar_broadcast);
+    // TODO(wicke): This is rather slow and should be re-written as pure cuda.
+    output.device(d) = adjusted.cwiseMin(max_bcast).cwiseMax(min_bcast);
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_ADJUST_CONTRAST_OP_H_
diff --git a/tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc b/tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc
new file mode 100644
index 0000000000..75b177cf4d
--- /dev/null
+++ b/tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc
@@ -0,0 +1,43 @@
+#include "tensorflow/core/public/tensor.h"
+#include <gtest/gtest.h>
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+static Graph* BM_AdjustContrast(int batches, int width, int height) {
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor in(DT_UINT8, TensorShape({batches, width, height, 3}));
+  in.flat<uint8>().setRandom();
+  Tensor factor(DT_FLOAT, TensorShape({}));
+  factor.flat<float>().setConstant(1.2);
+  Tensor min_value(DT_FLOAT, TensorShape({}));
+  min_value.flat<float>().setConstant(7.);
+  Tensor max_value(DT_FLOAT, TensorShape({}));
+  max_value.flat<float>().setConstant(250.);
+
+  Node* ret;
+  NodeBuilder(g->NewName("n"), "AdjustContrast")
+      .Input(test::graph::Constant(g, in))
+      .Input(test::graph::Constant(g, factor))
+      .Input(test::graph::Constant(g, min_value))
+      .Input(test::graph::Constant(g, max_value))
+      .Finalize(g, &ret);
+  return g;
+}
+
+#define BM_AdjustContrastDev(DEVICE, B, W, H)                           \
+  static void BM_AdjustContrast_##DEVICE##_##B##_##W##_##H(int iters) { \
+    testing::ItemsProcessed(iters* B* W* H * 3);                        \
+    test::Benchmark(#DEVICE, BM_AdjustContrast(B, W, H)).Run(iters);    \
+  }                                                                     \
+  BENCHMARK(BM_AdjustContrast_##DEVICE##_##B##_##W##_##H);
+
+// Benchmark results as of cl/106323955
+// BM_AdjustContrast_cpu_1_299_299  3416770  22008951  100  11.6M items/s
+
+// BM_AdjustContrast_gpu_32_299_299  37117844  45512374  100  179.8M items/s
+BM_AdjustContrastDev(cpu, 1, 299, 299) BM_AdjustContrastDev(gpu, 32, 299, 299)
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/adjust_contrast_op_gpu.cu.cc b/tensorflow/core/kernels/adjust_contrast_op_gpu.cu.cc
new file mode 100644
index 0000000000..7a9b0726fd
--- /dev/null
+++ b/tensorflow/core/kernels/adjust_contrast_op_gpu.cu.cc
@@ -0,0 +1,22 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/adjust_contrast_op.h"
+
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+template struct functor::AdjustContrast<GPUDevice, uint8>;
+template struct functor::AdjustContrast<GPUDevice, int8>;
+template struct functor::AdjustContrast<GPUDevice, int16>;
+template struct functor::AdjustContrast<GPUDevice, int32>;
+template struct functor::AdjustContrast<GPUDevice, int64>;
+template struct functor::AdjustContrast<GPUDevice, float>;
+template struct functor::AdjustContrast<GPUDevice, double>;
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/adjust_contrast_op_test.cc b/tensorflow/core/kernels/adjust_contrast_op_test.cc
new file mode 100644
index 0000000000..67891e4fa1
--- /dev/null
+++ b/tensorflow/core/kernels/adjust_contrast_op_test.cc
@@ -0,0 +1,88 @@
+#include "tensorflow/core/framework/allocator.h"
+#include <gtest/gtest.h>
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+
+class AdjustContrastOpTest : public OpsTestBase {
+ protected:
+  void MakeOp() { RequireDefaultOps(); }
+};
+
+TEST_F(AdjustContrastOpTest, Simple_1113) {
+  RequireDefaultOps();
+  EXPECT_OK(NodeDefBuilder("adjust_constrast_op", "AdjustContrast")
+                .Input(FakeInput(DT_FLOAT))
+                .Input(FakeInput(DT_FLOAT))
+                .Input(FakeInput(DT_FLOAT))
+                .Input(FakeInput(DT_FLOAT))
+                .Attr("T", DT_FLOAT)
+                .Finalize(node_def()));
+  EXPECT_OK(InitOp());
+  AddInputFromArray<float>(TensorShape({1, 1, 1, 3}), {-1, 2, 3});
+  AddInputFromArray<float>(TensorShape({}), {1.0});
+  AddInputFromArray<float>(TensorShape({}), {0.0});
+  AddInputFromArray<float>(TensorShape({}), {2.0});
+  ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 3}));
+  test::FillValues<float>(&expected, {0, 2, 2});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(AdjustContrastOpTest, Simple_1223) {
+  RequireDefaultOps();
+  EXPECT_OK(NodeDefBuilder("adjust_constrast_op", "AdjustContrast")
+                .Input(FakeInput(DT_FLOAT))
+                .Input(FakeInput(DT_FLOAT))
+                .Input(FakeInput(DT_FLOAT))
+                .Input(FakeInput(DT_FLOAT))
+                .Attr("T", DT_FLOAT)
+                .Finalize(node_def()));
+  EXPECT_OK(InitOp());
+  AddInputFromArray<float>(TensorShape({1, 2, 2, 3}),
+                           {1, 5, 9, 2, 6, 10, 3, 7, 11, 4, 8, 12});
+  AddInputFromArray<float>(TensorShape({}), {0.2});
+  AddInputFromArray<float>(TensorShape({}), {0.0});
+  AddInputFromArray<float>(TensorShape({}), {10.0});
+  ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 2, 3}));
+  test::FillValues<float>(
+      &expected, {2.2, 6.2, 10, 2.4, 6.4, 10, 2.6, 6.6, 10, 2.8, 6.8, 10});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(AdjustContrastOpTest, Big_99x99x3) {
+  EXPECT_OK(NodeDefBuilder("adjust_constrast_op", "AdjustContrast")
+                .Input(FakeInput(DT_FLOAT))
+                .Input(FakeInput(DT_FLOAT))
+                .Input(FakeInput(DT_FLOAT))
+                .Input(FakeInput(DT_FLOAT))
+                .Attr("T", DT_FLOAT)
+                .Finalize(node_def()));
+  EXPECT_OK(InitOp());
+
+  std::vector<float> values;
+  for (int i = 0; i < 99 * 99 * 3; ++i) {
+    values.push_back(i % 255);
+  }
+
+  AddInputFromArray<float>(TensorShape({1, 99, 99, 3}), values);
+  AddInputFromArray<float>(TensorShape({}), {0.2});
+  AddInputFromArray<float>(TensorShape({}), {0});
+  AddInputFromArray<float>(TensorShape({}), {255});
+  ASSERT_OK(RunOpKernel());
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/aggregate_ops.cc b/tensorflow/core/kernels/aggregate_ops.cc
new file mode 100644
index 0000000000..426e868735
--- /dev/null
+++ b/tensorflow/core/kernels/aggregate_ops.cc
@@ -0,0 +1,238 @@
+// See docs in ../ops/math_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/aggregate_ops.h"
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/register_types.h"
+
+#include "tensorflow/core/platform/logging.h"
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class AddNOp : public OpKernel {
+ public:
+  explicit AddNOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    if (!ctx->ValidateInputsAreSameShape(this)) return;
+
+    const Tensor& input0 = ctx->input(0);
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input0.shape(), &output));
+    auto To = output->flat<T>();
+
+    const int num = ctx->num_inputs();
+    if (num == 1) {
+      *output = input0;
+      return;
+    }
+
+#define I(IDX) ctx->input(IDX).flat<T>()
+
+#if defined(PLATFORM_POSIX_ANDROID) || defined(PLATFORM_GOOGLE_ANDROID)
+    // On Android, we only support additions of two arguments, so we
+    // can reduce the number of template instantiations.
+    OP_REQUIRES(ctx, num == 2,
+                errors::InvalidArgument("Only additions of two arguments "
+                                        "supported. Num inputs: ",
+                                        num));
+    functor::Add2Functor<Device, T> functor2;
+    functor2(ctx->template eigen_device<Device>(), To, I(0), I(1));
+#else
+    static const int kWidth = 8;
+    int r = num % kWidth;
+
+    switch (r) {
+      case 2: {
+        functor::Add2Functor<Device, T> functor2;
+        functor2(ctx->template eigen_device<Device>(), To, I(0), I(1));
+        break;
+      }
+      case 3: {
+        functor::Add3Functor<Device, T> functor3;
+        functor3(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2));
+        break;
+      }
+      case 4: {
+        functor::Add4Functor<Device, T> functor4;
+        functor4(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2),
+                 I(3));
+        break;
+      }
+      case 5: {
+        functor::Add5Functor<Device, T> functor5;
+        functor5(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2),
+                 I(3), I(4));
+        break;
+      }
+      case 6: {
+        functor::Add6Functor<Device, T> functor6;
+        functor6(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2),
+                 I(3), I(4), I(5));
+        break;
+      }
+      case 7: {
+        functor::Add7Functor<Device, T> functor7;
+        functor7(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2),
+                 I(3), I(4), I(5), I(6));
+        break;
+      }
+      case 0: {
+        functor::Add8Functor<Device, T> functor8;
+        functor8(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2),
+                 I(3), I(4), I(5), I(6), I(7));
+        r = 8;
+        break;
+      }
+      case 1: {
+        functor::Add9Functor<Device, T> functor9;
+        functor9(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2),
+                 I(3), I(4), I(5), I(6), I(7), I(8));
+        r = 9;
+        break;
+      }
+    }
+
+    for (; r < num; r += kWidth) {
+      functor::Add8pFunctor<Device, T> functor8p;
+      functor8p(ctx->template eigen_device<Device>(), To, I(r), I(r + 1),
+                I(r + 2), I(r + 3), I(r + 4), I(r + 5), I(r + 6), I(r + 7));
+    }
+#endif  // defined(PLATFORM_POSIX_ANDROID) || defined(PLATFORM_GOOGLE_ANDROID)
+
+#undef I
+  }
+};
+
+// Partial specializations for a CPUDevice, that uses the Eigen implementation
+// from AddNEigenImpl.
+namespace functor {
+template <typename T>
+struct Add2Functor<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstFlat in1,
+                  typename TTypes<T>::ConstFlat in2) {
+    Add2EigenImpl<CPUDevice, T>::Compute(d, out, in1, in2);
+  }
+};
+template <typename T>
+struct Add3Functor<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstFlat in1,
+                  typename TTypes<T>::ConstFlat in2,
+                  typename TTypes<T>::ConstFlat in3) {
+    Add3EigenImpl<CPUDevice, T>::Compute(d, out, in1, in2, in3);
+  }
+};
+template <typename T>
+struct Add4Functor<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstFlat in1,
+                  typename TTypes<T>::ConstFlat in2,
+                  typename TTypes<T>::ConstFlat in3,
+                  typename TTypes<T>::ConstFlat in4) {
+    Add4EigenImpl<CPUDevice, T>::Compute(d, out, in1, in2, in3, in4);
+  }
+};
+template <typename T>
+struct Add5Functor<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstFlat in1,
+                  typename TTypes<T>::ConstFlat in2,
+                  typename TTypes<T>::ConstFlat in3,
+                  typename TTypes<T>::ConstFlat in4,
+                  typename TTypes<T>::ConstFlat in5) {
+    Add5EigenImpl<CPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5);
+  }
+};
+template <typename T>
+struct Add6Functor<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstFlat in1,
+                  typename TTypes<T>::ConstFlat in2,
+                  typename TTypes<T>::ConstFlat in3,
+                  typename TTypes<T>::ConstFlat in4,
+                  typename TTypes<T>::ConstFlat in5,
+                  typename TTypes<T>::ConstFlat in6) {
+    Add6EigenImpl<CPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6);
+  }
+};
+template <typename T>
+struct Add7Functor<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstFlat in1,
+                  typename TTypes<T>::ConstFlat in2,
+                  typename TTypes<T>::ConstFlat in3,
+                  typename TTypes<T>::ConstFlat in4,
+                  typename TTypes<T>::ConstFlat in5,
+                  typename TTypes<T>::ConstFlat in6,
+                  typename TTypes<T>::ConstFlat in7) {
+    Add7EigenImpl<CPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
+                                         in7);
+  }
+};
+
+template <typename T>
+struct Add8Functor<CPUDevice, T> {
+  void operator()(
+      const CPUDevice& d, typename TTypes<T>::Flat out,
+      typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
+      typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
+      typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
+      typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
+    Add8EigenImpl<CPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
+                                         in7, in8);
+  }
+};
+
+template <typename T>
+struct Add8pFunctor<CPUDevice, T> {
+  void operator()(
+      const CPUDevice& d, typename TTypes<T>::Flat out,
+      typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
+      typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
+      typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
+      typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
+    Add8pEigenImpl<CPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
+                                          in7, in8);
+  }
+};
+
+template <typename T>
+struct Add9Functor<CPUDevice, T> {
+  void operator()(
+      const CPUDevice& d, typename TTypes<T>::Flat out,
+      typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
+      typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
+      typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
+      typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8,
+      typename TTypes<T>::ConstFlat in9) {
+    Add9EigenImpl<CPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
+                                         in7, in8, in9);
+  }
+};
+
+}  // namespace functor
+
+#define REGISTER_ADDN(type, dev)                                   \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("AddN").Device(DEVICE_##dev).TypeConstraint<type>("T"), \
+      AddNOp<dev##Device, type>)
+
+#define REGISTER_ADDN_CPU(type) REGISTER_ADDN(type, CPU)
+
+TF_CALL_NUMBER_TYPES(REGISTER_ADDN_CPU);
+#undef REGISTER_ADDN_CPU
+
+#if GOOGLE_CUDA
+REGISTER_ADDN(float, GPU);
+#endif  // GOOGLE_CUDA
+
+#undef REGISTER_ADDN
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/aggregate_ops.h b/tensorflow/core/kernels/aggregate_ops.h
new file mode 100644
index 0000000000..2214901970
--- /dev/null
+++ b/tensorflow/core/kernels/aggregate_ops.h
@@ -0,0 +1,211 @@
+#ifndef TENSORFLOW_KERNELS_AGGREGATE_OPS_H_
+#define TENSORFLOW_KERNELS_AGGREGATE_OPS_H_
+
+// Functor definitions for Aggregate ops, must be compilable by nvcc.
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T>
+struct Add2Functor {
+  void operator()(const Device& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstFlat in1,
+                  typename TTypes<T>::ConstFlat in2);
+};
+
+template <typename Device, typename T>
+struct Add2EigenImpl {
+  static void Compute(const Device& d, typename TTypes<T>::Flat out,
+                      typename TTypes<T>::ConstFlat in1,
+                      typename TTypes<T>::ConstFlat in2) {
+    out.device(d) = in1 + in2;
+  }
+};
+
+template <typename Device, typename T>
+struct Add3Functor {
+  void operator()(const Device& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstFlat in1,
+                  typename TTypes<T>::ConstFlat in2,
+                  typename TTypes<T>::ConstFlat in3);
+};
+
+template <typename Device, typename T>
+struct Add3EigenImpl {
+  static void Compute(const Device& d, typename TTypes<T>::Flat out,
+                      typename TTypes<T>::ConstFlat in1,
+                      typename TTypes<T>::ConstFlat in2,
+                      typename TTypes<T>::ConstFlat in3) {
+    out.device(d) = in1 + in2 + in3;
+  }
+};
+
+template <typename Device, typename T>
+struct Add4Functor {
+  void operator()(const Device& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstFlat in1,
+                  typename TTypes<T>::ConstFlat in2,
+                  typename TTypes<T>::ConstFlat in3,
+                  typename TTypes<T>::ConstFlat in4);
+};
+
+template <typename Device, typename T>
+struct Add4EigenImpl {
+  static void Compute(const Device& d, typename TTypes<T>::Flat out,
+                      typename TTypes<T>::ConstFlat in1,
+                      typename TTypes<T>::ConstFlat in2,
+                      typename TTypes<T>::ConstFlat in3,
+                      typename TTypes<T>::ConstFlat in4) {
+    out.device(d) = in1 + in2 + in3 + in4;
+  }
+};
+
+template <typename Device, typename T>
+struct Add5Functor {
+  void operator()(const Device& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstFlat in1,
+                  typename TTypes<T>::ConstFlat in2,
+                  typename TTypes<T>::ConstFlat in3,
+                  typename TTypes<T>::ConstFlat in4,
+                  typename TTypes<T>::ConstFlat in5);
+};
+
+template <typename Device, typename T>
+struct Add5EigenImpl {
+  static void Compute(const Device& d, typename TTypes<T>::Flat out,
+                      typename TTypes<T>::ConstFlat in1,
+                      typename TTypes<T>::ConstFlat in2,
+                      typename TTypes<T>::ConstFlat in3,
+                      typename TTypes<T>::ConstFlat in4,
+                      typename TTypes<T>::ConstFlat in5) {
+    out.device(d) = in1 + in2 + in3 + in4 + in5;
+  }
+};
+
+template <typename Device, typename T>
+struct Add6Functor {
+  void operator()(const Device& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstFlat in1,
+                  typename TTypes<T>::ConstFlat in2,
+                  typename TTypes<T>::ConstFlat in3,
+                  typename TTypes<T>::ConstFlat in4,
+                  typename TTypes<T>::ConstFlat in5,
+                  typename TTypes<T>::ConstFlat in6);
+};
+
+template <typename Device, typename T>
+struct Add6EigenImpl {
+  static void Compute(const Device& d, typename TTypes<T>::Flat out,
+                      typename TTypes<T>::ConstFlat in1,
+                      typename TTypes<T>::ConstFlat in2,
+                      typename TTypes<T>::ConstFlat in3,
+                      typename TTypes<T>::ConstFlat in4,
+                      typename TTypes<T>::ConstFlat in5,
+                      typename TTypes<T>::ConstFlat in6) {
+    out.device(d) = in1 + in2 + in3 + in4 + in5 + in6;
+  }
+};
+
+template <typename Device, typename T>
+struct Add7Functor {
+  void operator()(const Device& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstFlat in1,
+                  typename TTypes<T>::ConstFlat in2,
+                  typename TTypes<T>::ConstFlat in3,
+                  typename TTypes<T>::ConstFlat in4,
+                  typename TTypes<T>::ConstFlat in5,
+                  typename TTypes<T>::ConstFlat in6,
+                  typename TTypes<T>::ConstFlat in7);
+};
+
+template <typename Device, typename T>
+struct Add7EigenImpl {
+  static void Compute(const Device& d, typename TTypes<T>::Flat out,
+                      typename TTypes<T>::ConstFlat in1,
+                      typename TTypes<T>::ConstFlat in2,
+                      typename TTypes<T>::ConstFlat in3,
+                      typename TTypes<T>::ConstFlat in4,
+                      typename TTypes<T>::ConstFlat in5,
+                      typename TTypes<T>::ConstFlat in6,
+                      typename TTypes<T>::ConstFlat in7) {
+    out.device(d) = in1 + in2 + in3 + in4 + in5 + in6 + in7;
+  }
+};
+
+template <typename Device, typename T>
+struct Add8Functor {
+  void operator()(
+      const Device& d, typename TTypes<T>::Flat out,
+      typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
+      typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
+      typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
+      typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8);
+};
+
+template <typename Device, typename T>
+struct Add8EigenImpl {
+  static void Compute(
+      const Device& d, typename TTypes<T>::Flat out,
+      typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
+      typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
+      typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
+      typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
+    out.device(d) = in1 + in2 + in3 + in4 + in5 + in6 + in7 + in8;
+  }
+};
+
+// Add8p is like Add8 except the underlying implementation should +=
+// rather than assign to the output.
+template <typename Device, typename T>
+struct Add8pFunctor {
+  void operator()(
+      const Device& d, typename TTypes<T>::Flat out,
+      typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
+      typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
+      typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
+      typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8);
+};
+
+template <typename Device, typename T>
+struct Add8pEigenImpl {
+  static void Compute(
+      const Device& d, typename TTypes<T>::Flat out,
+      typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
+      typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
+      typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
+      typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
+    out.device(d) += in1 + in2 + in3 + in4 + in5 + in6 + in7 + in8;
+  }
+};
+
+template <typename Device, typename T>
+struct Add9Functor {
+  void operator()(
+      const Device& d, typename TTypes<T>::Flat out,
+      typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
+      typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
+      typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
+      typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8,
+      typename TTypes<T>::ConstFlat in9);
+};
+
+template <typename Device, typename T>
+struct Add9EigenImpl {
+  static void Compute(
+      const Device& d, typename TTypes<T>::Flat out,
+      typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
+      typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
+      typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
+      typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8,
+      typename TTypes<T>::ConstFlat in9) {
+    out.device(d) = in1 + in2 + in3 + in4 + in5 + in6 + in7 + in8 + in9;
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_AGGREGATE_OPS_H_
diff --git a/tensorflow/core/kernels/aggregate_ops_gpu.cu.cc b/tensorflow/core/kernels/aggregate_ops_gpu.cu.cc
new file mode 100644
index 0000000000..5cf2934ac1
--- /dev/null
+++ b/tensorflow/core/kernels/aggregate_ops_gpu.cu.cc
@@ -0,0 +1,141 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/aggregate_ops.h"
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Partial specialization for a GPUDevice, that uses the Eigen implementation.
+namespace functor {
+template <typename T>
+struct Add2Functor<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstFlat in1,
+                  typename TTypes<T>::ConstFlat in2) {
+    Add2EigenImpl<GPUDevice, T>::Compute(d, out, in1, in2);
+  }
+};
+
+template <typename T>
+struct Add3Functor<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstFlat in1,
+                  typename TTypes<T>::ConstFlat in2,
+                  typename TTypes<T>::ConstFlat in3) {
+    Add3EigenImpl<GPUDevice, T>::Compute(d, out, in1, in2, in3);
+  }
+};
+
+template <typename T>
+struct Add4Functor<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstFlat in1,
+                  typename TTypes<T>::ConstFlat in2,
+                  typename TTypes<T>::ConstFlat in3,
+                  typename TTypes<T>::ConstFlat in4) {
+    Add4EigenImpl<GPUDevice, T>::Compute(d, out, in1, in2, in3, in4);
+  }
+};
+
+template <typename T>
+struct Add5Functor<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstFlat in1,
+                  typename TTypes<T>::ConstFlat in2,
+                  typename TTypes<T>::ConstFlat in3,
+                  typename TTypes<T>::ConstFlat in4,
+                  typename TTypes<T>::ConstFlat in5) {
+    Add5EigenImpl<GPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5);
+  }
+};
+
+template <typename T>
+struct Add6Functor<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstFlat in1,
+                  typename TTypes<T>::ConstFlat in2,
+                  typename TTypes<T>::ConstFlat in3,
+                  typename TTypes<T>::ConstFlat in4,
+                  typename TTypes<T>::ConstFlat in5,
+                  typename TTypes<T>::ConstFlat in6) {
+    Add6EigenImpl<GPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6);
+  }
+};
+
+template <typename T>
+struct Add7Functor<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstFlat in1,
+                  typename TTypes<T>::ConstFlat in2,
+                  typename TTypes<T>::ConstFlat in3,
+                  typename TTypes<T>::ConstFlat in4,
+                  typename TTypes<T>::ConstFlat in5,
+                  typename TTypes<T>::ConstFlat in6,
+                  typename TTypes<T>::ConstFlat in7) {
+    Add7EigenImpl<GPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
+                                         in7);
+  }
+};
+
+template <typename T>
+struct Add8Functor<GPUDevice, T> {
+  void operator()(
+      const GPUDevice& d, typename TTypes<T>::Flat out,
+      typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
+      typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
+      typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
+      typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
+    Add8EigenImpl<GPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
+                                         in7, in8);
+  }
+};
+
+template <typename T>
+struct Add8pFunctor<GPUDevice, T> {
+  void operator()(
+      const GPUDevice& d, typename TTypes<T>::Flat out,
+      typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
+      typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
+      typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
+      typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
+    Add8pEigenImpl<GPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
+                                          in7, in8);
+  }
+};
+
+template <typename T>
+struct Add9Functor<GPUDevice, T> {
+  void operator()(
+      const GPUDevice& d, typename TTypes<T>::Flat out,
+      typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
+      typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
+      typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
+      typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8,
+      typename TTypes<T>::ConstFlat in9) {
+    Add9EigenImpl<GPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
+                                         in7, in8, in9);
+  }
+};
+
+}  // end namespace functor
+
+// Instantiate the GPU implementation for float.
+template struct functor::Add2Functor<GPUDevice, float>;
+template struct functor::Add3Functor<GPUDevice, float>;
+template struct functor::Add4Functor<GPUDevice, float>;
+template struct functor::Add5Functor<GPUDevice, float>;
+template struct functor::Add6Functor<GPUDevice, float>;
+template struct functor::Add7Functor<GPUDevice, float>;
+template struct functor::Add8Functor<GPUDevice, float>;
+template struct functor::Add8pFunctor<GPUDevice, float>;
+template struct functor::Add9Functor<GPUDevice, float>;
+
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/argmax_op.cc b/tensorflow/core/kernels/argmax_op.cc
new file mode 100644
index 0000000000..0845eebf09
--- /dev/null
+++ b/tensorflow/core/kernels/argmax_op.cc
@@ -0,0 +1,163 @@
+// See docs in ../ops/math_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/argmax_op.h"
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T, typename ArgFunctor>
+class ArgOp : public OpKernel {
+ public:
+  explicit ArgOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const Tensor& dimension = context->input(1);
+
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(dimension.shape()),
+                errors::InvalidArgument(
+                    "dim must be a scalar, but received tensor of shape: ",
+                    dimension.shape().DebugString()));
+
+    const int32 dim = dimension.scalar<int32>()();
+    const int input_dims = input.dims();
+
+    OP_REQUIRES(context, dim >= 0, errors::InvalidArgument("dim must be >= 0"));
+    OP_REQUIRES(context, dim < input_dims,
+                errors::InvalidArgument("Minimum tensor rank: ", dim,
+                                        " but got: ", input_dims));
+
+    TensorShape output_shape;
+    TensorShape input_shape = input.shape();
+    for (int d = 0; d < input_dims - 1; ++d) {
+      output_shape.AddDim(input_shape.dim_size((d < dim) ? d : d + 1));
+    }
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+
+#define HANDLE_DIM(NDIM)                                         \
+  case NDIM:                                                     \
+    ArgFunctor::Reduce##NDIM(context->eigen_device<Device>(),    \
+                             input.tensor<T, NDIM>(), dim,       \
+                             output->tensor<int64, NDIM - 1>()); \
+    break;
+
+    switch (input_dims) {
+      HANDLE_DIM(1);
+      HANDLE_DIM(2);
+      HANDLE_DIM(3);
+      HANDLE_DIM(4);
+      HANDLE_DIM(5);
+
+      default:
+        OP_REQUIRES(context, false,
+                    errors::InvalidArgument(
+                        "ArgOp : Unhandled input dimensions: ", input_dims));
+    }
+  }
+#undef HANDLE_DIM
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(ArgOp);
+};
+
+template <typename Device, typename T>
+class ArgMaxOp : public ArgOp<Device, T, functor::ArgMax<Device, T> > {
+ public:
+  explicit ArgMaxOp(OpKernelConstruction* context)
+      : ArgOp<Device, T, functor::ArgMax<Device, T> >(context) {}
+};
+
+template <typename Device, typename T>
+class ArgMinOp : public ArgOp<Device, T, functor::ArgMin<Device, T> > {
+ public:
+  explicit ArgMinOp(OpKernelConstruction* context)
+      : ArgOp<Device, T, functor::ArgMin<Device, T> >(context) {}
+};
+
+#define REGISTER_ARGMAX(type)                            \
+  REGISTER_KERNEL_BUILDER(Name("ArgMax")                 \
+                              .Device(DEVICE_CPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("dimension"),  \
+                          ArgMaxOp<CPUDevice, type>);    \
+  REGISTER_KERNEL_BUILDER(Name("ArgMin")                 \
+                              .Device(DEVICE_CPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("dimension"),  \
+                          ArgMinOp<CPUDevice, type>);
+
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_ARGMAX);
+
+#if GOOGLE_CUDA
+
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+
+#define DECLARE_GPU_SPEC(T, Dims)                                              \
+  template <>                                                                  \
+  void ArgMax<GPUDevice, T>::Reduce##Dims(                                     \
+      const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input,         \
+      const int32 dimension, typename TTypes<int64, Dims - 1>::Tensor output); \
+  template <>                                                                  \
+  void ArgMin<GPUDevice, T>::Reduce##Dims(                                     \
+      const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input,         \
+      const int32 dimension, typename TTypes<int64, Dims - 1>::Tensor output);
+
+#define DECLARE_GPU_SPECS(T) \
+  DECLARE_GPU_SPEC(T, 1);    \
+  DECLARE_GPU_SPEC(T, 2);    \
+  DECLARE_GPU_SPEC(T, 3);    \
+  DECLARE_GPU_SPEC(T, 4);    \
+  DECLARE_GPU_SPEC(T, 5);
+
+#define DECLARE_GPU_CLASS(T)                   \
+  extern template struct ArgMax<GPUDevice, T>; \
+  extern template struct ArgMin<GPUDevice, T>;
+
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_CLASS);
+
+#undef DECLARE_GPU_SPECS
+#undef DECLARE_GPU_CLASS
+
+}  // namespace functor
+
+// Registration of the GPU implementations.
+#define REGISTER_ARGMAX_GPU(type)                        \
+  REGISTER_KERNEL_BUILDER(Name("ArgMax")                 \
+                              .Device(DEVICE_GPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("dimension"),  \
+                          ArgMaxOp<GPUDevice, type>);    \
+  REGISTER_KERNEL_BUILDER(Name("ArgMin")                 \
+                              .Device(DEVICE_GPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("dimension"),  \
+                          ArgMinOp<GPUDevice, type>);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_ARGMAX_GPU);
+
+#undef REGISTER_ARGMAX_GPU
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/argmax_op.h b/tensorflow/core/kernels/argmax_op.h
new file mode 100644
index 0000000000..41734f3254
--- /dev/null
+++ b/tensorflow/core/kernels/argmax_op.h
@@ -0,0 +1,55 @@
+#ifndef TENSORFLOW_KERNELS_ARGMAX_OP_H_
+#define TENSORFLOW_KERNELS_ARGMAX_OP_H_
+// Generator definition for ArgMaxOp, must be compilable by nvcc.
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device, typename T>
+struct ArgMax {
+#define DECLARE_COMPUTE_SPEC(Dims)                                     \
+  EIGEN_ALWAYS_INLINE static void Reduce##Dims(                        \
+      const Device& d, typename TTypes<T, Dims>::ConstTensor input,    \
+      const int32 dimension,                                           \
+      typename TTypes<int64, Dims - 1>::Tensor output) {               \
+    output.device(d) = input.argmax(dimension).template cast<int64>(); \
+  }
+
+  DECLARE_COMPUTE_SPEC(1);
+  DECLARE_COMPUTE_SPEC(2);
+  DECLARE_COMPUTE_SPEC(3);
+  DECLARE_COMPUTE_SPEC(4);
+  DECLARE_COMPUTE_SPEC(5);
+
+#undef DECLARE_COMPUTE_SPEC
+};
+
+template <typename Device, typename T>
+struct ArgMin {
+#define DECLARE_COMPUTE_SPEC(Dims)                                     \
+  EIGEN_ALWAYS_INLINE static void Reduce##Dims(                        \
+      const Device& d, typename TTypes<T, Dims>::ConstTensor input,    \
+      const int32 dimension,                                           \
+      typename TTypes<int64, Dims - 1>::Tensor output) {               \
+    output.device(d) = input.argmin(dimension).template cast<int64>(); \
+  }
+
+  DECLARE_COMPUTE_SPEC(1);
+  DECLARE_COMPUTE_SPEC(2);
+  DECLARE_COMPUTE_SPEC(3);
+  DECLARE_COMPUTE_SPEC(4);
+  DECLARE_COMPUTE_SPEC(5);
+
+#undef DECLARE_COMPUTE_SPEC
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_ARGMAX_OP_H_
diff --git a/tensorflow/core/kernels/argmax_op_gpu.cu.cc b/tensorflow/core/kernels/argmax_op_gpu.cu.cc
new file mode 100644
index 0000000000..6c91fc2c86
--- /dev/null
+++ b/tensorflow/core/kernels/argmax_op_gpu.cu.cc
@@ -0,0 +1,20 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/argmax_op.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+#define DEFINE_GPU_SPEC(T)                       \
+  template struct functor::ArgMax<GPUDevice, T>; \
+  template struct functor::ArgMin<GPUDevice, T>;
+
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPEC);
+
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/assign_op.h b/tensorflow/core/kernels/assign_op.h
new file mode 100644
index 0000000000..3306f1eeaa
--- /dev/null
+++ b/tensorflow/core/kernels/assign_op.h
@@ -0,0 +1,92 @@
+#ifndef TENSORFLOW_KERNELS_ASSIGN_OP_H_
+#define TENSORFLOW_KERNELS_ASSIGN_OP_H_
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+// TODO(jeff): Get rid of use_exclusive_lock_ option
+
+// Computes *input[0] = input[1]
+class AssignOp : public OpKernel {
+ public:
+  explicit AssignOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("use_locking", &use_exclusive_lock_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("validate_shape", &validate_shape_));
+    OP_REQUIRES(context, IsRefType(context->input_type(0)),
+                errors::InvalidArgument("lhs input needs to be a ref type"));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    Tensor rhs = context->input(1);
+
+    // We always return the input ref.
+    context->forward_ref_input_to_ref_output(0, 0);
+
+    // If the left hand side is not initialized, or the shape of the
+    // right-hand side is different than the left hand side, we need
+    // to allocate a new tensor.
+    {
+      mutex_lock l(*context->input_ref_mutex(0));
+
+      Tensor old_lhs = context->mutable_input(0, true);
+
+      if (validate_shape_) {
+        OP_REQUIRES(
+            context, old_lhs.shape().IsSameSize(rhs.shape()),
+            errors::InvalidArgument(
+                "Assign requires shapes of both tensors to match. lhs shape= ",
+                old_lhs.shape().ShortDebugString(), " rhs shape= ",
+                rhs.shape().ShortDebugString()));
+      }
+
+      const bool same_shape = old_lhs.shape().IsSameSize(rhs.shape());
+      if (!old_lhs.IsInitialized() || !same_shape) {
+        // Create new tensor whose shape matches the right hand side
+        // and copy then hand off to lhs.
+        // We can't always know how this value will be used downstream,
+        // so make conservative assumptions in specifying the memory
+        // allocation attributes.
+        AllocatorAttributes attr;
+        attr.set_gpu_compatible(true);
+        PersistentTensor copy;
+        Tensor* copyTensor = nullptr;
+        OP_REQUIRES_OK(
+            context, context->allocate_persistent(old_lhs.dtype(), rhs.shape(),
+                                                  &copy, &copyTensor, attr));
+        Copy(context, copyTensor, rhs);
+        context->replace_ref_input(0, *copyTensor, true);
+        return;
+      }
+
+      // The tensor has already been initialized and the right hand side
+      // matches the left hand side's shape.
+      if (use_exclusive_lock_) {
+        Copy(context, &old_lhs, rhs);
+        return;
+      }
+    }
+
+    // The tensor has already been initialized and the right hand side
+    // matches the left hand side's shape. We have been told to do the
+    // copy outside the lock.
+    Tensor old_unlocked_lhs = context->mutable_input(0, false);
+    Copy(context, &old_unlocked_lhs, rhs);
+  }
+
+  virtual void Copy(OpKernelContext* context, Tensor* lhs,
+                    const Tensor& rhs) = 0;
+
+  bool use_exclusive_lock_;
+  bool validate_shape_;
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_ASSIGN_OP_H_
diff --git a/tensorflow/core/kernels/attention_ops.cc b/tensorflow/core/kernels/attention_ops.cc
new file mode 100644
index 0000000000..28763f65a4
--- /dev/null
+++ b/tensorflow/core/kernels/attention_ops.cc
@@ -0,0 +1,92 @@
+// See docs in ../ops/attention_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks"
+
+namespace tensorflow {
+
+class ExtractGlimpseOp : public OpKernel {
+ public:
+  explicit ExtractGlimpseOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("normalized", &normalized_));
+    OP_REQUIRES_OK(context, context->GetAttr("centered", &centered_));
+    OP_REQUIRES_OK(context, context->GetAttr("uniform_noise", &uniform_noise_));
+  }
+
+  // Expect input tensor of rank 4 with dimensions (batch_size, height, width,
+  // depth).
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const TensorShape input_shape = input.shape();
+    const int32 num_dims = input_shape.dims();
+    OP_REQUIRES(
+        context, num_dims == 4,
+        errors::InvalidArgument(
+            "input must be 4-dimensional (batch_size, height, width, depth)",
+            input_shape.ShortDebugString()));
+
+    const int64 batch_size = input_shape.dim_size(0);
+
+    const Tensor& window_size = context->input(1);
+    OP_REQUIRES(context, (window_size.shape().dims() == 1) &&
+                             window_size.shape().dim_size(0) == 2,
+                errors::InvalidArgument(
+                    "input must be a vector of size 2 (height, width)",
+                    window_size.shape().ShortDebugString()));
+
+    const int64 output_height = window_size.tensor<int, 1>()(0);
+    const int64 output_width = window_size.tensor<int, 1>()(1);
+    TensorShape output_shape = input_shape;
+    output_shape.set_dim(1, output_height);
+    output_shape.set_dim(2, output_width);
+
+    const Tensor& offsets = context->input(2);
+    OP_REQUIRES(context, offsets.shape().dims() == 2,
+                errors::InvalidArgument("input must be a matrix",
+                                        offsets.shape().ShortDebugString()));
+    OP_REQUIRES(context, offsets.shape().dim_size(0) == batch_size,
+                errors::InvalidArgument("first dimension should be batch",
+                                        offsets.shape().ShortDebugString()));
+    OP_REQUIRES(
+        context, offsets.shape().dim_size(1) == 2,
+        errors::InvalidArgument("second dimension should be of size 2 (y,x)",
+                                offsets.shape().ShortDebugString()));
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+
+    std::vector<Eigen::IndexPair<float> > offset_vec;
+    offset_vec.reserve(batch_size);
+    for (int i = 0; i < batch_size; ++i) {
+      float offset_y = offsets.tensor<float, 2>()(i, 0);
+      float offset_x = offsets.tensor<float, 2>()(i, 1);
+      // Eigen::ExtractGlimpses expects offsets as (x,y), whereas the
+      // calling TensorFlow operates with (y,x) as indices.
+      offset_vec.push_back(Eigen::IndexPair<float>(offset_x, offset_y));
+    }
+
+    output->tensor<float, 4>().swap_layout().device(
+        context->eigen_cpu_device()) =
+        Eigen::ExtractGlimpses(input.tensor<float, 4>().swap_layout(),
+                               output_width, output_height, offset_vec,
+                               normalized_, centered_, uniform_noise_);
+  }
+
+ private:
+  bool normalized_;
+  bool centered_;
+  bool uniform_noise_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("ExtractGlimpse").Device(DEVICE_CPU),
+                        ExtractGlimpseOp);
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/avgpooling_op.cc b/tensorflow/core/kernels/avgpooling_op.cc
new file mode 100644
index 0000000000..26f98ffbcd
--- /dev/null
+++ b/tensorflow/core/kernels/avgpooling_op.cc
@@ -0,0 +1,418 @@
+// See docs in ../ops/nn_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/avgpooling_op.h"
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/pooling_ops_common.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks"
+
+#if GOOGLE_CUDA
+#include "tensorflow/core/kernels/maxpooling_op_gpu.h"
+#include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class AvgPoolingOp : public UnaryOp<T> {
+ public:
+  explicit AvgPoolingOp(OpKernelConstruction* context) : UnaryOp<T>(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument(
+                    "Sliding window ksize field must "
+                    "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument(
+                    "Sliding window stride field must "
+                    "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& tensor_in = context->input(0);
+    PoolParameters params{context, ksize_, stride_, padding_,
+                          tensor_in.shape()};
+    if (!context->status().ok()) {
+      return;
+    }
+    OP_REQUIRES(context, params.depth_window == 1,
+                errors::Unimplemented(
+                    "Non-spatial pooling is not "
+                    "yet supported. Volunteers? :)"));
+
+    // For avgpooling, tensor_in should have 4 dimensions.
+    OP_REQUIRES(context, tensor_in.dims() == 4,
+                errors::InvalidArgument("tensor_in must be 4-dimensional"));
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                0, params.forward_output_shape(), &output));
+
+    if (std::is_same<Device, GPUDevice>::value) {
+      Eigen::PaddingType pt = BrainPadding2EigenPadding(padding_);
+      functor::SpatialAvgPooling<Device, T>()(
+          context->eigen_device<Device>(), output->tensor<T, 4>(),
+          tensor_in.tensor<T, 4>(), params.window_rows, params.window_cols,
+          params.row_stride, params.col_stride, pt);
+    } else {
+      SpatialAvgPool<Device, T>(context, output, tensor_in, params, padding_);
+    }
+  }
+
+ private:
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("AvgPool")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T"),
+                        AvgPoolingOp<CPUDevice, float>);
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                      \
+  template <>                                                    \
+  void SpatialAvgPooling<GPUDevice, T>::operator()(              \
+      const GPUDevice& d, typename TTypes<T, 4>::Tensor output,  \
+      typename TTypes<T, 4>::ConstTensor input, int window_rows, \
+      int window_cols, int row_stride, int col_stride,           \
+      const Eigen::PaddingType& padding);                        \
+  extern template struct SpatialAvgPooling<GPUDevice, T>;
+
+DECLARE_GPU_SPEC(float);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+REGISTER_KERNEL_BUILDER(Name("AvgPool")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<float>("T"),
+                        AvgPoolingOp<GPUDevice, float>);
+#endif  // GOOGLE_CUDA
+
+// The operation to compute AvgPool gradients.
+// It takes two inputs:
+//   - The original input tensor shape
+//   - Backprop tensor for output
+// It produces one output: backprop tensor for input.
+template <typename Device, class T>
+class AvgPoolingGradOp : public OpKernel {
+ public:
+  explicit AvgPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument(
+                    "Sliding window ksize field must "
+                    "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument(
+                    "Sliding window strides field must "
+                    "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& tensor_in_shape = context->input(0);
+    const Tensor& out_backprop = context->input(1);
+    // For avgpooling, tensor_in_shape should have 1 dimension, and 4 elements.
+    OP_REQUIRES(context, tensor_in_shape.dims() == 1 &&
+                             tensor_in_shape.NumElements() == 4,
+                errors::InvalidArgument(
+                    "out_backprop must be 1-dimensional and 4 "
+                    "elements"));
+    // For avgpooling, out_backprop should have 4 dimensions.
+    OP_REQUIRES(context, out_backprop.dims() == 4,
+                errors::InvalidArgument("out_backprop must be 4-dimensional"));
+    const int64 out_backprop_batch = out_backprop.dim_size(0);
+    const int64 out_backprop_rows = out_backprop.dim_size(1);
+    const int64 out_backprop_cols = out_backprop.dim_size(2);
+    const int64 out_backprop_depth = out_backprop.dim_size(3);
+
+    TensorShape output_shape;
+    auto shape_vec = tensor_in_shape.vec<int32>();
+    for (int64 i = 0; i < tensor_in_shape.NumElements(); ++i) {
+      output_shape.AddDim(shape_vec(i));
+    }
+    const int64 in_rows = output_shape.dim_size(1);
+    const int64 in_cols = output_shape.dim_size(2);
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+    output->flat<T>().setZero();
+
+    const int window_rows = ksize_[1];
+    const int window_cols = ksize_[2];
+    const int depth_window = ksize_[3];
+
+    const int row_stride = stride_[1];
+    const int col_stride = stride_[2];
+
+    // We (will) use different code for spatial pooling and
+    // non-spatial pooling.
+    //
+    // Spatial pooling is when depth_window = 1
+    OP_REQUIRES(context, depth_window == 1,
+                errors::Unimplemented(
+                    "Non-spatial pooling is not "
+                    "yet supported. Volunteers? :)"));
+
+    int out_height, out_width, pad_rows, pad_cols;
+    OP_REQUIRES_OK(
+        context, Get2dOutputSize(in_rows, in_cols, window_rows, window_cols,
+                                 row_stride, col_stride, padding_, &out_height,
+                                 &out_width, &pad_rows, &pad_cols));
+
+    const T* out_backprop_ptr = out_backprop.flat<T>().data();
+    T* input_backprop_ptr = output->flat<T>().data();
+
+    for (int64 b = 0; b < out_backprop_batch; ++b) {
+      for (int64 r = 0; r < out_backprop_rows; ++r) {
+        // Calculates row broadcast size.  For SAME padding, current
+        // index could be in the padding area, and r*row_stride +
+        // window_rows could be beyond the input tensor's boundary. In
+        // such cases, change the starting index and reduce the
+        // broadcast size.
+        int rindex, rsize;
+        OP_REQUIRES_OK(context,
+                       GetBroadcastSize(r, in_rows, window_rows, row_stride,
+                                        pad_rows, &rindex, &rsize));
+        for (int64 c = 0; c < out_backprop_cols; ++c) {
+          // Calculates col broadcast size.  For SAME padding, current
+          // index could be in the padding area, and c*col_stride +
+          // window_cols could be beyond the input tensor's boundary. In
+          // such cases, change the starting index and reduce the
+          // broadcast size.
+          int cindex, csize;
+          OP_REQUIRES_OK(context,
+                         GetBroadcastSize(c, in_cols, window_cols, col_stride,
+                                          pad_cols, &cindex, &csize));
+
+          T divide_coeff = 1.0 / (rsize * csize);
+          int64 output_index =
+              (b * out_backprop_rows + r) * out_backprop_cols + c;
+          for (int64 r_dst = rindex; r_dst < rindex + rsize; ++r_dst) {
+            for (int64 c_dst = cindex; c_dst < cindex + csize; ++c_dst) {
+              int64 input_index = (b * in_rows + r_dst) * in_cols + c_dst;
+              const T* output_offset =
+                  out_backprop_ptr + output_index * out_backprop_depth;
+              T* input_offset =
+                  input_backprop_ptr + input_index * out_backprop_depth;
+              for (int64 d = 0; d < out_backprop_depth; ++d) {
+                *input_offset += *output_offset * divide_coeff;
+                ++output_offset;
+                ++input_offset;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T")
+                            .HostMemory("orig_input_shape"),
+                        AvgPoolingGradOp<CPUDevice, float>);
+REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<double>("T")
+                            .HostMemory("orig_input_shape"),
+                        AvgPoolingGradOp<CPUDevice, double>);
+
+#if GOOGLE_CUDA
+
+// A CUDNN based AvgPoolingGrad implementation. It includes the padding as the
+// candidates for the pooling operation.
+template <class T>
+class AvgPoolingGradOp<GPUDevice, T> : public OpKernel {
+ public:
+  typedef GPUDevice Device;
+
+  explicit AvgPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& tensor_in_shape = context->input(0);
+    const Tensor& out_backprop = context->input(1);
+    // For avgpooling, tensor_in_shape should have 1 dimension, and 4 elements.
+    OP_REQUIRES(
+        context,
+        tensor_in_shape.dims() == 1 && tensor_in_shape.NumElements() == 4,
+        errors::InvalidArgument("out_backprop must be 1-dimensional and 4 "
+                                "elements"));
+    // For avgpooling, out_backprop should have 4 dimensions.
+    OP_REQUIRES(context, out_backprop.dims() == 4,
+                errors::InvalidArgument("out_backprop must be 4-dimensional"));
+
+    TensorShape output_shape;
+    auto shape_vec = tensor_in_shape.vec<int32>();
+    for (int64 i = 0; i < tensor_in_shape.NumElements(); ++i) {
+      output_shape.AddDim(shape_vec(i));
+    }
+
+    DnnPoolingGradOp<T>::Compute(
+        context, perftools::gputools::dnn::PoolingMode::kAverage, ksize_,
+        stride_, padding_, nullptr, nullptr, out_backprop, output_shape);
+  }
+
+ private:
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<float>("T")
+                            .HostMemory("orig_input_shape")
+                            .Label("cudnn"),
+                        AvgPoolingGradOp<GPUDevice, float>);
+
+// A custom GPU kernel based AvgPoolingGrad implementation. It includes the
+// padding as the candidates for the pooling operation.
+template <class T>
+class AvgPoolingGradOpCustomGPUKernel : public OpKernel {
+ public:
+  typedef GPUDevice Device;
+
+  explicit AvgPoolingGradOpCustomGPUKernel(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& tensor_in_shape = context->input(0);
+    const Tensor& out_backprop = context->input(1);
+    // For avgpooling, tensor_in_shape should have 1 dimension, and 4 elements.
+    OP_REQUIRES(
+        context,
+        tensor_in_shape.dims() == 1 && tensor_in_shape.NumElements() == 4,
+        errors::InvalidArgument("out_backprop must be 1-dimensional and 4 "
+                                "elements"));
+    // For avgpooling, out_backprop should have 4 dimensions.
+    OP_REQUIRES(context, out_backprop.dims() == 4,
+                errors::InvalidArgument("out_backprop must be 4-dimensional"));
+    const int64 out_backprop_batch = out_backprop.dim_size(0);
+    const int64 out_backprop_rows = out_backprop.dim_size(1);
+    const int64 out_backprop_cols = out_backprop.dim_size(2);
+    const int64 out_backprop_depth = out_backprop.dim_size(3);
+
+    TensorShape output_shape;
+    auto shape_vec = tensor_in_shape.vec<int32>();
+    for (int64 i = 0; i < tensor_in_shape.NumElements(); ++i) {
+      output_shape.AddDim(shape_vec(i));
+    }
+    const int64 in_rows = output_shape.dim_size(1);
+    const int64 in_cols = output_shape.dim_size(2);
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+
+    const int window_rows = ksize_[1];
+    const int window_cols = ksize_[2];
+    const int depth_window = ksize_[3];
+
+    const int row_stride = stride_[1];
+    const int col_stride = stride_[2];
+
+    // We (will) use different code for spatial pooling and
+    // non-spatial pooling.
+    //
+    // Spatial pooling is when depth_window = 1
+    OP_REQUIRES(context, depth_window == 1,
+                errors::Unimplemented("Non-spatial pooling is not "
+                                      "yet supported. Volunteers? :)"));
+
+    int out_height, out_width, pad_rows, pad_cols;
+    OP_REQUIRES_OK(
+        context, Get2dOutputSize(in_rows, in_cols, window_rows, window_cols,
+                                 row_stride, col_stride, padding_, &out_height,
+                                 &out_width, &pad_rows, &pad_cols));
+
+    RunAvePoolBackwardNHWC<T>(out_backprop.flat<T>().data(),  // top_diff
+                              out_backprop_batch,             // num
+                              in_rows,                        // height
+                              in_cols,                        // width
+                              out_backprop_depth,             // channels
+                              out_backprop_rows,              // pooled_height
+                              out_backprop_cols,              // pooled_width
+                              window_rows,                    // kernel_h
+                              window_cols,                    // kernel_w
+                              row_stride,                     // stride_h
+                              col_stride,                     // stride_w
+                              pad_rows,                       // pad_t
+                              pad_cols,                       // pad_l
+                              output->flat<T>().data(),       // bottom_diff
+                              context->eigen_gpu_device());   // d
+  }
+
+ private:
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<float>("T")
+                            .HostMemory("orig_input_shape"),
+                        AvgPoolingGradOpCustomGPUKernel<float>);
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/avgpooling_op.h b/tensorflow/core/kernels/avgpooling_op.h
new file mode 100644
index 0000000000..38f0eb97e5
--- /dev/null
+++ b/tensorflow/core/kernels/avgpooling_op.h
@@ -0,0 +1,58 @@
+#ifndef TENSORFLOW_KERNELS_AVGPOOLING_OP_H_
+#define TENSORFLOW_KERNELS_AVGPOOLING_OP_H_
+// Functor definition for AvgPoolingOp, must be compilable by nvcc.
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T>
+struct SpatialAvgPooling {
+  void operator()(const Device& d, typename TTypes<T, 4>::Tensor output,
+                  typename TTypes<T, 4>::ConstTensor input, int window_rows,
+                  int window_cols, int row_stride, int col_stride,
+                  const Eigen::PaddingType& padding) {
+    // Because we swap the layout, we swap the row/cols as well
+    output.swap_layout().device(d) =
+        Eigen::SpatialAvgPooling(input.swap_layout(), window_cols, window_rows,
+                                 col_stride, row_stride, padding);
+  }
+};
+
+}  // namespace functor
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Lauch a custom GPU kernels from Yanqing for the avgpooling backward operation
+// that works NHWC data formats.
+// Arguments:
+//   top_diff: backprop to the output of the pooling layer
+//   num: number of input batches
+//   height: input height
+//   width: input width
+//   channels: number of input channels
+//   pooled_height: the height of the output to the pooling layer
+//   pooled_width: the width of the output to the pooling layer
+//   kernel_h: the height of the pooling kernel
+//   kernel_w: the width of the pooling kernel
+//   stride_h: the height of the vertical stride
+//   stride_w: the width of the horizontal stride
+//   pad_t: padding size to the top side
+//   pad_l: padding size to the left side
+//   bottom_diff: backprop to the input of the pooling layer.
+template <typename T>
+bool RunAvePoolBackwardNHWC(const T* const top_diff, const int num,
+                            const int height, const int width,
+                            const int channels, const int pooled_height,
+                            const int pooled_width, const int kernel_h,
+                            const int kernel_w, const int stride_h,
+                            const int stride_w, const int pad_t,
+                            const int pad_l, T* const bottom_diff,
+                            const GPUDevice& d);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_AVGPOOLING_OP_H_
diff --git a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
new file mode 100644
index 0000000000..ec84ee6862
--- /dev/null
+++ b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
@@ -0,0 +1,101 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <stdio.h>
+#include <iostream>
+
+#include "tensorflow/core/kernels/avgpooling_op.h"
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+#define DEFINE_GPU_KERNELS(T) \
+  template struct functor::SpatialAvgPooling<GPUDevice, T>;
+
+DEFINE_GPU_KERNELS(float)
+
+#undef DEFINE_GPU_KERNELS
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+static const int CAFFE_CUDA_NUM_THREADS = 1024;
+
+template <typename dtype>
+__global__ void AvePoolBackwardNHWC(const int nthreads,
+                                    const dtype* const top_diff, const int num,
+                                    const int height, const int width,
+                                    const int channels, const int pooled_height,
+                                    const int pooled_width, const int kernel_h,
+                                    const int kernel_w, const int stride_h,
+                                    const int stride_w, const int pad_t,
+                                    const int pad_l, dtype* const bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // find out the local index
+    // find out the local offset
+    const int c = index % channels;
+    const int w = index / channels % width + pad_l;
+    const int h = (index / channels / width) % height + pad_t;
+    const int n = index / channels / width / height;
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int phend = min(h / stride_h + 1, pooled_height);
+    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+    const int pwend = min(w / stride_w + 1, pooled_width);
+    dtype gradient = 0;
+    const dtype* const top_diff_slice =
+        top_diff + n * pooled_height * pooled_width * channels + c;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        // figure out the pooling size
+        int hstart = ph * stride_h - pad_t;
+        int wstart = pw * stride_w - pad_l;
+        int hend = min(hstart + kernel_h, height);
+        int wend = min(wstart + kernel_w, width);
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        int pool_size = (hend - hstart) * (wend - wstart);
+        gradient +=
+            top_diff_slice[(ph * pooled_width + pw) * channels] / pool_size;
+      }
+    }
+    bottom_diff[index] = gradient;
+  }
+}
+
+template <typename T>
+bool RunAvePoolBackwardNHWC(const T* const top_diff, const int num,
+                            const int height, const int width,
+                            const int channels, const int pooled_height,
+                            const int pooled_width, const int kernel_h,
+                            const int kernel_w, const int stride_h,
+                            const int stride_w, const int pad_t,
+                            const int pad_l, T* const bottom_diff,
+                            const GPUDevice& d) {
+  int x_size = num * height * width * channels;
+  int thread_per_block =
+      std::min(CAFFE_CUDA_NUM_THREADS, d.maxCudaThreadsPerMultiProcessor());
+  int block_count = (x_size + thread_per_block - 1) / thread_per_block;
+  AvePoolBackwardNHWC<T><<<block_count, thread_per_block, 0, d.stream()>>>(
+      x_size, top_diff, num, height, width, channels, pooled_height,
+      pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_t,
+      bottom_diff);
+
+  return d.ok();
+}
+
+template bool RunAvePoolBackwardNHWC(
+    const float* const top_diff, const int num, const int height,
+    const int width, const int channels, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
+    float* const bottom_diff, const GPUDevice& d);
+
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/batch_matmul_op.cc b/tensorflow/core/kernels/batch_matmul_op.cc
new file mode 100644
index 0000000000..349aac0158
--- /dev/null
+++ b/tensorflow/core/kernels/batch_matmul_op.cc
@@ -0,0 +1,260 @@
+// See docs in ../ops/math_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+#if GOOGLE_CUDA
+#include "tensorflow/core/common_runtime/gpu_device_context.h"
+#include "tensorflow/stream_executor/stream.h"
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename Scalar>
+struct LaunchBatchMatMul;
+
+template <typename Scalar>
+struct LaunchBatchMatMul<CPUDevice, Scalar> {
+  static void Launch(OpKernelContext* context, const Tensor& in_x,
+                     const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out) {
+    auto Tx = in_x.tensor<Scalar, 3>();
+    auto Ty = in_y.tensor<Scalar, 3>();
+    auto Tz = out->tensor<Scalar, 3>();
+
+    // Shards "n"-matmuls into "num" shards. Each shard is
+    // dispatched to a thread.
+    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+    const int64 num_units = in_x.dim_size(0);
+    const int64 cost_per_unit =
+        in_x.dim_size(0) * in_x.dim_size(1) * out->dim_size(2);
+    Shard(worker_threads.num_threads, worker_threads.workers, num_units,
+          cost_per_unit, [&Tx, &Ty, adj_x, adj_y, &Tz](int start, int limit) {
+            LaunchBatchMatMul<CPUDevice, Scalar>::Run(Tx, Ty, adj_x, adj_y, Tz,
+                                                      start, limit);
+          });
+  }
+
+  template <typename In, typename Out>
+  static void Run(In Tx, In Ty, bool adj_x, bool adj_y, Out Tz, int start,
+                  int limit) {
+    Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_pairs;
+
+    Eigen::internal::scalar_conjugate_op<Scalar> conj;
+    if (!adj_x && !adj_y) {
+      for (int i = start; i < limit; ++i) {
+        auto x = Tx.template chip<0>(i);
+        auto y = Ty.template chip<0>(i);
+        auto z = Tz.template chip<0>(i);
+        contract_pairs[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
+        z = x.contract(y, contract_pairs);  // matmul
+      }
+    } else if (!adj_x && adj_y) {
+      for (int i = start; i < limit; ++i) {
+        auto x = Tx.template chip<0>(i);
+        auto y = Ty.template chip<0>(i).unaryExpr(conj);
+        auto z = Tz.template chip<0>(i);
+        contract_pairs[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 1);
+        z = x.contract(y, contract_pairs);  // matmul
+      }
+    } else if (adj_x && !adj_y) {
+      for (int i = start; i < limit; ++i) {
+        auto x = Tx.template chip<0>(i).unaryExpr(conj);
+        auto y = Ty.template chip<0>(i);
+        auto z = Tz.template chip<0>(i);
+        contract_pairs[0] = Eigen::IndexPair<Eigen::DenseIndex>(0, 0);
+        z = x.contract(y, contract_pairs);  // matmul
+      }
+    } else {
+      for (int i = start; i < limit; ++i) {
+        auto x = Tx.template chip<0>(i).unaryExpr(conj);
+        auto y = Ty.template chip<0>(i).unaryExpr(conj);
+        auto z = Tz.template chip<0>(i);
+        contract_pairs[0] = Eigen::IndexPair<Eigen::DenseIndex>(0, 1);
+        z = x.contract(y, contract_pairs);  // matmul
+      }
+    }
+  }
+};
+
+#if GOOGLE_CUDA
+
+namespace {
+template <typename T>
+perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
+  perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory));
+  perftools::gputools::DeviceMemory<T> typed(wrapped);
+  return typed;
+}
+}  // namespace
+
+template <typename Scalar>
+struct LaunchBatchMatMul<GPUDevice, Scalar> {
+  static void Launch(OpKernelContext* context, const Tensor& in_x,
+                     const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out) {
+    perftools::gputools::blas::Transpose trans[] = {
+        perftools::gputools::blas::Transpose::kNoTranspose,
+        perftools::gputools::blas::Transpose::kTranspose};
+    const uint64 m = in_x.dim_size(adj_x ? 2 : 1);
+    const uint64 k = in_x.dim_size(adj_x ? 1 : 2);
+    const uint64 n = in_y.dim_size(adj_y ? 1 : 2);
+    const uint64 batch_size = in_x.dim_size(0);
+    auto blas_transpose_a = trans[adj_x];
+    auto blas_transpose_b = trans[adj_y];
+
+    auto* stream = context->op_device_context<GPUDeviceContext>()->stream();
+    OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+
+    typedef perftools::gputools::DeviceMemory<Scalar> DeviceMemoryType;
+    std::vector<DeviceMemoryType> a_device_memory;
+    std::vector<DeviceMemoryType> b_device_memory;
+    std::vector<DeviceMemoryType> c_device_memory;
+    std::vector<DeviceMemoryType*> a_ptrs;
+    std::vector<DeviceMemoryType*> b_ptrs;
+    std::vector<DeviceMemoryType*> c_ptrs;
+    a_device_memory.reserve(batch_size);
+    b_device_memory.reserve(batch_size);
+    c_device_memory.reserve(batch_size);
+    a_ptrs.reserve(batch_size);
+    b_ptrs.reserve(batch_size);
+    c_ptrs.reserve(batch_size);
+    auto* a_base_ptr = in_x.template flat<Scalar>().data();
+    auto* b_base_ptr = in_y.template flat<Scalar>().data();
+    auto* c_base_ptr = out->template flat<Scalar>().data();
+    for (int64 i = 0; i < batch_size; ++i) {
+      a_device_memory.push_back(AsDeviceMemory(a_base_ptr + i * m * k));
+      b_device_memory.push_back(AsDeviceMemory(b_base_ptr + i * k * n));
+      c_device_memory.push_back(AsDeviceMemory(c_base_ptr + i * m * n));
+      a_ptrs.push_back(&a_device_memory.back());
+      b_ptrs.push_back(&b_device_memory.back());
+      c_ptrs.push_back(&c_device_memory.back());
+    }
+
+    // Cublas does
+    // C = A x B
+    // where A, B and C are assumed to be in column major.
+    // We want the output to be in row-major, so we can compute
+    // C' = B' x A' (' stands for transpose)
+    bool blas_launch_status =
+        stream->ThenBlasGemmBatched(blas_transpose_b, blas_transpose_a, n, m, k,
+                                    static_cast<Scalar>(1.0), b_ptrs,
+                                    adj_y ? k : n, a_ptrs, adj_x ? m : k,
+                                    static_cast<Scalar>(0.0), c_ptrs, n,
+                                    batch_size)
+            .ok();
+    if (!blas_launch_status) {
+      context->SetStatus(errors::Internal(
+          "Blas SGEMMBatched launch failed : a.shape=",
+          in_x.shape().DebugString(), ", b.shape=", in_y.shape().DebugString(),
+          ", m=", m, ", n=", n, ", k=", k, ", batch_size=", batch_size));
+    }
+  }
+};
+
+#endif  // GOOGLE_CUDA
+
+template <typename Device, typename Scalar>
+class BatchMatMul : public OpKernel {
+ public:
+  explicit BatchMatMul(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("adj_x", &adj_x_));
+    OP_REQUIRES_OK(context, context->GetAttr("adj_y", &adj_y_));
+  }
+
+  virtual ~BatchMatMul() {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& in0 = ctx->input(0);
+    const Tensor& in1 = ctx->input(1);
+    OP_REQUIRES(ctx, in0.dims() == in1.dims(),
+                errors::InvalidArgument("In[0] and In[1] has different ndims: ",
+                                        in0.shape().ShortDebugString(), " vs. ",
+                                        in1.shape().ShortDebugString()));
+    const int ndims = in0.dims();
+    OP_REQUIRES(
+        ctx, ndims >= 3,
+        errors::InvalidArgument("In[0] and In[1] ndims must be >= 3: ", ndims));
+    TensorShape out_shape;
+    for (int i = 0; i < ndims - 2; ++i) {
+      OP_REQUIRES(ctx, in0.dim_size(i) == in1.dim_size(i),
+                  errors::InvalidArgument("In[0].dim(", i, ") and In[1].dim(",
+                                          i, ") must be the same: ",
+                                          in0.shape().DebugString(), " vs ",
+                                          in1.shape().DebugString()));
+      out_shape.AddDim(in0.dim_size(i));
+    }
+    auto n = out_shape.num_elements();
+    auto d0 = in0.dim_size(ndims - 2);
+    auto d1 = in0.dim_size(ndims - 1);
+    Tensor in0_reshaped;
+    CHECK(in0_reshaped.CopyFrom(in0, TensorShape({n, d0, d1})));
+    auto d2 = in1.dim_size(ndims - 2);
+    auto d3 = in1.dim_size(ndims - 1);
+    Tensor in1_reshaped;
+    CHECK(in1_reshaped.CopyFrom(in1, TensorShape({n, d2, d3})));
+    if (adj_x_) std::swap(d0, d1);
+    if (adj_y_) std::swap(d2, d3);
+    OP_REQUIRES(ctx, d1 == d2,
+                errors::InvalidArgument(
+                    "In[0] mismatch In[1] shape: ", d1, " vs. ", d2, ": ",
+                    in0.shape().ShortDebugString(), " ",
+                    in1.shape().ShortDebugString(), " ", adj_x_, " ", adj_y_));
+    out_shape.AddDim(d0);
+    out_shape.AddDim(d3);
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
+    if (out->NumElements() == 0) {
+      return;
+    }
+    if (in0.NumElements() == 0 || in1.NumElements() == 0) {
+      functor::SetZeroFunctor<Device, Scalar> f;
+      f(ctx->eigen_device<Device>(), out->flat<Scalar>());
+      return;
+    }
+    Tensor out_reshaped;
+    CHECK(out_reshaped.CopyFrom(*out, TensorShape({n, d0, d3})));
+    LaunchBatchMatMul<Device, Scalar>::Launch(ctx, in0_reshaped, in1_reshaped,
+                                              adj_x_, adj_y_, &out_reshaped);
+  }
+
+ private:
+  bool adj_x_;
+  bool adj_y_;
+};
+
+#define REGISTER_CPU(TYPE)                                              \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("BatchMatMul").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
+      BatchMatMul<CPUDevice, TYPE>)
+
+#define REGISTER_GPU(TYPE)                                              \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("BatchMatMul").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
+      BatchMatMul<GPUDevice, TYPE>)
+
+REGISTER_CPU(float);
+REGISTER_CPU(double);
+REGISTER_CPU(int32);
+REGISTER_CPU(complex64);
+
+#ifdef GOOGLE_CUDA
+// TODO(kalakris): The GPU implementation is currently disabled due to issues
+// encountered in practice. See b/24534272.
+// REGISTER_GPU(float);
+#endif  // GOOGLE_CUDA
+
+#undef REGISTER_CPU
+#undef REGISTER_GPU
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/batch_norm_op.cc b/tensorflow/core/kernels/batch_norm_op.cc
new file mode 100644
index 0000000000..c67c921631
--- /dev/null
+++ b/tensorflow/core/kernels/batch_norm_op.cc
@@ -0,0 +1,223 @@
+// See docs in ../ops/nn_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/batch_norm_op.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class BatchNormOp : public OpKernel {
+ public:
+  explicit BatchNormOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("variance_epsilon", &variance_epsilon_));
+    OP_REQUIRES_OK(context, context->GetAttr("scale_after_normalization",
+                                             &scale_after_normalization_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const Tensor& mean = context->input(1);
+    const Tensor& var = context->input(2);
+    const Tensor& beta = context->input(3);
+    const Tensor& gamma = context->input(4);
+
+    OP_REQUIRES(context, input.dims() == 4,
+                errors::InvalidArgument("input must be 4-dimensional",
+                                        input.shape().ShortDebugString()));
+    OP_REQUIRES(context, mean.dims() == 1,
+                errors::InvalidArgument("mean must be 1-dimensional",
+                                        mean.shape().ShortDebugString()));
+    OP_REQUIRES(context, var.dims() == 1,
+                errors::InvalidArgument("var must be 1-dimensional",
+                                        var.shape().ShortDebugString()));
+    OP_REQUIRES(context, beta.dims() == 1,
+                errors::InvalidArgument("beta must be 1-dimensional",
+                                        beta.shape().ShortDebugString()));
+    OP_REQUIRES(context, gamma.dims() == 1,
+                errors::InvalidArgument("gamma must be 1-dimensional",
+                                        gamma.shape().ShortDebugString()));
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input.shape(), &output));
+
+    functor::BatchNorm<Device, T>()(
+        context->eigen_device<Device>(), input.tensor<T, 4>(), mean.vec<T>(),
+        var.vec<T>(), beta.vec<T>(), gamma.vec<T>(), variance_epsilon_,
+        scale_after_normalization_, output->tensor<T, 4>());
+  }
+
+ private:
+  float variance_epsilon_;
+  bool scale_after_normalization_;
+};
+
+template <typename Device, typename T>
+class BatchNormGradOp : public OpKernel {
+ public:
+  explicit BatchNormGradOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("variance_epsilon", &variance_epsilon_));
+    OP_REQUIRES_OK(context, context->GetAttr("scale_after_normalization",
+                                             &scale_after_normalization_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const Tensor& mean = context->input(1);
+    const Tensor& var = context->input(2);
+    const Tensor& gamma = context->input(3);
+    const Tensor& out_backprop = context->input(4);
+
+    OP_REQUIRES(context, input.dims() == 4,
+                errors::InvalidArgument("input must be 4-dimensional",
+                                        input.shape().ShortDebugString()));
+    OP_REQUIRES(context, mean.dims() == 1,
+                errors::InvalidArgument("mean must be 1-dimensional",
+                                        mean.shape().ShortDebugString()));
+    OP_REQUIRES(context, var.dims() == 1,
+                errors::InvalidArgument("var must be 1-dimensional",
+                                        var.shape().ShortDebugString()));
+    OP_REQUIRES(context, gamma.dims() == 1,
+                errors::InvalidArgument("gamma must be 1-dimensional",
+                                        gamma.shape().ShortDebugString()));
+    OP_REQUIRES(
+        context, out_backprop.dims() == 4,
+        errors::InvalidArgument("out_backprop must be 4-dimensional",
+                                out_backprop.shape().ShortDebugString()));
+
+    Tensor* dx = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, input.shape(), &dx));
+    Tensor* dm = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(1, mean.shape(), &dm));
+    Tensor* dv = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(2, var.shape(), &dv));
+    Tensor* db = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(3, mean.shape(), &db));
+    Tensor* dg = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(4, gamma.shape(), &dg));
+
+    // Scratch buffer of [depth] dimension, aka the 4th dimension of input,
+    // which is dim_size(3), for calculating various combinations of
+    // (var + epsilon).
+    Tensor scratch1;
+    OP_REQUIRES_OK(context, context->allocate_temp(
+                                DataTypeToEnum<T>::value,
+                                TensorShape({input.dim_size(3)}), &scratch1));
+
+    // Scratch buffer of [depth] dimension for saving intermediate calculation
+    // values.
+    Tensor scratch2;
+    OP_REQUIRES_OK(context, context->allocate_temp(
+                                DataTypeToEnum<T>::value,
+                                TensorShape({input.dim_size(3)}), &scratch2));
+
+    functor::BatchNormGrad<Device, T>()(
+        context->eigen_device<Device>(), input.tensor<T, 4>(), mean.vec<T>(),
+        var.vec<T>(), gamma.vec<T>(), out_backprop.tensor<T, 4>(),
+        variance_epsilon_, scale_after_normalization_, dx->tensor<T, 4>(),
+        dm->vec<T>(), dv->vec<T>(), db->vec<T>(), dg->vec<T>(),
+        scratch1.vec<T>(), scratch2.vec<T>());
+  }
+
+ private:
+  float variance_epsilon_;
+  bool scale_after_normalization_;
+};
+
+#define REGISTER_KERNEL(T)                                         \
+  REGISTER_KERNEL_BUILDER(Name("BatchNormWithGlobalNormalization") \
+                              .Device(DEVICE_CPU)                  \
+                              .TypeConstraint<T>("T"),             \
+                          BatchNormOp<CPUDevice, T>);
+
+REGISTER_KERNEL(float);
+REGISTER_KERNEL(double);
+#undef REGISTER_KERNEL
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                                  \
+  template <>                                                                \
+  void BatchNorm<GPUDevice, T>::operator()(                                  \
+      const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input,          \
+      typename TTypes<T>::ConstVec mean, typename TTypes<T>::ConstVec var,   \
+      typename TTypes<T>::ConstVec beta, typename TTypes<T>::ConstVec gamma, \
+      float variance_epsilon, bool scale_after_normalization,                \
+      typename TTypes<T, 4>::Tensor output);                                 \
+  extern template struct BatchNorm<GPUDevice, T>;
+
+#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPEC(T);
+
+DECLARE_GPU_SPECS(float);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+// Registration of the GPU implementations.
+#define REGISTER_GPU_KERNEL(T)                                     \
+  REGISTER_KERNEL_BUILDER(Name("BatchNormWithGlobalNormalization") \
+                              .Device(DEVICE_GPU)                  \
+                              .TypeConstraint<T>("T"),             \
+                          BatchNormOp<GPUDevice, T>);
+
+REGISTER_GPU_KERNEL(float);
+#undef REGISTER_GPU_KERNEL
+
+#endif  // GOOGLE_CUDA
+
+#define REGISTER_KERNEL(T)                                             \
+  REGISTER_KERNEL_BUILDER(Name("BatchNormWithGlobalNormalizationGrad") \
+                              .Device(DEVICE_CPU)                      \
+                              .TypeConstraint<T>("T"),                 \
+                          BatchNormGradOp<CPUDevice, T>);
+
+REGISTER_KERNEL(float);
+REGISTER_KERNEL(double);
+#undef REGISTER_KERNEL
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                                    \
+  template <>                                                                  \
+  void BatchNormGrad<GPUDevice, T>::operator()(                                \
+      const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input,            \
+      typename TTypes<T>::ConstVec mean, typename TTypes<T>::ConstVec var,     \
+      typename TTypes<T>::ConstVec gamma,                                      \
+      typename TTypes<T, 4>::ConstTensor out_backprop, float variance_epsilon, \
+      bool scale_after_normalization, typename TTypes<T, 4>::Tensor dx,        \
+      typename TTypes<T>::Vec dm, typename TTypes<T>::Vec dv,                  \
+      typename TTypes<T>::Vec db, typename TTypes<T>::Vec dg,                  \
+      typename TTypes<T>::Vec scratch1, typename TTypes<T>::Vec scratch2);     \
+  extern template struct BatchNormGrad<GPUDevice, T>;
+
+#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPEC(T);
+
+DECLARE_GPU_SPECS(float);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+// Registration of the GPU implementations.
+#define REGISTER_GPU_KERNEL(T)                                         \
+  REGISTER_KERNEL_BUILDER(Name("BatchNormWithGlobalNormalizationGrad") \
+                              .Device(DEVICE_GPU)                      \
+                              .TypeConstraint<T>("T"),                 \
+                          BatchNormGradOp<GPUDevice, T>);
+
+REGISTER_GPU_KERNEL(float);
+#undef REGISTER_GPU_KERNEL
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batch_norm_op.h b/tensorflow/core/kernels/batch_norm_op.h
new file mode 100644
index 0000000000..5981e58460
--- /dev/null
+++ b/tensorflow/core/kernels/batch_norm_op.h
@@ -0,0 +1,133 @@
+#ifndef TENSORFLOW_KERNELS_BATCH_NORM_OP_H_
+#define TENSORFLOW_KERNELS_BATCH_NORM_OP_H_
+// Functor definition for BatchNormOp, must be compilable by nvcc.
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by BatchNormOp to do the computations.
+template <typename Device, typename T>
+struct BatchNorm {
+  void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input,
+                  typename TTypes<T>::ConstVec mean,
+                  typename TTypes<T>::ConstVec var,
+                  typename TTypes<T>::ConstVec beta,
+                  typename TTypes<T>::ConstVec gamma, float variance_epsilon,
+                  bool scale_after_normalization,
+                  typename TTypes<T, 4>::Tensor output) {
+    const int depth = mean.dimension(0);
+    const int rest_size = input.size() / depth;
+
+    Eigen::DSizes<int, 2> rest_by_depth(rest_size, depth);
+#if !defined(EIGEN_HAS_INDEX_LIST)
+    Eigen::DSizes<int, 2> rest_by_one(rest_size, 1);
+    Eigen::DSizes<int, 2> one_by_depth(1, depth);
+    Eigen::DSizes<int, 2> depth_by_one(depth, 1);
+#else
+    Eigen::IndexList<int, Eigen::type2index<1> > rest_by_one;
+    rest_by_one.set(0, rest_size);
+    Eigen::IndexList<Eigen::type2index<1>, int> one_by_depth;
+    one_by_depth.set(1, depth);
+    Eigen::IndexList<int, Eigen::type2index<1> > depth_by_one;
+    depth_by_one.set(0, depth);
+#endif
+    if (scale_after_normalization) {
+      output.reshape(rest_by_depth).device(d) =
+          (input.reshape(rest_by_depth) -
+           mean.reshape(one_by_depth).broadcast(rest_by_one)) *
+              ((var + var.constant(variance_epsilon)).rsqrt() * gamma)
+                  .eval()
+                  .reshape(one_by_depth)
+                  .broadcast(rest_by_one) +
+          beta.reshape(one_by_depth).broadcast(rest_by_one);
+    } else {
+      output.reshape(rest_by_depth).device(d) =
+          (input.reshape(rest_by_depth) -
+           mean.reshape(one_by_depth).broadcast(rest_by_one)) *
+              ((var + var.constant(variance_epsilon)).rsqrt())
+                  .eval()
+                  .reshape(one_by_depth)
+                  .broadcast(rest_by_one) +
+          beta.reshape(one_by_depth).broadcast(rest_by_one);
+    }
+  }
+};
+
+template <typename Device, typename T>
+struct BatchNormGrad {
+  void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input,
+                  typename TTypes<T>::ConstVec mean,
+                  typename TTypes<T>::ConstVec var,
+                  typename TTypes<T>::ConstVec gamma,
+                  typename TTypes<T, 4>::ConstTensor out_backprop,
+                  float variance_epsilon, bool scale_after_normalization,
+                  typename TTypes<T, 4>::Tensor dx, typename TTypes<T>::Vec dm,
+                  typename TTypes<T>::Vec dv, typename TTypes<T>::Vec db,
+                  typename TTypes<T>::Vec dg, typename TTypes<T>::Vec scratch1,
+                  typename TTypes<T>::Vec scratch2) {
+    const int depth = mean.dimension(0);
+    const int rest_size = input.size() / depth;
+
+    typedef typename TTypes<T>::ConstVec::Index Index;
+    Eigen::DSizes<Index, 2> rest_by_depth(rest_size, depth);
+    Eigen::DSizes<Index, 2> rest_by_one(rest_size, 1);
+    Eigen::DSizes<Index, 2> one_by_depth(1, depth);
+
+    // db = out_backprop
+    //
+    // dg = out_backprop * ((x - m) * rsqrt(v + epsilon))
+    //
+    // dv = sum_over_rest(out_backprop * gamma * (x - m)) *
+    //      (-1/2) * (v + epsilon) ^ (-3/2)
+    //
+    // dm = sum_over_rest(out_backprop * gamma) * (-1 / rsqrt(v + epsilon))
+    //
+    // dx = out_backprop * (gamma * rsqrt(v + epsilon))
+    Eigen::array<Index, 1> reduction_axis;
+    reduction_axis[0] = 0;  // Reduces on first dimension.
+
+    db.device(d) = out_backprop.reshape(rest_by_depth).sum(reduction_axis);
+
+    // scratch1 = rsqrt(v + epsilon)
+    scratch1.device(d) = (var + var.constant(variance_epsilon)).rsqrt();
+
+    // scratch2 = sum_over_rest(out_backprop * (x - m))
+    scratch2.device(d) = (out_backprop.reshape(rest_by_depth) *
+                          (input.reshape(rest_by_depth) -
+                           mean.reshape(one_by_depth).broadcast(rest_by_one)))
+                             .sum(reduction_axis);
+
+    if (scale_after_normalization) {
+      dx.reshape(rest_by_depth).device(d) =
+          out_backprop.reshape(rest_by_depth) * ((scratch1 * gamma)
+                                                     .eval()
+                                                     .reshape(one_by_depth)
+                                                     .broadcast(rest_by_one));
+      dm.device(d) = -db * (scratch1 * gamma).eval();
+      dg.device(d) = scratch2 * scratch1;
+    } else {
+      dx.reshape(rest_by_depth).device(d) =
+          out_backprop.reshape(rest_by_depth) *
+          scratch1.reshape(one_by_depth).broadcast(rest_by_one);
+      dm.device(d) = -db * scratch1;
+      dg.device(d) = dg.constant(static_cast<T>(0.0));  // Gamma is not learned.
+    }
+
+    // scratch1 = - 1/2 * (var + epsilon) ^ (-3/2)
+    scratch1.device(d) = scratch1 * scratch1.constant(static_cast<T>(-0.5f)) /
+                         (var + var.constant(variance_epsilon));
+
+    if (scale_after_normalization) {
+      dv.device(d) = scratch2 * (scratch1 * gamma).eval();
+    } else {
+      dv.device(d) = scratch2 * scratch1;
+    }
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_BATCH_NORM_OP_H_
diff --git a/tensorflow/core/kernels/batch_norm_op_gpu.cu.cc b/tensorflow/core/kernels/batch_norm_op_gpu.cu.cc
new file mode 100644
index 0000000000..02e0eeecfa
--- /dev/null
+++ b/tensorflow/core/kernels/batch_norm_op_gpu.cu.cc
@@ -0,0 +1,17 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/batch_norm_op.h"
+
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+template struct functor::BatchNorm<GPUDevice, float>;
+template struct functor::BatchNormGrad<GPUDevice, float>;
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/bcast_ops.cc b/tensorflow/core/kernels/bcast_ops.cc
new file mode 100644
index 0000000000..bb1492e5b4
--- /dev/null
+++ b/tensorflow/core/kernels/bcast_ops.cc
@@ -0,0 +1,71 @@
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/util/bcast.h"
+
+namespace tensorflow {
+
+// Given shapes of two tensors, computes the reduction indices for the
+// gradient computation.
+//
+// TODO(zhifengc):
+//   1. Adds support for n-ary (n >= 2).
+class BCastGradArgsOp : public OpKernel {
+ public:
+  explicit BCastGradArgsOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(
+        ctx, ctx->MatchSignature({DT_INT32, DT_INT32}, {DT_INT32, DT_INT32}));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    OP_REQUIRES(
+        ctx, ctx->num_inputs() == 2,
+        errors::Unimplemented("Broadcast for n-ary operations (n > 2)"));
+    gtl::InlinedVector<BCast::Vec, 4> shapes;
+    for (int i = 0; i < ctx->num_inputs(); ++i) {
+      const Tensor& in = ctx->input(i);
+      OP_REQUIRES(ctx, TensorShapeUtils::IsVector(in.shape()),
+                  errors::InvalidArgument("In[", i, "] must be a vector.",
+                                          in.shape().ShortDebugString()));
+      BCast::Vec vec;
+      for (int64 i = 0; i < in.NumElements(); ++i) {
+        vec.push_back(in.vec<int32>()(i));
+      }
+      shapes.push_back(vec);
+    }
+    BCast bcast(shapes[0], shapes[1]);
+    OP_REQUIRES(ctx, bcast.IsValid(),
+                errors::InvalidArgument(
+                    "Incompatible shapes: [", str_util::Join(shapes[0], ","),
+                    "] vs. [", str_util::Join(shapes[1], ","), "]"));
+    Output(ctx, 0, bcast.grad_x_reduce_idx());
+    Output(ctx, 1, bcast.grad_y_reduce_idx());
+  }
+
+ private:
+  void Output(OpKernelContext* ctx, int idx, const BCast::Vec& v) {
+    const int len = v.size();
+    Tensor* o = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(idx, TensorShape({len}), &o));
+    for (int i = 0; i < len; ++i) o->flat<int32>()(i) = v[i];
+  }
+
+  TF_DISALLOW_COPY_AND_ASSIGN(BCastGradArgsOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("BroadcastGradientArgs")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("s0")
+                            .HostMemory("s1")
+                            .HostMemory("r0")
+                            .HostMemory("r1"),
+                        BCastGradArgsOp);
+REGISTER_KERNEL_BUILDER(Name("BroadcastGradientArgs")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("s0")
+                            .HostMemory("s1")
+                            .HostMemory("r0")
+                            .HostMemory("r1"),
+                        BCastGradArgsOp);
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/bias_op.cc b/tensorflow/core/kernels/bias_op.cc
new file mode 100644
index 0000000000..68737f6c2d
--- /dev/null
+++ b/tensorflow/core/kernels/bias_op.cc
@@ -0,0 +1,112 @@
+// See docs in ../ops/nn_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/bias_op.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class BiasOp : public BinaryOp<T> {
+ public:
+  explicit BiasOp(OpKernelConstruction* context) : BinaryOp<T>(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const Tensor& bias = context->input(1);
+
+    OP_REQUIRES(context, TensorShapeUtils::IsMatrixOrHigher(input.shape()),
+                errors::InvalidArgument("Input tensor must be at least 2D: ",
+                                        input.shape().DebugString()));
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(bias.shape()),
+                errors::InvalidArgument("Biases must be 1D: ",
+                                        bias.shape().DebugString()));
+    const auto last_dim = input.shape().dims() - 1;
+    OP_REQUIRES(
+        context, bias.shape().dim_size(0) == input.shape().dim_size(last_dim),
+        errors::InvalidArgument(
+            "Must provide as many biases as the last dimension "
+            "of the input tensor: ",
+            bias.shape().DebugString(), " vs. ", input.shape().DebugString()));
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input.shape(), &output));
+
+    switch (input.shape().dims()) {
+      case 2:
+        Compute<2>(context, input, bias, output);
+        break;
+      case 3:
+        Compute<3>(context, input, bias, output);
+        break;
+      case 4:
+        Compute<4>(context, input, bias, output);
+        break;
+      case 5:
+        Compute<5>(context, input, bias, output);
+        break;
+      default:
+        OP_REQUIRES(context, false,
+                    errors::InvalidArgument("Only ranks up to 5 supported: ",
+                                            input.shape().DebugString()));
+    }
+  }
+
+  // Add biases for an input matrix of rank Dims, by using the Bias.
+  template <int Dims>
+  void Compute(OpKernelContext* ctx, const Tensor& input, const Tensor& bias,
+               Tensor* output) {
+    functor::Bias<Device, T, Dims> functor;
+    functor(ctx->eigen_device<Device>(), input.tensor<T, Dims>(), bias.vec<T>(),
+            output->tensor<T, Dims>());
+  }
+};
+
+#define REGISTER_KERNEL(type)                                       \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("BiasAdd").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      BiasOp<CPUDevice, type>);
+
+TF_CALL_NUMBER_TYPES(REGISTER_KERNEL);
+#undef REGISTER_KERNEL
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T, Dims)                                      \
+  template <>                                                          \
+  void Bias<GPUDevice, T, Dims>::operator()(                           \
+      const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input, \
+      typename TTypes<T>::ConstVec bias,                               \
+      typename TTypes<T, Dims>::Tensor output);                        \
+  extern template struct Bias<GPUDevice, T, Dims>;
+
+#define DECLARE_GPU_SPECS(T) \
+  DECLARE_GPU_SPEC(T, 2);    \
+  DECLARE_GPU_SPEC(T, 3);    \
+  DECLARE_GPU_SPEC(T, 4);    \
+  DECLARE_GPU_SPEC(T, 5);
+
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
+}  // namespace functor
+
+// Registration of the GPU implementations.
+#define REGISTER_GPU_KERNEL(type)                                   \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("BiasAdd").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+      BiasOp<GPUDevice, type>);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNEL);
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/bias_op.h b/tensorflow/core/kernels/bias_op.h
new file mode 100644
index 0000000000..513406d251
--- /dev/null
+++ b/tensorflow/core/kernels/bias_op.h
@@ -0,0 +1,41 @@
+#ifndef TENSORFLOW_KERNELS_BIAS_OP_H_
+#define TENSORFLOW_KERNELS_BIAS_OP_H_
+// Functor definition for BiasOp, must be compilable by nvcc.
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by BiasOp to do the computations.
+template <typename Device, typename T, int Dims>
+struct Bias {
+  // Add "bias" to "input", broadcasting it on all dimensions but the last one.
+  void operator()(const Device& d, typename TTypes<T, Dims>::ConstTensor input,
+                  typename TTypes<T>::ConstVec bias,
+                  typename TTypes<T, Dims>::Tensor output) {
+    const int bias_size = bias.dimension(0);
+    const int rest_size = input.size() / bias_size;
+
+    Eigen::DSizes<int, 2> rest_by_bias(rest_size, bias_size);
+#if !defined(EIGEN_HAS_INDEX_LIST)
+    Eigen::DSizes<int, 2> rest_by_one(rest_size, 1);
+    Eigen::DSizes<int, 2> one_by_bias(1, bias_size);
+#else
+    Eigen::IndexList<int, Eigen::type2index<1> > rest_by_one;
+    rest_by_one.set(0, rest_size);
+    Eigen::IndexList<Eigen::type2index<1>, int> one_by_bias;
+    one_by_bias.set(1, bias_size);
+#endif
+
+    output.reshape(rest_by_bias).device(d) =
+        input.reshape(rest_by_bias) +
+        bias.reshape(one_by_bias).broadcast(rest_by_one);
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_BIAS_OP_H_
diff --git a/tensorflow/core/kernels/bias_op_gpu.cu.cc b/tensorflow/core/kernels/bias_op_gpu.cu.cc
new file mode 100644
index 0000000000..d3377b3ce8
--- /dev/null
+++ b/tensorflow/core/kernels/bias_op_gpu.cu.cc
@@ -0,0 +1,23 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/bias_op.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Definition of the GPU implementations declared in bias_op.cc.
+#define DEFINE_GPU_SPECS(T)                       \
+  template struct functor::Bias<GPUDevice, T, 2>; \
+  template struct functor::Bias<GPUDevice, T, 3>; \
+  template struct functor::Bias<GPUDevice, T, 4>; \
+  template struct functor::Bias<GPUDevice, T, 5>;
+
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
+
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/candidate_sampler_ops.cc b/tensorflow/core/kernels/candidate_sampler_ops.cc
new file mode 100644
index 0000000000..cd5fde37a6
--- /dev/null
+++ b/tensorflow/core/kernels/candidate_sampler_ops.cc
@@ -0,0 +1,243 @@
+// See docs in ../ops/candidate_sampling_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include <cfloat>
+#include <unordered_map>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/range_sampler.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/util/guarded_philox_random.h"
+
+namespace tensorflow {
+
+class BaseCandidateSamplerOp : public OpKernel {
+ public:
+  explicit BaseCandidateSamplerOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("num_sampled", &num_sampled_));
+    OP_REQUIRES_OK(context, context->GetAttr("num_true", &num_true_));
+    OP_REQUIRES_OK(context, context->GetAttr("unique", &unique_));
+    OP_REQUIRES_OK(context, generator_.Init(context));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& true_classes = context->input(0);
+    OP_REQUIRES(context, true_classes.dims() == 2,
+                errors::InvalidArgument("true_classes must be a matrix"));
+    const int32 batch_size = true_classes.dim_size(0);
+    OP_REQUIRES(context, true_classes.dim_size(1) == num_true_,
+                errors::InvalidArgument("true_classes must have "
+                                        "num_true columns"));
+
+    // Output candidates and expected_count.
+    Tensor* out_sampled_candidates = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape({num_sampled_}),
+                                            &out_sampled_candidates));
+
+    Tensor* out_true_expected_count = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                1, TensorShape({batch_size, num_true_}),
+                                &out_true_expected_count));
+    Tensor* out_sampled_expected_count = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(2, TensorShape({num_sampled_}),
+                                            &out_sampled_expected_count));
+
+    gtl::ArraySlice<int64> true_candidate(true_classes.matrix<int64>().data(),
+                                          batch_size * num_true_);
+    gtl::MutableArraySlice<int64> sampled_candidate(
+        out_sampled_candidates->vec<int64>().data(), num_sampled_);
+    gtl::MutableArraySlice<float> true_expected_count(
+        out_true_expected_count->matrix<float>().data(),
+        batch_size * num_true_);
+    gtl::MutableArraySlice<float> sampled_expected_count(
+        out_sampled_expected_count->vec<float>().data(), num_sampled_);
+
+    CHECK(sampler_) << "CandidateSamplerOp did not set sampler_";
+
+    // Approximately conservatively estimate the number of samples required.
+    // In cases where rejection sampling is used we may occasionally use more
+    // samples than expected, which will result in reused random bits.
+    const int64 samples32 = 2048 * num_sampled_;
+
+    // Pick sampled candidates.
+    auto local_gen = generator_.ReserveSamples32(samples32);
+    random::SimplePhilox random(&local_gen);
+    sampler_->SampleBatchGetExpectedCount(&random, unique_, &sampled_candidate,
+                                          &sampled_expected_count,
+                                          true_candidate, &true_expected_count);
+
+    if (sampler_->NeedsUpdates()) {
+      sampler_->Update(true_candidate);
+    }
+  }
+
+ protected:
+  void set_sampler(RangeSampler* sampler) { sampler_.reset(sampler); }
+
+ private:
+  int32 num_true_;
+  int32 num_sampled_;
+  bool unique_;
+  std::unique_ptr<RangeSampler> sampler_;
+  GuardedPhiloxRandom generator_;
+};
+
+template <class RangeSamplerType>
+class SimpleCandidateSamplerOp : public BaseCandidateSamplerOp {
+ public:
+  explicit SimpleCandidateSamplerOp(OpKernelConstruction* context)
+      : BaseCandidateSamplerOp(context) {
+    int64 range_max;
+    OP_REQUIRES_OK(context, context->GetAttr("range_max", &range_max));
+    set_sampler(new RangeSamplerType(range_max));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("UniformCandidateSampler").Device(DEVICE_CPU),
+                        SimpleCandidateSamplerOp<UniformSampler>);
+
+REGISTER_KERNEL_BUILDER(Name("LogUniformCandidateSampler").Device(DEVICE_CPU),
+                        SimpleCandidateSamplerOp<LogUniformSampler>);
+
+REGISTER_KERNEL_BUILDER(Name("LearnedUnigramCandidateSampler")
+                            .Device(DEVICE_CPU),
+                        SimpleCandidateSamplerOp<UnigramSampler>);
+
+REGISTER_KERNEL_BUILDER(Name("ThreadUnsafeUnigramCandidateSampler")
+                            .Device(DEVICE_CPU),
+                        SimpleCandidateSamplerOp<ThreadUnsafeUnigramSampler>);
+
+class AllCandidateSamplerOp : public BaseCandidateSamplerOp {
+ public:
+  explicit AllCandidateSamplerOp(OpKernelConstruction* context)
+      : BaseCandidateSamplerOp(context) {
+    int64 range_max;
+    OP_REQUIRES_OK(context, context->GetAttr("num_sampled", &range_max));
+    set_sampler(new AllSampler(range_max));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("AllCandidateSampler").Device(DEVICE_CPU),
+                        AllCandidateSamplerOp);
+
+class FixedUnigramCandidateSamplerOp : public BaseCandidateSamplerOp {
+ public:
+  explicit FixedUnigramCandidateSamplerOp(OpKernelConstruction* context)
+      : BaseCandidateSamplerOp(context) {
+    int64 range_max;
+    OP_REQUIRES_OK(context, context->GetAttr("range_max", &range_max));
+    string vocab_file;
+    OP_REQUIRES_OK(context, context->GetAttr("vocab_file", &vocab_file));
+    std::vector<float> unigrams;
+    OP_REQUIRES_OK(context, context->GetAttr("unigrams", &unigrams));
+    OP_REQUIRES(
+        context, !vocab_file.empty() || !unigrams.empty(),
+        errors::InvalidArgument("Must provide either vocab_file or unigrams."));
+    OP_REQUIRES(context, vocab_file.empty() || unigrams.empty(),
+                errors::InvalidArgument(
+                    "Must only provide one of vocab_file and unigrams."));
+    float distortion;
+    OP_REQUIRES_OK(context, context->GetAttr("distortion", &distortion));
+    int64 num_reserved_ids;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("num_reserved_ids", &num_reserved_ids));
+    int64 num_shards;
+    OP_REQUIRES_OK(context, context->GetAttr("num_shards", &num_shards));
+    int64 shard;
+    OP_REQUIRES_OK(context, context->GetAttr("shard", &shard));
+
+    if (!vocab_file.empty()) {
+      set_sampler(new FixedUnigramSampler(context->env(), range_max, vocab_file,
+                                          distortion, num_reserved_ids,
+                                          num_shards, shard));
+    } else {
+      set_sampler(new FixedUnigramSampler(range_max, unigrams, distortion,
+                                          num_reserved_ids, num_shards, shard));
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("FixedUnigramCandidateSampler").Device(DEVICE_CPU),
+                        FixedUnigramCandidateSamplerOp);
+
+class ComputeAccidentalHitsOp : public OpKernel {
+ public:
+  explicit ComputeAccidentalHitsOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("num_true", &num_true_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& in_true_candidates = context->input(0);
+    TensorShape in_true_candidates_shape = in_true_candidates.shape();
+    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(in_true_candidates_shape) &&
+                             in_true_candidates_shape.dim_size(1) == num_true_,
+                errors::InvalidArgument(
+                    "true_candidates must be a batch_size * num_true matrix"));
+
+    const int64 batch_size = in_true_candidates_shape.dim_size(0);
+
+    const Tensor& in_sampled_candidates = context->input(1);
+    OP_REQUIRES(context,
+                TensorShapeUtils::IsVector(in_sampled_candidates.shape()),
+                errors::InvalidArgument(
+                    "sampled_candidates must be a vector, which is typically "
+                    "an output from CandidateSampler"));
+
+    std::unordered_map<int64, int> sampled_candidate_to_pos;
+    for (int64 i = 0; i < in_sampled_candidates.dim_size(0); ++i) {
+      sampled_candidate_to_pos[in_sampled_candidates.vec<int64>()(i)] = i;
+    }
+
+    // Produce output in the same format as UnpackSparseFeatures.
+    std::vector<int> indices;
+    std::vector<int64> ids;
+    std::vector<float> weights;
+
+    for (int64 i = 0; i < batch_size; ++i) {
+      for (int64 j = 0; j < num_true_; ++j) {
+        const int64 true_candidate = in_true_candidates.matrix<int64>()(i, j);
+        const auto look = sampled_candidate_to_pos.find(true_candidate);
+        if (look != sampled_candidate_to_pos.end()) {
+          indices.push_back(i);
+          ids.push_back(look->second);
+          weights.push_back(-FLT_MAX);
+        }
+      }
+    }
+
+    Tensor* out_indices = nullptr;
+    OP_REQUIRES_OK(
+        context,
+        context->allocate_output(
+            0, TensorShape({static_cast<int>(indices.size())}), &out_indices));
+    Tensor* out_ids = nullptr;
+    OP_REQUIRES_OK(
+        context, context->allocate_output(
+                     1, TensorShape({static_cast<int>(ids.size())}), &out_ids));
+    Tensor* out_weights = nullptr;
+    OP_REQUIRES_OK(
+        context,
+        context->allocate_output(
+            2, TensorShape({static_cast<int>(weights.size())}), &out_weights));
+
+    for (size_t i = 0; i < indices.size(); ++i) {
+      out_indices->vec<int32>()(i) = indices[i];
+      out_ids->vec<int64>()(i) = ids[i];
+      out_weights->vec<float>()(i) = weights[i];
+    }
+  }
+
+ private:
+  int64 num_true_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("ComputeAccidentalHits").Device(DEVICE_CPU),
+                        ComputeAccidentalHitsOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op.cc b/tensorflow/core/kernels/cast_op.cc
new file mode 100644
index 0000000000..779ac57b6a
--- /dev/null
+++ b/tensorflow/core/kernels/cast_op.cc
@@ -0,0 +1,233 @@
+// See docs in ../ops/math_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/cast_op.h"
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+template <typename Device, typename Tout, typename Tin>
+void CastMaybeInline(const Device& d, typename TTypes<Tout>::Flat o,
+                     typename TTypes<Tin>::ConstFlat i) {
+  if (o.size() * (sizeof(Tin) + sizeof(Tout)) < 131072) {
+    // Small cast on a CPU: do inline
+    o = i.template cast<Tout>();
+  } else {
+    o.device(d) = i.template cast<Tout>();
+  }
+}
+
+template <typename O, typename I>
+struct CastFunctor<CPUDevice, O, I> {
+  void operator()(const CPUDevice& d, typename TTypes<O>::Flat o,
+                  typename TTypes<I>::ConstFlat i) {
+    CastMaybeInline<CPUDevice, O, I>(d, o, i);
+  }
+};
+
+}  // namespace functor
+
+#define CAST_CASE(DEVICE, IN, OUT)                                         \
+  if (DataTypeToEnum<IN>::value == src_dtype_ &&                           \
+      DataTypeToEnum<OUT>::value == dst_dtype_) {                          \
+    work_ = [](OpKernelContext* ctx, const Tensor& inp, Tensor* out) {     \
+      functor::CastFunctor<DEVICE, OUT, IN> func;                          \
+      func(ctx->eigen_device<DEVICE>(), out->flat<OUT>(), inp.flat<IN>()); \
+    };                                                                     \
+    return Status::OK();                                                   \
+  }
+
+class CastOpBase : public OpKernel {
+ public:
+  explicit CastOpBase(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("SrcT", &src_dtype_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("DstT", &dst_dtype_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& inp = ctx->input(0);
+    if (work_ == nullptr) {
+      ctx->set_output(0, inp);
+    } else {
+      Tensor* out = nullptr;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, inp.shape(), &out));
+      work_(ctx, inp, out);
+    }
+  }
+
+ protected:
+  DataType src_dtype_;
+  DataType dst_dtype_;
+  std::function<void(OpKernelContext*, const Tensor&, Tensor*)> work_ = nullptr;
+
+  virtual Status Prepare() = 0;
+  Status Unimplemented() {
+    return errors::Unimplemented("Cast ", DataTypeString(src_dtype_), " to ",
+                                 DataTypeString(dst_dtype_),
+                                 " is not supported");
+  }
+
+  TF_DISALLOW_COPY_AND_ASSIGN(CastOpBase);
+};
+
+class CpuCastOp : public CastOpBase {
+ public:
+  explicit CpuCastOp(OpKernelConstruction* ctx) : CastOpBase(ctx) {
+    OP_REQUIRES_OK(ctx, Prepare());
+  }
+
+ protected:
+  Status Prepare() override {
+    if (src_dtype_ == dst_dtype_) {
+      work_ = nullptr;  // Identity
+      return Status::OK();
+    }
+    CAST_CASE(CPUDevice, bool, float);
+    CAST_CASE(CPUDevice, bool, int32);
+    CAST_CASE(CPUDevice, bool, double);
+    CAST_CASE(CPUDevice, double, float);
+    CAST_CASE(CPUDevice, double, int32);
+    CAST_CASE(CPUDevice, double, int64);
+    CAST_CASE(CPUDevice, float, double);
+    CAST_CASE(CPUDevice, float, uint8);
+    CAST_CASE(CPUDevice, float, int32);
+    CAST_CASE(CPUDevice, float, int64);
+    CAST_CASE(CPUDevice, int32, double);
+    CAST_CASE(CPUDevice, int32, float);
+    CAST_CASE(CPUDevice, int32, uint8);
+    CAST_CASE(CPUDevice, int32, int64);
+    CAST_CASE(CPUDevice, int64, double);
+    CAST_CASE(CPUDevice, int64, float);
+    CAST_CASE(CPUDevice, int64, int32);
+    CAST_CASE(CPUDevice, uint8, float);
+    CAST_CASE(CPUDevice, uint8, int32);
+    CAST_CASE(CPUDevice, uint8, int64);
+    CAST_CASE(CPUDevice, uint8, double);
+    if (src_dtype_ == DT_BFLOAT16 && dst_dtype_ == DT_FLOAT) {
+      work_ = [](OpKernelContext* ctx, const Tensor& inp, Tensor* out) {
+        int64 N = out->NumElements();
+        auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
+        int num_threads =
+            std::min<int>(std::min(4, worker_threads->num_threads), N / 4096);
+        if (num_threads < 1) {
+          BFloat16ToFloat(inp.flat<bfloat16>().data(),
+                          out->flat<float>().data(), N);
+        } else {
+          auto work = [&inp, &out](int64 start, int64 end) {
+            BFloat16ToFloat(inp.flat<bfloat16>().data() + start,
+                            out->flat<float>().data() + start, end - start);
+          };
+          Shard(num_threads, worker_threads->workers, N, 100, work);
+        }
+      };
+      return Status::OK();
+    }
+    if (src_dtype_ == DT_FLOAT && dst_dtype_ == DT_BFLOAT16) {
+      work_ = [](OpKernelContext* ctx, const Tensor& inp, Tensor* out) {
+        int64 N = out->NumElements();
+        auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
+        int num_threads =
+            std::min<int>(std::min(4, worker_threads->num_threads), N / 4096);
+        if (num_threads < 1) {
+          FloatToBFloat16(inp.flat<float>().data(),
+                          out->flat<bfloat16>().data(), N);
+        } else {
+          auto work = [&inp, &out](int64 start, int64 end) {
+            FloatToBFloat16(inp.flat<float>().data() + start,
+                            out->flat<bfloat16>().data() + start, end - start);
+          };
+          Shard(num_threads, worker_threads->workers, N, 100, work);
+        }
+      };
+      return Status::OK();
+    }
+    return Unimplemented();
+  }
+};
+
+class GpuCastOp : public CastOpBase {
+ public:
+  explicit GpuCastOp(OpKernelConstruction* ctx) : CastOpBase(ctx) {
+    OP_REQUIRES_OK(ctx, Prepare());
+  }
+
+ protected:
+  Status Prepare() override {
+    if (src_dtype_ == dst_dtype_) {
+      work_ = nullptr;  // Identity
+      return Status::OK();
+    }
+    CAST_CASE(GPUDevice, bfloat16, float);
+    CAST_CASE(GPUDevice, bool, float);
+    CAST_CASE(GPUDevice, double, float);
+    CAST_CASE(GPUDevice, double, int64);
+    CAST_CASE(GPUDevice, float, bfloat16);
+    CAST_CASE(GPUDevice, float, double);
+    CAST_CASE(GPUDevice, float, int64);
+    CAST_CASE(GPUDevice, int64, double);
+    CAST_CASE(GPUDevice, int64, float);
+    CAST_CASE(GPUDevice, uint8, float);
+    CAST_CASE(GPUDevice, float, uint8);
+    CAST_CASE(GPUDevice, bool, int32);
+    CAST_CASE(GPUDevice, double, int32);
+    CAST_CASE(GPUDevice, float, int32);
+    CAST_CASE(GPUDevice, int32, double);
+    CAST_CASE(GPUDevice, int32, float);
+    CAST_CASE(GPUDevice, int32, int64);
+    CAST_CASE(GPUDevice, int64, int32);
+    return Unimplemented();
+  }
+};
+
+#undef CAST_CASE
+
+REGISTER_KERNEL_BUILDER(Name("Cast").Device(DEVICE_CPU), CpuCastOp);
+
+#if GOOGLE_CUDA
+#define REGISTER_CAST_GPU(srctype, dsttype)                    \
+  REGISTER_KERNEL_BUILDER(Name("Cast")                         \
+                              .TypeConstraint<srctype>("SrcT") \
+                              .TypeConstraint<dsttype>("DstT") \
+                              .Device(DEVICE_GPU),             \
+                          GpuCastOp);
+REGISTER_CAST_GPU(bfloat16, float);
+REGISTER_CAST_GPU(bool, float);
+REGISTER_CAST_GPU(double, float);
+REGISTER_CAST_GPU(double, int64);
+REGISTER_CAST_GPU(float, bfloat16);
+REGISTER_CAST_GPU(float, double);
+REGISTER_CAST_GPU(float, int64);
+REGISTER_CAST_GPU(int64, double);
+REGISTER_CAST_GPU(int64, float);
+REGISTER_CAST_GPU(uint8, float);
+REGISTER_CAST_GPU(float, uint8);
+REGISTER_CAST_GPU(bool, int32);
+REGISTER_CAST_GPU(double, int32);
+REGISTER_CAST_GPU(float, int32);
+REGISTER_CAST_GPU(int32, double);
+REGISTER_CAST_GPU(int32, float);
+REGISTER_CAST_GPU(int32, int64);
+REGISTER_CAST_GPU(int64, int32);
+#undef REGISTER_CAST_GPU
+#endif  // GOOGLE_CUDA
+
+// HostCast differs from Cast in that its input and output are in host memory.
+REGISTER_KERNEL_BUILDER(Name("_HostCast").Device(DEVICE_CPU), CpuCastOp);
+REGISTER_KERNEL_BUILDER(
+    Name("_HostCast").Device(DEVICE_GPU).HostMemory("x").HostMemory("y"),
+    CpuCastOp);
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op.h b/tensorflow/core/kernels/cast_op.h
new file mode 100644
index 0000000000..d066206abc
--- /dev/null
+++ b/tensorflow/core/kernels/cast_op.h
@@ -0,0 +1,71 @@
+#ifndef TENSORFLOW_KERNELS_CAST_OP_H_
+#define TENSORFLOW_KERNELS_CAST_OP_H_
+
+#include "tensorflow/core/framework/bfloat16.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/port.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename Tout, typename Tin>
+void Cast(const Device& d, typename TTypes<Tout>::Flat o,
+          typename TTypes<Tin>::ConstFlat i) {
+  o.device(d) = i.template cast<Tout>();
+}
+
+template <typename Device, typename Tout, typename Tin>
+struct CastFunctor {
+  void operator()(const Device& d, typename TTypes<Tout>::Flat o,
+                  typename TTypes<Tin>::ConstFlat i);
+};
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+namespace Eigen {
+namespace internal {
+
+// Specialized cast op impls for bfloat16.
+template <>
+struct scalar_cast_op< ::tensorflow::bfloat16, float> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef float result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator()(
+      const ::tensorflow::bfloat16& a) const {
+    static_assert(::tensorflow::port::kLittleEndian, "");
+    float ret;
+    uint16_t* p = reinterpret_cast<uint16_t*>(&ret);
+    p[0] = 0;
+    p[1] = a.value;
+    return ret;
+  }
+};
+
+template <>
+struct functor_traits<scalar_cast_op< ::tensorflow::bfloat16, float> > {
+  enum { Cost = NumTraits<float>::AddCost, PacketAccess = false };
+};
+
+template <>
+struct scalar_cast_op<float, ::tensorflow::bfloat16> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef ::tensorflow::bfloat16 result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const ::tensorflow::bfloat16 operator()(
+      const float a) const {
+    static_assert(::tensorflow::port::kLittleEndian, "");
+    const uint16_t* p = reinterpret_cast<const uint16_t*>(&a);
+    return ::tensorflow::bfloat16(p[1]);
+  }
+};
+
+template <>
+struct functor_traits<scalar_cast_op<float, ::tensorflow::bfloat16> > {
+  enum { Cost = NumTraits<float>::AddCost, PacketAccess = false };
+};
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // TENSORFLOW_KERNELS_CAST_OP_H_
diff --git a/tensorflow/core/kernels/cast_op_gpu.cu.cc b/tensorflow/core/kernels/cast_op_gpu.cu.cc
new file mode 100644
index 0000000000..cd198c752b
--- /dev/null
+++ b/tensorflow/core/kernels/cast_op_gpu.cu.cc
@@ -0,0 +1,45 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/bfloat16.h"
+#include "tensorflow/core/kernels/cast_op.h"
+
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename O, typename I>
+struct CastFunctor<GPUDevice, O, I> {
+  void operator()(const GPUDevice& d, typename TTypes<O>::Flat o,
+                  typename TTypes<I>::ConstFlat i) {
+    Cast<GPUDevice, O, I>(d, o, i);
+  }
+};
+
+#define DEFINE(O, I) template struct CastFunctor<GPUDevice, O, I>;
+DEFINE(float, double);
+DEFINE(float, int32);
+DEFINE(float, int64);
+DEFINE(double, float);
+DEFINE(double, int32);
+DEFINE(double, int64);
+DEFINE(int32, float);
+DEFINE(int32, double);
+DEFINE(int32, int64);
+DEFINE(int64, float);
+DEFINE(int64, double);
+DEFINE(int64, int32);
+DEFINE(int32, bool);
+DEFINE(float, bool);
+DEFINE(float, uint8);
+DEFINE(uint8, float);
+DEFINE(float, bfloat16);
+DEFINE(bfloat16, float);
+#undef DEFINE
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cast_op_test.cc b/tensorflow/core/kernels/cast_op_test.cc
new file mode 100644
index 0000000000..f774fbcfe8
--- /dev/null
+++ b/tensorflow/core/kernels/cast_op_test.cc
@@ -0,0 +1,100 @@
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/tensor.h"
+#include <gtest/gtest.h>
+
+namespace tensorflow {
+
+template <typename Src, typename Dst>
+static Graph* Cast(int num) {
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor data(DataTypeToEnum<Src>::value,
+              TensorShape({64, 64, num / (64 * 64)}));
+  data.flat<Src>().setRandom();
+  test::graph::Cast(g, test::graph::Constant(g, data),
+                    DataTypeToEnum<Dst>::value);
+  return g;
+}
+
+class CastOpTest : public OpsTestBase {
+ protected:
+  void MakeOp(DataType src, DataType dst) {
+    RequireDefaultOps();
+    EXPECT_OK(NodeDefBuilder("cast_op", "Cast")
+                  .Input(FakeInput(DT_INT32))
+                  .Attr("SrcT", src)
+                  .Attr("DstT", dst)
+                  .Finalize(node_def()));
+    EXPECT_OK(InitOp());
+  }
+};
+
+TEST_F(CastOpTest, Int32ToUint8) {
+  MakeOp(DT_INT32, DT_UINT8);
+  AddInputFromArray<int32>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+  ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_UINT8, TensorShape({1, 2, 2, 1}));
+  test::FillValues<uint8>(&expected, {1, 2, 3, 4});
+  test::ExpectTensorEqual<uint8>(expected, *GetOutput(0));
+}
+
+static void BM_cpu_float_int64(int iters, int num) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num);
+  testing::BytesProcessed(static_cast<int64>(iters) * num *
+                          (sizeof(float) + sizeof(int64)));
+  testing::UseRealTime();
+  test::Benchmark("cpu", Cast<float, int64>(num)).Run(iters);
+}
+BENCHMARK(BM_cpu_float_int64)->Arg(64 << 10)->Arg(32 << 20);
+
+static void BM_gpu_float_int64(int iters, int num) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num);
+  testing::BytesProcessed(static_cast<int64>(iters) * num *
+                          (sizeof(float) + sizeof(int64)));
+  testing::UseRealTime();
+  test::Benchmark("gpu", Cast<float, int64>(num)).Run(iters);
+}
+BENCHMARK(BM_gpu_float_int64)->Arg(64 << 10)->Arg(32 << 20);
+
+static void BM_cpu_bool_float(int iters, int num) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num);
+  testing::BytesProcessed(static_cast<int64>(iters) * num *
+                          (sizeof(bool) + sizeof(float)));
+  testing::UseRealTime();
+  test::Benchmark("cpu", Cast<bool, float>(num)).Run(iters);
+}
+BENCHMARK(BM_cpu_bool_float)->Arg(64 << 10)->Arg(32 << 20);
+
+static void BM_gpu_bool_float(int iters, int num) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num);
+  testing::BytesProcessed(static_cast<int64>(iters) * num *
+                          (sizeof(bool) + sizeof(float)));
+  testing::UseRealTime();
+  test::Benchmark("gpu", Cast<bool, float>(num)).Run(iters);
+}
+BENCHMARK(BM_gpu_bool_float)->Arg(64 << 10)->Arg(32 << 20);
+
+static void BM_cpu_float_bfloat16(int iters, int num) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num);
+  testing::BytesProcessed(static_cast<int64>(iters) * num *
+                          (sizeof(float) + sizeof(bfloat16)));
+  testing::UseRealTime();
+  test::Benchmark("cpu", Cast<float, bfloat16>(num)).Run(iters);
+}
+BENCHMARK(BM_cpu_float_bfloat16)->Arg(64 << 10)->Arg(32 << 20);
+
+static void BM_cpu_bfloat16_float(int iters, int num) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num);
+  testing::BytesProcessed(static_cast<int64>(iters) * num *
+                          (sizeof(float) + sizeof(bfloat16)));
+  testing::UseRealTime();
+  test::Benchmark("cpu", Cast<bfloat16, float>(num)).Run(iters);
+}
+BENCHMARK(BM_cpu_bfloat16_float)->Arg(64 << 10)->Arg(32 << 20);
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/check_numerics_op.cc b/tensorflow/core/kernels/check_numerics_op.cc
new file mode 100644
index 0000000000..65487a303c
--- /dev/null
+++ b/tensorflow/core/kernels/check_numerics_op.cc
@@ -0,0 +1,190 @@
+// See docs in ../ops/array_ops.cc.
+
+#include <math.h>
+#include <algorithm>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/public/tensor.h"
+
+#if GOOGLE_CUDA
+#include "tensorflow/core/common_runtime/gpu_device_context.h"
+#include "tensorflow/stream_executor/stream.h"
+#endif  // GOOGLE_CUDA
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+#if GOOGLE_CUDA
+template <typename T>
+struct CheckNumericsLaunch {
+  void Run(const GPUDevice& d, const T* data, int size,
+           int abnormal_detected[2]);
+};
+#endif
+
+namespace {
+
+template <typename Device, typename T>
+class CheckNumericsOp;
+
+// Partial specialization for CPU
+template <typename T>
+class CheckNumericsOp<CPUDevice, T> : public OpKernel {
+ public:
+  explicit CheckNumericsOp(OpKernelConstruction* context) : OpKernel(context) {
+    // message_ is used as the prefix for the assertion error message. For
+    // instance, this can be the name of the input op that produced the tensor.
+    OP_REQUIRES_OK(context, context->GetAttr("message", &message_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // pass along the input to the output
+    context->set_output(0, context->input(0));
+
+    auto in = context->input(0).flat<T>();
+    const T* data = in.data();
+    const int size = in.size();
+    // Check to see if any element of the tensor is NaN or Inf.
+    int fp_props =
+        std::accumulate(data, data + size, 0, [](const int& x, const T& y) {
+          int prop = std::fpclassify(y);
+          int result = x;
+          if (prop == FP_INFINITE) {
+            result |= kInfBit;
+          } else if (prop == FP_NAN) {
+            result |= kNaNBit;
+          }
+          return result;
+        });
+    string status;
+    if ((fp_props & kInfBit) && (fp_props & kNaNBit)) {
+      status = "Inf and NaN";
+    } else {
+      if (fp_props & kInfBit) {
+        status = "Inf";
+      }
+      if (fp_props & kNaNBit) {
+        status = "NaN";
+      }
+    }
+    if (!status.empty()) {
+      context->SetStatus(errors::InvalidArgument(message_, " : Tensor had ",
+                                                 status, " values"));
+    }
+  }
+
+ private:
+  string message_;
+  static const int kInfBit = 0x01;
+  static const int kNaNBit = 0x02;
+};
+
+#if GOOGLE_CUDA
+// Partial specialization for GPU
+template <typename T>
+class CheckNumericsOp<GPUDevice, T> : public OpKernel {
+ public:
+  typedef GPUDevice Device;
+
+  explicit CheckNumericsOp(OpKernelConstruction* context) : OpKernel(context) {
+    // message_ is used as the prefix for the assertion error message. For
+    // instance, this can be the name of the input op that produced the tensor.
+    OP_REQUIRES_OK(context, context->GetAttr("message", &message_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // pass along the input to the output
+    context->set_output(0, context->input(0));
+    auto input = context->input(0).flat<T>();
+
+    // Allocate and initialize the elements to hold the check results
+    const int abnormal_detected_size = 2;
+    Tensor abnormal_detected;
+    OP_REQUIRES_OK(context, context->allocate_temp(
+                                DT_INT32, TensorShape({abnormal_detected_size}),
+                                &abnormal_detected));
+
+    auto* stream = context->op_device_context<GPUDeviceContext>()->stream();
+    OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+
+    perftools::gputools::DeviceMemoryBase abnormal_detected_ptr(
+        abnormal_detected.flat<int>().data(),
+        abnormal_detected.flat<int>().size());
+    stream->ThenMemset32(&abnormal_detected_ptr, 0,
+                         abnormal_detected.flat<int>().size() * sizeof(int));
+
+    // Call the Cuda kernels for the numerical checks
+    const Device& d = context->eigen_device<Device>();
+    CheckNumericsLaunch<T>().Run(d, input.data(), input.size(),
+                                 abnormal_detected.flat<int>().data());
+
+    // Copy the results from device to host
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    attr.set_gpu_compatible(true);
+    Tensor abnormal_detected_out;
+    OP_REQUIRES_OK(context, context->allocate_temp(
+                                DT_INT32, TensorShape({abnormal_detected_size}),
+                                &abnormal_detected_out, attr));
+    int* abnormal_detected_host = abnormal_detected_out.flat<int>().data();
+    stream->ThenMemcpy(abnormal_detected_host, abnormal_detected_ptr,
+                       abnormal_detected_size * sizeof(int));
+    stream->BlockHostUntilDone();
+    OP_REQUIRES(context, stream->ok(),
+                errors::Internal("cudaMemcpy from device to host failed"));
+
+    int is_nan = abnormal_detected_host[0];
+    int is_inf = abnormal_detected_host[1];
+    if (is_nan || is_inf) {
+      string status;
+      LOG(ERROR) << "abnormal_detected_host @" << abnormal_detected_host
+                 << " = {" << is_nan << ", " << is_inf << "} " << message_;
+
+      // Results should always be 1 or 0.  If we see anything else then
+      // there has been some GPU memory corruption.
+      CHECK_GE(is_nan, 0);
+      CHECK_GE(is_inf, 0);
+      CHECK_LE(is_nan, 1);
+      CHECK_LE(is_inf, 1);
+
+      if (is_nan && is_inf) {
+        status = "Inf and NaN";
+      } else if (is_nan) {
+        status = "NaN";
+      } else if (is_inf) {
+        status = "Inf";
+      }
+      context->SetStatus(errors::InvalidArgument(message_, " : Tensor had ",
+                                                 status, " values"));
+    }
+  }
+
+ private:
+  string message_;
+};
+#endif  // GOOGLE_CUDA
+
+}  // namespace
+
+REGISTER_KERNEL_BUILDER(Name("CheckNumerics")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T"),
+                        CheckNumericsOp<CPUDevice, float>);
+REGISTER_KERNEL_BUILDER(Name("CheckNumerics")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<double>("T"),
+                        CheckNumericsOp<CPUDevice, double>);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("CheckNumerics")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<float>("T"),
+                        CheckNumericsOp<GPUDevice, float>);
+REGISTER_KERNEL_BUILDER(Name("CheckNumerics")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<double>("T"),
+                        CheckNumericsOp<GPUDevice, double>);
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc b/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc
new file mode 100644
index 0000000000..cb84f98731
--- /dev/null
+++ b/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc
@@ -0,0 +1,62 @@
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include <stdio.h>
+#include <assert.h>
+
+#include <math.h>
+#include <algorithm>
+
+#include "tensorflow/core/platform/port.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+namespace {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// A Cuda kernel to check if each element is Inf or Nan. If any exists, the
+// relevant elements in abnormal_detected will be set
+template <typename T>
+__global__ void CheckNumericsKernel(const T *data, int size,
+                                    int abnormal_detected[2]) {
+  const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const int32 total_thread_count = gridDim.x * blockDim.x;
+
+  int32 offset = thread_id;
+
+  while (offset < size) {
+    if (isnan(data[offset])) {
+      abnormal_detected[0] = 1;
+    }
+    if (isinf(data[offset])) {
+      abnormal_detected[1] = 1;
+    }
+    offset += total_thread_count;
+  }
+}
+
+}  // namespace
+
+// A simple launch pad to launch the Cuda kernels that checks the numerical
+// abnormality in the given array
+template <typename T>
+struct CheckNumericsLaunch {
+  void Run(const GPUDevice &d, const T *data, int size,
+           int abnormal_detected[2]) {
+    const int32 block_size = d.maxCudaThreadsPerBlock();
+    const int32 num_blocks =
+        (d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor()) /
+        block_size;
+
+    CheckNumericsKernel<T><<<num_blocks, block_size, 0, d.stream()>>>(
+        data, size, abnormal_detected);
+  }
+};
+
+template struct CheckNumericsLaunch<float>;
+template struct CheckNumericsLaunch<double>;
+
+}  // namespace tensorflow
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cholesky_op.cc b/tensorflow/core/kernels/cholesky_op.cc
new file mode 100644
index 0000000000..12632fb248
--- /dev/null
+++ b/tensorflow/core/kernels/cholesky_op.cc
@@ -0,0 +1,71 @@
+// See docs in ../ops/linalg_ops.cc.
+// TODO(konstantinos): Enable complex inputs. This will require additional tests
+//                     and OP_REQUIRES.
+
+#include <cmath>
+
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "third_party/eigen3/Eigen/Cholesky"
+
+namespace tensorflow {
+
+template <class Scalar, bool SupportsBatchOperationT>
+class CholeskyOp : public LinearAlgebraOp<Scalar, SupportsBatchOperationT> {
+ public:
+  explicit CholeskyOp(OpKernelConstruction* context)
+      : LinearAlgebraOp<Scalar, SupportsBatchOperationT>(context) {}
+
+  TensorShape GetOutputMatrixShape(
+      const TensorShape& input_matrix_shape) override {
+    return input_matrix_shape;
+  }
+
+  int64 GetCostPerUnit(const TensorShape& input_matrix_shape) override {
+    const int64 rows = input_matrix_shape.dim_size(0);
+    if (rows > (1LL << 20)) {
+      // A big number to cap the cost in case overflow.
+      return kint32max;
+    } else {
+      return rows * rows * rows;
+    }
+  }
+
+  using typename LinearAlgebraOp<Scalar, SupportsBatchOperationT>::MatrixMap;
+  using
+      typename LinearAlgebraOp<Scalar, SupportsBatchOperationT>::ConstMatrixMap;
+
+  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMap& input,
+                     MatrixMap* output) override {
+    OP_REQUIRES(context, input.rows() == input.cols(),
+                errors::InvalidArgument("Input matrix must be square."));
+    if (input.rows() == 0) {
+      // If X is an empty matrix (0 rows, 0 col), X * X' == X.
+      // Therefore, we return X.
+      return;
+    }
+    // Perform the actual LL^T Cholesky decomposition. This will only use
+    // the lower triangular part of data_in by default. The upper triangular
+    // part of the matrix will not be read.
+    Eigen::LLT<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic,
+                             Eigen::RowMajor>> llt_decomposition(input);
+
+    // Output the lower triangular in a dense form.
+    *output = llt_decomposition.matrixL();
+
+    OP_REQUIRES(context, llt_decomposition.info() == Eigen::Success,
+                errors::InvalidArgument("LLT decomposition was not successful. "
+                                        "The input might not be valid."));
+  }
+};
+
+REGISTER_LINALG_OP("Cholesky", (CholeskyOp<float, false>), float);
+REGISTER_LINALG_OP("Cholesky", (CholeskyOp<double, false>), double);
+REGISTER_LINALG_OP("BatchCholesky", (CholeskyOp<float, true>), float);
+REGISTER_LINALG_OP("BatchCholesky", (CholeskyOp<double, true>), double);
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
new file mode 100644
index 0000000000..b68fcec515
--- /dev/null
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -0,0 +1,153 @@
+// See docs in ../ops/array_ops.cc.
+
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/concat_op.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/public/status.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+// --------------------------------------------------------------------------
+template <typename Device, typename T>
+class ConcatOp : public OpKernel {
+ public:
+  typedef std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>
+      ConstMatrixVector;
+
+  explicit ConcatOp(OpKernelConstruction* c) : OpKernel(c) {}
+
+  void Compute(OpKernelContext* c) override {
+    const Tensor* concat_dim_tensor;
+    OP_REQUIRES_OK(c, c->input("concat_dim", &concat_dim_tensor));
+    OP_REQUIRES(
+        c, TensorShapeUtils::IsLegacyScalar(concat_dim_tensor->shape()),
+        errors::InvalidArgument(
+            "Concat dim tensor should be a scalar integer, but got shape ",
+            concat_dim_tensor->shape().DebugString()));
+    const int32 concat_dim = concat_dim_tensor->scalar<int32>()();
+    OpInputList values;
+    OP_REQUIRES_OK(c, c->input_list("values", &values));
+    const int N = values.size();
+    const int input_dims = values[0].dims();
+    const TensorShape& input_shape = values[0].shape();
+    OP_REQUIRES(
+        c, (0 <= concat_dim && concat_dim < input_dims) ||
+               (kAllowLegacyScalars && concat_dim == 0),
+        errors::InvalidArgument(
+            "ConcatOp : Expected concatenating dimensions in the range [", 0,
+            ", ", input_dims, "), but got ", concat_dim));
+
+    // Note that we reduce the concat of n-dimensional tensors into a two
+    // dimensional concat. Assuming the dimensions of any input/output
+    // tensor are {x0, x1,...,xn-1, y0, y1,...,ym-1}, where the concat is along
+    // the dimension indicated with size y0, we flatten it to {x, y}, where y =
+    // Prod_i(yi) and x = ((n > 0) ? Prod_i(xi) : 1).
+    ConstMatrixVector inputs_flat;
+    inputs_flat.reserve(N);
+    int64 inputs_flat_dim0 = 1;
+    for (int d = 0; d < concat_dim; ++d) {
+      inputs_flat_dim0 *= input_shape.dim_size(d);
+    }
+    int output_concat_dim = 0;
+    const bool input_is_scalar = TensorShapeUtils::IsLegacyScalar(input_shape);
+    for (int i = 0; i < N; ++i) {
+      const auto in = values[i];
+      const bool in_is_scalar = TensorShapeUtils::IsLegacyScalar(in.shape());
+      OP_REQUIRES(
+          c, in.dims() == input_dims || (input_is_scalar && in_is_scalar),
+          errors::InvalidArgument(
+              "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
+              input_shape.ShortDebugString(), " vs. shape[", i, "] = ",
+              in.shape().ShortDebugString()));
+      for (int j = 0; j < input_dims; ++j) {
+        if (j == concat_dim) {
+          continue;
+        }
+        OP_REQUIRES(
+            c, in.dim_size(j) == input_shape.dim_size(j),
+            errors::InvalidArgument(
+                "ConcatOp : Dimensions of inputs should match: shape[0] = ",
+                input_shape.ShortDebugString(), " vs. shape[", i, "] = ",
+                in.shape().ShortDebugString()));
+      }
+      if (in.NumElements() > 0) {
+        int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0;
+        inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+            in.shaped<T, 2>({inputs_flat_dim0, inputs_flat_dim1})));
+      }
+      // TODO(irving): Remove check once !kAllowLegacyScalars
+      output_concat_dim += in.dims() > 0 ? in.dim_size(concat_dim) : 1;
+    }
+
+    TensorShape output_shape(input_shape);
+    // TODO(irving): Remove rank 0 case once !kAllowLegacyScalars
+    if (output_shape.dims() == 0) {
+      output_shape.AddDim(output_concat_dim);
+    } else {
+      output_shape.set_dim(concat_dim, output_concat_dim);
+    }
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(c, c->allocate_output(0, output_shape, &output));
+    if (output->NumElements() > 0) {
+      int64 output_dim1 = output->NumElements() / inputs_flat_dim0;
+      auto output_flat = output->shaped<T, 2>({inputs_flat_dim0, output_dim1});
+      if (std::is_same<Device, GPUDevice>::value) {
+        ConcatGPU<T>(c->eigen_gpu_device(), inputs_flat, &output_flat);
+      } else {
+        ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
+      }
+    }
+  }
+};
+
+#define REGISTER_CONCAT(type)                            \
+  REGISTER_KERNEL_BUILDER(Name("Concat")                 \
+                              .Device(DEVICE_CPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("concat_dim"), \
+                          ConcatOp<CPUDevice, type>)
+
+TF_CALL_ALL_TYPES(REGISTER_CONCAT);
+REGISTER_CONCAT(quint8);
+REGISTER_CONCAT(qint8);
+REGISTER_CONCAT(qint32);
+REGISTER_CONCAT(bfloat16);
+
+#undef REGISTER_CONCAT
+
+#if GOOGLE_CUDA
+
+#define REGISTER_GPU(type)                               \
+  REGISTER_KERNEL_BUILDER(Name("Concat")                 \
+                              .Device(DEVICE_GPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("concat_dim"), \
+                          ConcatOp<GPUDevice, type>)
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+#undef REGISTER_GPU
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Concat")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int32>("T")
+                            .HostMemory("concat_dim")
+                            .HostMemory("values")
+                            .HostMemory("output"),
+                        ConcatOp<CPUDevice, int32>);
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_op.h b/tensorflow/core/kernels/concat_op.h
new file mode 100644
index 0000000000..664e55080d
--- /dev/null
+++ b/tensorflow/core/kernels/concat_op.h
@@ -0,0 +1,27 @@
+#ifndef TENSORFLOW_KERNELS_CONCAT_OP_H_
+#define TENSORFLOW_KERNELS_CONCAT_OP_H_
+
+#include <vector>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/device_base.h"
+
+namespace tensorflow {
+
+// Assumes all inputs are nonempty
+template <typename T>
+void ConcatCPU(DeviceBase* d,
+               const std::vector<
+                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
+               typename TTypes<T, 2>::Matrix* output);
+
+// Assumes all inputs are nonempty
+template <typename T>
+void ConcatGPU(const Eigen::GpuDevice& d,
+               const std::vector<
+                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
+               typename TTypes<T, 2>::Matrix* output);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_CONCAT_OP_H_
diff --git a/tensorflow/core/kernels/concat_op_cpu.cc b/tensorflow/core/kernels/concat_op_cpu.cc
new file mode 100644
index 0000000000..679a53721c
--- /dev/null
+++ b/tensorflow/core/kernels/concat_op_cpu.cc
@@ -0,0 +1,122 @@
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/concat_op.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+template <typename T>
+static inline void Copy(T* dst, const T* src, int n) {
+  if (DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) {
+    memcpy(dst, src, n * sizeof(T));
+  } else {
+    for (int k = 0; k < n; ++k) {
+      *dst++ = *src++;
+    }
+  }
+}
+
+template <typename T>
+void ConcatCPU(DeviceBase* d,
+               const std::vector<
+                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
+               typename TTypes<T, 2>::Matrix* output) {
+  int num_inputs = inputs.size();
+  std::vector<ptrdiff_t> sizes;
+  sizes.reserve(num_inputs);
+  int row_size = 0;
+  for (int j = 0; j < num_inputs; ++j) {
+    sizes.push_back(inputs[j]->dimension(1));
+    row_size += sizes.back();
+  }
+
+  auto worker_threads = d->tensorflow_cpu_worker_threads();
+  int num_threads = std::min<int>(std::min(4, worker_threads->num_threads),
+                                  output->size() / 4096);
+  // Single threaded mode.
+  if (num_threads == 0) {
+    T* out = &(*output)(0, 0);
+    std::vector<const T*> inp;
+    inp.reserve(num_inputs);
+    for (int j = 0; j < num_inputs; ++j) {
+      inp.push_back(&(*inputs[j])(0, 0));
+    }
+    const int dim0 = output->dimension(0);
+    for (int i = 0; i < dim0; ++i) {
+      for (int j = 0; j < num_inputs; ++j) {
+        auto size = sizes[j];
+        Copy(out, inp[j], size);
+        out += size;
+        inp[j] += size;
+      }
+    }
+    return;
+  }
+
+  // Sharded mode.
+  auto work = [&row_size, &sizes, &inputs, &output, &num_inputs](int64 start,
+                                                                 int64 end) {
+    int64 skipped_rows = start / row_size;
+    T* out = output->data() + skipped_rows * row_size;
+    T* out_start = output->data() + start;
+    T* out_end = output->data() + end;
+
+    // Handle partial row at start
+    if (out < out_start) {
+      for (int j = 0; j < num_inputs; ++j) {
+        ptrdiff_t size = sizes[j];
+        ptrdiff_t offset = out_start - out;
+        if (size <= offset) {
+          out += size;
+          continue;
+        }
+        const T* inp = &(*inputs[j])(skipped_rows, 0);
+        if (offset > 0) {
+          out += offset;
+          inp += offset;
+          size -= offset;
+        }
+        size = std::min(size, out_end - out);
+        if (size <= 0) break;
+        Copy(out, inp, size);
+        out += size;
+      }
+      ++skipped_rows;
+    }
+    if (out == out_end) return;
+    CHECK(out >= out_start);
+    CHECK(out < out_end);
+
+    // Copy remaining data.
+    std::vector<const T*> inp;
+    inp.reserve(num_inputs);
+    for (int j = 0; j < num_inputs; ++j) {
+      inp.push_back(&(*inputs[j])(skipped_rows, 0));
+    }
+    const int dim0 = output->dimension(0);
+    for (int i = skipped_rows; i < dim0; ++i) {
+      for (int j = 0; j < num_inputs; ++j) {
+        ptrdiff_t size = std::min(sizes[j], out_end - out);
+        Copy(out, inp[j], size);
+        out += size;
+        inp[j] += size;
+        if (out == out_end) return;
+      }
+    }
+  };
+  Shard(num_threads, worker_threads->workers, output->size(), 100, work);
+}
+
+#define REGISTER(T)                                                            \
+  template void ConcatCPU<T>(                                                  \
+      DeviceBase*,                                                             \
+      const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&, \
+      typename TTypes<T, 2>::Matrix* output);
+TF_CALL_ALL_TYPES(REGISTER)
+REGISTER(quint8)
+REGISTER(qint8)
+REGISTER(qint32)
+REGISTER(bfloat16)
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_op_gpu.cu.cc b/tensorflow/core/kernels/concat_op_gpu.cu.cc
new file mode 100644
index 0000000000..d8ce6bd85d
--- /dev/null
+++ b/tensorflow/core/kernels/concat_op_gpu.cu.cc
@@ -0,0 +1,41 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <stdio.h>
+
+#include <memory>
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename T>
+void ConcatGPU(const GPUDevice& d,
+               const std::vector<
+                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
+               typename TTypes<T, 2>::Matrix* output) {
+  Eigen::array<ptrdiff_t, 2> offset(0, 0);
+  for (int i = 0; i < inputs.size(); ++i) {
+    Eigen::array<ptrdiff_t, 2> size = inputs[i]->dimensions();
+    output->slice(offset, size).device(d) = *inputs[i];
+    offset[1] += size[1];
+  }
+}
+
+#define REGISTER_GPU(T)                                                       \
+  template void ConcatGPU<T>(                                                 \
+      const GPUDevice& d,                                                     \
+      const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& \
+          inputs,                                                             \
+      typename TTypes<T, 2>::Matrix* output);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+#undef REGISTER_GPU
+
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/concat_op_test.cc b/tensorflow/core/kernels/concat_op_test.cc
new file mode 100644
index 0000000000..4ccc5b5b19
--- /dev/null
+++ b/tensorflow/core/kernels/concat_op_test.cc
@@ -0,0 +1,240 @@
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/tensor.h"
+#include <gtest/gtest.h>
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace {
+
+// For the benchmark, we set up two 2-dimensional tensors, each kDim1 x 'dim'
+// in size, and concat them together along "concat_dimension"
+template <typename T>
+static void ConcatHelper(int iters, int concat_dimension, int dim2) {
+  testing::StopTiming();
+  RequireDefaultOps();
+  Graph* g = new Graph(OpRegistry::Global());
+
+  DataType dt = DataTypeToEnum<T>::v();
+  const int kDim1 = 100;
+  Tensor concat_dim(DT_INT32, TensorShape({}));
+  concat_dim.scalar<int32>()() = concat_dimension;
+  Tensor in0(dt, TensorShape({kDim1, dim2}));
+  in0.flat<T>().setRandom();
+  Tensor in1(dt, TensorShape({kDim1, dim2}));
+  in1.flat<T>().setRandom();
+
+  Node* node;
+  TF_CHECK_OK(
+      NodeBuilder(g->NewName("n"), "Concat")
+          .Input(test::graph::Constant(g, concat_dim))
+          .Input({test::graph::Constant(g, in0), test::graph::Constant(g, in1)})
+          .Attr("N", 2)
+          .Attr("T", dt)
+          .Finalize(g, &node));
+
+  testing::BytesProcessed(static_cast<int64>(iters) *
+                          ((kDim1 * dim2) + (kDim1 * dim2)) * sizeof(T));
+  testing::StartTiming();
+  test::Benchmark("cpu", g).Run(iters);
+  testing::UseRealTime();
+}
+
+static void BM_ConcatDim0Float(int iters, int dim2) {
+  ConcatHelper<float>(iters, 0, dim2);
+}
+
+static void BM_ConcatDim1Float(int iters, int dim2) {
+  ConcatHelper<float>(iters, 1, dim2);
+}
+
+BENCHMARK(BM_ConcatDim0Float)->Arg(1000)->Arg(100000)->Arg(1000000);
+BENCHMARK(BM_ConcatDim1Float)->Arg(1000)->Arg(100000)->Arg(1000000);
+
+static void BM_ConcatDim1int16(int iters, int dim2) {
+  ConcatHelper<int16>(iters, 1, dim2);
+}
+static void BM_ConcatDim1bfloat16(int iters, int dim2) {
+  ConcatHelper<bfloat16>(iters, 1, dim2);
+}
+
+BENCHMARK(BM_ConcatDim1int16)->Arg(1000)->Arg(100000)->Arg(1000000);
+BENCHMARK(BM_ConcatDim1bfloat16)->Arg(1000)->Arg(100000)->Arg(1000000);
+
+template <typename T>
+static void ConcatManyHelper(int iters, int concat_dimension, int dim2) {
+  testing::StopTiming();
+  RequireDefaultOps();
+  Graph* g = new Graph(OpRegistry::Global());
+
+  DataType dt = DataTypeToEnum<T>::v();
+  const int kDim1 = 40000;
+  const int kNumInputs = 64;
+  Tensor concat_dim(DT_INT32, TensorShape({}));
+  concat_dim.scalar<int32>()() = concat_dimension;
+  std::vector<NodeBuilder::NodeOut> inputs;
+  inputs.reserve(kNumInputs);
+  for (int i = 0; i < kNumInputs; ++i) {
+    Tensor in(dt, TensorShape({kDim1, dim2}));
+    in.flat<T>().setRandom();
+    inputs.push_back(test::graph::Constant(g, in));
+  }
+
+  Node* node;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Concat")
+                  .Input(test::graph::Constant(g, concat_dim))
+                  .Input(inputs)
+                  .Attr("N", 64)
+                  .Attr("T", dt)
+                  .Finalize(g, &node));
+  testing::BytesProcessed(static_cast<int64>(iters) * kDim1 * dim2 *
+                          kNumInputs * sizeof(T));
+  testing::StartTiming();
+  test::Benchmark("cpu", g).Run(iters);
+  testing::UseRealTime();
+}
+
+static void BM_ConcatManyDim1bfloat16(int iters, int dim2) {
+  ConcatManyHelper<bfloat16>(iters, 1, dim2);
+}
+
+BENCHMARK(BM_ConcatManyDim1bfloat16)->Arg(18)->Arg(34)->Arg(60);
+
+static void MemcpyAlternativeHelper(int iters, int concat_dimension, int dim2) {
+  testing::StopTiming();
+
+  const int kDim1 = 100;
+  std::vector<float> data1(kDim1 * dim2, 1.0f);
+  std::vector<float> data2(kDim1 * dim2, 2.0f);
+
+  testing::BytesProcessed(static_cast<int64>(iters) *
+                          ((kDim1 * dim2) + (kDim1 * dim2)) * sizeof(float));
+  testing::StartTiming();
+  while (--iters > 0) {
+    const int n0 = data1.size();
+    const int n1 = data2.size();
+    float* result = new float[n0 + n1];
+    memcpy(&result[0], &data1[0], n0 * sizeof(float));
+    memcpy(&result[n0], &data2[0], n1 * sizeof(float));
+    delete[] result;
+  }
+}
+
+static void BM_MemcpyAlternativeDim0(int iters, int dim2) {
+  MemcpyAlternativeHelper(iters, 0, dim2);
+}
+static void BM_MemcpyAlternativeDim1(int iters, int dim2) {
+  MemcpyAlternativeHelper(iters, 1, dim2);
+}
+
+BENCHMARK(BM_MemcpyAlternativeDim0)->Arg(1000)->Arg(100000)->Arg(1000000);
+BENCHMARK(BM_MemcpyAlternativeDim1)->Arg(1000)->Arg(100000)->Arg(1000000);
+
+typedef Eigen::TensorMap<Eigen::Tensor<bfloat16, 1, Eigen::RowMajor>,
+                         Eigen::Unaligned> EigenMap;
+static void MemcpyManyAlternative1(int iters, int dim2) {
+  testing::StopTiming();
+
+  const int kDim1 = 40000;
+  const int kNumCopies = 64;
+  const int size = kDim1 * dim2 * kNumCopies;
+  bfloat16* data = new bfloat16[size];
+  EigenMap map(data, size);
+  map.setRandom();
+
+  testing::BytesProcessed(static_cast<int64>(iters) * kDim1 * dim2 *
+                          kNumCopies * sizeof(bfloat16));
+  testing::StartTiming();
+  while (iters-- > 0) {
+    std::vector<bfloat16*> inputs(kNumCopies);
+    for (int i = 0; i < kNumCopies; ++i) {
+      inputs[i] = &data[i * kDim1 * dim2];
+    }
+    bfloat16* result = new bfloat16[size];
+    for (int j = 0; j < kNumCopies; ++j) {
+      bfloat16* output = &result[j * dim2];
+      for (int i = 0; i < kDim1; ++i) {
+        if (i + 1 < kDim1) {
+          port::prefetch<port::PREFETCH_HINT_T0>(inputs[j] + dim2);
+        }
+        memcpy(output, inputs[j], dim2 * sizeof(bfloat16));
+        inputs[j] += dim2;
+        output += dim2 * kNumCopies;
+      }
+    }
+    delete[] result;
+  }
+  delete[] data;
+}
+
+static void MemcpyManyAlternative2(int iters, int dim2) {
+  testing::StopTiming();
+
+  const int kDim1 = 40000;
+  const int kNumCopies = 64;
+  const int size = kDim1 * dim2 * kNumCopies;
+  bfloat16* data = new bfloat16[size];
+  EigenMap map(data, size);
+  map.setRandom();
+
+  testing::BytesProcessed(static_cast<int64>(iters) * kDim1 * dim2 *
+                          kNumCopies * sizeof(bfloat16));
+  testing::StartTiming();
+  std::vector<bfloat16*> inputs(kNumCopies);
+  while (--iters > 0) {
+    bfloat16* result = new bfloat16[size];
+    for (int i = 0; i < kNumCopies; ++i) {
+      inputs[i] = &data[i * kDim1 * dim2];
+    }
+    bfloat16* output = result;
+    for (int i = 0; i < kDim1; ++i) {
+      for (int j = 0; j < kNumCopies; ++j) {
+        if (j + 1 < kNumCopies) {
+          port::prefetch<port::PREFETCH_HINT_T0>(inputs[j + 1]);
+        }
+        memcpy(output, inputs[j], dim2 * sizeof(bfloat16));
+        inputs[j] += dim2;
+        output += dim2;
+      }
+    }
+    delete[] result;
+  }
+  delete[] data;
+}
+
+BENCHMARK(MemcpyManyAlternative1)
+    ->Arg(16)
+    ->Arg(17)
+    ->Arg(18)
+    ->Arg(32)
+    ->Arg(33)
+    ->Arg(34)
+    ->Arg(60)
+    ->Arg(64)
+    ->Arg(65);
+
+BENCHMARK(MemcpyManyAlternative2)
+    ->Arg(16)
+    ->Arg(17)
+    ->Arg(18)
+    ->Arg(32)
+    ->Arg(33)
+    ->Arg(34)
+    ->Arg(60)
+    ->Arg(64)
+    ->Arg(65);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
new file mode 100644
index 0000000000..281bafd3df
--- /dev/null
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -0,0 +1,249 @@
+// See docs in ../ops/array_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/constant_op.h"
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+ConstantOp::ConstantOp(OpKernelConstruction* ctx)
+    : OpKernel(ctx), tensor_(ctx->output_type(0)) {
+  const TensorProto* proto = nullptr;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("value", &proto));
+  OP_REQUIRES_OK(ctx, ctx->device()->MakeTensorFromProto(
+                          *proto, AllocatorAttributes(), &tensor_));
+  OP_REQUIRES(
+      ctx, ctx->output_type(0) == tensor_.dtype(),
+      errors::InvalidArgument("Type mismatch between value (",
+                              DataTypeString(tensor_.dtype()), ") and dtype (",
+                              DataTypeString(ctx->output_type(0)), ")"));
+}
+
+void ConstantOp::Compute(OpKernelContext* ctx) { ctx->set_output(0, tensor_); }
+
+ConstantOp::~ConstantOp() {}
+
+REGISTER_KERNEL_BUILDER(Name("Const").Device(DEVICE_CPU), ConstantOp);
+
+#if GOOGLE_CUDA
+#define REGISTER_KERNEL(D, TYPE)                                      \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("Const").Device(DEVICE_##D).TypeConstraint<TYPE>("dtype"), \
+      ConstantOp);
+REGISTER_KERNEL(GPU, float);
+REGISTER_KERNEL(GPU, double);
+REGISTER_KERNEL(GPU, uint8);
+REGISTER_KERNEL(GPU, int8);
+REGISTER_KERNEL(GPU, int16);
+REGISTER_KERNEL(GPU, int64);
+REGISTER_KERNEL(GPU, complex64);
+REGISTER_KERNEL(GPU, bool);
+// Currently we do not support string constants on GPU
+#undef REGISTER_KERNEL
+#endif
+
+// HostConstantOp differs from ConstantOp in that its output is always
+// in host memory.
+class HostConstantOp : public OpKernel {
+ public:
+  explicit HostConstantOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx), tensor_(ctx->output_type(0)) {
+    const TensorProto* proto = nullptr;
+    AllocatorAttributes alloc_attr;
+    alloc_attr.set_on_host(true);
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("value", &proto));
+    OP_REQUIRES_OK(
+        ctx, ctx->device()->MakeTensorFromProto(*proto, alloc_attr, &tensor_));
+    OP_REQUIRES(
+        ctx, ctx->output_type(0) == tensor_.dtype(),
+        errors::InvalidArgument(
+            "Type mismatch between value (", DataTypeString(tensor_.dtype()),
+            ") and dtype (", DataTypeString(ctx->output_type(0)), ")"));
+  }
+
+  void Compute(OpKernelContext* ctx) override { ctx->set_output(0, tensor_); }
+
+  bool IsExpensive() override { return false; }
+
+  ~HostConstantOp() override {}
+
+ private:
+  Tensor tensor_;
+  TF_DISALLOW_COPY_AND_ASSIGN(HostConstantOp);
+};
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Const")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("output")
+                            .TypeConstraint<int32>("dtype"),
+                        HostConstantOp);
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+// Partial specialization of FillFunctor<Device=CPUDevice, T>.
+template <typename T>
+struct FillFunctor<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstScalar in) {
+    out.device(d) = out.constant(in());
+  }
+};
+
+// Partial specialization of SetZeroFunctor<Device=CPUDevice, T>.
+template <typename T>
+struct SetZeroFunctor<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat out) {
+    out.device(d) = out.constant(0);
+  }
+};
+
+#define DEFINE_SETZERO_CPU(T) template struct SetZeroFunctor<CPUDevice, T>
+DEFINE_SETZERO_CPU(float);
+DEFINE_SETZERO_CPU(double);
+DEFINE_SETZERO_CPU(int32);
+DEFINE_SETZERO_CPU(complex64);
+#undef DEFINE_SETZERO_CPU
+
+}  // end namespace functor
+
+template <typename Device, typename T>
+class FillOp : public OpKernel {
+ public:
+  explicit FillOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& Tdims = context->input(0);
+    OP_REQUIRES(context, TensorShapeUtils::IsLegacyVector(Tdims.shape()),
+                errors::InvalidArgument("dims must be a vector of int32."));
+    const Tensor& Tvalue = context->input(1);
+    OP_REQUIRES(context, TensorShapeUtils::IsLegacyScalar(Tvalue.shape()),
+                errors::InvalidArgument("value must be a scalar."));
+    auto dims = Tdims.flat<int32>();
+    for (int i = 0; i < dims.size(); i++) {
+      OP_REQUIRES(context, dims(i) >= 0,
+                  errors::InvalidArgument("dims[", i, "] = ", dims(i),
+                                          " must be nonnegative."));
+    }
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(
+        context,
+        context->allocate_output(
+            0, TensorShapeUtils::MakeShape(
+                   reinterpret_cast<const int32*>(dims.data()), dims.size()),
+            &out));
+    functor::FillFunctor<Device, T> functor;
+    functor(context->eigen_device<Device>(), out->flat<T>(),
+            Tvalue.scalar<T>());
+  }
+};
+
+#define REGISTER_KERNEL(D, TYPE)                         \
+  REGISTER_KERNEL_BUILDER(Name("Fill")                   \
+                              .Device(DEVICE_##D)        \
+                              .TypeConstraint<TYPE>("T") \
+                              .HostMemory("dims"),       \
+                          FillOp<D##Device, TYPE>);
+
+#define REGISTER_CPU_KERNEL(TYPE) REGISTER_KERNEL(CPU, TYPE)
+TF_CALL_ALL_TYPES(REGISTER_CPU_KERNEL);
+#undef REGISTER_CPU_KERNEL
+
+#if GOOGLE_CUDA
+REGISTER_KERNEL(GPU, float);
+REGISTER_KERNEL(GPU, double);
+REGISTER_KERNEL(GPU, uint8);
+REGISTER_KERNEL(GPU, int8);
+REGISTER_KERNEL(GPU, int16);
+REGISTER_KERNEL(GPU, int64);
+// Currently we do not support filling strings and complex64 on GPU
+
+#endif  // GOOGLE_CUDA
+
+#undef REGISTER_KERNEL
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Fill")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int32>("T")
+                            .HostMemory("dims")
+                            .HostMemory("value")
+                            .HostMemory("output"),
+                        FillOp<CPUDevice, int32>);
+
+template <typename Device, typename T>
+class ZerosLikeOp : public OpKernel {
+ public:
+  explicit ZerosLikeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& input = ctx->input(0);
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input.shape(), &out));
+    Tensor zero(DataTypeToEnum<T>::value, {1});
+    zero.scalar<T>().setZero();
+    const Tensor& zero_cref = zero;
+    functor::FillFunctor<Device, T> functor;
+    functor(ctx->eigen_device<Device>(), out->flat<T>(), zero_cref.scalar<T>());
+  }
+};
+
+#define REGISTER_KERNEL(type, dev)                                      \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("ZerosLike").Device(DEVICE_##dev).TypeConstraint<type>("T"), \
+      ZerosLikeOp<dev##Device, type>)
+
+#define REGISTER_CPU(type) REGISTER_KERNEL(type, CPU)
+TF_CALL_ALL_TYPES(REGISTER_CPU);
+#undef REGISTER_CPU
+
+#if GOOGLE_CUDA
+REGISTER_KERNEL(float, GPU);
+REGISTER_KERNEL(double, GPU);
+#endif  // GOOGLE_CUDA
+
+#undef REGISTER_KERNEL
+
+class PlaceholderOp : public OpKernel {
+ public:
+  explicit PlaceholderOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &expected_shape_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    if (expected_shape_.dims() > 0) {
+      OP_REQUIRES(ctx, false,
+                  errors::InvalidArgument(
+                      "You must feed a value for placeholder tensor '", name(),
+                      "' with dtype ", DataTypeString(output_type(0)),
+                      " and shape ", expected_shape_.DebugString()));
+    } else {
+      OP_REQUIRES(ctx, false,
+                  errors::InvalidArgument(
+                      "You must feed a value for placeholder tensor '", name(),
+                      "' with dtype ", DataTypeString(output_type(0))));
+    }
+  }
+
+ private:
+  TensorShape expected_shape_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("Placeholder").Device(DEVICE_CPU), PlaceholderOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/constant_op.h b/tensorflow/core/kernels/constant_op.h
new file mode 100644
index 0000000000..20a5c9c42f
--- /dev/null
+++ b/tensorflow/core/kernels/constant_op.h
@@ -0,0 +1,25 @@
+#ifndef TENSORFLOW_KERNELS_CONSTANT_OP_H_
+#define TENSORFLOW_KERNELS_CONSTANT_OP_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+// ConstantOp returns a tensor specified by ConstantOpDef.
+class ConstantOp : public OpKernel {
+ public:
+  explicit ConstantOp(OpKernelConstruction* ctx);
+  void Compute(OpKernelContext* ctx) override;
+  bool IsExpensive() override { return false; }
+  ~ConstantOp() override;
+
+ private:
+  Tensor tensor_;
+  TF_DISALLOW_COPY_AND_ASSIGN(ConstantOp);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_CONSTANT_OP_H_
diff --git a/tensorflow/core/kernels/constant_op_gpu.cu.cc b/tensorflow/core/kernels/constant_op_gpu.cu.cc
new file mode 100644
index 0000000000..64502378bd
--- /dev/null
+++ b/tensorflow/core/kernels/constant_op_gpu.cu.cc
@@ -0,0 +1,89 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/platform/port.h"
+
+namespace Eigen {
+namespace internal {
+
+template <typename T>
+struct scalar_const_op {
+  typedef typename packet_traits<T>::type Packet;
+
+  const T* val;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  scalar_const_op(const scalar_const_op& x)
+      : val(x.val) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_const_op(const T* v) : val(v) {}
+
+  template <typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(Index,
+                                                           Index = 0) const {
+    return *val;
+  }
+
+  template <typename Index>
+  EIGEN_STRONG_INLINE const Packet packetOp(Index, Index = 0) const {
+    return internal::pset1<Packet>(*val);
+  }
+};
+
+template <typename T>
+struct functor_traits<scalar_const_op<T> > {
+  enum {
+    Cost = 1,
+    PacketAccess = packet_traits<T>::Vectorizable,
+    IsRepeatable = true
+  };
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+namespace tensorflow {
+
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Partial specialization FillFunctor<Device=GPUDevice, T>
+template <typename T>
+struct FillFunctor<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstScalar in) {
+    Eigen::internal::scalar_const_op<T> f(in.data());
+    out.device(d) = out.nullaryExpr(f);
+  }
+};
+
+#define DEFINE_FILL_GPU(T) template struct FillFunctor<GPUDevice, T>
+DEFINE_FILL_GPU(float);
+DEFINE_FILL_GPU(double);
+DEFINE_FILL_GPU(int32);
+DEFINE_FILL_GPU(uint8);
+DEFINE_FILL_GPU(int16);
+DEFINE_FILL_GPU(int8);
+DEFINE_FILL_GPU(int64);
+#undef DEFINE_FILL_GPU
+
+// Partial specialization of FillFunctor<Device=GPUDevice, T>.
+template <typename T>
+struct SetZeroFunctor<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat out) {
+    out.device(d) = out.constant(0);
+  }
+};
+
+#define DEFINE_SETZERO_GPU(T) template struct SetZeroFunctor<GPUDevice, T>
+DEFINE_SETZERO_GPU(float);
+#undef DEFINE_SETZERO_GPU
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/constant_op_test.cc b/tensorflow/core/kernels/constant_op_test.cc
new file mode 100644
index 0000000000..f5a464c07c
--- /dev/null
+++ b/tensorflow/core/kernels/constant_op_test.cc
@@ -0,0 +1,43 @@
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+// Returns graph containing "num" const nodes.  If 'sequential' is
+// true, make sure all constants are executed sequentially in the
+// graph by adding control dependencies.
+static Graph* ManyConsts(int num, bool sequential) {
+  Graph* g = new Graph(OpRegistry::Global());
+  Node* prev = nullptr;
+  for (int i = 0; i < num; ++i) {
+    Tensor c(DT_FLOAT, TensorShape({}));
+    c.scalar<float>()() = i;
+    Node* curr = test::graph::Constant(g, c);
+    if (sequential && prev != nullptr) {
+      g->AddControlEdge(prev, curr);
+    }
+    prev = curr;
+  }
+  return g;
+}
+
+static void BM_ManyConsts_Parallel(int iters, int num) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num);
+  test::Benchmark("cpu", ManyConsts(num, false /* !sequential */)).Run(iters);
+}
+BENCHMARK(BM_ManyConsts_Parallel)->Range(1, 1 << 10);
+
+static void BM_ManyConsts_Sequential(int iters, int num) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num);
+  test::Benchmark("cpu", ManyConsts(num, true /* sequential */)).Run(iters);
+}
+BENCHMARK(BM_ManyConsts_Sequential)->Range(1, 1 << 10);
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
new file mode 100644
index 0000000000..bc44a7f7cc
--- /dev/null
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -0,0 +1,359 @@
+#include "tensorflow/core/kernels/control_flow_ops.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+
+// A switch op has two inputs and two outputs. It forwards the value of
+// Input:0 to the output specified by input:1. Input:1 is a boolean tensor.
+// Input:0 is forwarded to output:0 if input:1 is false, otherwise to
+// output:1.
+class SwitchOp : public OpKernel {
+ public:
+  explicit SwitchOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& outputPorts = context->input(1);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsScalar(outputPorts.shape()),
+        errors::InvalidArgument("The second input must be a scalar, "
+                                "but it has shape ",
+                                outputPorts.shape().ShortDebugString()));
+
+    bool pred = outputPorts.scalar<bool>()();
+    int port = (pred) ? 1 : 0;
+    if (IsRefType(context->input_dtype(0))) {
+      context->forward_ref_input_to_ref_output(0, port);
+    } else {
+      context->set_output(port, context->input(0));
+    }
+  }
+
+  bool IsExpensive() override { return false; }
+
+  ~SwitchOp() override {}
+
+  TF_DISALLOW_COPY_AND_ASSIGN(SwitchOp);
+};
+
+#define REGISTER_CPU_SWITCH(type)                         \
+  REGISTER_KERNEL_BUILDER(Name("Switch")                  \
+                              .Device(DEVICE_CPU)         \
+                              .HostMemory("pred")         \
+                              .TypeConstraint<type>("T"), \
+                          SwitchOp)
+
+#define REGISTER_CPU_REF_SWITCH(type)                     \
+  REGISTER_KERNEL_BUILDER(Name("RefSwitch")               \
+                              .Device(DEVICE_CPU)         \
+                              .HostMemory("pred")         \
+                              .TypeConstraint<type>("T"), \
+                          SwitchOp)
+
+#define REGISTER_GPU_SWITCH(type)                         \
+  REGISTER_KERNEL_BUILDER(Name("Switch")                  \
+                              .Device(DEVICE_GPU)         \
+                              .HostMemory("pred")         \
+                              .TypeConstraint<type>("T"), \
+                          SwitchOp)
+
+#define REGISTER_GPU_REF_SWITCH(type)                     \
+  REGISTER_KERNEL_BUILDER(Name("RefSwitch")               \
+                              .Device(DEVICE_GPU)         \
+                              .HostMemory("pred")         \
+                              .TypeConstraint<type>("T"), \
+                          SwitchOp)
+
+TF_CALL_ALL_TYPES(REGISTER_CPU_SWITCH);
+TF_CALL_ALL_TYPES(REGISTER_CPU_REF_SWITCH);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_SWITCH);
+REGISTER_GPU_SWITCH(bool);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_REF_SWITCH);
+REGISTER_GPU_REF_SWITCH(int32);
+REGISTER_GPU_REF_SWITCH(bool);
+
+#undef REGISTER_CPU_SWITCH
+#undef REGISTER_CPU_REF_SWITCH
+#undef REGISTER_GPU_SWITCH
+#undef REGISTER_GPU_REF_SWITCH
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Switch")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("data")
+                            .HostMemory("pred")
+                            .HostMemory("output_false")
+                            .HostMemory("output_true")
+                            .TypeConstraint<int32>("T"),
+                        SwitchOp);
+
+class RefSelectOp : public OpKernel {
+ public:
+  explicit RefSelectOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("N", &num_ref_inputs_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& index_tensor = context->input(0);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsScalar(index_tensor.shape()),
+        errors::InvalidArgument("Index must be a scalar, "
+                                "but it has shape ",
+                                index_tensor.shape().ShortDebugString()));
+
+    int32 index = index_tensor.scalar<int32>()();
+
+    OP_REQUIRES(context, index >= 0 && index < num_ref_inputs_,
+                errors::InvalidArgument("Index must be in the range [0, ",
+                                        num_ref_inputs_, ") but got ", index));
+    context->forward_ref_input_to_ref_output(index + 1, 0);
+  }
+
+  bool IsExpensive() override { return false; }
+
+  ~RefSelectOp() override {}
+
+  TF_DISALLOW_COPY_AND_ASSIGN(RefSelectOp);
+
+ private:
+  int num_ref_inputs_;
+};
+
+#define REGISTER_CPU_REF_SELECT(type)                     \
+  REGISTER_KERNEL_BUILDER(Name("RefSelect")               \
+                              .Device(DEVICE_CPU)         \
+                              .HostMemory("index")        \
+                              .TypeConstraint<type>("T"), \
+                          RefSelectOp)
+TF_CALL_ALL_TYPES(REGISTER_CPU_REF_SELECT);
+
+#undef REGISTER_CPU_REF_SWITCH
+
+// A merge op has n inputs and two outputs. It forwards the value of the
+// first input that becomes available to its first output, and the
+// index of the first input to its second output.
+class MergeOp : public OpKernel {
+ public:
+  explicit MergeOp(OpKernelConstruction* context) : OpKernel(context) {
+    const DataType dt = context->input_type(0);
+    const int num_in = context->num_inputs();
+    OP_REQUIRES_OK(context, context->MatchSignature(DataTypeVector(num_in, dt),
+                                                    {dt, DT_INT32}));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    bool input_seen = false;
+    for (int i = 0; i < context->num_inputs(); ++i) {
+      if (context->has_input(i)) {
+        if (input_seen) {
+          context->SetStatus(errors::Internal(
+              "Merge can not have more than one valid input."));
+          return;
+        }
+        input_seen = true;
+
+        context->set_output(0, context->input(i));
+        Tensor* value_index = nullptr;
+        OP_REQUIRES_OK(context, context->allocate_output(1, TensorShape({}),
+                                                         &value_index));
+        value_index->scalar<int32>()() = i;
+      }
+    }
+  }
+
+  bool IsExpensive() override { return false; }
+
+  ~MergeOp() override {}
+
+  TF_DISALLOW_COPY_AND_ASSIGN(MergeOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("Merge").Device(DEVICE_CPU), MergeOp);
+
+#define REGISTER_GPU_KERNEL(type)                         \
+  REGISTER_KERNEL_BUILDER(Name("Merge")                   \
+                              .Device(DEVICE_GPU)         \
+                              .TypeConstraint<type>("T")  \
+                              .HostMemory("value_index"), \
+                          MergeOp);
+
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+
+#undef REGISTER_GPU_KERNEL
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Merge")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("inputs")
+                            .HostMemory("output")
+                            .HostMemory("value_index")
+                            .TypeConstraint<int32>("T"),
+                        MergeOp);
+
+// An enter op has one input and one output. It creates or finds
+// the child frame that is uniquely identified by the frame_name,
+// and makes its input available to the child frame.
+class EnterOp : public OpKernel {
+ public:
+  explicit EnterOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    if (IsRefType(context->input_dtype(0))) {
+      context->forward_ref_input_to_ref_output(0, 0);
+    } else {
+      context->set_output(0, context->input(0));
+    }
+  }
+
+  bool IsExpensive() override { return false; }
+
+  ~EnterOp() override {}
+
+  TF_DISALLOW_COPY_AND_ASSIGN(EnterOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("Enter").Device(DEVICE_CPU), EnterOp);
+REGISTER_KERNEL_BUILDER(Name("RefEnter").Device(DEVICE_CPU), EnterOp);
+
+#define REGISTER_GPU_KERNEL(type) \
+  REGISTER_KERNEL_BUILDER(        \
+      Name("Enter").Device(DEVICE_GPU).TypeConstraint<type>("T"), EnterOp);
+#define REGISTER_GPU_REF_KERNEL(type) \
+  REGISTER_KERNEL_BUILDER(            \
+      Name("RefEnter").Device(DEVICE_GPU).TypeConstraint<type>("T"), EnterOp);
+
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+TF_CALL_NUMBER_TYPES(REGISTER_GPU_REF_KERNEL);
+
+#undef REGISTER_GPU_KERNEL
+#undef REGISTER_GPU_REF_KERNEL
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Enter")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("data")
+                            .HostMemory("output")
+                            .TypeConstraint<int32>("T"),
+                        EnterOp);
+
+// An exit op has one input and one output. It exits the current
+// frame to its parent frame, and makes its input available to the
+// parent frame.
+class ExitOp : public OpKernel {
+ public:
+  explicit ExitOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    context->set_output(0, context->input(0));
+  }
+
+  bool IsExpensive() override { return false; }
+
+  ~ExitOp() override {}
+
+  TF_DISALLOW_COPY_AND_ASSIGN(ExitOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("Exit").Device(DEVICE_CPU), ExitOp);
+
+#define REGISTER_GPU_KERNEL(type) \
+  REGISTER_KERNEL_BUILDER(        \
+      Name("Exit").Device(DEVICE_GPU).TypeConstraint<type>("T"), ExitOp);
+
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+
+#undef REGISTER_GPU_KERNEL
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Exit")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("data")
+                            .HostMemory("output")
+                            .TypeConstraint<int32>("T"),
+                        ExitOp);
+
+// A next_iteration op has one input and one output. It makes its input
+// available to the next iteration.
+class NextIterationOp : public OpKernel {
+ public:
+  explicit NextIterationOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    context->set_output(0, context->input(0));
+  }
+
+  bool IsExpensive() override { return false; }
+
+  ~NextIterationOp() override {}
+
+  TF_DISALLOW_COPY_AND_ASSIGN(NextIterationOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("NextIteration").Device(DEVICE_CPU),
+                        NextIterationOp);
+
+#define REGISTER_GPU_KERNEL(type)                                         \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("NextIteration").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+      NextIterationOp);
+
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+
+#undef REGISTER_GPU_KERNEL
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("NextIteration")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("data")
+                            .HostMemory("output")
+                            .TypeConstraint<int32>("T"),
+                        NextIterationOp);
+
+// A LoopCond op has one input and one output. The input is a boolean
+// scalar representing the taken branches of the "pivot" Switch that
+// determines loop termination. As a contract, any high-level front-end
+// should always use port '0' of the "pivot" switches for loop exit.
+class LoopCondOp : public OpKernel {
+ public:
+  explicit LoopCondOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    context->set_output(0, context->input(0));
+  }
+
+  bool IsExpensive() override { return false; }
+
+  ~LoopCondOp() override {}
+
+  TF_DISALLOW_COPY_AND_ASSIGN(LoopCondOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("LoopCond").Device(DEVICE_CPU), LoopCondOp);
+REGISTER_KERNEL_BUILDER(Name("LoopCond")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("input")
+                            .HostMemory("output"),
+                        LoopCondOp);
+
+// ControlTrigger kernels
+REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_CPU),
+                        ControlTriggerOp);
+
+REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_GPU),
+                        ControlTriggerOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/control_flow_ops.h b/tensorflow/core/kernels/control_flow_ops.h
new file mode 100644
index 0000000000..184cc9fb63
--- /dev/null
+++ b/tensorflow/core/kernels/control_flow_ops.h
@@ -0,0 +1,22 @@
+#ifndef TENSORFLOW_KERNELS_CONTROL_FLOW_OPS_H_
+#define TENSORFLOW_KERNELS_CONTROL_FLOW_OPS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+// A ControlTriggerOp is similar to a NoOp. However, it always treats the input
+// control edges as Live edges. Its primary use so far is in the scheduling of
+// recvs, where we add ControlTrigger nodes and use them to trigger recvs. We
+// allow ControlTrigger nodes to be enabled by dead nodes.
+class ControlTriggerOp : public OpKernel {
+ public:
+  explicit ControlTriggerOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+  void Compute(OpKernelContext* context) override {}
+  bool IsExpensive() override { return false; }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_CONTROL_FLOW_OPS_H_
diff --git a/tensorflow/core/kernels/control_flow_ops_test.cc b/tensorflow/core/kernels/control_flow_ops_test.cc
new file mode 100644
index 0000000000..52bc11abf0
--- /dev/null
+++ b/tensorflow/core/kernels/control_flow_ops_test.cc
@@ -0,0 +1,71 @@
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/public/tensor.h"
+#include <gtest/gtest.h>
+
+namespace tensorflow {
+namespace {
+
+// Tests for the switch op
+class SwitchOpTest : public OpsTestBase {
+ protected:
+  void Initialize(DataType dt) {
+    RequireDefaultOps();
+    ASSERT_OK(NodeDefBuilder("op", "Switch")
+                  .Input(FakeInput(dt))
+                  .Input(FakeInput())
+                  .Finalize(node_def()));
+    ASSERT_OK(InitOp());
+  }
+};
+
+TEST_F(SwitchOpTest, Int32Success_6_s0) {
+  Initialize(DT_INT32);
+  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<bool>(TensorShape({}), {false});
+  ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_INT32, TensorShape({6}));
+  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+  EXPECT_EQ(nullptr, GetOutput(1));
+}
+
+TEST_F(SwitchOpTest, Int32Success_6_s1) {
+  Initialize(DT_INT32);
+  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<bool>(TensorShape({}), {true});
+  ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_INT32, TensorShape({6}));
+  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32>(expected, *GetOutput(1));
+  EXPECT_EQ(nullptr, GetOutput(0));
+}
+
+TEST_F(SwitchOpTest, Int32Success_2_3_s0) {
+  Initialize(DT_INT32);
+  AddInputFromArray<int32>(TensorShape({2, 3}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<bool>(TensorShape({}), {false});
+  ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_INT32, TensorShape({2, 3}));
+  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+  EXPECT_EQ(nullptr, GetOutput(1));
+}
+
+TEST_F(SwitchOpTest, StringSuccess_s1) {
+  Initialize(DT_STRING);
+  AddInputFromArray<string>(TensorShape({6}), {"A", "b", "C", "d", "E", "f"});
+  AddInputFromArray<bool>(TensorShape({}), {true});
+  ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({6}));
+  test::FillValues<string>(&expected, {"A", "b", "C", "d", "E", "f"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(1));
+  EXPECT_EQ(nullptr, GetOutput(0));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_2d.h b/tensorflow/core/kernels/conv_2d.h
new file mode 100644
index 0000000000..2fb623244c
--- /dev/null
+++ b/tensorflow/core/kernels/conv_2d.h
@@ -0,0 +1,127 @@
+#ifndef TENSORFLOW_KERNELS_CONV_2D_H_
+#define TENSORFLOW_KERNELS_CONV_2D_H_
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+// TODO(yangke): revisit these operations and in particular, see if we can
+// combine all of them into just one operation without causing nvcc to
+// timeout.
+template <typename Device, typename T, int Dims>
+struct ShuffleAndReverse {
+  void operator()(const Device& d, typename TTypes<T, Dims>::ConstTensor input,
+                  const Eigen::DSizes<Eigen::DenseIndex, Dims>& order,
+                  const Eigen::array<bool, Dims>& reverse_dims,
+                  typename TTypes<T, Dims>::Tensor output) {
+    output.device(d) = input.shuffle(order).reverse(reverse_dims);
+  }
+};
+
+template <typename Device, typename T, int Dims>
+struct InflatePadAndShuffle {
+  void operator()(
+      const Device& d, typename TTypes<T, Dims>::ConstTensor input,
+      const Eigen::DSizes<Eigen::DenseIndex, Dims>& strides,
+      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, Dims>& pad_dims,
+      const Eigen::DSizes<Eigen::DenseIndex, Dims>& order,
+      typename TTypes<T, Dims>::Tensor output) {
+    output.device(d) = input.inflate(strides).pad(pad_dims).shuffle(order);
+  }
+};
+
+template <typename Device, typename Input, typename Filter, typename Output>
+void SpatialConvolutionFunc(const Device& d, Output output, Input input,
+                            Filter filter, int stride,
+                            const Eigen::PaddingType& padding) {
+  output.device(d) = Eigen::SpatialConvolution(input, filter, stride, padding);
+}
+
+template <typename Device, typename T>
+struct SpatialConvolution {
+  void operator()(const Device& d, typename TTypes<T, 4>::Tensor output,
+                  typename TTypes<T, 4>::ConstTensor input,
+                  typename TTypes<T, 4>::ConstTensor filter, int stride,
+                  const Eigen::PaddingType& padding) {
+    SpatialConvolutionFunc(d, output, input, filter, stride, padding);
+  }
+};
+
+template <typename Device, typename T>
+struct SpatialConvolutionBackwardInput {
+  void operator()(const Device& d, typename TTypes<T, 4>::Tensor input_backward,
+                  typename TTypes<T, 4>::ConstTensor kernel,
+                  typename TTypes<T, 4>::ConstTensor output_backward,
+                  int input_rows, int input_cols, int stride) {
+    input_backward.device(d) = Eigen::SpatialConvolutionBackwardInput(
+        kernel, output_backward, input_rows, input_cols, stride);
+  }
+};
+
+template <typename Device, typename T>
+struct SpatialConvolutionBackwardKernel {
+  void operator()(const Device& d,
+                  typename TTypes<T, 4>::Tensor kernel_backward,
+                  typename TTypes<T, 4>::ConstTensor input,
+                  typename TTypes<T, 4>::ConstTensor output_backward,
+                  int kernel_rows, int kernel_cols, int stride) {
+    kernel_backward.device(d) = Eigen::SpatialConvolutionBackwardKernel(
+        input, output_backward, kernel_rows, kernel_cols, stride);
+  }
+};
+
+// TODO(vrv): Figure out how to use the MatMulFunctor in matmul_op.h.
+// My initial attempt to do this compiled but failed in the pytest
+// due to a swigdeps error.
+template <typename Device, typename T>
+struct MatMulConvFunctor {
+  // Computes on device "d": out = in0 * in1, where * is matrix
+  // multiplication.
+  void operator()(
+      const Device& d, typename TTypes<T, 2>::Tensor out,
+      typename TTypes<T, 2>::ConstTensor in0,
+      typename TTypes<T, 2>::ConstTensor in1,
+      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair) {
+    out.device(d) = in0.contract(in1, dim_pair);
+  }
+};
+
+template <typename Device, typename T>
+struct TransformFilter {
+  void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor in,
+                  typename TTypes<T, 4>::Tensor out) {
+    out.device(d) = in.shuffle(Eigen::DSizes<Eigen::DenseIndex, 4>(3, 2, 0, 1));
+  }
+};
+
+template <typename Device, typename T>
+struct TransformDepth {
+  void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor in,
+                  const Eigen::DSizes<Eigen::DenseIndex, 4>& shuffle,
+                  typename TTypes<T, 4>::Tensor out) {
+    out.device(d) = in.shuffle(shuffle);
+  }
+};
+
+template <typename Device, typename T>
+struct PadInput {
+  void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor in,
+                  int padding_rows_left, int padding_rows_right,
+                  int padding_cols_left, int padding_cols_right,
+                  typename TTypes<T, 4>::Tensor out) {
+    Eigen::array<std::pair<ptrdiff_t, ptrdiff_t>, 4> padding;
+    padding[0] = std::make_pair(0, 0);
+    padding[1] = std::make_pair(padding_rows_left, padding_rows_right);
+    padding[2] = std::make_pair(padding_cols_left, padding_cols_right);
+    padding[3] = std::make_pair(0, 0);
+    out.device(d) = in.pad(padding);
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_CONV_2D_H_
diff --git a/tensorflow/core/kernels/conv_grad_ops.cc b/tensorflow/core/kernels/conv_grad_ops.cc
new file mode 100644
index 0000000000..bb21d7003c
--- /dev/null
+++ b/tensorflow/core/kernels/conv_grad_ops.cc
@@ -0,0 +1,1190 @@
+// See docs in ../ops/nn_ops.cc.
+
+#define USE_EIGEN_TENSOR
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/util/use_cudnn.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/public/tensor.h"
+
+#if GOOGLE_CUDA
+#include "tensorflow/core/common_runtime/gpu_device_context.h"
+#include "tensorflow/stream_executor/stream.h"
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+// The operation to compute Conv2D gradients.
+//
+//
+// To compute the gradients for Conv2D, we need three input tensors:
+//    input, filter, and backprop for output.
+// And we need to compute two backprops: one for input and one for filter. We
+// compute them in two different kernels.
+
+// Both backprops can be computed as straightforward conv2d.
+//
+// Consider a case where the input is 3x3 and the filter is 2x1:
+//
+// INPUT = [ A  B  C ]
+//         [ D  E  F ]
+//         [ G  H  I ]
+//
+// where each "A", "B", etc is batch x in_depth
+//
+// FILTER = [ X  Y ]
+//
+// where both "X" and "Y" are in_depth x out_depth
+//
+// With VALID padding, the output is 3x2:
+//
+// OUTPUT = [ a  b ]
+//          [ c  d ]
+//          [ e  f ]
+//
+// where each "a", "b", etc is batch x out_depth
+//
+// So we have:
+//
+//   a = A * X + B * Y
+//   b = B * X + C * Y
+//   c = D * X + E * Y
+//   d = E * X + F * Y
+//   e = G * X + H * Y
+//   f = H * X + I * Y
+//
+// So when we have backprops for the outputs (we denote them by
+// a', b', ... ):
+//
+// The backprops for the input are:
+//
+//   A' = a' * X^t
+//   B' = a' * Y^t + b' * X^t
+//   C' = b' * Y^t
+//   ...
+//
+// This is essentially computing a 2d conv of
+//
+// INPUT = [ 0  a'  b'  0 ]
+//         [ 0  c'  d'  0 ]
+//         [ 0  e'  f'  0 ]
+// and
+//
+// FILTER = [ Y^t X^t ]
+//
+// The backprops for the filter are:
+//
+//   X' = A^t * a' + B^t * b' + D^t * c' + E^t * d' + G^t * e' + H^t * f'
+//   Y' = B^t * a' + C^t * b' + E^t + c' + F^t * d' + H^t * e' + I^t * f'
+//
+// This is essentially computing a 2d conv of
+//
+// INPUT = [ A^t  B^t  C^t ]
+//         [ D^t  E^t  F^t ]
+//         [ G^t  H^t  I^t ]
+//
+// and
+//
+// FILTER = [ a'  b' ]
+//          [ c'  d' ]
+//          [ e'  f' ]
+//
+//
+//////////////////////////////////////////////////////////
+//
+// With stride more than one, it's a bit more complicated (we will need to
+// create holes to the backprop).
+//
+// Consider the case where
+//
+// INPUT = [ A B C D E ]
+//         [ F G H I J ]
+//         [ K L M N O ]
+// and
+//
+// FILTER = [ X Y Z ]
+//
+// with stride 2.
+//
+// The output will be
+//
+// OUTPUT = [ a b ]
+//          [ c d ]
+//
+// where:
+//
+//   a = A * X + B * Y + C * Z
+//   b = C * X + D * Y + E * Z
+//   c = K * X + L * Y + M * Z
+//   d = M * X + N * Y + O * Z
+//
+//
+// To compute the backprop for INPUT, we need to convolve
+//
+// INPUT = [ 0  0  a' 0  b' 0  0 ]
+//         [ 0  0  0  0  0  0  0 ]
+//         [ 0  0  c' 0  d' 0  0 ]
+//
+// (notice the holes in INPUT)
+//
+// and
+//
+// FILTER = [ Z^t  Y^t  X^t ]
+//
+// with stride 1.
+//
+// To compute the backprop for FILTER, we need to convolve
+
+//
+// INPUT = [ A^t  B^t  C^t  D^t  E^t ]
+//         [ F^t  G^t  H^t  I^t  J^t ]
+//         [ K^t  L^t  M^t  N^t  O^t ]
+// and
+//
+// FILTER = [ a' 0  b' ]
+//          [ 0  0  0  ]
+//          [ c' 0  d' ]
+//
+// (notice the holes in FILTER)
+//
+//
+// with stride 1
+//
+//////////////////////////////////////////////////////////
+//
+//
+// The case for SAME padding is in fact very similar to VALID -- we just
+// need to pad the input tensor a bit when computing the filter_backprop.
+
+// Common code between the two kernels: verifies that the dimensions all match
+// and extract the padded rows and columns.
+#define EXTRACT_AND_VERIFY_DIMENSIONS(label)                                   \
+  const Tensor& out_backprop = context->input(2);                              \
+  OP_REQUIRES(                                                                 \
+      context, input_shape.dims() == 4,                                        \
+      errors::InvalidArgument(label, ": input must be 4-dimensional"));        \
+  OP_REQUIRES(                                                                 \
+      context, filter_shape.dims() == 4,                                       \
+      errors::InvalidArgument(label, ": filter must be 4-dimensional"));       \
+  OP_REQUIRES(                                                                 \
+      context, out_backprop.dims() == 4,                                       \
+      errors::InvalidArgument(label, ": out_backprop must be 4-dimensional")); \
+  const int64 batch = input_shape.dim_size(0);                                 \
+  OP_REQUIRES(                                                                 \
+      context, batch == out_backprop.dim_size(0),                              \
+      errors::InvalidArgument(                                                 \
+          label, ": input and out_backprop must have the same batch size"));   \
+  const int64 input_rows = input_shape.dim_size(1);                            \
+  const int64 input_cols = input_shape.dim_size(2);                            \
+  const int64 filter_rows = filter_shape.dim_size(0);                          \
+  const int64 filter_cols = filter_shape.dim_size(1);                          \
+  const int64 output_rows = out_backprop.dim_size(1);                          \
+  const int64 output_cols = out_backprop.dim_size(2);                          \
+  const int64 in_depth = input_shape.dim_size(3);                              \
+  OP_REQUIRES(context, in_depth == filter_shape.dim_size(2),                   \
+              errors::InvalidArgument(                                         \
+                  label, ": input and filter must have the same depth"));      \
+  const int64 out_depth = filter_shape.dim_size(3);                            \
+  OP_REQUIRES(                                                                 \
+      context, out_depth == out_backprop.dim_size(3),                          \
+      errors::InvalidArgument(                                                 \
+          label, ": filter and out_backprop must have the same out_depth"));   \
+  const auto stride = strides_[1];                                             \
+  int out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;                  \
+  if (filter_cols == filter_rows && filter_rows == 1 && stride == 1) {         \
+    out_rows = input_rows;                                                     \
+    out_cols = input_cols;                                                     \
+  } else {                                                                     \
+    OP_REQUIRES_OK(                                                            \
+        context, Get2dOutputSize(input_rows, input_cols, filter_rows,          \
+                                 filter_cols, stride, stride, padding_,        \
+                                 &out_rows, &out_cols, &pad_rows, &pad_cols)); \
+  }                                                                            \
+  OP_REQUIRES(                                                                 \
+      context, output_rows == out_rows,                                        \
+      errors::InvalidArgument(                                                 \
+          label, ": Number of rows of out_backprop doesn't match computed: ",  \
+          "actual = ", output_rows, ", computed = ", out_rows));               \
+  OP_REQUIRES(                                                                 \
+      context, output_cols == out_cols,                                        \
+      errors::InvalidArgument(                                                 \
+          label, ": Number of cols of out_backprop doesn't match computed: ",  \
+          "actual = ", output_cols, ", computed = ", out_cols));               \
+  const auto expanded_out_rows = (output_rows - 1) * stride + 1;               \
+  const auto expanded_out_cols = (output_cols - 1) * stride + 1;               \
+  const auto padded_out_rows = input_rows + filter_rows - 1;                   \
+  const auto padded_out_cols = input_cols + filter_cols - 1;                   \
+  const auto top_pad_rows = filter_rows - 1 - pad_rows;                        \
+  const auto left_pad_cols = filter_cols - 1 - pad_cols;                       \
+  const auto bottom_pad_rows =                                                 \
+      padded_out_rows - expanded_out_rows - top_pad_rows;                      \
+  const auto right_pad_cols =                                                  \
+      padded_out_cols - expanded_out_cols - left_pad_cols;                     \
+  Eigen::DSizes<Eigen::DenseIndex, 4> strides{1, stride, stride, 1};           \
+  VLOG(2) << "Conv2d: " << label                                               \
+          << ": expanded_out_rows = " << expanded_out_rows                     \
+          << ", expanded_out_cols = " << expanded_out_cols                     \
+          << ", filter_rows = " << filter_rows                                 \
+          << ", filter_cols = " << filter_cols                                 \
+          << ", padded_out_rows = " << padded_out_rows                         \
+          << ", padded_out_cols = " << padded_out_cols                         \
+          << ", top_pad_rows = " << top_pad_rows                               \
+          << ", left_pad_cols = " << left_pad_cols                             \
+          << ", bottom_pad_rows = " << bottom_pad_rows                         \
+          << ", right_pad_cols = " << right_pad_cols                           \
+          << ", strides = " << strides[1]
+
+namespace {
+TensorShape VectorToShape(const TTypes<int32>::ConstVec& sizes) {
+  TensorShape shape;
+
+  using Index = TTypes<int32>::ConstVec::Index;
+  const Index dims = sizes.size();
+  for (Index i = 0; i < dims; ++i) {
+    shape.AddDim(sizes(i));
+  }
+
+  return shape;
+}
+}  // namespace
+
+// The fast versions using eigen computations directly. They are only enabled
+// for CPU for now since nvcc times out when trying to compile them.
+// TODO(yangke): enable them for GPUs when we have a faster compiler.
+
+template <typename Device, class T>
+class Conv2DFastBackpropInputOp : public OpKernel {
+ public:
+  explicit Conv2DFastBackpropInputOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    OP_REQUIRES(context, strides_.size() == 4,
+                errors::InvalidArgument(
+                    "Sliding window strides field must "
+                    "specify 4 dimensions"));
+    OP_REQUIRES(context, strides_[1] == strides_[2],
+                errors::InvalidArgument(
+                    "Current implementation only supports equal length "
+                    "strides in the row and column dimensions."));
+    OP_REQUIRES(context, (strides_[0] == 1 && strides_[3] == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "strides in the batch and depth dimensions."));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input_sizes = context->input(0);
+    const Tensor& filter = context->input(1);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsVector(input_sizes.shape()),
+        errors::InvalidArgument(
+            "Conv2DBackpropInput: input_sizes input must be 1-dim, not ",
+            input_sizes.dims()));
+    TensorShape input_shape = VectorToShape(input_sizes.vec<int32>());
+    const TensorShape& filter_shape = filter.shape();
+
+    EXTRACT_AND_VERIFY_DIMENSIONS("Conv2DBackpropInput");
+    Tensor* in_backprop = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input_shape, &in_backprop));
+    // Need to flip the input_rows and input_cols when passing to eigen.
+    functor::SpatialConvolutionBackwardInput<Device, T>()(
+        context->eigen_device<Device>(), in_backprop->tensor<T, 4>(),
+        filter.tensor<T, 4>(), out_backprop.tensor<T, 4>(), input_cols,
+        input_rows, stride);
+  }
+
+ private:
+  std::vector<int32> strides_;
+  Padding padding_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(Conv2DFastBackpropInputOp);
+};
+
+// Based on implementation written by Yangqing Jia (jiayq).
+template <typename Device, class T>
+class Conv2DCustomBackpropInputOp : public OpKernel {
+ public:
+  explicit Conv2DCustomBackpropInputOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    OP_REQUIRES(context, strides_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(context, strides_[1] == strides_[2],
+                errors::InvalidArgument(
+                    "Current implementation only supports equal length "
+                    "strides in the row and column dimensions."));
+    OP_REQUIRES(
+        context, (strides_[0] == 1 && strides_[3] == 1),
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input_sizes = context->input(0);
+    const Tensor& filter = context->input(1);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsVector(input_sizes.shape()),
+        errors::InvalidArgument(
+            "Conv2DBackpropInput: input_sizes input must be 1-dim, not ",
+            input_sizes.dims()));
+    TensorShape input_shape = VectorToShape(input_sizes.vec<int32>());
+    const TensorShape& filter_shape = filter.shape();
+
+    EXTRACT_AND_VERIFY_DIMENSIONS("Conv2DBackpropInput");
+    Tensor* in_backprop = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input_shape, &in_backprop));
+
+    // TODO(andydavis) Consider moving code shared with
+    // Conv2DCustomBackpropFilterOp into a shared helper function.
+    int pad_top;
+    int pad_bottom;
+    int pad_left;
+    int pad_right;
+    OP_REQUIRES_OK(
+        context,
+        Get2dOutputSizeVerbose(input_rows, input_cols, filter_rows, filter_cols,
+                               stride, stride, padding_, &out_rows, &out_cols,
+                               &pad_top, &pad_bottom, &pad_left, &pad_right));
+
+    // The total dimension size of each kernel.
+    const int filter_total_size = filter_rows * filter_cols * in_depth;
+    // The output image size is the spatial size of the output.
+    const int output_image_size = out_rows * out_cols;
+
+    Tensor col_buffer;
+    OP_REQUIRES_OK(
+        context,
+        context->allocate_temp(
+            DataTypeToEnum<T>::value,
+            TensorShape({output_image_size, filter_total_size}), &col_buffer));
+
+    // The input offset corresponding to a single input image.
+    const int input_offset = input_rows * input_cols * in_depth;
+    // The output offset corresponding to a single output image.
+    const int output_offset = out_rows * out_cols * out_depth;
+
+    auto* filter_data = filter.template flat<T>().data();
+    auto* col_buffer_data = col_buffer.template flat<T>().data();
+    auto* out_backprop_data = out_backprop.template flat<T>().data();
+    auto* input_backprop_data = in_backprop->template flat<T>().data();
+
+    typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic,
+                                     Eigen::RowMajor>> MatrixMap;
+    typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic,
+                                           Eigen::RowMajor>> ConstMatrixMap;
+
+    for (int image_id = 0; image_id < batch; ++image_id) {
+      // Compute gradient into col_buffer.
+      MatrixMap C(col_buffer_data, output_image_size, filter_total_size);
+
+      ConstMatrixMap A(out_backprop_data + output_offset * image_id,
+                       output_image_size, out_depth);
+      ConstMatrixMap B(filter_data, filter_total_size, out_depth);
+
+      // TODO(andydavis) Use a multi-threaded matmul implementation here.
+      C.noalias() = A * B.transpose();
+
+      Col2im<T>(col_buffer_data, in_depth, input_rows, input_cols, filter_rows,
+                filter_cols, pad_top, pad_left, pad_bottom, pad_right, stride,
+                stride, input_backprop_data);
+
+      input_backprop_data += input_offset;
+    }
+  }
+
+ private:
+  std::vector<int32> strides_;
+  Padding padding_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(Conv2DCustomBackpropInputOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T"),
+                        Conv2DCustomBackpropInputOp<CPUDevice, float>);
+
+REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
+                            .Device(DEVICE_CPU)
+                            .Label("custom")
+                            .TypeConstraint<float>("T"),
+                        Conv2DCustomBackpropInputOp<CPUDevice, float>);
+
+REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
+                            .Device(DEVICE_CPU)
+                            .Label("eigen_tensor")
+                            .TypeConstraint<float>("T"),
+                        Conv2DFastBackpropInputOp<CPUDevice, float>);
+
+template <typename Device, class T>
+class Conv2DFastBackpropFilterOp : public OpKernel {
+ public:
+  explicit Conv2DFastBackpropFilterOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    OP_REQUIRES(context, strides_.size() == 4,
+                errors::InvalidArgument(
+                    "Sliding window strides field must "
+                    "specify 4 dimensions"));
+    OP_REQUIRES(context, strides_[1] == strides_[2],
+                errors::InvalidArgument(
+                    "Current implementation only supports equal length "
+                    "strides in the row and column dimensions."));
+    OP_REQUIRES(context, (strides_[0] == 1 && strides_[3] == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "strides in the batch and depth dimensions."));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const Tensor& filter_sizes = context->input(1);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsVector(filter_sizes.shape()),
+        errors::InvalidArgument(
+            "Conv2DBackpropFilter: filter_sizes input must be 1-dim, not ",
+            filter_sizes.dims()));
+    const TensorShape& input_shape = input.shape();
+    TensorShape filter_shape = VectorToShape(filter_sizes.vec<int32>());
+
+    EXTRACT_AND_VERIFY_DIMENSIONS("Conv2DBackpropFilter");
+    Tensor* filter_backprop = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, filter_shape, &filter_backprop));
+
+    // Need to flip the filter_rows and filter_cols when passing to eigen.
+    functor::SpatialConvolutionBackwardKernel<Device, T>()(
+        context->eigen_device<Device>(), filter_backprop->tensor<T, 4>(),
+        input.tensor<T, 4>(), out_backprop.tensor<T, 4>(), filter_cols,
+        filter_rows, stride);
+  }
+
+ private:
+  std::vector<int32> strides_;
+  Padding padding_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(Conv2DFastBackpropFilterOp);
+};
+
+// Based on implementation written by Yangqing Jia (jiayq).
+template <typename Device, class T>
+class Conv2DCustomBackpropFilterOp : public OpKernel {
+ public:
+  explicit Conv2DCustomBackpropFilterOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    OP_REQUIRES(context, strides_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(context, strides_[1] == strides_[2],
+                errors::InvalidArgument(
+                    "Current implementation only supports equal length "
+                    "strides in the row and column dimensions."));
+    OP_REQUIRES(
+        context, (strides_[0] == 1 && strides_[3] == 1),
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const Tensor& filter_sizes = context->input(1);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsVector(filter_sizes.shape()),
+        errors::InvalidArgument(
+            "Conv2DCustomBackpropFilter: filter_sizes input must be 1-dim, "
+            "not ",
+            filter_sizes.dims()));
+    const TensorShape& input_shape = input.shape();
+    TensorShape filter_shape = VectorToShape(filter_sizes.vec<int32>());
+
+    EXTRACT_AND_VERIFY_DIMENSIONS("Conv2DCustomBackpropFilter");
+    Tensor* filter_backprop;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, filter_shape, &filter_backprop));
+
+    int pad_top;
+    int pad_bottom;
+    int pad_left;
+    int pad_right;
+    OP_REQUIRES_OK(
+        context,
+        Get2dOutputSizeVerbose(input_rows, input_cols, filter_rows, filter_cols,
+                               stride, stride, padding_, &out_rows, &out_cols,
+                               &pad_top, &pad_bottom, &pad_left, &pad_right));
+
+    // The total dimension size of each kernel.
+    const int filter_total_size = filter_rows * filter_cols * in_depth;
+    // The output image size is the spatial size of the output.
+    const int output_image_size = out_rows * out_cols;
+
+    Tensor col_buffer;
+    OP_REQUIRES_OK(
+        context,
+        context->allocate_temp(
+            DataTypeToEnum<T>::value,
+            TensorShape({output_image_size, filter_total_size}), &col_buffer));
+
+    // The input offset corresponding to a single input image.
+    const int input_offset = input_rows * input_cols * in_depth;
+    // The output offset corresponding to a single output image.
+    const int output_offset = out_rows * out_cols * out_depth;
+
+    auto* input_data = input.template flat<T>().data();
+    auto* col_buffer_data = col_buffer.template flat<T>().data();
+    auto* out_backprop_data = out_backprop.template flat<T>().data();
+    auto* filter_backprop_data = filter_backprop->template flat<T>().data();
+
+    typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic,
+                                     Eigen::RowMajor>> MatrixMap;
+    typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic,
+                                           Eigen::RowMajor>> ConstMatrixMap;
+
+    MatrixMap C(filter_backprop_data, filter_total_size, out_depth);
+
+    C.setZero();
+    for (int image_id = 0; image_id < batch; ++image_id) {
+      // When we compute the gradient with respect to the filters, we need to do
+      // im2col to allow gemm-type computation.
+      Im2col<T>(input_data, in_depth, input_rows, input_cols, filter_rows,
+                filter_cols, pad_top, pad_left, pad_bottom, pad_right, stride,
+                stride, col_buffer_data);
+
+      ConstMatrixMap A(col_buffer_data, output_image_size, filter_total_size);
+      ConstMatrixMap B(out_backprop_data + output_offset * image_id,
+                       output_image_size, out_depth);
+
+      // Compute gradient with respect to filter.
+      // TODO(andydavis) Use a multi-threaded matmul implementation here.
+      C.noalias() += A.transpose() * B;
+
+      input_data += input_offset;
+    }
+  }
+
+ private:
+  std::vector<int32> strides_;
+  Padding padding_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(Conv2DCustomBackpropFilterOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T"),
+                        Conv2DCustomBackpropFilterOp<CPUDevice, float>);
+
+REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
+                            .Device(DEVICE_CPU)
+                            .Label("custom")
+                            .TypeConstraint<float>("T"),
+                        Conv2DCustomBackpropFilterOp<CPUDevice, float>);
+
+REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
+                            .Device(DEVICE_CPU)
+                            .Label("eigen_tensor")
+                            .TypeConstraint<float>("T"),
+                        Conv2DFastBackpropFilterOp<CPUDevice, float>);
+
+// GPU definitions of both ops.
+#if GOOGLE_CUDA
+namespace {
+template <typename T>
+perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory,
+                                                    uint64 size) {
+  perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory),
+                                                size * sizeof(T));
+  perftools::gputools::DeviceMemory<T> typed(wrapped);
+  return typed;
+}
+}  // namespace
+
+// The slow version (but compiles for GPU)
+
+// Backprop for input.
+template <typename Device, class T>
+class Conv2DSlowBackpropInputOp : public OpKernel {
+ public:
+  explicit Conv2DSlowBackpropInputOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    OP_REQUIRES(context, strides_.size() == 4,
+                errors::InvalidArgument(
+                    "Sliding window strides field must "
+                    "specify 4 dimensions"));
+    OP_REQUIRES(context, strides_[1] == strides_[2],
+                errors::InvalidArgument(
+                    "Current implementation only supports equal length "
+                    "strides in the row and column dimensions."));
+    OP_REQUIRES(context, (strides_[0] == 1 && strides_[3] == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "strides in the batch and depth dimensions."));
+    OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
+    use_cudnn_ &= CanUseCudnn();
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input_sizes = context->input(0);
+    const Tensor& filter = context->input(1);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsVector(input_sizes.shape()),
+        errors::InvalidArgument(
+            "Conv2DBackpropInput: input_sizes input must be 1-dim, not ",
+            input_sizes.dims()));
+    TensorShape input_shape = VectorToShape(input_sizes.vec<int32>());
+    const TensorShape& filter_shape = filter.shape();
+
+    EXTRACT_AND_VERIFY_DIMENSIONS("Conv2DBackpropInput");
+    Tensor* in_backprop = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input_shape, &in_backprop));
+
+    const int padding_rows =
+        (output_rows - 1) * stride + filter_rows - input_rows;
+    const int padding_cols =
+        (output_cols - 1) * stride + filter_cols - input_cols;
+
+    // TODO(keveman): cuDNN only supports equal padding on both sides, so only
+    // calling it when that is true. Remove this check when (if?) cuDNN starts
+    // supporting different padding.
+    bool padding_compatible =
+        (padding_rows % 2 == 0) && (padding_cols % 2 == 0);
+
+    auto* stream = context->op_device_context<GPUDeviceContext>()->stream();
+    OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+
+    if (use_cudnn_ && padding_compatible) {
+      if (filter_rows == 1 && filter_cols == 1 && stride == 1) {
+        // 1x1 filter, so call cublas directly.
+        const uint64 m = batch * input_rows * input_cols;
+        const uint64 k = out_depth;
+        const uint64 n = in_depth;
+
+        auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
+                                    out_backprop.template flat<T>().size());
+        auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
+                                    filter.template flat<T>().size());
+        auto c_ptr = AsDeviceMemory(in_backprop->template flat<T>().data(),
+                                    in_backprop->template flat<T>().size());
+
+        auto transpose = perftools::gputools::blas::Transpose::kTranspose;
+        auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+
+        bool blas_launch_status =
+            stream->ThenBlasGemm(transpose, no_transpose, n, m, k, 1.0f, b_ptr,
+                                 k, a_ptr, k, 0.0f, &c_ptr, n)
+                .ok();
+        if (!blas_launch_status) {
+          context->SetStatus(errors::Internal("Blas SGEMM launch failed : m=",
+                                              m, ", n=", n, ", k=", k));
+        }
+        return;
+      }
+
+      perftools::gputools::dnn::BatchDescriptor input_desc;
+      input_desc.set_count(batch)
+          .set_height(input_rows)
+          .set_width(input_cols)
+          .set_feature_map_count(in_depth)
+          .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+      perftools::gputools::dnn::BatchDescriptor output_desc;
+      output_desc.set_count(batch)
+          .set_height(output_rows)
+          .set_width(output_cols)
+          .set_feature_map_count(out_depth)
+          .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+      perftools::gputools::dnn::FilterDescriptor filter_desc;
+      filter_desc.set_input_filter_height(filter_rows)
+          .set_input_filter_width(filter_cols)
+          .set_input_feature_map_count(in_depth)
+          .set_output_feature_map_count(out_depth);
+      perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
+      conv_desc.set_vertical_filter_stride(stride)
+          .set_horizontal_filter_stride(stride)
+          .set_zero_padding_height(padding_rows / 2)
+          .set_zero_padding_width(padding_cols / 2);
+
+      // NOTE(keveman):
+      // cuDNN only supports the following layouts :
+      // Input  : B x D x R x C
+      // Filter : OD x ID x R x C
+      // Whereas, we have
+      // Input  : B x R x C x D
+      // Filter : R x C x ID x OD
+      // TransformFilter performs (R x C x ID x OD) => (OD x ID x R x C)
+      // The first TransformDepth performs
+      // (B x R x C x D) => (B x D x R x C).
+      // Since the tensor returned from cuDNN is B x D x R x C also,
+      // the second TransformDepth performs
+      // (B x D x R x C) => (B x R x C x D).
+      Tensor transformed_filter;
+      OP_REQUIRES_OK(
+          context,
+          context->allocate_temp(
+              DataTypeToEnum<T>::value,
+              TensorShape({out_depth, in_depth, filter_rows, filter_cols}),
+              &transformed_filter));
+
+      functor::TransformFilter<Device, T>()(context->eigen_device<Device>(),
+                                            filter.tensor<T, 4>(),
+                                            transformed_filter.tensor<T, 4>());
+
+      Tensor transformed_out_backprop;
+      OP_REQUIRES_OK(
+          context,
+          context->allocate_temp(
+              DataTypeToEnum<T>::value,
+              TensorShape({batch, out_depth, output_rows, output_cols}),
+              &transformed_out_backprop));
+
+      functor::TransformDepth<Device, T>()(
+          context->eigen_device<Device>(), out_backprop.tensor<T, 4>(),
+          Eigen::DSizes<Eigen::DenseIndex, 4>(0, 3, 1, 2),
+          transformed_out_backprop.tensor<T, 4>());
+
+      Tensor pre_transformed_in_backprop;
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(
+                         DataTypeToEnum<T>::value,
+                         TensorShape({batch, in_depth, input_rows, input_cols}),
+                         &pre_transformed_in_backprop));
+
+      auto out_backprop_ptr =
+          AsDeviceMemory(transformed_out_backprop.template flat<T>().data(),
+                         transformed_out_backprop.template flat<T>().size());
+      auto filter_ptr =
+          AsDeviceMemory(transformed_filter.template flat<T>().data(),
+                         transformed_filter.template flat<T>().size());
+      auto in_backprop_ptr =
+          AsDeviceMemory(pre_transformed_in_backprop.template flat<T>().data(),
+                         pre_transformed_in_backprop.template flat<T>().size());
+
+      bool cudnn_launch_status =
+          stream->ThenConvolveBackwardData(filter_desc, filter_ptr, output_desc,
+                                           out_backprop_ptr, conv_desc,
+                                           input_desc, &in_backprop_ptr)
+              .ok();
+
+      if (!cudnn_launch_status) {
+        context->SetStatus(errors::Internal(
+            "cuDNN Backward Data function launch failure : input shape(",
+            input_shape.DebugString(), ") filter shape(",
+            filter_shape.DebugString(), ")"));
+      }
+
+      auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
+      functor::TransformDepth<Device, T>()(
+          context->eigen_device<Device>(),
+          toConstTensor(pre_transformed_in_backprop).template tensor<T, 4>(),
+          Eigen::DSizes<Eigen::DenseIndex, 4>(0, 2, 3, 1),
+          in_backprop->tensor<T, 4>());
+    } else {
+      // We fill out a padded out_backprop
+      TensorShape padded_out_shape(
+          {batch, padded_out_rows, padded_out_cols, out_depth});
+      Tensor padded_output;
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(DataTypeToEnum<T>::v(),
+                                            padded_out_shape, &padded_output));
+
+      Eigen::DSizes<Eigen::DenseIndex, 4> trivial_order{0, 1, 2, 3};
+      Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 4> pad_dims{
+          {{0, 0},
+           {top_pad_rows, bottom_pad_rows},
+           {left_pad_cols, right_pad_cols},
+           {0, 0}}};
+
+      functor::InflatePadAndShuffle<Device, T, 4>()(
+          context->eigen_device<Device>(), out_backprop.tensor<T, 4>(), strides,
+          pad_dims, trivial_order, padded_output.tensor<T, 4>());
+      const Tensor& padded_output_cref = padded_output;
+
+      // We then need to fill a new "reverted" filter
+      // We need to transpose the in_depth and out_depth for the filter and
+      // inverse the rows and cols.
+      TensorShape r_filter_shape(
+          {filter_rows, filter_cols, out_depth, in_depth});
+      Tensor r_filter;
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(DataTypeToEnum<T>::v(),
+                                            r_filter_shape, &r_filter));
+
+      Eigen::DSizes<Eigen::DenseIndex, 4> filter_order{0, 1, 3, 2};
+      Eigen::array<bool, 4> filter_rev_dims{true, true, false, false};
+      functor::ShuffleAndReverse<Device, T, 4>()(
+          context->eigen_device<Device>(), filter.tensor<T, 4>(), filter_order,
+          filter_rev_dims, r_filter.tensor<T, 4>());
+      const Tensor& r_filter_cref = r_filter;
+
+      // Now we can call conv_2d directly.
+      functor::SpatialConvolution<Device, T>()(
+          context->eigen_device<Device>(), in_backprop->tensor<T, 4>(),
+          padded_output_cref.tensor<T, 4>(), r_filter_cref.tensor<T, 4>(), 1,
+          BrainPadding2EigenPadding(VALID));
+    }
+  }
+
+ private:
+  std::vector<int32> strides_;
+  Padding padding_;
+  bool use_cudnn_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(Conv2DSlowBackpropInputOp);
+};
+
+// Backprop for filter.
+template <typename Device, class T>
+class Conv2DSlowBackpropFilterOp : public OpKernel {
+ public:
+  explicit Conv2DSlowBackpropFilterOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    OP_REQUIRES(context, strides_.size() == 4,
+                errors::InvalidArgument(
+                    "Sliding window strides field must "
+                    "specify 4 dimensions"));
+    OP_REQUIRES(context, strides_[1] == strides_[2],
+                errors::InvalidArgument(
+                    "Current implementation only supports equal length "
+                    "strides in the row and column dimensions."));
+    OP_REQUIRES(context, (strides_[0] == 1 && strides_[3] == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "strides in the batch and depth dimensions."));
+    OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
+    use_cudnn_ &= CanUseCudnn();
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const Tensor& filter_sizes = context->input(1);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsVector(filter_sizes.shape()),
+        errors::InvalidArgument(
+            "Conv2DBackpropFilter: filter_sizes input must be 1-dim, not ",
+            filter_sizes.dims()));
+    const TensorShape& input_shape = input.shape();
+    TensorShape filter_shape = VectorToShape(filter_sizes.vec<int32>());
+
+    EXTRACT_AND_VERIFY_DIMENSIONS("Conv2DBackpropFilter");
+    Tensor* filter_backprop = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, filter_shape, &filter_backprop));
+
+    const int padding_rows =
+        (output_rows - 1) * stride + filter_rows - input_rows;
+    const int padding_cols =
+        (output_cols - 1) * stride + filter_cols - input_cols;
+
+    // TODO(zhengxq): cuDNN only supports equal padding on both sides, so only
+    // calling it when that is true. Remove this check when (if?) cuDNN starts
+    // supporting different padding.
+    bool padding_compatible =
+        (padding_rows % 2 == 0) && (padding_cols % 2 == 0);
+
+    auto* stream = context->op_device_context<GPUDeviceContext>()->stream();
+    OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+
+    if (use_cudnn_ && padding_compatible) {
+      if (filter_rows == 1 && filter_cols == 1 && stride == 1) {
+        const uint64 m = in_depth;
+        const uint64 k = batch * input_rows * input_cols;
+        const uint64 n = out_depth;
+
+        // The shape of output backprop is
+        //   [batch, out_rows, out_cols, out_depth]
+        //   From cublas's perspective, it is: n x k
+        auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
+                                    out_backprop.template flat<T>().size());
+
+        // The shape of input is
+        //   [batch, in_rows, in_cols, in_depth],
+        //   From cublas's perspective, it is: m x k
+        auto b_ptr = AsDeviceMemory(input.template flat<T>().data(),
+                                    input.template flat<T>().size());
+
+        // the shape of the filter backprop from the conv_2d should be
+        //   [1, 1, in_depth, out_depth]
+        //   From cublas's perspective, it is: n x m
+        auto c_ptr = AsDeviceMemory(filter_backprop->template flat<T>().data(),
+                                    filter_backprop->template flat<T>().size());
+
+        bool blas_launch_status =
+            stream->ThenBlasGemm(
+                      perftools::gputools::blas::Transpose::kNoTranspose,
+                      perftools::gputools::blas::Transpose::kTranspose, n, m, k,
+                      1.0f, a_ptr, n, b_ptr, m, 0.0f, &c_ptr, n)
+                .ok();
+        if (!blas_launch_status) {
+          context->SetStatus(errors::Internal("Blas SGEMM launch failed : m=",
+                                              m, ", n=", n, ", k=", k));
+        }
+        return;
+      }
+
+      perftools::gputools::dnn::BatchDescriptor input_desc;
+      input_desc.set_count(batch)
+          .set_height(input_rows)
+          .set_width(input_cols)
+          .set_feature_map_count(in_depth)
+          .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+      perftools::gputools::dnn::BatchDescriptor output_desc;
+      output_desc.set_count(batch)
+          .set_height(output_rows)
+          .set_width(output_cols)
+          .set_feature_map_count(out_depth)
+          .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+      perftools::gputools::dnn::FilterDescriptor filter_desc;
+      filter_desc.set_input_filter_height(filter_rows)
+          .set_input_filter_width(filter_cols)
+          .set_input_feature_map_count(in_depth)
+          .set_output_feature_map_count(out_depth);
+      perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
+      conv_desc.set_vertical_filter_stride(stride)
+          .set_horizontal_filter_stride(stride)
+          .set_zero_padding_height(padding_rows / 2)
+          .set_zero_padding_width(padding_cols / 2);
+
+      // NOTE(zhengxq):
+      // cuDNN only supports the following layouts :
+      // Input  : B x D x R x C
+      // Filter : OD x ID x R x C
+      // Whereas, we have
+      // Input  : B x R x C x D
+      // Filter : R x C x ID x OD
+      // TransformFilter performs (R x C x ID x OD) => (OD x ID x R x C)
+      // The first TransformDepth performs
+      // (B x R x C x D) => (B x D x R x C).
+      // Since the tensor returned from cuDNN is B x D x R x C also,
+      // the second TransformDepth performs
+      // (B x D x R x C) => (B x R x C x D).
+
+      Tensor pre_transformed_filter_backprop;
+      OP_REQUIRES_OK(
+          context,
+          context->allocate_temp(
+              DataTypeToEnum<T>::value,
+              TensorShape({out_depth, in_depth, filter_rows, filter_cols}),
+              &pre_transformed_filter_backprop));
+
+      Tensor transformed_out_backprop;
+      OP_REQUIRES_OK(
+          context,
+          context->allocate_temp(
+              DataTypeToEnum<T>::value,
+              TensorShape({batch, out_depth, output_rows, output_cols}),
+              &transformed_out_backprop));
+
+      functor::TransformDepth<Device, T>()(
+          context->eigen_device<Device>(), out_backprop.tensor<T, 4>(),
+          Eigen::DSizes<Eigen::DenseIndex, 4>(0, 3, 1, 2),
+          transformed_out_backprop.tensor<T, 4>());
+
+      Tensor transformed_input;
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(
+                         DataTypeToEnum<T>::value,
+                         TensorShape({batch, in_depth, input_rows, input_cols}),
+                         &transformed_input));
+
+      functor::TransformDepth<Device, T>()(
+          context->eigen_device<Device>(), input.tensor<T, 4>(),
+          Eigen::DSizes<Eigen::DenseIndex, 4>(0, 3, 1, 2),
+          transformed_input.tensor<T, 4>());
+
+      auto out_backprop_ptr =
+          AsDeviceMemory(transformed_out_backprop.template flat<T>().data(),
+                         transformed_out_backprop.template flat<T>().size());
+      auto filter_backprop_ptr = AsDeviceMemory(
+          pre_transformed_filter_backprop.template flat<T>().data(),
+          pre_transformed_filter_backprop.template flat<T>().size());
+      auto input_ptr =
+          AsDeviceMemory(transformed_input.template flat<T>().data(),
+                         transformed_input.template flat<T>().size());
+
+      bool cudnn_launch_status =
+          stream->ThenConvolveBackwardFilter(input_desc, input_ptr, output_desc,
+                                             out_backprop_ptr, conv_desc,
+                                             filter_desc, &filter_backprop_ptr)
+              .ok();
+
+      if (!cudnn_launch_status) {
+        context->SetStatus(errors::Internal(
+            "cuDNN Backward Filter function launch failure : input shape(",
+            input_shape.DebugString(), ") filter shape(",
+            filter_shape.DebugString(), ")"));
+      }
+
+      auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
+      functor::TransformDepth<Device, T>()(
+          context->eigen_device<Device>(),
+          toConstTensor(pre_transformed_filter_backprop)
+              .template tensor<T, 4>(),
+          Eigen::DSizes<Eigen::DenseIndex, 4>(2, 3, 1, 0),
+          filter_backprop->tensor<T, 4>());
+    } else {
+      // Fall back to the non-cudnn code path
+
+      // For the backprop of the filter, we need to also transpose the
+      // out_backprop.
+      // The shape of backprop is
+      //   [batch, out_rows, out_cols, out_depth]
+      // And we need to change it to
+      //   [out_depth, out_rows, out_cols, batch]
+      Eigen::DSizes<Eigen::DenseIndex, 4> out_order{3, 1, 2, 0};
+      TensorShape padded_out_shape(
+          {out_depth, padded_out_rows, padded_out_cols, batch});
+      Tensor padded_output;
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(DataTypeToEnum<T>::v(),
+                                            padded_out_shape, &padded_output));
+
+      Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 4> pad_dims{
+          {{0, 0},
+           {top_pad_rows, bottom_pad_rows},
+           {left_pad_cols, right_pad_cols},
+           {0, 0}}};
+      functor::InflatePadAndShuffle<Device, T, 4>()(
+          context->eigen_device<Device>(), out_backprop.tensor<T, 4>(), strides,
+          pad_dims, out_order, padded_output.tensor<T, 4>());
+      const Tensor& padded_output_cref = padded_output;
+
+      // For the backprop of the filter, we need to transpose the input.
+      // The shape of input is
+      //   [batch, in_rows, in_cols, in_depth]
+      // And we need to change it to
+      //   [in_rows, in_cols, batch, in_depth]
+      Eigen::DSizes<Eigen::DenseIndex, 4> in_order{1, 2, 0, 3};
+      TensorShape in_shuffle_shape({input_rows, input_cols, batch, in_depth});
+      Tensor in_shuffle;
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(DataTypeToEnum<T>::v(),
+                                            in_shuffle_shape, &in_shuffle));
+
+      // No need for reversing this time.
+      Eigen::array<bool, 4> trivial_dims{false, false, false, false};
+      functor::ShuffleAndReverse<Device, T, 4>()(
+          context->eigen_device<Device>(), input.tensor<T, 4>(), in_order,
+          trivial_dims, in_shuffle.tensor<T, 4>());
+      const Tensor& in_shuffle_cref = in_shuffle;
+
+      // The output of the conv_2d would be
+      //   [out_depth, filter_rows, filter_cols, in_depth]
+      // and we need to shuffle it back to
+      //   [filter_rows, filter_cols, in_depth, out_depth];
+      // And we need to reverse the filter backprops
+      // So we need to allocated (sigh) yet another piece of memory to hold the
+      // ouptut.
+      TensorShape filter_shuffle_shape(
+          {out_depth, filter_rows, filter_cols, in_depth});
+      Tensor filter_shuffle;
+      OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::v(),
+                                                     filter_shuffle_shape,
+                                                     &filter_shuffle));
+
+      functor::SpatialConvolution<Device, T>()(
+          context->eigen_device<Device>(), filter_shuffle.tensor<T, 4>(),
+          padded_output_cref.tensor<T, 4>(), in_shuffle_cref.tensor<T, 4>(), 1,
+          BrainPadding2EigenPadding(VALID));
+
+      // Now copy the filter_backprop back to the destination.
+      Eigen::DSizes<Eigen::DenseIndex, 4> filter_order{1, 2, 3, 0};
+      Eigen::array<bool, 4> filter_rev_dims{true, true, false, false};
+      const Tensor& filter_shuffle_cref = filter_shuffle;
+      functor::ShuffleAndReverse<Device, T, 4>()(
+          context->eigen_device<Device>(), filter_shuffle_cref.tensor<T, 4>(),
+          filter_order, filter_rev_dims, filter_backprop->tensor<T, 4>());
+    }
+  }
+
+ private:
+  std::vector<int32> strides_;
+  Padding padding_;
+  bool use_cudnn_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(Conv2DSlowBackpropFilterOp);
+};
+
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                                 \
+  template <>                                                               \
+  void ShuffleAndReverse<GPUDevice, T, 4>::operator()(                      \
+      const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input,         \
+      const Eigen::DSizes<Eigen::DenseIndex, 4>& order,                     \
+      const Eigen::array<bool, 4>& reverse_dims,                            \
+      typename TTypes<T, 4>::Tensor output);                                \
+  extern template struct ShuffleAndReverse<GPUDevice, T, 4>;                \
+  template <>                                                               \
+  void InflatePadAndShuffle<GPUDevice, T, 4>::operator()(                   \
+      const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input,         \
+      const Eigen::DSizes<Eigen::DenseIndex, 4>& strides,                   \
+      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 4>& pad_dims, \
+      const Eigen::DSizes<Eigen::DenseIndex, 4>& order,                     \
+      typename TTypes<T, 4>::Tensor output);                                \
+  extern template struct InflatePadAndShuffle<GPUDevice, T, 4>;             \
+  template <>                                                               \
+  void TransformFilter<GPUDevice, T>::operator()(                           \
+      const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in,            \
+      typename TTypes<T, 4>::Tensor out);                                   \
+  extern template struct TransformFilter<GPUDevice, T>;                     \
+  template <>                                                               \
+  void TransformDepth<GPUDevice, T>::operator()(                            \
+      const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in,            \
+      const Eigen::DSizes<Eigen::DenseIndex, 4>& shuffle,                   \
+      typename TTypes<T, 4>::Tensor out);                                   \
+  extern template struct TransformDepth<GPUDevice, T>;                      \
+  template <>                                                               \
+  void SpatialConvolution<GPUDevice, T>::operator()(                        \
+      const GPUDevice& d, typename TTypes<T, 4>::Tensor output,             \
+      typename TTypes<T, 4>::ConstTensor input,                             \
+      typename TTypes<T, 4>::ConstTensor filter, int stride,                \
+      const Eigen::PaddingType& padding);                                   \
+  extern template struct SpatialConvolution<GPUDevice, T>;                  \
+  template <>                                                               \
+  void SpatialConvolutionBackwardInput<GPUDevice, T>::operator()(           \
+      const GPUDevice& d, typename TTypes<T, 4>::Tensor in_backprop,        \
+      typename TTypes<T, 4>::ConstTensor filter,                            \
+      typename TTypes<T, 4>::ConstTensor output_backprop, int input_rows,   \
+      int input_cols, int stride);                                          \
+  extern template struct SpatialConvolutionBackwardInput<GPUDevice, T>
+
+DECLARE_GPU_SPEC(float);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<float>("T")
+                            .HostMemory("input_sizes"),
+                        Conv2DSlowBackpropInputOp<GPUDevice, float>);
+REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<float>("T")
+                            .HostMemory("filter_sizes"),
+                        Conv2DSlowBackpropFilterOp<GPUDevice, float>);
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
new file mode 100644
index 0000000000..aaa2951778
--- /dev/null
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -0,0 +1,373 @@
+// See docs in ../ops/nn_ops.cc.
+
+#define USE_EIGEN_TENSOR
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/util/use_cudnn.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/public/tensor.h"
+
+#if GOOGLE_CUDA
+#include "tensorflow/core/common_runtime/gpu_device_context.h"
+#include "tensorflow/stream_executor/stream.h"
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+struct LaunchGeneric {
+  static void launch(OpKernelContext* ctx, const Tensor& input,
+                     const Tensor& filter, int stride,
+                     const Eigen::PaddingType& padding, Tensor* output) {
+    if (filter.dim_size(1) == filter.dim_size(0) && filter.dim_size(0) == 1 &&
+        stride == 1) {
+      // For 1x1 kernel, the 2D convolution is reduced to matrix
+      // multiplication.
+      //
+      // TODO(vrv): We should be able to call SpatialConvolution
+      // and it will produce the same result, but doing so
+      // led to NaNs during training.  Using matmul instead for now.
+      int conv_width = 1;  // Width for the convolution step.
+      for (int i = 0; i < 3; ++i) {
+        conv_width *= output->dim_size(i);
+      }
+
+      Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
+      dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
+      functor::MatMulConvFunctor<Device, T>()(
+          ctx->eigen_device<Device>(),
+          output->shaped<T, 2>({conv_width, filter.dim_size(3)}),
+          input.shaped<T, 2>({conv_width, filter.dim_size(2)}),
+          filter.shaped<T, 2>({filter.dim_size(2), filter.dim_size(3)}),
+          dim_pair);
+    } else {
+      functor::SpatialConvolution<Device, T>()(
+          ctx->eigen_device<Device>(), output->tensor<T, 4>(),
+          input.tensor<T, 4>(), filter.tensor<T, 4>(), stride, padding);
+    }
+  }
+};
+
+template <typename Device, typename T>
+struct LaunchConvOp;
+
+template <typename T>
+struct LaunchConvOp<CPUDevice, T> {
+  static void launch(OpKernelContext* ctx, bool use_cudnn, const Tensor& input,
+                     const Tensor& filter, int stride,
+                     const Eigen::PaddingType& padding, Tensor* output) {
+    LaunchGeneric<CPUDevice, T>::launch(ctx, input, filter, stride, padding,
+                                        output);
+  }
+};
+
+template <typename Device, typename T>
+class Conv2DOp : public BinaryOp<T> {
+ public:
+  explicit Conv2DOp(OpKernelConstruction* context) : BinaryOp<T>(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
+    use_cudnn_ &= CanUseCudnn();
+    OP_REQUIRES(context, strides_.size() == 4,
+                errors::InvalidArgument(
+                    "Sliding window strides field must "
+                    "specify 4 dimensions"));
+    OP_REQUIRES(context, strides_[1] == strides_[2],
+                errors::InvalidArgument(
+                    "Current implementation only supports equal length "
+                    "strides in the row and column dimensions."));
+    OP_REQUIRES(context, (strides_[0] == 1 && strides_[3] == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "strides in the batch and depth dimensions."));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // Input tensor is of the following dimensions:
+    // [ batch, in_rows, in_cols, in_depth ]
+
+    const Tensor& input = context->input(0);
+
+    // Input filter is of the following dimensions:
+    // [ filter_rows, filter_cols, in_depth, out_depth]
+    const Tensor& filter = context->input(1);
+
+    // For 2D convolution, there should be 4 dimensions.
+    OP_REQUIRES(context, input.dims() == 4,
+                errors::InvalidArgument("input must be 4-dimensional",
+                                        input.shape().ShortDebugString()));
+    OP_REQUIRES(context, filter.dims() == 4,
+                errors::InvalidArgument("filter must be 4-dimensional: ",
+                                        filter.shape().ShortDebugString()));
+
+    // The last dimension for input is in_depth. It must be the same as the
+    // filter's in_depth.
+    const int64 in_depth = input.dim_size(3);
+    OP_REQUIRES(
+        context, in_depth == filter.dim_size(2),
+        errors::InvalidArgument("input and filter must have the same depth: ",
+                                in_depth, " vs ", filter.dim_size(2)));
+
+    // The last dimension for filter is out_depth.
+    const int64 out_depth = filter.dim_size(3);
+
+    // The second dimension for input is rows/height.
+    // The first dimension for filter is rows/height.
+    const int64 input_rows = input.dim_size(1);
+    const int64 filter_rows = filter.dim_size(0);
+
+    // The third dimension for input is columns/width.
+    // The second dimension for filter is columns/width.
+    const int64 input_cols = input.dim_size(2);
+    const int64 filter_cols = filter.dim_size(1);
+
+    // The first dimension for input is batch.
+    const int64 batch = input.dim_size(0);
+
+    // For now we take the stride from the second dimension only (we
+    // assume row = col stride, and do not support striding on the
+    // batch or depth dimension).
+    const int stride = strides_[1];
+
+    int out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
+    if (filter_cols == filter_rows && filter_rows == 1 && stride == 1) {
+      // For 1x1 kernel, the 2D convolution is reduced to matrix
+      // multiplication.
+      out_rows = input_rows;
+      out_cols = input_cols;
+    } else {
+      OP_REQUIRES_OK(
+          context, Get2dOutputSize(input_rows, input_cols, filter_rows,
+                                   filter_cols, stride, stride, padding_,
+                                   &out_rows, &out_cols, &pad_rows, &pad_cols));
+    }
+    TensorShape out_shape({batch, out_rows, out_cols, out_depth});
+
+    // Output tensor is of the following dimensions:
+    // [ in_batch, out_rows, out_cols, out_depth ]
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
+
+    VLOG(2) << "Conv2D: in_depth = " << in_depth
+            << ", input_cols = " << input_cols
+            << ", filter_cols = " << filter_cols
+            << ", input_rows = " << input_rows
+            << ", filter_rows = " << filter_rows << ", stride = " << stride
+            << ", out_depth = " << out_depth;
+
+    LaunchConvOp<Device, T>::launch(context, use_cudnn_, input, filter, stride,
+                                    BrainPadding2EigenPadding(padding_),
+                                    output);
+  }
+
+ private:
+  std::vector<int32> strides_;
+  bool use_cudnn_;
+  Padding padding_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(Conv2DOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("Conv2D")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T"),
+                        Conv2DOp<CPUDevice, float>);
+
+#if GOOGLE_CUDA
+
+namespace {
+template <typename T>
+perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory,
+                                                    uint64 size) {
+  perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory),
+                                                size * sizeof(T));
+  perftools::gputools::DeviceMemory<T> typed(wrapped);
+  return typed;
+}
+}  // namespace
+
+template <typename T>
+struct LaunchConvOp<GPUDevice, T> {
+  static void launch(OpKernelContext* ctx, bool use_cudnn,
+                     const Tensor& input_param, const Tensor& filter,
+                     int stride, const Eigen::PaddingType& padding,
+                     Tensor* output) {
+    auto* stream = ctx->op_device_context<GPUDeviceContext>()->stream();
+    OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
+
+    if (use_cudnn) {
+      Tensor input = input_param;
+      if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1) {
+        // 1x1 filter, so call cublas directly.
+        const uint64 m =
+            input.dim_size(0) * input.dim_size(1) * input.dim_size(2);
+        const uint64 k = filter.dim_size(2);
+        const uint64 n = filter.dim_size(3);
+
+        auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
+                                    input.template flat<T>().size());
+        auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
+                                    filter.template flat<T>().size());
+        auto c_ptr = AsDeviceMemory(output->template flat<T>().data(),
+                                    output->template flat<T>().size());
+
+        auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+        bool blas_launch_status =
+            stream->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f,
+                                 b_ptr, n, a_ptr, k, 0.0f, &c_ptr, n)
+                .ok();
+        if (!blas_launch_status) {
+          ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
+                                          ", n=", n, ", k=", k));
+        }
+        return;
+      }
+      if (padding == Eigen::PADDING_SAME) {
+        const int64 out_rows = output->dim_size(1);
+        const int64 out_cols = output->dim_size(2);
+        const int64 in_rows = input.dim_size(1);
+        const int64 in_cols = input.dim_size(2);
+        const int64 patch_rows = filter.dim_size(0);
+        const int64 patch_cols = filter.dim_size(1);
+        // Total padding on rows and cols is
+        // Pr = (R' - 1) * S + Kr - R
+        // Pc = (C' - 1) * S + Kc - C
+        // where (R', C') are output dimensions, (R, C) are input dimensions, S
+        // is stride, (Kr, Kc) are filter dimensions.
+        // We pad Pr/2 on the left and Pr - Pr/2 on the right, Pc/2 on the top
+        // and Pc - Pc/2 on the bottom.  When Pr or Pc is odd, this means
+        // we pad more on the right and bottom than on the top and left.
+        const int padding_rows = (out_rows - 1) * stride + patch_rows - in_rows;
+        const int padding_cols = (out_cols - 1) * stride + patch_cols - in_cols;
+        Tensor transformed_input;
+        OP_REQUIRES_OK(
+            ctx, ctx->allocate_temp(
+                     DataTypeToEnum<T>::value,
+                     TensorShape(
+                         {input.dim_size(0), input.dim_size(1) + padding_rows,
+                          input.dim_size(2) + padding_cols, input.dim_size(3)}),
+                     &transformed_input));
+
+        functor::PadInput<GPUDevice, T>()(
+            ctx->eigen_device<GPUDevice>(), input_param.tensor<T, 4>(),
+            padding_rows / 2, padding_rows - padding_rows / 2, padding_cols / 2,
+            padding_cols - padding_cols / 2, transformed_input.tensor<T, 4>());
+        input = transformed_input;
+      }
+
+      perftools::gputools::dnn::BatchDescriptor input_desc;
+      input_desc.set_count(input.dim_size(0))
+          .set_height(input.dim_size(1))
+          .set_width(input.dim_size(2))
+          .set_feature_map_count(input.dim_size(3))
+          .set_layout(perftools::gputools::dnn::DataLayout::kBatchYXDepth);
+      perftools::gputools::dnn::BatchDescriptor output_desc;
+      output_desc.set_count(output->dim_size(0))
+          .set_height(output->dim_size(1))
+          .set_width(output->dim_size(2))
+          .set_feature_map_count(output->dim_size(3))
+          .set_layout(perftools::gputools::dnn::DataLayout::kBatchYXDepth);
+      perftools::gputools::dnn::FilterDescriptor filter_desc;
+      filter_desc.set_input_filter_height(filter.dim_size(0))
+          .set_input_filter_width(filter.dim_size(1))
+          .set_input_feature_map_count(filter.dim_size(2))
+          .set_output_feature_map_count(filter.dim_size(3));
+      perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
+      conv_desc.set_vertical_filter_stride(stride)
+          .set_horizontal_filter_stride(stride);
+
+      Tensor transformed_filter;
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_temp(
+                         DataTypeToEnum<T>::value,
+                         TensorShape({filter.dim_size(3), filter.dim_size(2),
+                                      filter.dim_size(0), filter.dim_size(1)}),
+                         &transformed_filter));
+
+      functor::TransformFilter<GPUDevice, T>()(
+          ctx->eigen_device<GPUDevice>(), filter.tensor<T, 4>(),
+          transformed_filter.tensor<T, 4>());
+
+      auto input_ptr = AsDeviceMemory(input.template flat<T>().data(),
+                                      input.template flat<T>().size());
+      auto filter_ptr =
+          AsDeviceMemory(transformed_filter.template flat<T>().data(),
+                         transformed_filter.template flat<T>().size());
+      auto output_ptr = AsDeviceMemory(output->template flat<T>().data(),
+                                       output->template flat<T>().size());
+
+      bool cudnn_launch_status =
+          stream->ThenConvolve(input_desc, input_ptr, filter_desc, filter_ptr,
+                               conv_desc, output_desc, &output_ptr)
+              .ok();
+
+      if (!cudnn_launch_status) {
+        ctx->SetStatus(errors::Internal(
+            "cuDNN launch failure : input shape(", input.shape().DebugString(),
+            ") filter shape(", filter.shape().DebugString(), ")"));
+      }
+    } else {
+      LaunchGeneric<GPUDevice, T>::launch(ctx, input_param, filter, stride,
+                                          padding, output);
+    }
+  }
+};
+
+#endif  // GOOGLE_CUDA
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                                  \
+  template <>                                                                \
+  void SpatialConvolution<GPUDevice, T>::operator()(                         \
+      const GPUDevice& d, typename TTypes<T, 4>::Tensor output,              \
+      typename TTypes<T, 4>::ConstTensor input,                              \
+      typename TTypes<T, 4>::ConstTensor filter, int stride,                 \
+      const Eigen::PaddingType& padding);                                    \
+  extern template struct SpatialConvolution<GPUDevice, T>;                   \
+  template <>                                                                \
+  void MatMulConvFunctor<GPUDevice, T>::operator()(                          \
+      const GPUDevice& d, typename TTypes<T, 2>::Tensor out,                 \
+      typename TTypes<T, 2>::ConstTensor in0,                                \
+      typename TTypes<T, 2>::ConstTensor in1,                                \
+      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair); \
+  extern template struct MatMulConvFunctor<GPUDevice, T>;                    \
+  template <>                                                                \
+  void TransformFilter<GPUDevice, T>::operator()(                            \
+      const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in,             \
+      typename TTypes<T, 4>::Tensor out);                                    \
+  extern template struct TransformFilter<GPUDevice, T>;                      \
+  template <>                                                                \
+  void PadInput<GPUDevice, T>::operator()(                                   \
+      const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in,             \
+      int padding_rows_left, int padding_rows_right, int padding_cols_left,  \
+      int padding_cols_right, typename TTypes<T, 4>::Tensor out);            \
+  extern template struct PadInput<GPUDevice, T>
+
+DECLARE_GPU_SPEC(float);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+// Registration of the GPU implementations.
+REGISTER_KERNEL_BUILDER(Name("Conv2D")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<float>("T"),
+                        Conv2DOp<GPUDevice, float>);
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_ops_gpu.cu.cc b/tensorflow/core/kernels/conv_ops_gpu.cu.cc
new file mode 100644
index 0000000000..44af814e2b
--- /dev/null
+++ b/tensorflow/core/kernels/conv_ops_gpu.cu.cc
@@ -0,0 +1,35 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/conv_2d.h"
+
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+template <typename T>
+struct SpatialConvolution<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T, 4>::Tensor output,
+                  typename TTypes<T, 4>::ConstTensor input,
+                  typename TTypes<T, 4>::ConstTensor filter, int stride,
+                  const Eigen::PaddingType& padding) {
+    // TODO(keveman): nvcc 6.5 crashes when 32 bit indexing is turned on. Enable
+    // this when we move to cuda 7.0.
+    // SpatialConvolutionFunc(d, To32Bit(output), To32Bit(input),
+    // To32Bit(filter), stride, padding);
+
+    SpatialConvolutionFunc(d, output, input, filter, stride, padding);
+  }
+};
+
+template struct SpatialConvolution<GPUDevice, float>;
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_ops_gpu_2.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_2.cu.cc
new file mode 100644
index 0000000000..e2e9d25d83
--- /dev/null
+++ b/tensorflow/core/kernels/conv_ops_gpu_2.cu.cc
@@ -0,0 +1,16 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/conv_2d.h"
+
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+template struct functor::InflatePadAndShuffle<GPUDevice, float, 4>;
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
new file mode 100644
index 0000000000..dbbe08ef9c
--- /dev/null
+++ b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
@@ -0,0 +1,22 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/conv_2d.h"
+
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+template struct functor::ShuffleAndReverse<GPUDevice, float, 4>;
+
+template struct functor::TransformFilter<GPUDevice, float>;
+
+template struct functor::PadInput<GPUDevice, float>;
+
+template struct functor::TransformDepth<GPUDevice, float>;
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_ops_gpu_matmul.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_matmul.cu.cc
new file mode 100644
index 0000000000..87d79ecb4d
--- /dev/null
+++ b/tensorflow/core/kernels/conv_ops_gpu_matmul.cu.cc
@@ -0,0 +1,16 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/conv_2d.h"
+
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+template struct functor::MatMulConvFunctor<GPUDevice, float>;
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/core_ops_test.cc b/tensorflow/core/kernels/core_ops_test.cc
new file mode 100644
index 0000000000..a42a5999da
--- /dev/null
+++ b/tensorflow/core/kernels/core_ops_test.cc
@@ -0,0 +1,990 @@
+#define EIGEN_USE_THREADS
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
+
+#include <functional>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/nn_ops.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/eigen_thread_pool.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/util/port.h"
+#include <gtest/gtest.h>
+
+namespace tensorflow {
+
+static void SetConstOp(const string& name, std::initializer_list<int64> dims,
+                       NodeDef* node) {
+  Tensor tensor(DT_FLOAT, TensorShape(dims));
+  for (int64 i = 0; i < tensor.NumElements(); ++i) {
+    tensor.flat<float>()(i) = i / 10.0f;
+  }
+  TF_CHECK_OK(NodeDefBuilder(name, "Const")
+                  .Attr("dtype", DT_FLOAT)
+                  .Attr("value", tensor)
+                  .Finalize(node));
+}
+
+static void SetConstSizesOp(const string& name, const std::vector<int32>& sizes,
+                            NodeDef* node) {
+  TensorShape shape;
+  shape.AddDim(sizes.size());
+  Tensor tensor(DT_INT32, shape);
+  for (int64 i = 0; i < tensor.NumElements(); ++i) {
+    tensor.flat<int32>()(i) = sizes[i];
+  }
+  TF_CHECK_OK(NodeDefBuilder(name, "Const")
+                  .Attr("dtype", DT_INT32)
+                  .Attr("value", tensor)
+                  .Finalize(node));
+}
+
+namespace {
+
+enum CONV_OP {
+  CONV_OP_FORWARD = 0,
+  CONV_OP_BACKPROP_INPUT = 1,
+  CONV_OP_BACKPROP_FILTER = 2
+};
+
+}  // namespace
+
+static void BM_ConvFloat(int iters, int batch, int rows, int cols, int in_depth,
+                         int out_depth, int filter_rows, int filter_cols,
+                         CONV_OP op, int num_threads, int stride,
+                         Padding padding, bool use_gpu, const string& label) {
+  if (!IsGoogleCudaEnabled() && use_gpu) {
+    testing::SetLabel(
+        strings::StrCat("Skipping GPU test (no --config=cuda): ", label));
+    return;
+  }
+  testing::SetLabel(label);
+
+  // Set the number of threads
+  SessionOptions options;
+  options.config.set_intra_op_parallelism_threads(num_threads);
+
+  // We set up a graph for computing convolution.
+  GraphDef graph;
+
+  // For this, we need an input tensor and a filter tensor.
+  // Compute the output size.
+  int out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
+  TF_CHECK_OK(Get2dOutputSize(rows, cols, filter_rows, filter_cols, stride,
+                              stride, padding, &out_rows, &out_cols, &pad_rows,
+                              &pad_cols));
+  // Counting the number of floating point operations (both MUL and ADD)
+  int64 num_ops = 0;
+  if (op == CONV_OP_FORWARD) {
+    // Forward computation:
+    // BATCH x OUT_ROW X OUT_COL X IN_DEPTH X PATCH_ROW X PATH_COL X OUT_DEPTH
+    // We multiply by two since there are mutliplications and additions.
+    num_ops = static_cast<int64>(batch * in_depth * out_depth) *
+              static_cast<int64>(filter_rows * filter_cols) *
+              static_cast<int64>(out_rows * out_cols) * 2;
+  } else {
+    // Backward computation: both input and filter backprop take the same
+    // amount of computation:
+    // BATCH x IN_ROW X IN_COL X IN_DEPTH X PATCH_ROW X PATCH_COL X OUT_DEPTH
+    // We multiply by two since there are mutliplications and additions.
+    num_ops = static_cast<int64>(batch * in_depth * out_depth) *
+              static_cast<int64>(filter_rows * filter_cols) *
+              static_cast<int64>(rows * cols) * 2;
+  }
+
+  SetConstOp("input", {batch, rows, cols, in_depth}, graph.add_node());
+  SetConstOp("filter", {filter_rows, filter_cols, in_depth, out_depth},
+             graph.add_node());
+  SetConstOp("output_backprop", {batch, out_rows, out_cols, out_depth},
+             graph.add_node());
+  SetConstSizesOp("input_sizes",
+                  std::vector<int32>({batch, rows, cols, in_depth}),
+                  graph.add_node());
+  SetConstSizesOp("filter_sizes", std::vector<int32>({filter_rows, filter_cols,
+                                                      in_depth, out_depth}),
+                  graph.add_node());
+
+  // Now add the convolution op
+  NodeDef* conv = graph.add_node();
+  switch (op) {
+    case CONV_OP_FORWARD:
+      TF_CHECK_OK(NodeDefBuilder("conv2d", "Conv2D")
+                      .Input("input", 0, DT_FLOAT)
+                      .Input("filter", 0, DT_FLOAT)
+                      .Attr("strides", {1, stride, stride, 1})
+                      .Attr("padding", padding == VALID ? "VALID" : "SAME")
+                      .Finalize(conv));
+      break;
+    case CONV_OP_BACKPROP_INPUT:
+      TF_CHECK_OK(NodeDefBuilder("conv2d", "Conv2DBackpropInput")
+                      .Input("input_sizes", 0, DT_INT32)
+                      .Input("filter", 0, DT_FLOAT)
+                      .Input("output_backprop", 0, DT_FLOAT)
+                      .Attr("strides", {1, stride, stride, 1})
+                      .Attr("padding", padding == VALID ? "VALID" : "SAME")
+                      .Finalize(conv));
+      break;
+    case CONV_OP_BACKPROP_FILTER:
+      TF_CHECK_OK(NodeDefBuilder("conv2d", "Conv2DBackpropFilter")
+                      .Input("input", 0, DT_FLOAT)
+                      .Input("filter_sizes", 0, DT_INT32)
+                      .Input("output_backprop", 0, DT_FLOAT)
+                      .Attr("strides", {1, stride, stride, 1})
+                      .Attr("padding", padding == VALID ? "VALID" : "SAME")
+                      .Finalize(conv));
+      break;
+  }
+  Graph* g = new Graph(OpRegistry::Global());
+  GraphConstructorOptions opts;
+  TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph, g));
+
+  string device = use_gpu ? "gpu" : "cpu";
+  test::Benchmark(device, g, &options).Run(iters);
+  testing::ItemsProcessed(num_ops * iters);
+}
+
+// BS: batch_size
+// R: tensor_in_rows
+// C: tensor_in_cols
+// ID: input_depth
+// OD: output_depth
+// KR: kernel_rows
+// KC: kernel_cols
+#define BM_ConvFloatFwd(BS, R, C, ID, OD, KR, KC, STR, PAD, LABEL)           \
+  static void BM_ConvFloatFwdCPU1_##LABEL(int iters) {                       \
+    BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_FORWARD, 1, STR,   \
+                 PAD, false,                                                 \
+                 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",  \
+                                 KR, "_", KC, "_", STR, "_", PAD, "_cpu1")); \
+  }                                                                          \
+  static void BM_ConvFloatFwdCPU4_##LABEL(int iters) {                       \
+    BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_FORWARD, 4, STR,   \
+                 PAD, false,                                                 \
+                 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",  \
+                                 KR, "_", KC, "_", STR, "_", PAD, "_cpu4")); \
+  }                                                                          \
+  static void BM_ConvFloatFwdGPU_##LABEL(int iters) {                        \
+    BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_FORWARD, 1, STR,   \
+                 PAD, true,                                                  \
+                 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",  \
+                                 KR, "_", KC, "_", STR, "_", PAD, "_gpu"));  \
+  }                                                                          \
+  BENCHMARK(BM_ConvFloatFwdCPU1_##LABEL);                                    \
+  BENCHMARK(BM_ConvFloatFwdCPU4_##LABEL);                                    \
+  BENCHMARK(BM_ConvFloatFwdGPU_##LABEL)
+
+BM_ConvFloatFwd(32, 5, 5, 1248, 128, 1, 1, 1, SAME, conv0);
+BM_ConvFloatFwd(32, 8, 8, 384, 384, 1, 3, 1, SAME, conv1);
+BM_ConvFloatFwd(32, 8, 8, 384, 384, 3, 1, 1, SAME, conv2);
+BM_ConvFloatFwd(32, 8, 8, 2048, 192, 1, 1, 1, SAME, conv3);
+BM_ConvFloatFwd(32, 8, 8, 448, 384, 3, 3, 1, SAME, conv4);
+BM_ConvFloatFwd(32, 8, 8, 2048, 320, 1, 1, 1, SAME, conv5);
+BM_ConvFloatFwd(32, 8, 8, 2048, 448, 1, 1, 1, SAME, conv6);
+BM_ConvFloatFwd(32, 8, 8, 2048, 384, 1, 1, 1, SAME, conv7);
+BM_ConvFloatFwd(32, 8, 8, 1760, 384, 1, 1, 1, SAME, conv8);
+BM_ConvFloatFwd(32, 8, 8, 1760, 192, 1, 1, 1, SAME, conv9);
+BM_ConvFloatFwd(32, 8, 8, 1760, 448, 1, 1, 1, SAME, conv10);
+BM_ConvFloatFwd(32, 8, 8, 1760, 320, 1, 1, 1, SAME, conv11);
+BM_ConvFloatFwd(32, 17, 17, 192, 192, 3, 3, 2, VALID, conv12);
+BM_ConvFloatFwd(32, 17, 17, 192, 192, 3, 3, 1, SAME, conv13);
+BM_ConvFloatFwd(32, 17, 17, 1248, 192, 1, 1, 1, SAME, conv14);
+BM_ConvFloatFwd(32, 17, 17, 128, 320, 3, 3, 2, VALID, conv15);
+BM_ConvFloatFwd(32, 17, 17, 1248, 128, 1, 1, 1, SAME, conv16);
+BM_ConvFloatFwd(32, 17, 17, 224, 224, 1, 3, 1, SAME, conv17);
+BM_ConvFloatFwd(32, 17, 17, 192, 256, 3, 1, 1, SAME, conv18);
+BM_ConvFloatFwd(32, 17, 17, 192, 256, 1, 3, 1, SAME, conv19);
+BM_ConvFloatFwd(32, 17, 17, 1216, 192, 1, 1, 1, SAME, conv20);
+BM_ConvFloatFwd(32, 17, 17, 1216, 96, 1, 1, 1, SAME, conv21);
+BM_ConvFloatFwd(32, 17, 17, 224, 224, 3, 1, 1, SAME, conv22);
+BM_ConvFloatFwd(32, 17, 17, 192, 224, 3, 3, 1, SAME, conv23);
+BM_ConvFloatFwd(32, 17, 17, 192, 192, 1, 3, 1, SAME, conv24);
+BM_ConvFloatFwd(32, 17, 17, 1152, 192, 1, 1, 1, SAME, conv25);
+BM_ConvFloatFwd(32, 17, 17, 1152, 128, 1, 1, 1, SAME, conv26);
+BM_ConvFloatFwd(32, 17, 17, 192, 192, 3, 1, 1, SAME, conv27);
+BM_ConvFloatFwd(32, 17, 17, 160, 192, 3, 3, 1, SAME, conv28);
+BM_ConvFloatFwd(32, 17, 17, 1152, 160, 1, 1, 1, SAME, conv29);
+BM_ConvFloatFwd(32, 17, 17, 1024, 128, 1, 1, 1, SAME, conv30);
+BM_ConvFloatFwd(32, 17, 17, 128, 192, 1, 3, 1, SAME, conv31);
+BM_ConvFloatFwd(32, 17, 17, 1024, 160, 1, 1, 1, SAME, conv32);
+BM_ConvFloatFwd(32, 17, 17, 128, 192, 3, 1, 1, SAME, conv33);
+BM_ConvFloatFwd(32, 17, 17, 1024, 256, 1, 1, 1, SAME, conv34);
+BM_ConvFloatFwd(32, 17, 17, 128, 128, 3, 1, 1, SAME, conv35);
+BM_ConvFloatFwd(32, 17, 17, 768, 192, 1, 1, 1, SAME, conv36);
+BM_ConvFloatFwd(32, 17, 17, 128, 128, 1, 3, 1, SAME, conv37);
+BM_ConvFloatFwd(32, 17, 17, 128, 128, 3, 3, 1, SAME, conv38);
+BM_ConvFloatFwd(32, 17, 17, 768, 128, 1, 1, 1, SAME, conv39);
+BM_ConvFloatFwd(32, 17, 17, 768, 320, 1, 1, 1, SAME, conv40);
+BM_ConvFloatFwd(32, 35, 35, 96, 96, 3, 3, 2, VALID, conv41);
+BM_ConvFloatFwd(32, 35, 35, 288, 384, 3, 3, 2, VALID, conv42);
+BM_ConvFloatFwd(32, 35, 35, 64, 96, 3, 3, 1, SAME, conv43);
+BM_ConvFloatFwd(32, 35, 35, 288, 64, 1, 1, 1, SAME, conv44);
+BM_ConvFloatFwd(32, 35, 35, 256, 64, 1, 1, 1, SAME, conv45);
+BM_ConvFloatFwd(32, 35, 35, 48, 64, 5, 5, 1, SAME, conv46);
+BM_ConvFloatFwd(32, 35, 35, 256, 48, 1, 1, 1, SAME, conv47);
+BM_ConvFloatFwd(32, 35, 35, 96, 96, 3, 3, 1, SAME, conv48);
+BM_ConvFloatFwd(32, 35, 35, 192, 32, 1, 1, 1, SAME, conv49);
+BM_ConvFloatFwd(32, 35, 35, 192, 64, 1, 1, 1, SAME, conv50);
+BM_ConvFloatFwd(32, 35, 35, 192, 48, 1, 1, 1, SAME, conv51);
+BM_ConvFloatFwd(32, 73, 73, 64, 192, 3, 3, 1, VALID, conv52);
+BM_ConvFloatFwd(32, 73, 73, 64, 64, 1, 1, 1, VALID, conv53);
+BM_ConvFloatFwd(32, 147, 147, 24, 64, 1, 1, 1, VALID, conv54);
+
+#define BM_ConvFloatBkInAndFilter(BS, R, C, ID, OD, KR, KC, STR, PAD, LABEL)  \
+  static void BM_ConvFloatBkInCPU1_##LABEL(int iters) {                       \
+    BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_INPUT, 1,  \
+                 STR, PAD, false,                                             \
+                 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",   \
+                                 KR, "_", KC, "_", STR, "_", PAD, "_cpu1"));  \
+  }                                                                           \
+  static void BM_ConvFloatBkInCPU4_##LABEL(int iters) {                       \
+    BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_INPUT, 4,  \
+                 STR, PAD, false,                                             \
+                 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",   \
+                                 KR, "_", KC, "_", STR, "_", PAD, "_cpu4"));  \
+  }                                                                           \
+  static void BM_ConvFloatBkInGPU_##LABEL(int iters) {                        \
+    BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_INPUT, 1,  \
+                 STR, PAD, true,                                              \
+                 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",   \
+                                 KR, "_", KC, "_", STR, "_", PAD, "_gpu"));   \
+  }                                                                           \
+  static void BM_ConvFloatBkFilterCPU1_##LABEL(int iters) {                   \
+    BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 1, \
+                 STR, PAD, false,                                             \
+                 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",   \
+                                 KR, "_", KC, "_", STR, "_", PAD, "_cpu1"));  \
+  }                                                                           \
+  static void BM_ConvFloatBkFilterCPU4_##LABEL(int iters) {                   \
+    BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 4, \
+                 STR, PAD, false,                                             \
+                 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",   \
+                                 KR, "_", KC, "_", STR, "_", PAD, "_cpu4"));  \
+  }                                                                           \
+  static void BM_ConvFloatBkFilterGPU_##LABEL(int iters) {                    \
+    BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 1, \
+                 STR, PAD, true,                                              \
+                 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",   \
+                                 KR, "_", KC, "_", STR, "_", PAD, "_gpu"));   \
+  }                                                                           \
+  BENCHMARK(BM_ConvFloatBkInCPU1_##LABEL);                                    \
+  BENCHMARK(BM_ConvFloatBkInCPU4_##LABEL);                                    \
+  BENCHMARK(BM_ConvFloatBkInGPU_##LABEL);                                     \
+  BENCHMARK(BM_ConvFloatBkFilterCPU1_##LABEL);                                \
+  BENCHMARK(BM_ConvFloatBkFilterCPU4_##LABEL);                                \
+  BENCHMARK(BM_ConvFloatBkFilterGPU_##LABEL)
+
+// Benchmarks from the inception model
+
+BM_ConvFloatBkInAndFilter(32, 5, 5, 1248, 128, 1, 1, 1, SAME, conv0);
+BM_ConvFloatBkInAndFilter(32, 8, 8, 384, 384, 1, 3, 1, SAME, conv1);
+BM_ConvFloatBkInAndFilter(32, 8, 8, 384, 384, 3, 1, 1, SAME, conv2);
+BM_ConvFloatBkInAndFilter(32, 8, 8, 2048, 192, 1, 1, 1, SAME, conv3);
+BM_ConvFloatBkInAndFilter(32, 8, 8, 448, 384, 3, 3, 1, SAME, conv4);
+BM_ConvFloatBkInAndFilter(32, 8, 8, 2048, 320, 1, 1, 1, SAME, conv5);
+BM_ConvFloatBkInAndFilter(32, 8, 8, 2048, 448, 1, 1, 1, SAME, conv6);
+BM_ConvFloatBkInAndFilter(32, 8, 8, 2048, 384, 1, 1, 1, SAME, conv7);
+BM_ConvFloatBkInAndFilter(32, 8, 8, 1760, 384, 1, 1, 1, SAME, conv8);
+BM_ConvFloatBkInAndFilter(32, 8, 8, 1760, 192, 1, 1, 1, SAME, conv9);
+BM_ConvFloatBkInAndFilter(32, 8, 8, 1760, 448, 1, 1, 1, SAME, conv10);
+BM_ConvFloatBkInAndFilter(32, 8, 8, 1760, 320, 1, 1, 1, SAME, conv11);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 192, 3, 3, 2, VALID, conv12);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 192, 3, 3, 1, SAME, conv13);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 1248, 192, 1, 1, 1, SAME, conv14);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 128, 320, 3, 3, 2, VALID, conv15);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 1248, 128, 1, 1, 1, SAME, conv16);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 224, 224, 1, 3, 1, SAME, conv17);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 256, 3, 1, 1, SAME, conv18);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 256, 1, 3, 1, SAME, conv19);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 1216, 192, 1, 1, 1, SAME, conv20);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 1216, 96, 1, 1, 1, SAME, conv21);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 224, 224, 3, 1, 1, SAME, conv22);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 224, 3, 3, 1, SAME, conv23);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 192, 1, 3, 1, SAME, conv24);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 1152, 192, 1, 1, 1, SAME, conv25);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 1152, 128, 1, 1, 1, SAME, conv26);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 192, 3, 1, 1, SAME, conv27);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 160, 192, 3, 3, 1, SAME, conv28);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 1152, 160, 1, 1, 1, SAME, conv29);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 1024, 128, 1, 1, 1, SAME, conv30);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 128, 192, 1, 3, 1, SAME, conv31);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 1024, 160, 1, 1, 1, SAME, conv32);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 128, 192, 3, 1, 1, SAME, conv33);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 1024, 256, 1, 1, 1, SAME, conv34);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 128, 128, 3, 1, 1, SAME, conv35);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 768, 192, 1, 1, 1, SAME, conv36);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 128, 128, 1, 3, 1, SAME, conv37);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 128, 128, 3, 3, 1, SAME, conv38);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 768, 128, 1, 1, 1, SAME, conv39);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 768, 320, 1, 1, 1, SAME, conv40);
+BM_ConvFloatBkInAndFilter(32, 35, 35, 96, 96, 3, 3, 2, VALID, conv41);
+BM_ConvFloatBkInAndFilter(32, 35, 35, 288, 384, 3, 3, 2, VALID, conv42);
+BM_ConvFloatBkInAndFilter(32, 35, 35, 64, 96, 3, 3, 1, SAME, conv43);
+BM_ConvFloatBkInAndFilter(32, 35, 35, 288, 64, 1, 1, 1, SAME, conv44);
+BM_ConvFloatBkInAndFilter(32, 35, 35, 256, 64, 1, 1, 1, SAME, conv45);
+BM_ConvFloatBkInAndFilter(32, 35, 35, 48, 64, 5, 5, 1, SAME, conv46);
+BM_ConvFloatBkInAndFilter(32, 35, 35, 256, 48, 1, 1, 1, SAME, conv47);
+BM_ConvFloatBkInAndFilter(32, 35, 35, 96, 96, 3, 3, 1, SAME, conv48);
+BM_ConvFloatBkInAndFilter(32, 35, 35, 192, 32, 1, 1, 1, SAME, conv49);
+BM_ConvFloatBkInAndFilter(32, 35, 35, 192, 64, 1, 1, 1, SAME, conv50);
+BM_ConvFloatBkInAndFilter(32, 35, 35, 192, 48, 1, 1, 1, SAME, conv51);
+BM_ConvFloatBkInAndFilter(32, 73, 73, 64, 192, 3, 3, 1, VALID, conv52);
+BM_ConvFloatBkInAndFilter(32, 73, 73, 64, 64, 1, 1, 1, VALID, conv53);
+BM_ConvFloatBkInAndFilter(32, 147, 147, 24, 64, 1, 1, 1, VALID, conv54);
+
+#define BM_ConvFloatBkFCPU(BS, R, C, ID, OD, KR, KC, TH, LABEL)                \
+  static void                                                                  \
+      BM_ConvFloatBkFCPU_##BS##_##R##_##C##_##ID##_##OD##_##KR##_##KC##_##TH(  \
+          int iters) {                                                         \
+    BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, TH, \
+                 1, VALID, false, LABEL);                                      \
+  }                                                                            \
+  BENCHMARK(                                                                   \
+      BM_ConvFloatBkFCPU_##BS##_##R##_##C##_##ID##_##OD##_##KR##_##KC##_##TH)
+
+// Benchmarks from https://github.com/soumith/convnet-benchmarks
+BM_ConvFloatBkFCPU(128, 128, 128, 3, 96, 11, 11, 4, "convnet-layer1");
+BM_ConvFloatBkFCPU(128, 64, 64, 64, 128, 9, 9, 4, "convnet-layer2");
+BM_ConvFloatBkFCPU(128, 32, 32, 128, 128, 9, 9, 4, "convnet-layer3");
+BM_ConvFloatBkFCPU(128, 16, 16, 128, 128, 7, 7, 4, "convnet-layer4");
+BM_ConvFloatBkFCPU(128, 13, 13, 384, 384, 3, 3, 4, "convnet-layer5");
+
+#define BM_ConvFloatBkFGPU(BS, R, C, ID, OD, KR, KC, LABEL)                    \
+  static void BM_ConvFloatBkFGPU_##BS##_##R##_##C##_##ID##_##OD##_##KR##_##KC( \
+      int iters) {                                                             \
+    BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 1,  \
+                 1, VALID, true, LABEL);                                       \
+  }                                                                            \
+  BENCHMARK(BM_ConvFloatBkFGPU_##BS##_##R##_##C##_##ID##_##OD##_##KR##_##KC)
+
+// Benchmarks from https://github.com/soumith/convnet-benchmarks
+BM_ConvFloatBkFGPU(128, 128, 128, 3, 96, 11, 11, "convnet-layer1");
+BM_ConvFloatBkFGPU(128, 64, 64, 64, 128, 9, 9, "convnet-layer2");
+BM_ConvFloatBkFGPU(128, 32, 32, 128, 128, 9, 9, "convnet-layer3");
+BM_ConvFloatBkFGPU(128, 16, 16, 128, 128, 7, 7, "convnet-layer4");
+BM_ConvFloatBkFGPU(128, 13, 13, 384, 384, 3, 3, "convnet-layer5");
+
+static void BM_LRNFloat(int iters, int depth, int cols, int rows,
+                        int batch_size, int range, int num_threads,
+                        const string& label) {
+  tensorflow::testing::StopTiming();
+  std::unique_ptr<Device> device(
+      DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+
+  thread::ThreadPool threadpool(Env::Default(), "test", num_threads);
+  EigenThreadPoolWrapper wrapper(&threadpool);
+  Eigen::ThreadPoolDevice eigen_cpu_device(&wrapper, num_threads);
+  device->set_eigen_cpu_device(&eigen_cpu_device);
+
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  TensorShape shape({batch_size, rows, cols, depth});
+
+  Tensor input(DT_FLOAT, shape);
+  test::FillIota<float>(&input, 1.0);
+  inputs.push_back({nullptr, &input});
+
+  // Convolution op.
+  NodeDef lrn_node_def;
+  TF_CHECK_OK(NodeDefBuilder("lrn_op", "LRN")
+                  .Input("input", 0, DT_FLOAT)
+                  .Attr("depth_radius", range)
+                  .Attr("bias", 1.0)
+                  .Attr("alpha", 0.1)
+                  .Attr("beta", 0.5)
+                  .Finalize(&lrn_node_def));
+
+  Status status;
+  std::unique_ptr<OpKernel> op(CreateOpKernel(
+      DEVICE_CPU, device.get(), cpu_allocator(), lrn_node_def, &status));
+  TF_CHECK_OK(status);
+
+  OpKernelContext::Params params;
+  params.device = device.get();
+  params.frame_iter = FrameAndIter(0, 0);
+  params.inputs = &inputs;
+  params.op_kernel = op.get();
+  params.output_alloc_attr = [&device, &op, &params](int index) {
+    AllocatorAttributes attr;
+    const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY);
+    attr.set_on_host(on_host);
+    return attr;
+  };
+
+  std::unique_ptr<OpKernelContext> context(new OpKernelContext(params));
+
+  op->Compute(context.get());
+  tensorflow::testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    delete context->release_output(0).tensor;
+    op->Compute(context.get());
+  }
+  tensorflow::testing::StopTiming();
+  testing::ItemsProcessed(context->mutable_output(0)->NumElements() * iters *
+                          (2 * range + 1) * 2);
+  testing::SetLabel(label);
+}
+
+#define BM_LRNFloatFwdCPU(DEPTH, COLS, ROWS, BATCH, RANGE, THREADS, LABEL)   \
+  static void                                                                \
+      BM_LRNFloat_##DEPTH##_##COLS##_##ROWS##_##BATCH##_##RANGE##_##THREADS( \
+          int iters) {                                                       \
+    BM_LRNFloat(iters, DEPTH, COLS, ROWS, BATCH, RANGE, THREADS, LABEL);     \
+  }                                                                          \
+  BENCHMARK(                                                                 \
+      BM_LRNFloat_##DEPTH##_##COLS##_##ROWS##_##BATCH##_##RANGE##_##THREADS)
+
+// clang-format off
+//                DEPTH, COLS, ROWS, BATCH, RANGE, THREADS, LABEL
+BM_LRNFloatFwdCPU(64,    56,   56,   32,    5,     1,       "lrn 1 thread");
+BM_LRNFloatFwdCPU(192,   28,   28,   64,    2,     1,       "lrn 1 thread");
+BM_LRNFloatFwdCPU(192,   56,   56,   32,    5,     1,       "lrn 1 thread");
+BM_LRNFloatFwdCPU(64,    56,   56,   32,    5,     4,       "lrn 4 threads");
+BM_LRNFloatFwdCPU(192,   28,   28,   64,    2,     4,       "lrn 4 threads");
+BM_LRNFloatFwdCPU(192,   56,   56,   32,    5,     4,       "lrn 4 threads");
+BM_LRNFloatFwdCPU(64,    56,   56,   32,    5,     8,       "lrn 8 threads");
+BM_LRNFloatFwdCPU(192,   28,   28,   64,    2,     8,       "lrn 8 threads");
+BM_LRNFloatFwdCPU(192,   56,   56,   32,    5,     8,       "lrn 8 threads");
+// clang-format on
+
+/*
+AvgPooling Op
+*/
+static void BM_AvgPool(int iters, int batch_size, int rows, int cols, int depth,
+                       int kernel_rows, int kernel_cols, int stride,
+                       Padding padding, int num_threads, const string& label) {
+  tensorflow::testing::StopTiming();
+  std::unique_ptr<Device> device(
+      DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+
+  thread::ThreadPool threadpool(Env::Default(), "test", num_threads);
+  EigenThreadPoolWrapper wrapper(&threadpool);
+  Eigen::ThreadPoolDevice eigen_cpu_device(&wrapper, num_threads);
+  device->set_eigen_cpu_device(&eigen_cpu_device);
+
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  TensorShape shape1({batch_size, rows, cols, depth});
+  Tensor input1(DT_FLOAT, shape1);
+  test::FillIota<float>(&input1, 1.0);
+  inputs.push_back({nullptr, &input1});
+
+  // AvgPooling op.
+  NodeDef avgpool_node_def;
+  CHECK_EQ(kernel_rows, kernel_cols);
+  Status status = NodeDefBuilder("avgpool_op", "AvgPool")
+                      .Input(FakeInput(DT_FLOAT))
+                      .Attr("ksize", {1, kernel_rows, kernel_cols, 1})
+                      .Attr("strides", {1, stride, stride, 1})
+                      .Attr("padding", padding == VALID ? "VALID" : "SAME")
+                      .Finalize(&avgpool_node_def);
+  TF_CHECK_OK(status);
+
+  std::unique_ptr<OpKernel> op(CreateOpKernel(
+      DEVICE_CPU, device.get(), cpu_allocator(), avgpool_node_def, &status));
+  TF_CHECK_OK(status);
+  OpKernelContext::Params params;
+  params.device = device.get();
+  params.frame_iter = FrameAndIter(0, 0);
+  params.inputs = &inputs;
+  params.op_kernel = op.get();
+  params.output_alloc_attr = [&device, &op, &params](int index) {
+    AllocatorAttributes attr;
+    const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY);
+    attr.set_on_host(on_host);
+    return attr;
+  };
+
+  std::unique_ptr<OpKernelContext> avgpool_context(new OpKernelContext(params));
+
+  op->Compute(avgpool_context.get());
+  tensorflow::testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    delete avgpool_context->release_output(0).tensor;
+    op->Compute(avgpool_context.get());
+  }
+  tensorflow::testing::StopTiming();
+  testing::ItemsProcessed(avgpool_context->mutable_output(0)->NumElements() *
+                          iters);
+  testing::SetLabel(label);
+}
+
+// BS: batch_size
+// IR: input_rows
+// IC: input_cols
+// ND: node_depth
+// KR: kernel_rows
+// KC: kernel_cols
+// ST: stride. We use the same stride for both directions.
+// PT: padding
+#define BM_AvgPoolFwdCPU(BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL)            \
+  static void                                                                  \
+      BM_AvgPool_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH( \
+          int iters) {                                                         \
+    BM_AvgPool(iters, BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL);              \
+  }                                                                            \
+  BENCHMARK(                                                                   \
+      BM_AvgPool_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH)
+
+// Labels are taken from the 2014-July-24 version of imagenet
+BM_AvgPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, VALID, 1, "avgpool0_VALID");
+BM_AvgPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, VALID, 1, "avgpool1_VALID");
+BM_AvgPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, VALID, 1, "avgpool4_VALID");
+BM_AvgPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, VALID, 1, "avgpool10_VALID");
+BM_AvgPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, SAME, 1, "avgpool0_SAME");
+BM_AvgPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, SAME, 1, "avgpool1_SAME");
+BM_AvgPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, SAME, 1, "avgpool4_SAME");
+BM_AvgPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, SAME, 1, "avgpool10_SAME");
+BM_AvgPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, VALID, 4, "avgpool0_VALID");
+BM_AvgPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, VALID, 4, "avgpool1_VALID");
+BM_AvgPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, VALID, 4, "avgpool4_VALID");
+BM_AvgPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, VALID, 4, "avgpool10_VALID");
+BM_AvgPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, SAME, 4, "avgpool0_SAME");
+BM_AvgPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, SAME, 4, "avgpool1_SAME");
+BM_AvgPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, SAME, 4, "avgpool4_SAME");
+BM_AvgPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, SAME, 4, "avgpool10_SAME");
+
+static void BM_AvgPoolBk(int iters, int batch_size, int rows, int cols,
+                         int depth, int kernel_rows, int kernel_cols,
+                         int stride, Padding padding, int num_threads,
+                         const string& label) {
+  tensorflow::testing::StopTiming();
+  std::unique_ptr<Device> device(
+      DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+
+  thread::ThreadPool threadpool(Env::Default(), "test", num_threads);
+  EigenThreadPoolWrapper wrapper(&threadpool);
+  Eigen::ThreadPoolDevice eigen_cpu_device(&wrapper, num_threads);
+  device->set_eigen_cpu_device(&eigen_cpu_device);
+
+  gtl::InlinedVector<TensorValue, 4> inputs;
+
+  int out_height, out_width, pad_rows, pad_cols;
+  Status status =
+      Get2dOutputSize(rows, cols, kernel_rows, kernel_cols, stride, stride,
+                      padding, &out_height, &out_width, &pad_rows, &pad_cols);
+  TF_CHECK_OK(status);
+  TensorShape output_shape({batch_size, out_height, out_width, depth});
+  TensorShape shape2({4});
+  Tensor input_shape_tensor(DT_INT32, shape2);
+  int32 input_dims[] = {batch_size, rows, cols, depth};
+  for (int i = 0; i < 4; i++) {
+    input_shape_tensor.flat<int32>()(i) = input_dims[i];
+  }
+  inputs.push_back({nullptr, &input_shape_tensor});
+
+  Tensor output_backprop(DT_FLOAT, output_shape);
+  test::FillIota<float>(&output_backprop, 11.0);
+  inputs.push_back({nullptr, &output_backprop});
+
+  // AvgPoolGrad op.
+  NodeDef avgpool_grad_node_def;
+  status = NodeDefBuilder("avgpool_grad_op", "AvgPoolGrad")
+               .Input(FakeInput())
+               .Input(FakeInput(DT_FLOAT))
+               .Attr("ksize", {1, kernel_rows, kernel_cols, 1})
+               .Attr("strides", {1, stride, stride, 1})
+               .Attr("padding", padding == VALID ? "VALID" : "SAME")
+               .Finalize(&avgpool_grad_node_def);
+  TF_CHECK_OK(status);
+  std::unique_ptr<OpKernel> op(CreateOpKernel(
+      DEVICE_CPU, nullptr, cpu_allocator(), avgpool_grad_node_def, &status));
+  TF_CHECK_OK(status);
+  OpKernelContext::Params params;
+  params.device = device.get();
+  params.frame_iter = FrameAndIter(0, 0);
+  params.inputs = &inputs;
+  params.op_kernel = op.get();
+  params.output_alloc_attr = [&device, &op, &params](int index) {
+    AllocatorAttributes attr;
+    const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY);
+    attr.set_on_host(on_host);
+    return attr;
+  };
+
+  std::unique_ptr<OpKernelContext> avgpool_context(new OpKernelContext(params));
+
+  op->Compute(avgpool_context.get());
+  tensorflow::testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    delete avgpool_context->release_output(0).tensor;
+    op->Compute(avgpool_context.get());
+  }
+  tensorflow::testing::StopTiming();
+  testing::ItemsProcessed(avgpool_context->mutable_output(0)->NumElements() *
+                          iters);
+  testing::SetLabel(label);
+}
+
+// BS: batch_size
+// IR: input_rows
+// IC: input_cols
+// ND: node_depth
+// KR: kernel_rows
+// KC: kernel_cols
+// ST: stride. We use the same stride for both directions.
+// PT: padding
+// The resulted symbol is too long. Need to use two macros to fit in 80-chars
+#define BM_AvgPoolBkCPU(BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL)               \
+  static void                                                                    \
+      BM_AvgPoolBk_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH( \
+          int iters) {                                                           \
+    BM_AvgPoolBk(iters, BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL);              \
+  }                                                                              \
+  BENCHMARK(                                                                     \
+      BM_AvgPoolBk_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH)
+
+// Shapes taken from the 2015/05/16 inception model
+BM_AvgPoolBkCPU(32, 35, 35, 192, 3, 3, 1, SAME, 1, "avgpool_grad0_SAME");
+BM_AvgPoolBkCPU(32, 35, 35, 256, 3, 3, 1, SAME, 1, "avgpool_grad1_SAME");
+BM_AvgPoolBkCPU(32, 17, 17, 768, 3, 3, 1, SAME, 1, "avgpool_grad2_SAME");
+BM_AvgPoolBkCPU(32, 17, 17, 1024, 3, 3, 1, SAME, 1, "avgpool_grad3_SAME");
+BM_AvgPoolBkCPU(32, 17, 17, 1152, 3, 3, 1, SAME, 1, "avgpool_grad4_SAME");
+BM_AvgPoolBkCPU(32, 17, 17, 1216, 3, 3, 1, SAME, 1, "avgpool_grad5_SAME");
+BM_AvgPoolBkCPU(32, 17, 17, 1248, 5, 5, 3, VALID, 1, "avgpool_grad6_VALID");
+BM_AvgPoolBkCPU(32, 8, 8, 1760, 3, 3, 1, SAME, 1, "avgpool_grad7_SAME");
+BM_AvgPoolBkCPU(32, 8, 8, 2048, 8, 8, 1, VALID, 1, "avgpool_grad8_VALID");
+
+/*
+MaxPooling Op
+*/
+static void BM_MaxPool(int iters, int batch_size, int rows, int cols, int depth,
+                       int kernel_rows, int kernel_cols, int stride,
+                       Padding padding, int num_threads, const string& label) {
+  tensorflow::testing::StopTiming();
+  std::unique_ptr<Device> device(
+      DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+
+  thread::ThreadPool threadpool(Env::Default(), "test", num_threads);
+  EigenThreadPoolWrapper wrapper(&threadpool);
+  Eigen::ThreadPoolDevice eigen_cpu_device(&wrapper, num_threads);
+  device->set_eigen_cpu_device(&eigen_cpu_device);
+
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  TensorShape shape1({batch_size, rows, cols, depth});
+  Tensor input1(DT_FLOAT, shape1);
+  test::FillIota<float>(&input1, 1.0);
+  inputs.push_back({nullptr, &input1});
+
+  // MaxPooling op.
+  NodeDef maxpool_node_def;
+  CHECK_EQ(kernel_rows, kernel_cols);
+  Status status = NodeDefBuilder("maxpool_op", "MaxPool")
+                      .Input(FakeInput())
+                      .Attr("ksize", {1, kernel_rows, kernel_cols, 1})
+                      .Attr("strides", {1, stride, stride, 1})
+                      .Attr("padding", padding == VALID ? "VALID" : "SAME")
+                      .Finalize(&maxpool_node_def);
+  TF_CHECK_OK(status);
+  std::unique_ptr<OpKernel> op(CreateOpKernel(
+      DEVICE_CPU, device.get(), cpu_allocator(), maxpool_node_def, &status));
+  TF_CHECK_OK(status);
+  OpKernelContext::Params params;
+  params.device = device.get();
+  params.frame_iter = FrameAndIter(0, 0);
+  params.inputs = &inputs;
+  params.op_kernel = op.get();
+  params.output_alloc_attr = [&device, &op, &params](int index) {
+    AllocatorAttributes attr;
+    const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY);
+    attr.set_on_host(on_host);
+    return attr;
+  };
+
+  std::unique_ptr<OpKernelContext> maxpool_context(new OpKernelContext(params));
+
+  op->Compute(maxpool_context.get());
+  tensorflow::testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    delete maxpool_context->release_output(0).tensor;
+    op->Compute(maxpool_context.get());
+  }
+  tensorflow::testing::StopTiming();
+  testing::ItemsProcessed(maxpool_context->mutable_output(0)->NumElements() *
+                          iters);
+  testing::SetLabel(label);
+}
+
+// BS: batch_size
+// IR: input_rows
+// IC: input_cols
+// ND: node_depth
+// KR: kernel_rows
+// KC: kernel_cols
+// ST: stride. We use the same stride for both directions.
+// PT: padding
+#define BM_MaxPoolFwdCPU(BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL)            \
+  static void                                                                  \
+      BM_MaxPool_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH( \
+          int iters) {                                                         \
+    BM_MaxPool(iters, BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL);              \
+  }                                                                            \
+  BENCHMARK(                                                                   \
+      BM_MaxPool_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH)
+
+// Labels are taken from the 2014-July-24 version of imagenet
+BM_MaxPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, VALID, 1, "maxpool0_VALID");
+BM_MaxPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, VALID, 1, "maxpool1_VALID");
+BM_MaxPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, VALID, 1, "maxpool4_VALID");
+BM_MaxPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, VALID, 1, "maxpool10_VALID");
+BM_MaxPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, SAME, 1, "maxpool0_SAME");
+BM_MaxPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, SAME, 1, "maxpool1_SAME");
+BM_MaxPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, SAME, 1, "maxpool4_SAME");
+BM_MaxPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, SAME, 1, "maxpool10_SAME");
+BM_MaxPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, VALID, 4, "maxpool0_VALID");
+BM_MaxPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, VALID, 4, "maxpool1_VALID");
+BM_MaxPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, VALID, 4, "maxpool4_VALID");
+BM_MaxPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, VALID, 4, "maxpool10_VALID");
+BM_MaxPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, SAME, 4, "maxpool0_SAME");
+BM_MaxPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, SAME, 4, "maxpool1_SAME");
+BM_MaxPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, SAME, 4, "maxpool4_SAME");
+BM_MaxPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, SAME, 4, "maxpool10_SAME");
+
+static void BM_MaxPoolBk(int iters, int batch_size, int rows, int cols,
+                         int depth, int kernel_rows, int kernel_cols,
+                         int stride, Padding padding, int num_threads,
+                         bool use_gpu, const string& label) {
+  GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+
+  int out_height, out_width, pad_rows, pad_cols;
+  Status status =
+      Get2dOutputSize(rows, cols, kernel_rows, kernel_cols, stride, stride,
+                      padding, &out_height, &out_width, &pad_rows, &pad_cols);
+  TF_CHECK_OK(status);
+
+  Tensor input_data(DT_FLOAT, TensorShape({batch_size, rows, cols, depth}));
+  input_data.flat<float>().setRandom();
+  Node* input_data_node = ops::Const(input_data, b.opts());
+
+  Tensor output_data(DT_FLOAT,
+                     TensorShape({batch_size, out_height, out_width, depth}));
+  output_data.flat<float>().setRandom();
+  Node* output_data_node = ops::Const(output_data, b.opts());
+
+  Tensor output_diff(DT_FLOAT,
+                     TensorShape({batch_size, out_height, out_width, depth}));
+  output_diff.flat<float>().setRandom();
+  Node* output_diff_node = ops::Const(output_diff, b.opts());
+
+  CHECK_EQ(kernel_rows, kernel_cols);
+  ops::MaxPoolGrad(input_data_node, output_data_node, output_diff_node,
+                   {1, kernel_rows, kernel_cols, 1} /* ksize */,
+                   {1, stride, stride, 1} /* stride */,
+                   padding == VALID ? "VALID" : "SAME", b.opts());
+  Graph* g = new Graph(OpRegistry::Global());
+  TF_CHECK_OK(b.ToGraph(g));
+  string device = use_gpu ? "gpu" : "cpu";
+  test::Benchmark(device, g).Run(iters);
+
+  testing::ItemsProcessed(batch_size * rows * cols * depth * iters);
+  testing::SetLabel(label);
+}
+
+// BS: batch_size
+// IR: input_rows
+// IC: input_cols
+// ND: node_depth
+// KR: kernel_rows
+// KC: kernel_cols
+// ST: stride. We use the same stride for both directions.
+// PT: padding
+// The resulted symbol is too long. Need to use two macros to fit in 80-chars
+// clang-format off
+#define BM_MaxPoolBkGPU(BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL)             \
+  static void                                                                  \
+      BM_MaxPoolBk_GPU_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_       \
+          ##PT##_##TH(                                                         \
+          int iters) {                                                         \
+    BM_MaxPoolBk(iters, BS, IR, IC, ND, KR, KC, ST, PT, TH, true, LABEL);      \
+  }                                                                            \
+  BENCHMARK(                                                                   \
+      BM_MaxPoolBk_GPU_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_       \
+          ##PT##_##TH)                                                         \
+
+#define BM_MaxPoolBkCPU(BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL)             \
+  static void                                                                  \
+      BM_MaxPoolBk_CPU_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_       \
+          ##PT##_##TH(                                                         \
+          int iters) {                                                         \
+    BM_MaxPoolBk(iters, BS, IR, IC, ND, KR, KC, ST, PT, TH, false, LABEL);     \
+  }                                                                            \
+  BENCHMARK(                                                                   \
+      BM_MaxPoolBk_CPU_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_       \
+          ##PT##_##TH)
+// clang-format on
+
+// Shapes taken from the 2015/05/16 inception model
+BM_MaxPoolBkGPU(32, 147, 147, 64, 3, 3, 2, VALID, 1, "maxpool_grad0_VALID");
+BM_MaxPoolBkGPU(32, 71, 71, 192, 3, 3, 2, VALID, 1, "maxpool_grad1_VALID");
+BM_MaxPoolBkGPU(32, 35, 35, 288, 3, 3, 2, VALID, 1, "maxpool_grad2_VALID");
+BM_MaxPoolBkGPU(32, 17, 17, 1248, 3, 3, 2, VALID, 1, "maxpool_grad3_VALID");
+BM_MaxPoolBkGPU(32, 8, 8, 2048, 3, 3, 2, VALID, 1, "maxpool_grad4_VALID");
+
+BM_MaxPoolBkCPU(32, 147, 147, 64, 3, 3, 2, VALID, 1, "maxpool_grad0_VALID");
+BM_MaxPoolBkCPU(32, 71, 71, 192, 3, 3, 2, VALID, 1, "maxpool_grad1_VALID");
+BM_MaxPoolBkCPU(32, 35, 35, 288, 3, 3, 2, VALID, 1, "maxpool_grad2_VALID");
+BM_MaxPoolBkCPU(32, 17, 17, 1248, 3, 3, 2, VALID, 1, "maxpool_grad3_VALID");
+BM_MaxPoolBkCPU(32, 8, 8, 2048, 3, 3, 2, VALID, 1, "maxpool_grad4_VALID");
+
+/*
+Relu Op
+Run benchmark with:
+*/
+static void BM_ReluFloat(int iters, int batch_size, int rows, int cols,
+                         int depth, int num_threads, const string& label) {
+  tensorflow::testing::StopTiming();
+  std::unique_ptr<Device> device(
+      DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+
+  thread::ThreadPool threadpool(Env::Default(), "test", num_threads);
+  EigenThreadPoolWrapper wrapper(&threadpool);
+  Eigen::ThreadPoolDevice eigen_cpu_device(&wrapper, num_threads);
+  device->set_eigen_cpu_device(&eigen_cpu_device);
+
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  TensorShape shape1({batch_size, rows, cols, depth});
+  Tensor input1(DT_FLOAT, shape1);
+  test::FillIota<float>(&input1, 1.0);
+  inputs.push_back({nullptr, &input1});
+
+  // Reluing op.
+  NodeDef relu_node_def;
+  Status status = NodeDefBuilder("relu_op", "Relu")
+                      .Input(FakeInput(DT_FLOAT))
+                      .Finalize(&relu_node_def);
+  TF_CHECK_OK(status);
+  std::unique_ptr<OpKernel> op(CreateOpKernel(
+      DEVICE_CPU, device.get(), cpu_allocator(), relu_node_def, &status));
+  TF_CHECK_OK(status);
+  OpKernelContext::Params params;
+  params.device = device.get();
+  params.frame_iter = FrameAndIter(0, 0);
+  params.inputs = &inputs;
+  params.op_kernel = op.get();
+  params.output_alloc_attr = [&device, &op, &params](int index) {
+    AllocatorAttributes attr;
+    const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY);
+    attr.set_on_host(on_host);
+    return attr;
+  };
+
+  std::unique_ptr<OpKernelContext> relu_context(new OpKernelContext(params));
+
+  op->Compute(relu_context.get());
+  tensorflow::testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    delete relu_context->release_output(0).tensor;
+    op->Compute(relu_context.get());
+  }
+  tensorflow::testing::StopTiming();
+  testing::ItemsProcessed(relu_context->mutable_output(0)->NumElements() *
+                          iters);
+  testing::SetLabel(label);
+}
+
+// BS: batch_size
+// IR: input_rows
+// IC: input_cols
+// ND: node_depth
+#define BM_Relu(BS, IR, IC, ND, TH, LABEL)                               \
+  static void BM_ReluFloat_##BS##_##IR##_##IC##_##ND##_##TH(int iters) { \
+    BM_ReluFloat(iters, BS, IR, IC, ND, TH, LABEL);                      \
+  }                                                                      \
+  BENCHMARK(BM_ReluFloat_##BS##_##IR##_##IC##_##ND##_##TH)
+
+BM_Relu(32, 112, 112, 64, 1, "relu0");
+BM_Relu(32, 56, 56, 192, 1, "relu1");
+BM_Relu(32, 28, 28, 352, 1, "relu4");
+BM_Relu(32, 14, 14, 576, 1, "relu10");
+BM_Relu(32, 112, 112, 64, 4, "relu0");
+BM_Relu(32, 56, 56, 192, 4, "relu1");
+BM_Relu(32, 28, 28, 352, 4, "relu4");
+BM_Relu(32, 14, 14, 576, 4, "relu10");
+
+static void BM_ImageNetSoftmaxFwd(int iters, int batch_size, int node_depth,
+                                  int num_threads, const string& label) {
+  tensorflow::testing::StopTiming();
+  std::unique_ptr<Device> device(
+      DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+
+  thread::ThreadPool threadpool(Env::Default(), "test", num_threads);
+  EigenThreadPoolWrapper wrapper(&threadpool);
+  Eigen::ThreadPoolDevice eigen_cpu_device(&wrapper, num_threads);
+  device->set_eigen_cpu_device(&eigen_cpu_device);
+
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  TensorShape shape1({node_depth, batch_size});
+  Tensor* input1 = new Tensor(DT_FLOAT, shape1);
+  test::FillIota<float>(input1, 1.0);
+  inputs.push_back({nullptr, input1});
+
+  // Softmax op.
+  NodeDef softmax_node_def;
+  TF_CHECK_OK(NodeDefBuilder("softmax_op", "Softmax")
+                  .Input("input", 0, DT_FLOAT)
+                  .Finalize(&softmax_node_def));
+  Status status;
+  std::unique_ptr<OpKernel> op(CreateOpKernel(
+      DEVICE_CPU, device.get(), cpu_allocator(), softmax_node_def, &status));
+  TF_CHECK_OK(status);
+  OpKernelContext::Params params;
+  params.device = device.get();
+  params.frame_iter = FrameAndIter(0, 0);
+  params.inputs = &inputs;
+  params.op_kernel = op.get();
+  params.output_alloc_attr = [&device, &op, &params](int index) {
+    AllocatorAttributes attr;
+    const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY);
+    attr.set_on_host(on_host);
+    return attr;
+  };
+
+  std::unique_ptr<OpKernelContext> softmax_context(new OpKernelContext(params));
+
+  op->Compute(softmax_context.get());
+  tensorflow::testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    delete softmax_context->release_output(0).tensor;
+    op->Compute(softmax_context.get());
+  }
+  tensorflow::testing::StopTiming();
+  testing::ItemsProcessed(softmax_context->mutable_output(0)->NumElements() *
+                          iters);
+  testing::SetLabel(label);
+}
+
+#define BM_ImageNetSoftmaxFwdCPU(BATCH_SIZE, NODE_DEPTH, TH, LABEL)     \
+  static void BM_ImageNetSoftmaxFwd_##BATCH_SIZE##_##NODE_DEPTH##_##TH( \
+      int iters) {                                                      \
+    BM_ImageNetSoftmaxFwd(iters, BATCH_SIZE, NODE_DEPTH, TH, LABEL);    \
+  }                                                                     \
+  BENCHMARK(BM_ImageNetSoftmaxFwd_##BATCH_SIZE##_##NODE_DEPTH##_##TH)
+
+// Labels are taken from the 2014-July-24 version of imagenet
+BM_ImageNetSoftmaxFwdCPU(32, 1008, 1, "softmax32");
+BM_ImageNetSoftmaxFwdCPU(128, 1008, 1, "softmax128");
+BM_ImageNetSoftmaxFwdCPU(32, 1008, 4, "softmax32");
+BM_ImageNetSoftmaxFwdCPU(128, 1008, 4, "softmax128");
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/count_up_to_op.cc b/tensorflow/core/kernels/count_up_to_op.cc
new file mode 100644
index 0000000000..7cf4bdb6d0
--- /dev/null
+++ b/tensorflow/core/kernels/count_up_to_op.cc
@@ -0,0 +1,51 @@
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/port.h"
+
+namespace tensorflow {
+
+template <class T>
+class CountUpToOp : public OpKernel {
+ public:
+  explicit CountUpToOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("limit", &limit_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    T before_increment;
+    {
+      mutex_lock l(*context->input_ref_mutex(0));
+      Tensor tensor = context->mutable_input(0, true);
+      OP_REQUIRES(context, TensorShapeUtils::IsScalar(tensor.shape()),
+                  errors::InvalidArgument("input is not a scalar: ",
+                                          tensor.shape().DebugString()));
+      T* ptr = &tensor.scalar<T>()();
+      before_increment = *ptr;
+      if (*ptr >= limit_) {
+        context->SetStatus(errors::OutOfRange("Reached limit of ", limit_));
+        return;
+      }
+      ++*ptr;
+    }
+    // Output if no error.
+    Tensor* out_tensor;
+    OP_REQUIRES_OK(context, context->allocate_output("output", TensorShape({}),
+                                                     &out_tensor));
+    out_tensor->scalar<T>()() = before_increment;
+  }
+
+ private:
+  T limit_;
+};
+
+#define REGISTER(TYPE)                                                \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("CountUpTo").TypeConstraint<TYPE>("T").Device(DEVICE_CPU), \
+      CountUpToOp<TYPE>)
+
+REGISTER(int32);
+REGISTER(int64);
+
+#undef REGISTER
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_abs.cc b/tensorflow/core/kernels/cwise_op_abs.cc
new file mode 100644
index 0000000000..5d39b88166
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_abs.cc
@@ -0,0 +1,23 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER4(UnaryOp, CPU, "Abs", functor::abs, float, double, int32, int64);
+#ifndef __ANDROID__
+REGISTER_KERNEL_BUILDER(Name("ComplexAbs").Device(DEVICE_CPU),
+                        UnaryOp<CPUDevice, functor::abs<complex64>>);
+#endif
+#if GOOGLE_CUDA
+REGISTER3(UnaryOp, GPU, "Abs", functor::abs, float, double, int64);
+#endif
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Abs")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .TypeConstraint<int32>("T"),
+                        UnaryOp<CPUDevice, functor::abs<int32>>);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_add.cc b/tensorflow/core/kernels/cwise_op_add.cc
new file mode 100644
index 0000000000..a6cd4bddbe
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_add.cc
@@ -0,0 +1,21 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER7(BinaryOp, CPU, "Add", functor::add, float, double, int32, int64, int8,
+          int16, complex64);
+#if GOOGLE_CUDA
+REGISTER3(BinaryOp, GPU, "Add", functor::add, float, double, int64);
+#endif
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Add")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::add<int32>>);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_ceil.cc b/tensorflow/core/kernels/cwise_op_ceil.cc
new file mode 100644
index 0000000000..0a8f1313f8
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_ceil.cc
@@ -0,0 +1,8 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER2(UnaryOp, CPU, "Ceil", functor::ceil, float, double);
+#if GOOGLE_CUDA
+REGISTER2(UnaryOp, GPU, "Ceil", functor::ceil, float, double);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_complex.cc b/tensorflow/core/kernels/cwise_op_complex.cc
new file mode 100644
index 0000000000..825181bc35
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_complex.cc
@@ -0,0 +1,10 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER_KERNEL_BUILDER(Name("Complex").Device(DEVICE_CPU),
+                        BinaryOp<CPUDevice, functor::make_complex<float>>);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("Complex").Device(DEVICE_GPU),
+                        BinaryOp<GPUDevice, functor::make_complex<float>>);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_conj.cc b/tensorflow/core/kernels/cwise_op_conj.cc
new file mode 100644
index 0000000000..ba445d1c3d
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_conj.cc
@@ -0,0 +1,10 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER_KERNEL_BUILDER(Name("Conj").Device(DEVICE_CPU),
+                        UnaryOp<CPUDevice, functor::conj<complex64>>);
+#if GOOGLE_CUDA
+// REGISTER_KERNEL_BUILDER(Name("Conj").Device(DEVICE_GPU),
+//                         UnaryOp<GPUDevice, functor::conj<complex64>>);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_cos.cc b/tensorflow/core/kernels/cwise_op_cos.cc
new file mode 100644
index 0000000000..45e24fc2ec
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_cos.cc
@@ -0,0 +1,8 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER3(UnaryOp, CPU, "Cos", functor::cos, float, double, complex64);
+#if GOOGLE_CUDA
+REGISTER2(UnaryOp, GPU, "Cos", functor::cos, float, double);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_div.cc b/tensorflow/core/kernels/cwise_op_div.cc
new file mode 100644
index 0000000000..76d606ed03
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_div.cc
@@ -0,0 +1,21 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER5(BinaryOp, CPU, "Div", functor::div, float, double, int32, int64,
+          complex64);
+#if GOOGLE_CUDA
+REGISTER3(BinaryOp, GPU, "Div", functor::div, float, double, int64);
+#endif
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Div")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::div<int32>>);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_equal_to.cc b/tensorflow/core/kernels/cwise_op_equal_to.cc
new file mode 100644
index 0000000000..8369299332
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_equal_to.cc
@@ -0,0 +1,21 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER5(BinaryOp, CPU, "Equal", functor::equal_to, float, double, int32,
+          int64, complex64);
+#if GOOGLE_CUDA
+REGISTER3(BinaryOp, GPU, "Equal", functor::equal_to, float, double, int64);
+#endif
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Equal")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::equal_to<int32>>);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_exp.cc b/tensorflow/core/kernels/cwise_op_exp.cc
new file mode 100644
index 0000000000..b2603a1b4c
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_exp.cc
@@ -0,0 +1,8 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER3(UnaryOp, CPU, "Exp", functor::exp, float, double, complex64);
+#if GOOGLE_CUDA
+REGISTER2(UnaryOp, GPU, "Exp", functor::exp, float, double);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_floor.cc b/tensorflow/core/kernels/cwise_op_floor.cc
new file mode 100644
index 0000000000..83c8203953
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_floor.cc
@@ -0,0 +1,8 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER2(UnaryOp, CPU, "Floor", functor::floor, float, double);
+#if GOOGLE_CUDA
+REGISTER2(UnaryOp, GPU, "Floor", functor::floor, float, double);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_gpu_abs.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_abs.cu.cc
new file mode 100644
index 0000000000..59436afbc0
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_abs.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY3(abs, float, double, int64);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_add.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_add.cu.cc
new file mode 100644
index 0000000000..edf8e0d1a5
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_add.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY3(add, float, double, int64);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_ceil.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_ceil.cu.cc
new file mode 100644
index 0000000000..f24c4b8b73
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_ceil.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY2(ceil, float, double);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_complex.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_complex.cu.cc
new file mode 100644
index 0000000000..29086b5c71
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_complex.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY1(make_complex, float);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc
new file mode 100644
index 0000000000..cae22cea8e
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+// DEFINE_UNARY1(conj, complex64);  // not working
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_cos.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_cos.cu.cc
new file mode 100644
index 0000000000..c8412496a8
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_cos.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY2(cos, float, double);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_div.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_div.cu.cc
new file mode 100644
index 0000000000..c581c0487e
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_div.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY3(div, float, double, int64);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc
new file mode 100644
index 0000000000..f994822a74
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY4(equal_to, float, double, int64, complex64);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_exp.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_exp.cu.cc
new file mode 100644
index 0000000000..caeaa19cef
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_exp.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY2(exp, float, double);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_floor.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_floor.cu.cc
new file mode 100644
index 0000000000..0a06ff2978
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_floor.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY2(floor, float, double);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_greater.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_greater.cu.cc
new file mode 100644
index 0000000000..e1278e077b
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_greater.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY3(greater, float, double, int64);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_greater_equal.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_greater_equal.cu.cc
new file mode 100644
index 0000000000..fafcf9b28a
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_greater_equal.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY3(greater_equal, float, double, int64);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_imag.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_imag.cu.cc
new file mode 100644
index 0000000000..0370782c96
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_imag.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY1(get_imag, complex64);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_inverse.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_inverse.cu.cc
new file mode 100644
index 0000000000..020abef210
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_inverse.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY3(inverse, float, double, int64);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_isfinite.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_isfinite.cu.cc
new file mode 100644
index 0000000000..7a3a273af7
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_isfinite.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY2(isfinite, float, double);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_isinf.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_isinf.cu.cc
new file mode 100644
index 0000000000..cfc4be3d25
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_isinf.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY2(isinf, float, double);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_isnan.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_isnan.cu.cc
new file mode 100644
index 0000000000..c93b74387e
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_isnan.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY2(isnan, float, double);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_less.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_less.cu.cc
new file mode 100644
index 0000000000..8e2b28ac60
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_less.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY3(less, float, double, int64);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_less_equal.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_less_equal.cu.cc
new file mode 100644
index 0000000000..be8e34a58b
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_less_equal.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY3(less_equal, float, double, int64);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_log.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_log.cu.cc
new file mode 100644
index 0000000000..7d183cce50
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_log.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY2(log, float, double);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_logical_and.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_logical_and.cu.cc
new file mode 100644
index 0000000000..ba7046f9f0
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_logical_and.cu.cc
@@ -0,0 +1,13 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+template struct BinaryFunctor<GPUDevice, logical_and, 1>;
+template struct BinaryFunctor<GPUDevice, logical_and, 2>;
+template struct BinaryFunctor<GPUDevice, logical_and, 3>;
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_logical_not.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_logical_not.cu.cc
new file mode 100644
index 0000000000..34a43a76ef
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_logical_not.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+template struct UnaryFunctor<GPUDevice, logical_not>;
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_logical_or.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_logical_or.cu.cc
new file mode 100644
index 0000000000..47a7bd68dc
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_logical_or.cu.cc
@@ -0,0 +1,13 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+template struct BinaryFunctor<GPUDevice, logical_or, 1>;
+template struct BinaryFunctor<GPUDevice, logical_or, 2>;
+template struct BinaryFunctor<GPUDevice, logical_or, 3>;
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_maximum.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_maximum.cu.cc
new file mode 100644
index 0000000000..8f7ab90e9a
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_maximum.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY3(maximum, float, double, int64);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_minimum.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_minimum.cu.cc
new file mode 100644
index 0000000000..75fd7f89b4
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_minimum.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY3(minimum, float, double, int64);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_mod.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_mod.cu.cc
new file mode 100644
index 0000000000..d08a17a94d
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_mod.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+// No GPU ops for mod yet.
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_mul.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_mul.cu.cc
new file mode 100644
index 0000000000..e0a6738bef
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_mul.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY3(mul, float, double, int64);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_neg.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_neg.cu.cc
new file mode 100644
index 0000000000..3031afbb75
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_neg.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY4(neg, float, double, int32, int64);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_not_equal_to.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_not_equal_to.cu.cc
new file mode 100644
index 0000000000..59c76ee88b
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_not_equal_to.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY4(not_equal_to, float, double, int64, complex64);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_pow.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_pow.cu.cc
new file mode 100644
index 0000000000..50177495bc
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_pow.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY3(pow, float, double, int64);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_real.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_real.cu.cc
new file mode 100644
index 0000000000..3b1d465914
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_real.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY1(get_real, complex64);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_rsqrt.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_rsqrt.cu.cc
new file mode 100644
index 0000000000..682e2d2d4b
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_rsqrt.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY2(rsqrt, float, double);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
new file mode 100644
index 0000000000..b5125648e3
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
@@ -0,0 +1,15 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+template struct SelectFunctor<GPUDevice, float>;
+template struct SelectFunctor<GPUDevice, double>;
+template struct SelectFunctor<GPUDevice, int32>;
+template struct SelectFunctor<GPUDevice, int64>;
+template struct SelectFunctor<GPUDevice, complex64>;
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_sigmoid.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_sigmoid.cu.cc
new file mode 100644
index 0000000000..9c250f3071
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_sigmoid.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY2(sigmoid, float, double);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_sign.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_sign.cu.cc
new file mode 100644
index 0000000000..f413480ecc
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_sign.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY3(sign, float, double, int64);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_sin.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_sin.cu.cc
new file mode 100644
index 0000000000..6135f3b780
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_sin.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY2(sin, float, double);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_sqrt.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_sqrt.cu.cc
new file mode 100644
index 0000000000..9bdf3b9e30
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_sqrt.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY2(sqrt, float, double);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_square.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_square.cu.cc
new file mode 100644
index 0000000000..6b900e994d
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_square.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY3(square, float, double, int64);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_sub.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_sub.cu.cc
new file mode 100644
index 0000000000..6fd5ea0d38
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_sub.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY3(sub, float, double, int64);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_tanh.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_tanh.cu.cc
new file mode 100644
index 0000000000..e0393f6c2a
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_tanh.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY2(tanh, float, double);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_greater.cc b/tensorflow/core/kernels/cwise_op_greater.cc
new file mode 100644
index 0000000000..9ae31dcdfe
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_greater.cc
@@ -0,0 +1,21 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER4(BinaryOp, CPU, "Greater", functor::greater, float, double, int32,
+          int64);
+#if GOOGLE_CUDA
+REGISTER3(BinaryOp, GPU, "Greater", functor::greater, float, double, int64);
+#endif
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Greater")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::greater<int32>>);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_greater_equal.cc b/tensorflow/core/kernels/cwise_op_greater_equal.cc
new file mode 100644
index 0000000000..be4cc5dc79
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_greater_equal.cc
@@ -0,0 +1,22 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER4(BinaryOp, CPU, "GreaterEqual", functor::greater_equal, float, double,
+          int32, int64);
+#if GOOGLE_CUDA
+REGISTER3(BinaryOp, GPU, "GreaterEqual", functor::greater_equal, float, double,
+          int64);
+#endif
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::greater_equal<int32>>);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_imag.cc b/tensorflow/core/kernels/cwise_op_imag.cc
new file mode 100644
index 0000000000..c2432326fc
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_imag.cc
@@ -0,0 +1,10 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER_KERNEL_BUILDER(Name("Imag").Device(DEVICE_CPU),
+                        UnaryOp<CPUDevice, functor::get_imag<complex64>>);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("Imag").Device(DEVICE_GPU),
+                        UnaryOp<GPUDevice, functor::get_imag<complex64>>);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_inverse.cc b/tensorflow/core/kernels/cwise_op_inverse.cc
new file mode 100644
index 0000000000..6af883e755
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_inverse.cc
@@ -0,0 +1,8 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER3(UnaryOp, CPU, "Inv", functor::inverse, float, double, complex64);
+#if GOOGLE_CUDA
+REGISTER3(UnaryOp, GPU, "Inv", functor::inverse, float, double, int64);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_isfinite.cc b/tensorflow/core/kernels/cwise_op_isfinite.cc
new file mode 100644
index 0000000000..e52d199a8f
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_isfinite.cc
@@ -0,0 +1,8 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER2(UnaryOp, CPU, "IsFinite", functor::isfinite, float, double);
+#if GOOGLE_CUDA
+REGISTER2(UnaryOp, GPU, "IsFinite", functor::isfinite, float, double);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_isinf.cc b/tensorflow/core/kernels/cwise_op_isinf.cc
new file mode 100644
index 0000000000..868204f86e
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_isinf.cc
@@ -0,0 +1,8 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER2(UnaryOp, CPU, "IsInf", functor::isinf, float, double);
+#if GOOGLE_CUDA
+REGISTER2(UnaryOp, GPU, "IsInf", functor::isinf, float, double);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_isnan.cc b/tensorflow/core/kernels/cwise_op_isnan.cc
new file mode 100644
index 0000000000..a8f4d60d0f
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_isnan.cc
@@ -0,0 +1,8 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER2(UnaryOp, CPU, "IsNan", functor::isnan, float, double);
+#if GOOGLE_CUDA
+REGISTER2(UnaryOp, GPU, "IsNan", functor::isnan, float, double);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_less.cc b/tensorflow/core/kernels/cwise_op_less.cc
new file mode 100644
index 0000000000..3b5f75445c
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_less.cc
@@ -0,0 +1,20 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER4(BinaryOp, CPU, "Less", functor::less, float, double, int32, int64);
+#if GOOGLE_CUDA
+REGISTER3(BinaryOp, GPU, "Less", functor::less, float, double, int64);
+#endif
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Less")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::less<int32>>);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_less_equal.cc b/tensorflow/core/kernels/cwise_op_less_equal.cc
new file mode 100644
index 0000000000..507c7c2908
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_less_equal.cc
@@ -0,0 +1,22 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER4(BinaryOp, CPU, "LessEqual", functor::less_equal, float, double, int32,
+          int64);
+#if GOOGLE_CUDA
+REGISTER3(BinaryOp, GPU, "LessEqual", functor::less_equal, float, double,
+          int64);
+#endif
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("LessEqual")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::less_equal<int32>>);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_log.cc b/tensorflow/core/kernels/cwise_op_log.cc
new file mode 100644
index 0000000000..ebc7cbcc4e
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_log.cc
@@ -0,0 +1,8 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER3(UnaryOp, CPU, "Log", functor::log, float, double, complex64);
+#if GOOGLE_CUDA
+REGISTER2(UnaryOp, GPU, "Log", functor::log, float, double);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_logical_and.cc b/tensorflow/core/kernels/cwise_op_logical_and.cc
new file mode 100644
index 0000000000..a4075088f4
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_logical_and.cc
@@ -0,0 +1,10 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER_KERNEL_BUILDER(Name("LogicalAnd").Device(DEVICE_CPU),
+                        BinaryOp<CPUDevice, functor::logical_and>);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("LogicalAnd").Device(DEVICE_GPU),
+                        BinaryOp<GPUDevice, functor::logical_and>);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_logical_not.cc b/tensorflow/core/kernels/cwise_op_logical_not.cc
new file mode 100644
index 0000000000..b2e97bf70c
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_logical_not.cc
@@ -0,0 +1,10 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER_KERNEL_BUILDER(Name("LogicalNot").Device(DEVICE_CPU),
+                        UnaryOp<CPUDevice, functor::logical_not>);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("LogicalNot").Device(DEVICE_GPU),
+                        UnaryOp<GPUDevice, functor::logical_not>);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_logical_or.cc b/tensorflow/core/kernels/cwise_op_logical_or.cc
new file mode 100644
index 0000000000..0d1df082f7
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_logical_or.cc
@@ -0,0 +1,10 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER_KERNEL_BUILDER(Name("LogicalOr").Device(DEVICE_CPU),
+                        BinaryOp<CPUDevice, functor::logical_or>);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("LogicalOr").Device(DEVICE_GPU),
+                        BinaryOp<GPUDevice, functor::logical_or>);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_maximum.cc b/tensorflow/core/kernels/cwise_op_maximum.cc
new file mode 100644
index 0000000000..c0c9e3f6f5
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_maximum.cc
@@ -0,0 +1,21 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER4(BinaryOp, CPU, "Maximum", functor::maximum, float, double, int32,
+          int64);
+#if GOOGLE_CUDA
+REGISTER3(BinaryOp, GPU, "Maximum", functor::maximum, float, double, int64);
+#endif
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Maximum")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::maximum<int32>>);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_minimum.cc b/tensorflow/core/kernels/cwise_op_minimum.cc
new file mode 100644
index 0000000000..4c6bf7df05
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_minimum.cc
@@ -0,0 +1,21 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER4(BinaryOp, CPU, "Minimum", functor::minimum, float, double, int32,
+          int64);
+#if GOOGLE_CUDA
+REGISTER3(BinaryOp, GPU, "Minimum", functor::minimum, float, double, int64);
+#endif
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Minimum")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::minimum<int32>>);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_mod.cc b/tensorflow/core/kernels/cwise_op_mod.cc
new file mode 100644
index 0000000000..17f2834030
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_mod.cc
@@ -0,0 +1,6 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER2(BinaryOp, CPU, "Mod", functor::mod, int32, int64);
+REGISTER2(BinaryOp, CPU, "Mod", functor::fmod, float, double);
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_mul.cc b/tensorflow/core/kernels/cwise_op_mul.cc
new file mode 100644
index 0000000000..15f65012cd
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_mul.cc
@@ -0,0 +1,21 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER7(BinaryOp, CPU, "Mul", functor::mul, float, double, int32, int64, int8,
+          int16, complex64);
+#if GOOGLE_CUDA
+REGISTER3(BinaryOp, GPU, "Mul", functor::mul, float, double, int64);
+#endif
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Mul")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::mul<int32>>);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_neg.cc b/tensorflow/core/kernels/cwise_op_neg.cc
new file mode 100644
index 0000000000..3a19b2e94f
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_neg.cc
@@ -0,0 +1,9 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER5(UnaryOp, CPU, "Neg", functor::neg, float, double, int32, complex64,
+          int64);
+#if GOOGLE_CUDA
+REGISTER4(UnaryOp, GPU, "Neg", functor::neg, float, double, int32, int64);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_not_equal_to.cc b/tensorflow/core/kernels/cwise_op_not_equal_to.cc
new file mode 100644
index 0000000000..02d434a1c2
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_not_equal_to.cc
@@ -0,0 +1,10 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER5(BinaryOp, CPU, "NotEqual", functor::not_equal_to, float, double,
+          int32, int64, complex64);
+#if GOOGLE_CUDA
+REGISTER3(BinaryOp, GPU, "NotEqual", functor::not_equal_to, float, double,
+          int64);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_pow.cc b/tensorflow/core/kernels/cwise_op_pow.cc
new file mode 100644
index 0000000000..d10dced85f
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_pow.cc
@@ -0,0 +1,9 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER5(BinaryOp, CPU, "Pow", functor::pow, float, double, int32, int64,
+          complex64);
+#if GOOGLE_CUDA
+REGISTER3(BinaryOp, GPU, "Pow", functor::pow, float, double, int64);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_real.cc b/tensorflow/core/kernels/cwise_op_real.cc
new file mode 100644
index 0000000000..84295a5a16
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_real.cc
@@ -0,0 +1,10 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER_KERNEL_BUILDER(Name("Real").Device(DEVICE_CPU),
+                        UnaryOp<CPUDevice, functor::get_real<complex64>>);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("Real").Device(DEVICE_GPU),
+                        UnaryOp<GPUDevice, functor::get_real<complex64>>);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_rsqrt.cc b/tensorflow/core/kernels/cwise_op_rsqrt.cc
new file mode 100644
index 0000000000..a22b1209de
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_rsqrt.cc
@@ -0,0 +1,8 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER3(UnaryOp, CPU, "Rsqrt", functor::rsqrt, float, double, complex64);
+#if GOOGLE_CUDA
+REGISTER2(UnaryOp, GPU, "Rsqrt", functor::rsqrt, float, double);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_select.cc b/tensorflow/core/kernels/cwise_op_select.cc
new file mode 100644
index 0000000000..baa821690a
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_select.cc
@@ -0,0 +1,17 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER_SELECT(CPU, "Select", "", float);
+REGISTER_SELECT(CPU, "Select", "", double);
+REGISTER_SELECT(CPU, "Select", "", int32);
+REGISTER_SELECT(CPU, "Select", "", int64);
+REGISTER_SELECT(CPU, "Select", "", complex64);
+REGISTER_SELECT(CPU, "Select", "", string);
+#if GOOGLE_CUDA
+REGISTER_SELECT(GPU, "Select", "", float);
+REGISTER_SELECT(GPU, "Select", "", double);
+REGISTER_SELECT(GPU, "Select", "", int32);
+REGISTER_SELECT(GPU, "Select", "", int64);
+REGISTER_SELECT(GPU, "Select", "", complex64);
+#endif  // GOOGLE_CUDA
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sigmoid.cc b/tensorflow/core/kernels/cwise_op_sigmoid.cc
new file mode 100644
index 0000000000..e03b5d54dd
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_sigmoid.cc
@@ -0,0 +1,8 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER3(UnaryOp, CPU, "Sigmoid", functor::sigmoid, float, double, complex64);
+#if GOOGLE_CUDA
+REGISTER2(UnaryOp, GPU, "Sigmoid", functor::sigmoid, float, double);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sign.cc b/tensorflow/core/kernels/cwise_op_sign.cc
new file mode 100644
index 0000000000..59a0bfa1ed
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_sign.cc
@@ -0,0 +1,19 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER4(UnaryOp, CPU, "Sign", functor::sign, float, double, int32, int64);
+#if GOOGLE_CUDA
+REGISTER3(UnaryOp, GPU, "Sign", functor::sign, float, double, int64);
+#endif
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Sign")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .TypeConstraint<int32>("T"),
+                        UnaryOp<CPUDevice, functor::sign<int32>>);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sin.cc b/tensorflow/core/kernels/cwise_op_sin.cc
new file mode 100644
index 0000000000..e7c87374d7
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_sin.cc
@@ -0,0 +1,8 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER3(UnaryOp, CPU, "Sin", functor::sin, float, double, complex64);
+#if GOOGLE_CUDA
+REGISTER2(UnaryOp, GPU, "Sin", functor::sin, float, double);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sqrt.cc b/tensorflow/core/kernels/cwise_op_sqrt.cc
new file mode 100644
index 0000000000..f43241264a
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_sqrt.cc
@@ -0,0 +1,8 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER3(UnaryOp, CPU, "Sqrt", functor::sqrt, float, double, complex64);
+#if GOOGLE_CUDA
+REGISTER2(UnaryOp, GPU, "Sqrt", functor::sqrt, float, double);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_square.cc b/tensorflow/core/kernels/cwise_op_square.cc
new file mode 100644
index 0000000000..510fda49aa
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_square.cc
@@ -0,0 +1,9 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER5(UnaryOp, CPU, "Square", functor::square, float, double, int32,
+          complex64, int64);
+#if GOOGLE_CUDA
+REGISTER3(UnaryOp, GPU, "Square", functor::square, float, double, int64);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sub.cc b/tensorflow/core/kernels/cwise_op_sub.cc
new file mode 100644
index 0000000000..c3c5952f8d
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_sub.cc
@@ -0,0 +1,21 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER5(BinaryOp, CPU, "Sub", functor::sub, float, double, int32, int64,
+          complex64);
+#if GOOGLE_CUDA
+REGISTER3(BinaryOp, GPU, "Sub", functor::sub, float, double, int64);
+#endif
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Sub")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::sub<int32>>);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_tanh.cc b/tensorflow/core/kernels/cwise_op_tanh.cc
new file mode 100644
index 0000000000..31f4743449
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_tanh.cc
@@ -0,0 +1,8 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER3(UnaryOp, CPU, "Tanh", functor::tanh, float, double, complex64);
+#if GOOGLE_CUDA
+REGISTER2(UnaryOp, GPU, "Tanh", functor::tanh, float, double);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
new file mode 100644
index 0000000000..7d818cfbbf
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -0,0 +1,607 @@
+#ifndef TENSORFLOW_KERNELS_CWISE_OPS_H_
+#define TENSORFLOW_KERNELS_CWISE_OPS_H_
+
+#include <cmath>
+#include <functional>
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+// The following functors (sign, tanh, sigmoid, etc.) are not defined
+// by Eigen.  When their equivalent are added into the Eigen, we can
+// replace them using type aliases.
+
+namespace Eigen {
+namespace internal {
+
+template <typename T>
+struct scalar_sign_op {
+  // TODO(zhifengc): this only works for real types. In theory,
+  // sign(x) = x / |x| works for both real and complex values.
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const {
+    return T(x > T(0)) - T(x < T(0));
+  }
+};
+
+// TODO(zhifengc): Eigen::internal::pow_impl does not have proper
+// EIGEN host/device decoration. We duplicate code here for now.
+template <typename T, bool IsInteger>
+struct pow {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T
+  operator()(const T& x, const T& y) const {
+    return std::pow(x, y);
+  }
+};
+
+template <typename T>
+struct pow<T, true> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(T x, T y) const {
+    T res(1);
+    if (y & 1) res *= x;
+    y >>= 1;
+    while (y) {
+      x *= x;
+      if (y & 1) res *= x;
+      y >>= 1;
+    }
+    return res;
+  }
+};
+
+template <typename T>
+struct scalar_pow2_op : pow<T, NumTraits<T>::IsInteger> {};
+
+template <typename T>
+struct functor_traits<scalar_pow2_op<T> > {
+  enum {
+    Cost = 5 * NumTraits<T>::MulCost,
+    PacketAccess = false,
+  };
+};
+
+template <typename T>
+struct scalar_fmod2_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod2_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a,
+                                                           const T& b) const {
+    return fmod(a, b);
+  }
+};
+
+template <typename T>
+struct scalar_mod2_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T
+  operator()(const T& a, const T& b) const {
+    return a % b;
+  }
+};
+
+template <typename T>
+struct functor_traits<scalar_mod2_op<T> > {
+  enum {
+    Cost = 5,  // Roughly the cost of a div
+    PacketAccess = false,
+  };
+};
+
+// scalar_left and scalar_right are template helpers to partially
+// apply a binary function.
+//
+// Suppose Binary is a binary functor f(x, y), scalar_left<> is a
+// unary functor g_x(y) = f(x, y), where x is provided via the
+// constructor. Similarly, scalar_right<> is a unary functor g_y(x) =
+// f(x, y).
+
+template <typename Tout, typename Tin, typename Binary,
+          bool PacketAccess = functor_traits<Binary>::PacketAccess>
+struct scalar_left {
+  typedef Tout result_type;
+  const Tin* left;
+  EIGEN_DEVICE_FUNC inline scalar_left(
+      const scalar_left& other)  // NOLINT(runtime/explicit)
+      : left(other.left) {}
+  EIGEN_DEVICE_FUNC inline explicit scalar_left(const Tin* c) : left(c) {}
+  EIGEN_DEVICE_FUNC inline Tout operator()(const Tin& right) const {
+    return Binary()(*left, right);
+  }
+};
+
+template <typename Tout, typename Tin, typename Binary>
+struct scalar_left<Tout, Tin, Binary, true> {
+  typedef Tout result_type;
+  const Tin* left;
+  EIGEN_DEVICE_FUNC inline scalar_left(
+      const scalar_left& other)  // NOLINT(runtime/explicit)
+      : left(other.left) {}
+  EIGEN_DEVICE_FUNC inline explicit scalar_left(const Tin* c) : left(c) {}
+  EIGEN_DEVICE_FUNC inline Tout operator()(const Tin& right) const {
+    return Binary()(*left, right);
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& right_packet) const {
+    const Packet left_packet = Eigen::internal::pset1<Packet>(*left);
+    return Binary().packetOp(left_packet, right_packet);
+  }
+};
+
+template <typename Tout, typename Tin, typename Binary>
+struct functor_traits<scalar_left<Tout, Tin, Binary> > {
+  enum {
+    Cost = functor_traits<Binary>::Cost,
+    PacketAccess = functor_traits<Binary>::PacketAccess,
+  };
+};
+
+template <typename Tout, typename Tin, typename Binary,
+          bool PacketAccess = functor_traits<Binary>::PacketAccess>
+struct scalar_right {
+  typedef Tout result_type;
+  const Tin* right;
+  EIGEN_DEVICE_FUNC inline scalar_right(
+      const scalar_right& other)  // NOLINT(runtime/explicit)
+      : right(other.right) {}
+  EIGEN_DEVICE_FUNC inline explicit scalar_right(const Tin* c) : right(c) {}
+  EIGEN_DEVICE_FUNC inline Tout operator()(const Tin& left) const {
+    return Binary()(left, *right);
+  }
+};
+
+template <typename Tout, typename Tin, typename Binary>
+struct scalar_right<Tout, Tin, Binary, true> {
+  typedef Tout result_type;
+  const Tin* right;
+  EIGEN_DEVICE_FUNC inline scalar_right(
+      const scalar_right& other)  // NOLINT(runtime/explicit)
+      : right(other.right) {}
+  EIGEN_DEVICE_FUNC inline explicit scalar_right(const Tin* c) : right(c) {}
+  EIGEN_DEVICE_FUNC inline Tout operator()(const Tin& left) const {
+    return Binary()(left, *right);
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& left_packet) const {
+    const Packet right_packet = Eigen::internal::pset1<Packet>(*right);
+    return Binary().packetOp(left_packet, right_packet);
+  }
+};
+
+template <typename Tout, typename Tin, typename Binary>
+struct functor_traits<scalar_right<Tout, Tin, Binary> > {
+  enum {
+    Cost = functor_traits<Binary>::Cost,
+    PacketAccess = functor_traits<Binary>::PacketAccess,
+  };
+};
+
+// similar to std::equal_to, but with the DEVICE_FUNC qualifier
+template <class T>
+struct equal_to : std::binary_function<T, T, bool> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  bool operator()(const T& x, const T& y) const { return x == y; }
+};
+
+// similar to std::not_equal_to, but with the DEVICE_FUNC qualifier
+template <class T>
+struct not_equal_to : std::binary_function<T, T, bool> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  bool operator()(const T& x, const T& y) const { return x != y; }
+};
+
+// similar to std::greater, but with the DEVICE_FUNC qualifier
+template <class T>
+struct greater : std::binary_function<T, T, bool> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  bool operator()(const T& x, const T& y) const { return x > y; }
+};
+
+// similar to std::less, but with the DEVICE_FUNC qualifier
+template <class T>
+struct less : std::binary_function<T, T, bool> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  bool operator()(const T& x, const T& y) const { return x < y; }
+};
+
+// similar to std::greater_equal, but with the DEVICE_FUNC qualifier
+template <class T>
+struct greater_equal : std::binary_function<T, T, bool> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  bool operator()(const T& x, const T& y) const { return x >= y; }
+};
+
+// similar to std::less_equal, but with the DEVICE_FUNC qualifier
+template <class T>
+struct less_equal : std::binary_function<T, T, bool> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  bool operator()(const T& x, const T& y) const { return x <= y; }
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+namespace tensorflow {
+namespace functor {
+
+////////////////////////////////////////////////////////////////////////////////
+// Helpers
+////////////////////////////////////////////////////////////////////////////////
+
+// Base template for functors whose input scalar type is T and
+// output scalar type is R.
+template <typename T, typename F, typename R = T>
+struct base {
+  // func defines operator() and its vectorized version packetOp().
+  typedef F func;
+
+  // If true, the functor's corresponding binary op will instantiate
+  // specialized kernels to perform an optimized broadcast
+  // operation. Each functor for which this is enabled increases the
+  // code size, so by default this is disabled for binary functors and
+  // is enabled on a per-op basis as needed.
+  static const bool use_bcast_optimization = false;
+
+  // operator() has the signature:
+  //  out_type operator()(in_type in0, in_type in1 ...)
+  typedef R out_type;
+  typedef T in_type;
+
+  // TensorFlow provides tensor-ized version of "func". Roughly
+  // speaking, the tensorflow operation has the signature:
+  //   tout_type op(tin_type in0)
+  //   tout_type op(tin_type in0, tin_type in1)
+  //   tout_type op(tin_type in0, in_type scalar)
+  typedef typename TTypes<out_type>::Flat tout_type;
+  typedef typename TTypes<in_type>::ConstFlat tin_type;
+  typedef typename TTypes<in_type>::ConstScalar tscalar_type;
+};
+
+// For now, we only apply certain speed optimization for
+// float/double's broadcast binary op.
+template <typename T>
+struct use_bcast_optimization {
+  static const bool value = false;
+};
+
+template <>
+struct use_bcast_optimization<float> {
+  static const bool value = true;
+};
+
+template <>
+struct use_bcast_optimization<double> {
+  static const bool value = true;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Unary functors
+////////////////////////////////////////////////////////////////////////////////
+
+// abs(x) = |x|
+// neg(x) = - x
+// inverse(x) = 1 / x
+// square(x) = x^2
+// sqrt(x) = x^(1/2)
+// rsqrt(x) = x^(-1/2)
+// exp(x) = e^x
+// log(x) = natural logrithm of x
+// tanh = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
+// sigmoid = 1 / (1 + exp(-x))  // a.k.a, logistic
+//
+// NOTE: We may eventually implement common functions used in NN
+// here. E.g., rectifier, softplus, derivatives of tanh, sigmod, etc.
+// For reference, see speech/lstm/eigen_functors.h.
+
+template <typename T>
+struct abs : base<T, Eigen::internal::scalar_abs_op<T>,
+                  typename Eigen::internal::scalar_abs_op<T>::result_type> {};
+
+template <typename T>
+struct neg : base<T, Eigen::internal::scalar_opposite_op<T> > {};
+
+template <typename T>
+struct inverse : base<T, Eigen::internal::scalar_inverse_op<T> > {};
+
+template <typename T>
+struct square : base<T, Eigen::internal::scalar_square_op<T> > {};
+
+template <typename T>
+struct sqrt : base<T, Eigen::internal::scalar_sqrt_op<T> > {};
+
+template <typename T>
+struct rsqrt : base<T, Eigen::internal::scalar_rsqrt_op<T> > {};
+
+template <typename T>
+struct exp : base<T, Eigen::internal::scalar_exp_op<T> > {};
+
+template <typename T>
+struct log : base<T, Eigen::internal::scalar_log_op<T> > {};
+
+template <typename T>
+struct sign : base<T, Eigen::internal::scalar_sign_op<T> > {};
+
+template <typename T>
+struct tanh : base<T, Eigen::internal::scalar_tanh_op<T> > {};
+
+template <typename T>
+struct sigmoid : base<T, Eigen::internal::scalar_sigmoid_op<T> > {};
+
+template <typename T>
+struct sin : base<T, Eigen::internal::scalar_sin_op<T> > {};
+
+template <typename T>
+struct cos : base<T, Eigen::internal::scalar_cos_op<T> > {};
+
+struct logical_not : base<bool, std::logical_not<bool> > {};
+
+namespace impl {
+
+#ifndef __CUDACC__
+// Uses STL std cmath functions.
+template <typename T>
+bool isinf(T v) {
+  return std::isinf(v);
+}
+
+template <typename T>
+bool isnan(T v) {
+  return std::isnan(v);
+}
+
+template <typename T>
+bool isfinite(T v) {
+  return std::isfinite(v);
+}
+
+template <typename T>
+T floor(T v) {
+  return std::floor(v);
+}
+
+template <typename T>
+T ceil(T v) {
+  return std::ceil(v);
+}
+#else
+// Uses CUDA's functions for float and double.
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isinf(T v) {
+  return ::isinf(v);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isnan(T v) {
+  return ::isnan(v);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isfinite(T v) {
+  return ::isfinite(v);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T floor(T v) {
+  return ::floor(v);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T ceil(T v) {
+  return ::ceil(v);
+}
+#endif
+}  // end namespace impl
+
+// NOTE: std::isinf, std::isnan, std::isfinite are plain function.
+// Therefore we need to wrap them in functors to be used with Eigen's
+// type system.
+
+template <typename T>
+struct isinf_func {
+  typedef bool result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(T x) const {
+    return impl::isinf(x);
+  }
+};
+
+template <typename T>
+struct isinf : base<T, isinf_func<T>, bool> {};
+
+template <typename T>
+struct isnan_func {
+  typedef bool result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(T x) const {
+    return impl::isnan(x);
+  }
+};
+
+template <typename T>
+struct isnan : base<T, isnan_func<T>, bool> {};
+
+template <typename T>
+struct isfinite_func {
+  typedef bool result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(T x) const {
+    return impl::isfinite(x);
+  }
+};
+
+template <typename T>
+struct isfinite : base<T, isfinite_func<T>, bool> {};
+
+template <typename T>
+struct floor_func {
+  typedef T result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(T x) const {
+    return impl::floor(x);
+  }
+};
+
+template <typename T>
+struct floor : base<T, floor_func<T> > {};
+
+template <typename T>
+struct ceil_func {
+  typedef T result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(T x) const {
+    return impl::ceil(x);
+  }
+};
+
+template <typename T>
+struct ceil : base<T, ceil_func<T> > {};
+
+////////////////////////////////////////////////////////////////////////////////
+// Binary functors
+////////////////////////////////////////////////////////////////////////////////
+
+// Binary functors:
+//
+// add(x, y) = x + y
+// sub(x, y) = x - y
+// mul(x, y) = x * y
+// div(x, y) = x / y
+// mod(x, y) = x % y         (int32 and int64 only)
+// fmod(x, y) = fmod(x, y)   (float and double only)
+// pow(x, y) = x ^ y
+// maximum(x, y) = x > y ? x : y
+// minimum(x, y) = x < y ? x : y
+
+template <typename T>
+struct add : base<T, Eigen::internal::scalar_sum_op<T> > {
+  static const bool use_bcast_optimization = true;
+};
+
+template <typename T>
+struct sub : base<T, Eigen::internal::scalar_difference_op<T> > {
+  static const bool use_bcast_optimization = true;
+};
+
+template <typename T>
+struct mul : base<T, Eigen::internal::scalar_product_op<T> > {};
+
+template <typename T>
+struct div : base<T, Eigen::internal::scalar_quotient_op<T> > {};
+
+template <typename T>
+struct fmod : base<T, Eigen::internal::scalar_fmod2_op<T> > {};
+
+template <typename T>
+struct mod : base<T, Eigen::internal::scalar_mod2_op<T> > {};
+
+template <typename T>
+struct pow : base<T, Eigen::internal::scalar_pow2_op<T> > {};
+
+template <typename T>
+struct maximum : base<T, Eigen::internal::scalar_max_op<T> > {};
+
+template <typename T>
+struct minimum : base<T, Eigen::internal::scalar_min_op<T> > {};
+
+template <typename T>
+struct less : base<T, Eigen::internal::less<T>, bool> {};
+
+template <typename T>
+struct less_equal : base<T, Eigen::internal::less_equal<T>, bool> {};
+
+template <typename T>
+struct greater : base<T, Eigen::internal::greater<T>, bool> {};
+
+template <typename T>
+struct greater_equal : base<T, Eigen::internal::greater_equal<T>, bool> {};
+
+template <typename T>
+struct equal_to : base<T, Eigen::internal::equal_to<T>, bool> {};
+
+template <typename T>
+struct not_equal_to : base<T, Eigen::internal::not_equal_to<T>, bool> {};
+
+struct logical_and : base<bool, Eigen::internal::scalar_boolean_and_op> {};
+
+struct logical_or : base<bool, Eigen::internal::scalar_boolean_or_op> {};
+
+template <typename T>
+struct make_complex_func {
+  typedef std::complex<T> result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  result_type operator()(T real, T imag) const {
+    return std::complex<T>(real, imag);
+  }
+};
+
+template <typename T>
+struct make_complex : base<T, make_complex_func<T>, std::complex<T> > {};
+
+template <typename T>
+struct get_real
+    : base<T, Eigen::internal::scalar_real_op<T>, typename T::value_type> {};
+
+template <typename T>
+struct get_imag
+    : base<T, Eigen::internal::scalar_imag_op<T>, typename T::value_type> {};
+
+template <typename T>
+struct conj : base<T, Eigen::internal::scalar_conjugate_op<T> > {};
+
+////////////////////////////////////////////////////////////////////////////////
+// Functors takes 1 or 2 tensors, computes the base functor on
+// coefficient of the input tensors and puts the results in the output
+// tensor.
+////////////////////////////////////////////////////////////////////////////////
+template <typename Device, typename Functor>
+struct UnaryFunctor {
+  // Computes on device "d": out[i] = Functor(in[i])
+  void operator()(const Device& d, typename Functor::tout_type out,
+                  typename Functor::tin_type in);
+};
+
+template <typename Device, typename Functor, int NDIMS>
+struct BinaryFunctor {
+  // Computes on device "d": out[i] = Functor(in0[i], in1[i])
+  void operator()(const Device& d, typename Functor::tout_type out,
+                  typename Functor::tin_type in0,
+                  typename Functor::tin_type in1);
+
+  // Computes on device "d": out[i] = Functor(scalar[0], in[i])
+  void Left(const Device& d, typename Functor::tout_type out,
+            typename Functor::tscalar_type scalar,
+            typename Functor::tin_type in);
+
+  // Computes on device "d": out[i] = Functor(in[i], scalar[0])
+  void Right(const Device& d, typename Functor::tout_type out,
+             typename Functor::tin_type in,
+             typename Functor::tscalar_type scalar);
+
+  // Computes on device "d":
+  //   out = Functor(in0.broadcast(bcast0), in1.broadcast(bcast01))
+  //
+  // TODO(zhifengc): makes BCast a template member function on NDIMS
+  // instead making BinaryFunctor templates on NDIMS.
+  void BCast(const Device& d,
+             typename TTypes<typename Functor::out_type, NDIMS>::Tensor out,
+             typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in0,
+             typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast0,
+             typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in1,
+             typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast1);
+};
+
+template <int NDIMS>
+bool AllOne(const typename Eigen::array<Eigen::DenseIndex, NDIMS>& a) {
+  for (int i = 0; i < a.size(); ++i) {
+    if (a[i] != 1) return false;
+  }
+  return true;
+}
+
+template <typename Device, typename T>
+struct SelectFunctor {
+  void operator()(const Device& d, typename TTypes<T>::Flat out,
+                  typename TTypes<bool>::ConstFlat cond_flat,
+                  typename TTypes<T>::ConstFlat then_flat,
+                  typename TTypes<T>::ConstFlat else_flat);
+};
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_CWISE_OPS_H_
diff --git a/tensorflow/core/kernels/cwise_ops_common.cc b/tensorflow/core/kernels/cwise_ops_common.cc
new file mode 100644
index 0000000000..f86d2ddd9a
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_ops_common.cc
@@ -0,0 +1,42 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+
+BinaryOpShared::BinaryOpShared(OpKernelConstruction* ctx, DataType out,
+                               DataType in)
+    : OpKernel(ctx) {
+  OP_REQUIRES_OK(ctx, ctx->MatchSignature({in, in}, {out}));
+}
+
+void BinaryOpShared::SetUnimplementedError(OpKernelContext* ctx) {
+  ctx->SetStatus(errors::Unimplemented(
+      "Broadcast between ", ctx->input(0).shape().ShortDebugString(), " and ",
+      ctx->input(1).shape().ShortDebugString(), " is not supported yet."));
+}
+
+static BCast::Vec FromShape(const TensorShape& shape) {
+  BCast::Vec ret;
+  for (int i = 0; i < shape.dims(); ++i) ret.push_back(shape.dim_size(i));
+  return ret;
+}
+
+static TensorShape ToShape(const BCast::Vec& vec) {
+  TensorShape shape;
+  for (auto elem : vec) shape.AddDim(elem);
+  return shape;
+}
+
+BinaryOpShared::BinaryOpState::BinaryOpState(OpKernelContext* ctx)
+    : bcast(FromShape(ctx->input(0).shape()),
+            FromShape(ctx->input(1).shape())) {
+  if (!bcast.IsValid()) {
+    ctx->SetStatus(errors::InvalidArgument(
+        "Incompatible shapes: ", ctx->input(0).shape().ShortDebugString(),
+        " vs. ", ctx->input(1).shape().ShortDebugString()));
+    return;
+  }
+  OP_REQUIRES_OK(ctx,
+                 ctx->allocate_output(0, ToShape(bcast.output_shape()), &out));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_ops_common.h b/tensorflow/core/kernels/cwise_ops_common.h
new file mode 100644
index 0000000000..cf848b86d1
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_ops_common.h
@@ -0,0 +1,390 @@
+#ifndef TENSORFLOW_KERNELS_CWISE_OPS_COMMON_H_
+#define TENSORFLOW_KERNELS_CWISE_OPS_COMMON_H_
+
+// See docs in ../ops/math_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/cwise_ops.h"
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/bcast.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+class BinaryOpShared : public OpKernel {
+ public:
+  explicit BinaryOpShared(OpKernelConstruction* ctx, DataType out, DataType in);
+
+ protected:
+  struct BinaryOpState {
+    // Sets up bcast with the shape of in0 and in1, ensures that the bcast
+    // is valid, and if so, allocates out using ctx->output(...).
+    // Caller must check ctx->status() upon return for non-ok status.
+    // If ctx->status().ok() is true, then out is guaranteed to be allocated.
+    BinaryOpState(OpKernelContext* ctx);
+
+    BCast bcast;
+    Tensor* out = nullptr;
+  };
+
+  template <int NDIMS>
+  static Eigen::array<Eigen::DenseIndex, NDIMS> ToIndexArray(
+      const BCast::Vec& vec) {
+    CHECK_EQ(vec.size(), NDIMS);
+    Eigen::array<Eigen::DenseIndex, NDIMS> ret;
+    for (int i = 0; i < NDIMS; ++i) ret[i] = vec[i];
+    return ret;
+  }
+  void SetUnimplementedError(OpKernelContext* ctx);
+};
+
+// Coefficient-wise binary operations:
+//   Device: E.g., CPUDevice, GPUDevice.
+//   Functor: defined in cwise_functors.h. E.g., functor::add2.
+template <typename Device, typename Functor>
+class BinaryOp : public BinaryOpShared {
+ public:
+  typedef typename Functor::in_type Tin;    // Input scalar data type.
+  typedef typename Functor::out_type Tout;  // Output scalar data type.
+
+  explicit BinaryOp(OpKernelConstruction* ctx)
+      : BinaryOpShared(ctx, DataTypeToEnum<Tout>::v(),
+                       DataTypeToEnum<Tin>::v()) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& in0 = ctx->input(0);
+    const Tensor& in1 = ctx->input(1);
+    // 'state': Shared helper not dependent on T to reduce code size
+    BinaryOpState state(ctx);
+    if (!ctx->status().ok()) return;
+    Tensor* out = state.out;
+    BCast* bcast = &state.bcast;
+    if (out->NumElements() == 0) {
+      return;
+    }
+    const int ndims = bcast->x_reshape().size();
+    if (ndims <= 1) {
+      if (in1.NumElements() == 1) {
+        // tensor op scalar
+        functor::BinaryFunctor<Device, Functor, 1>().Right(
+            ctx->eigen_device<Device>(), out->flat<Tout>(), in0.flat<Tin>(),
+            in1.scalar<Tin>());
+        return;
+      }
+      if (in0.NumElements() == 1) {
+        // scalar op tensor
+        functor::BinaryFunctor<Device, Functor, 1>().Left(
+            ctx->eigen_device<Device>(), out->flat<Tout>(), in0.scalar<Tin>(),
+            in1.flat<Tin>());
+        return;
+      }
+      functor::BinaryFunctor<Device, Functor, 1>()(
+          ctx->eigen_device<Device>(), out->flat<Tout>(), in0.flat<Tin>(),
+          in1.flat<Tin>());
+      return;
+    }
+
+    if (ndims == 2) {
+      functor::BinaryFunctor<Device, Functor, 2>().BCast(
+          ctx->eigen_device<Device>(),
+          out->shaped<Tout, 2>(bcast->result_shape()),
+          in0.shaped<Tin, 2>(bcast->x_reshape()),
+          ToIndexArray<2>(bcast->x_bcast()),
+          in1.shaped<Tin, 2>(bcast->y_reshape()),
+          ToIndexArray<2>(bcast->y_bcast()));
+      return;
+    }
+
+    if (ndims == 3) {
+      functor::BinaryFunctor<Device, Functor, 3>().BCast(
+          ctx->eigen_device<Device>(),
+          out->shaped<Tout, 3>(bcast->result_shape()),
+          in0.shaped<Tin, 3>(bcast->x_reshape()),
+          ToIndexArray<3>(bcast->x_bcast()),
+          in1.shaped<Tin, 3>(bcast->y_reshape()),
+          ToIndexArray<3>(bcast->y_bcast()));
+      return;
+    }
+
+    SetUnimplementedError(ctx);
+  }
+
+ private:
+};
+
+// Coefficient-wise unary operations:
+//   Device: E.g., CPUDevice, GPUDevice.
+//   Functor: defined in cwise_functors.h. E.g., functor::sqrt.
+template <typename Device, typename Functor>
+class UnaryOp : public OpKernel {
+ public:
+  typedef typename Functor::in_type Tin;    // Input scalar data type.
+  typedef typename Functor::out_type Tout;  // Output scalar data type.
+  // Tin may be different from Tout. E.g., abs: complex64 -> float
+
+  explicit UnaryOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    auto in = DataTypeToEnum<Tin>::v();
+    auto out = DataTypeToEnum<Tout>::v();
+    OP_REQUIRES_OK(ctx, ctx->MatchSignature({in}, {out}));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& inp = ctx->input(0);
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, inp.shape(), &out));
+    functor::UnaryFunctor<Device, Functor>()(
+        ctx->eigen_device<Device>(), out->flat<Tout>(), inp.flat<Tin>());
+  }
+};
+
+// Coefficient-wise select operation.
+//   Device: E.g., CPUDevice, GPUDevice.
+template <typename Device, typename T>
+class SelectOp : public OpKernel {
+ public:
+  explicit SelectOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    auto dt = DataTypeToEnum<T>::v();
+    OP_REQUIRES_OK(ctx, ctx->MatchSignature({DT_BOOL, dt, dt}, {dt}));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& in0 = ctx->input(0);
+    const Tensor& in1 = ctx->input(1);
+    const Tensor& in2 = ctx->input(2);
+    if (!ctx->ValidateInputsAreSameShape(this)) return;
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, in0.shape(), &out));
+    functor::SelectFunctor<Device, T> func;
+    func(ctx->eigen_device<Device>(), out->flat<T>(), in0.flat<bool>(),
+         in1.flat<T>(), in2.flat<T>());
+  }
+};
+
+namespace functor {
+
+// For CPUDevice, we do operations inline if the resulting tensor is
+// modestly sized.
+static bool DoInline(size_t size) { return size <= 32768; }
+
+template <typename D, typename OUT, typename RHS>
+void Assign(const D& d, OUT out, RHS rhs) {
+  if (DoInline(out.size())) {
+    out = rhs;
+  } else {
+    out.device(d) = rhs;
+  }
+}
+
+// Partial specialization of BinaryFunctor<Device=CPUDevice, Functor>.
+template <typename Functor, int NDIMS>
+struct BinaryFunctor<CPUDevice, Functor, NDIMS> {
+  void operator()(const CPUDevice& d, typename Functor::tout_type out,
+                  typename Functor::tin_type in0,
+                  typename Functor::tin_type in1) {
+    Assign(d, out, in0.binaryExpr(in1, typename Functor::func()));
+  }
+
+  void Left(const CPUDevice& d, typename Functor::tout_type out,
+            typename Functor::tscalar_type scalar,
+            typename Functor::tin_type in) {
+    typedef typename Functor::out_type Tout;
+    typedef typename Functor::in_type Tin;
+    typedef typename Functor::func Binary;
+    typedef typename Eigen::internal::scalar_left<Tout, Tin, Binary> Unary;
+    Assign(d, out, in.unaryExpr(Unary(scalar.data())));
+  }
+
+  void Right(const CPUDevice& d, typename Functor::tout_type out,
+             typename Functor::tin_type in,
+             typename Functor::tscalar_type scalar) {
+    typedef typename Functor::out_type Tout;
+    typedef typename Functor::in_type Tin;
+    typedef typename Functor::func Binary;
+    typedef typename Eigen::internal::scalar_right<Tout, Tin, Binary> Unary;
+    Assign(d, out, in.unaryExpr(Unary(scalar.data())));
+  }
+
+#if !defined(EIGEN_HAS_INDEX_LIST)
+  inline Eigen::DSizes<int, 2> NByOne(int n) {
+    return Eigen::DSizes<int, 2>(n, 1);
+  }
+  inline Eigen::DSizes<int, 2> OneByM(int m) {
+    return Eigen::DSizes<int, 2>(1, m);
+  }
+#else
+  inline Eigen::IndexList<int, Eigen::type2index<1>> NByOne(int n) {
+    Eigen::IndexList<int, Eigen::type2index<1>> ret;
+    ret.set(0, n);
+    return ret;
+  }
+  inline Eigen::IndexList<Eigen::type2index<1>, int> OneByM(int m) {
+    Eigen::IndexList<Eigen::type2index<1>, int> ret;
+    ret.set(1, m);
+    return ret;
+  }
+#endif
+
+  void BCast(const CPUDevice& dev,
+             typename TTypes<typename Functor::out_type, NDIMS>::Tensor out,
+             typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in0,
+             typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast0,
+             typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in1,
+             typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast1) {
+    typedef typename Functor::in_type T;
+    typename Functor::func func;
+    if ((NDIMS == 2) && Functor::use_bcast_optimization &&
+        use_bcast_optimization<T>::value) {
+      // Optimize for speed by using Eigen::type2index and avoid
+      // .broadcast() when we know its a no-op.
+      //
+      // Here, we need to handle 6 cases depending on how many "1"
+      // exist in in0 and in1's shapes (4 numbers in total). It's not
+      // possible that two shapes have more than 2 1s because those
+      // are simplified to NDIMS==1 case.
+      //
+      // Because this optimization increases the binary size for each
+      // Functor (+, -, *, /, <, <=, etc.), type and ndim combination.
+      // we only apply such optimization for selected ops/types/ndims.
+      //
+      // Because NDIMS, Functor::use_broadcast_optimization and
+      // use_broadcast_optimization<T> are compile-time constant, gcc
+      // does a decent job avoiding generating code when conditions
+      // are not met.
+      const int a = in0.dimension(0);  // in0 is shape [a, b]
+      const int b = in0.dimension(1);
+      const int c = in1.dimension(0);  // in1 is shape [c, d]
+      const int d = in1.dimension(1);
+      if ((a == 1) && (d == 1)) {
+        auto lhs = in0.reshape(OneByM(b)).broadcast(NByOne(c));
+        auto rhs = in1.reshape(NByOne(c)).broadcast(OneByM(b));
+        Assign(dev, out, lhs.binaryExpr(rhs, func));
+        return;
+      }
+      if ((b == 1) && (c == 1)) {
+        auto lhs = in0.reshape(NByOne(a)).broadcast(OneByM(d));
+        auto rhs = in1.reshape(OneByM(d)).broadcast(NByOne(a));
+        Assign(dev, out, lhs.binaryExpr(rhs, func));
+        return;
+      }
+      if (a == 1) {
+        auto lhs = in0.reshape(OneByM(b)).broadcast(NByOne(c));
+        auto rhs = in1;
+        Assign(dev, out, lhs.binaryExpr(rhs, func));
+        return;
+      }
+      if (b == 1) {
+        auto lhs = in0.reshape(NByOne(a)).broadcast(OneByM(d));
+        auto rhs = in1;
+        Assign(dev, out, lhs.binaryExpr(rhs, func));
+        return;
+      }
+      if (c == 1) {
+        auto lhs = in0;
+        auto rhs = in1.reshape(OneByM(d)).broadcast(NByOne(a));
+        Assign(dev, out, lhs.binaryExpr(rhs, func));
+        return;
+      }
+      if (d == 1) {
+        auto lhs = in0;
+        auto rhs = in1.reshape(NByOne(c)).broadcast(OneByM(b));
+        Assign(dev, out, lhs.binaryExpr(rhs, func));
+        return;
+      }
+
+      const bool bcast0_all_one = AllOne<NDIMS>(bcast0);
+      const bool bcast1_all_one = AllOne<NDIMS>(bcast1);
+      if (bcast0_all_one && !bcast1_all_one) {
+        auto lhs = in0;  // No need to do broadcast for in0
+        auto rhs = in1.broadcast(bcast1);
+        Assign(dev, out, lhs.binaryExpr(rhs, func));
+        return;
+      }
+
+      if (!bcast0_all_one && bcast1_all_one) {
+        auto lhs = in0.broadcast(bcast0);
+        auto rhs = in1;  // No need to do broadcast for in1
+        Assign(dev, out, lhs.binaryExpr(rhs, func));
+        return;
+      }
+    }
+
+    // Fallback path. Always work and probably slower.
+    auto lhs = in0.broadcast(bcast0);
+    auto rhs = in1.broadcast(bcast1);
+    Assign(dev, out, lhs.binaryExpr(rhs, func));
+  }
+};
+
+// Partial specialization of UnaryFunctor<Device=CPUDevice, Functor>.
+template <typename Functor>
+struct UnaryFunctor<CPUDevice, Functor> {
+  void operator()(const CPUDevice& d, typename Functor::tout_type out,
+                  typename Functor::tin_type in) {
+    Assign(d, out, in.unaryExpr(typename Functor::func()));
+  }
+};
+
+template <typename T>
+struct SelectFunctor<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
+                  typename TTypes<bool>::ConstFlat cond_flat,
+                  typename TTypes<T>::ConstFlat then_flat,
+                  typename TTypes<T>::ConstFlat else_flat) {
+    Assign(d, out, cond_flat.select(then_flat, else_flat));
+  }
+};
+
+}  // end namespace functor
+
+#define REGISTER_SELECT(D, N, F, T)                                          \
+  REGISTER_KERNEL_BUILDER(Name(N).Device(DEVICE_##D).TypeConstraint<T>("T"), \
+                          SelectOp<D##Device, T>)
+
+#define REGISTER(OP, D, N, F, T)                                             \
+  REGISTER_KERNEL_BUILDER(Name(N).Device(DEVICE_##D).TypeConstraint<T>("T"), \
+                          OP<D##Device, F<T>>);
+
+// Macros to register kernels for multiple types (T0, T1, etc.)  on
+// device type "D" (CPU or GPU) for operatin "N" (e.g., sqrt) using
+// the functor "F" (e.g., functor:sqrt).
+
+#ifdef __ANDROID__
+// On Android, only register the first type (float)
+#define REGISTER2(OP, D, N, F, T0, T1) REGISTER(OP, D, N, F, T0)
+#define REGISTER3(OP, D, N, F, T0, T1, T2) REGISTER(OP, D, N, F, T0)
+#define REGISTER4(OP, D, N, F, T0, T1, T2, T3) REGISTER(OP, D, N, F, T0)
+#define REGISTER5(OP, D, N, F, T0, T1, T2, T3, T4) REGISTER(OP, D, N, F, T0)
+#define REGISTER6(OP, D, N, F, T0, T1, T2, T3, T4, T5) REGISTER(OP, D, N, F, T0)
+#define REGISTER7(OP, D, N, F, T0, T1, T2, T3, T4, T5, T6) \
+  REGISTER(OP, D, N, F, T0)
+#else  // !__ANDROID__
+#define REGISTER2(OP, D, N, F, T0, T1) \
+  REGISTER(OP, D, N, F, T0)            \
+  REGISTER(OP, D, N, F, T1)
+#define REGISTER3(OP, D, N, F, T0, T1, T2) \
+  REGISTER2(OP, D, N, F, T0, T1)           \
+  REGISTER(OP, D, N, F, T2)
+#define REGISTER4(OP, D, N, F, T0, T1, T2, T3) \
+  REGISTER2(OP, D, N, F, T0, T1)               \
+  REGISTER2(OP, D, N, F, T2, T3)
+#define REGISTER5(OP, D, N, F, T0, T1, T2, T3, T4) \
+  REGISTER3(OP, D, N, F, T0, T1, T2)               \
+  REGISTER2(OP, D, N, F, T3, T4)
+#define REGISTER6(OP, D, N, F, T0, T1, T2, T3, T4, T5) \
+  REGISTER3(OP, D, N, F, T0, T1, T2)                   \
+  REGISTER3(OP, D, N, F, T3, T4, T5)
+#define REGISTER7(OP, D, N, F, T0, T1, T2, T3, T4, T5, T6) \
+  REGISTER4(OP, D, N, F, T0, T1, T2, T3)                   \
+  REGISTER3(OP, D, N, F, T4, T5, T6)
+#endif  // __ANDROID__
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_CWISE_OPS_COMMON_H_
diff --git a/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h b/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
new file mode 100644
index 0000000000..b0dc027144
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
@@ -0,0 +1,135 @@
+#if !GOOGLE_CUDA
+#error This file must only be included when building with Cuda support
+#endif
+
+#ifndef TENSORFLOW_KERNELS_CWISE_OPS_GPU_COMMON_CU_H_
+#define TENSORFLOW_KERNELS_CWISE_OPS_GPU_COMMON_CU_H_
+
+#define EIGEN_USE_GPU
+
+#include <complex>
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/kernels/cwise_ops.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+#include "tensorflow/core/platform/logging.h"
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+typedef std::complex<float> complex64;
+
+// Partial specialization of UnaryFunctor<Device=GPUDevice, Functor>.
+template <typename Functor>
+struct UnaryFunctor<GPUDevice, Functor> {
+  void operator()(const GPUDevice& d, typename Functor::tout_type out,
+                  typename Functor::tin_type in) {
+    out.device(d) = in.unaryExpr(typename Functor::func());
+  }
+};
+
+// Partial specialization of BinaryFunctor<Device=GPUDevice, Functor>.
+template <typename Functor, int NDIMS>
+struct BinaryFunctor<GPUDevice, Functor, NDIMS> {
+  void operator()(const GPUDevice& d, typename Functor::tout_type out,
+                  typename Functor::tin_type in0,
+                  typename Functor::tin_type in1) {
+    out.device(d) = in0.binaryExpr(in1, typename Functor::func());
+  }
+
+  void Left(const GPUDevice& d, typename Functor::tout_type out,
+            typename Functor::tscalar_type scalar,
+            typename Functor::tin_type in) {
+    typedef typename Functor::out_type Tout;
+    typedef typename Functor::in_type Tin;
+    typedef typename Functor::func Binary;
+    typedef typename Eigen::internal::scalar_left<Tout, Tin, Binary> Unary;
+    out.device(d) = in.unaryExpr(Unary(scalar.data()));
+  }
+
+  void Right(const GPUDevice& d, typename Functor::tout_type out,
+             typename Functor::tin_type in,
+             typename Functor::tscalar_type scalar) {
+    typedef typename Functor::out_type Tout;
+    typedef typename Functor::in_type Tin;
+    typedef typename Functor::func Binary;
+    typedef typename Eigen::internal::scalar_right<Tout, Tin, Binary> Unary;
+    out.device(d) = in.unaryExpr(Unary(scalar.data()));
+  }
+
+  void BCast(const GPUDevice& d,
+             typename TTypes<typename Functor::out_type, NDIMS>::Tensor out,
+             typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in0,
+             typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast0,
+             typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in1,
+             typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast1) {
+    typedef typename Functor::in_type T;
+    typename Functor::func func;
+    if ((NDIMS == 2) && Functor::use_bcast_optimization &&
+        use_bcast_optimization<T>::value) {
+      const bool bcast0_all_one = AllOne<NDIMS>(bcast0);
+      const bool bcast1_all_one = AllOne<NDIMS>(bcast1);
+      if (bcast0_all_one && !bcast1_all_one) {
+        out.device(d) = in0.binaryExpr(in1.broadcast(bcast1), func);
+        return;
+      }
+      if (!bcast0_all_one && bcast1_all_one) {
+        out.device(d) = in0.broadcast(bcast0).binaryExpr(in1, func);
+        return;
+      }
+    }
+    out.device(d) =
+        in0.broadcast(bcast0).binaryExpr(in1.broadcast(bcast1), func);
+  }
+};
+
+template <typename T>
+struct SelectFunctor<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat out,
+                  typename TTypes<bool>::ConstFlat cond_flat,
+                  typename TTypes<T>::ConstFlat then_flat,
+                  typename TTypes<T>::ConstFlat else_flat) {
+    out.device(d) = cond_flat.select(then_flat, else_flat);
+  }
+};
+
+// Macros to explicitly instantiate kernels on GPU for multiple types
+// (T0, T1, etc.) for UnaryFunctor (e.g., functor:sqrt).
+#define DEFINE_UNARY1(F, T) template struct UnaryFunctor<GPUDevice, F<T> >
+#define DEFINE_UNARY2(F, T0, T1) \
+  DEFINE_UNARY1(F, T0);          \
+  DEFINE_UNARY1(F, T1)
+#define DEFINE_UNARY3(F, T0, T1, T2) \
+  DEFINE_UNARY2(F, T0, T1);          \
+  DEFINE_UNARY1(F, T2)
+#define DEFINE_UNARY4(F, T0, T1, T2, T3) \
+  DEFINE_UNARY2(F, T0, T1);              \
+  DEFINE_UNARY2(F, T2, T3)
+#define DEFINE_UNARY5(F, T0, T1, T2, T3, T4) \
+  DEFINE_UNARY2(F, T0, T1);                  \
+  DEFINE_UNARY3(F, T2, T3, T4)
+
+// Macros to explicitly instantiate kernels on GPU for multiple types
+// (T0, T1, etc.) for BinaryFunctor.
+#define DEFINE_BINARY1(F, T)                         \
+  template struct BinaryFunctor<GPUDevice, F<T>, 1>; \
+  template struct BinaryFunctor<GPUDevice, F<T>, 2>; \
+  template struct BinaryFunctor<GPUDevice, F<T>, 3>
+#define DEFINE_BINARY2(F, T0, T1) \
+  DEFINE_BINARY1(F, T0);          \
+  DEFINE_BINARY1(F, T1)
+#define DEFINE_BINARY3(F, T0, T1, T2) \
+  DEFINE_BINARY2(F, T0, T1);          \
+  DEFINE_BINARY1(F, T2)
+#define DEFINE_BINARY4(F, T0, T1, T2, T3) \
+  DEFINE_BINARY2(F, T0, T1);              \
+  DEFINE_BINARY2(F, T2, T3)
+#define DEFINE_BINARY5(F, T0, T1, T2, T3, T4) \
+  DEFINE_BINARY2(F, T0, T1);                  \
+  DEFINE_BINARY3(F, T2, T3, T4)
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_CWISE_OPS_GPU_COMMON_CU_H_
diff --git a/tensorflow/core/kernels/cwise_ops_test.cc b/tensorflow/core/kernels/cwise_ops_test.cc
new file mode 100644
index 0000000000..56af248117
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_ops_test.cc
@@ -0,0 +1,167 @@
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include <gtest/gtest.h>
+
+namespace tensorflow {
+
+// Creates a Graph which applies a unary "func" on a 3D float tensor
+// of "num" elements.
+static Graph* Unary(const string& func, int num) {
+  RequireDefaultOps();
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor data(DT_FLOAT, TensorShape({64, 64, num / (64 * 64)}));
+  CHECK_GT(data.NumElements(), 0);
+  data.flat<float>().setRandom();
+  test::graph::Unary(g, func, test::graph::Constant(g, data), 0);
+  return g;
+}
+
+static int kRows = 100000;
+
+static int RowsAndColsArg(int r, int c) { return r * kRows + c; }
+static int RowsFromArg(int arg) { return (arg / kRows); }
+static int ColsFromArg(int arg) { return (arg % kRows); }
+
+#define BM_UNARY(DEVICE, FUNC)                              \
+  static void BM_##DEVICE##_##FUNC(int iters, int num) {    \
+    const int64 tot = static_cast<int64>(iters) * num;      \
+    testing::ItemsProcessed(tot);                           \
+    testing::BytesProcessed(tot * sizeof(float));           \
+    test::Benchmark(#DEVICE, Unary(#FUNC, num)).Run(iters); \
+  }                                                         \
+  BENCHMARK(BM_##DEVICE##_##FUNC)->Range(4 << 10, 1 << 20);
+
+BM_UNARY(cpu, Floor);
+BM_UNARY(gpu, Floor);
+
+// data func scalar.
+static Graph* BinaryScalar(int num, const string& func) {
+  RequireDefaultOps();
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor lhs(DT_FLOAT, TensorShape({64, 64, num / (64 * 64)}));
+  lhs.flat<float>().setRandom();
+  Tensor rhs(DT_FLOAT, TensorShape({}));
+  rhs.flat<float>().setRandom();
+  test::graph::Binary(g, func, test::graph::Constant(g, lhs),
+                      test::graph::Constant(g, rhs));
+  return g;
+}
+
+#define BM_BINARY_SCALAR(DEVICE, FUNC)                             \
+  static void BM_##DEVICE##_##FUNC##_scalar(int iters, int num) {  \
+    const int64 tot = static_cast<int64>(iters) * num;             \
+    testing::ItemsProcessed(tot);                                  \
+    testing::BytesProcessed(tot * sizeof(float));                  \
+    test::Benchmark(#DEVICE, BinaryScalar(num, #FUNC)).Run(iters); \
+  }                                                                \
+  BENCHMARK(BM_##DEVICE##_##FUNC##_scalar)                         \
+      ->Arg(4096) /* must >= 4096 */                               \
+      ->Arg(32768)                                                 \
+      ->Arg(131072)                                                \
+      ->Arg(1048576);
+
+BM_BINARY_SCALAR(cpu, Less);
+BM_BINARY_SCALAR(gpu, Less);
+BM_BINARY_SCALAR(cpu, Add);
+BM_BINARY_SCALAR(gpu, Add);
+#undef BM_BINARY_SCALAR
+
+static Graph* BiasAdd(int rows, int cols) {
+  RequireDefaultOps();
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor lhs(DT_FLOAT, TensorShape({rows, cols}));
+  lhs.flat<float>().setRandom();
+  TensorShape rhs_shape;
+  rhs_shape = TensorShape({cols});
+  Tensor rhs(DT_FLOAT, rhs_shape);
+  rhs.flat<float>().setRandom();
+  test::graph::Binary(g, "BiasAdd", test::graph::Constant(g, lhs),
+                      test::graph::Constant(g, rhs));
+  return g;
+}
+
+#define BM_BIAS_ADD(DEVICE, R, C)                                     \
+  static void BM_##DEVICE##_BiasAdd_R##R##_C##C(int iters, int arg) { \
+    const int rows = RowsFromArg(arg);                                \
+    const int cols = ColsFromArg(arg);                                \
+    const int64 tot = static_cast<int64>(iters) * rows * cols;        \
+    testing::ItemsProcessed(tot);                                     \
+    testing::BytesProcessed(tot * sizeof(float));                     \
+    test::Benchmark(#DEVICE, BiasAdd(rows, cols)).Run(iters);         \
+  }                                                                   \
+  BENCHMARK(BM_##DEVICE##_BiasAdd_R##R##_C##C)->Arg(RowsAndColsArg(R, C));
+
+#define BM_BIAS_ADD_ALL(DEVICE)   \
+  BM_BIAS_ADD(DEVICE, 512, 2048); \
+  BM_BIAS_ADD(DEVICE, 512, 4096); \
+  BM_BIAS_ADD(DEVICE, 2048, 512); \
+  BM_BIAS_ADD(DEVICE, 4096, 512);
+
+BM_BIAS_ADD_ALL(cpu);
+BM_BIAS_ADD_ALL(gpu);
+#undef BM_BIAS_ADD_ALL
+#undef BM_BIAS_ADD
+
+static Graph* BcastAdd(int rows, int cols, int dim) {
+  RequireDefaultOps();
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor lhs(DT_FLOAT, TensorShape({rows, cols}));
+  lhs.flat<float>().setRandom();
+  TensorShape rhs_shape;
+  if (dim == 0) {
+    rhs_shape = TensorShape({rows, 1});
+  } else {
+    rhs_shape = TensorShape({cols});
+  }
+  Tensor rhs(DT_FLOAT, rhs_shape);
+  rhs.flat<float>().setRandom();
+  test::graph::Binary(g, "Add", test::graph::Constant(g, lhs),
+                      test::graph::Constant(g, rhs));
+  return g;
+}
+
+#define BM_BCAST_ADD_ROW(DEVICE, R, C)                                    \
+  static void BM_##DEVICE##_BcastAddRow_R##R##_C##C(int iters, int arg) { \
+    const int rows = RowsFromArg(arg);                                    \
+    const int cols = ColsFromArg(arg);                                    \
+    const int64 tot = static_cast<int64>(iters) * rows * cols;            \
+    testing::ItemsProcessed(tot);                                         \
+    testing::BytesProcessed(tot * sizeof(float));                         \
+    test::Benchmark(#DEVICE, BcastAdd(rows, cols, 0)).Run(iters);         \
+  }                                                                       \
+  BENCHMARK(BM_##DEVICE##_BcastAddRow_R##R##_C##C)->Arg(RowsAndColsArg(R, C));
+
+#define BM_BCAST_ADD_ROW_ALL(DEVICE)   \
+  BM_BCAST_ADD_ROW(DEVICE, 512, 2048); \
+  BM_BCAST_ADD_ROW(DEVICE, 512, 4096); \
+  BM_BCAST_ADD_ROW(DEVICE, 2048, 512); \
+  BM_BCAST_ADD_ROW(DEVICE, 4096, 512);
+BM_BCAST_ADD_ROW_ALL(cpu);
+BM_BCAST_ADD_ROW_ALL(gpu);
+#undef BM_BCAST_ADD_ROW_ALL
+#undef BM_BCAST_ADD_ROW
+
+#define BM_BCAST_ADD_COL(DEVICE, R, C)                                    \
+  static void BM_##DEVICE##_BcastAddCol_R##R##_C##C(int iters, int arg) { \
+    const int rows = RowsFromArg(arg);                                    \
+    const int cols = ColsFromArg(arg);                                    \
+    const int64 tot = static_cast<int64>(iters) * rows * cols;            \
+    testing::ItemsProcessed(tot);                                         \
+    testing::BytesProcessed(tot * sizeof(float));                         \
+    test::Benchmark(#DEVICE, BcastAdd(rows, cols, 1)).Run(iters);         \
+  }                                                                       \
+  BENCHMARK(BM_##DEVICE##_BcastAddCol_R##R##_C##C)->Arg(RowsAndColsArg(R, C));
+
+#define BM_BCAST_ADD_COL_ALL(DEVICE)   \
+  BM_BCAST_ADD_COL(DEVICE, 512, 2048); \
+  BM_BCAST_ADD_COL(DEVICE, 512, 4096); \
+  BM_BCAST_ADD_COL(DEVICE, 2048, 512); \
+  BM_BCAST_ADD_COL(DEVICE, 4096, 512);
+BM_BCAST_ADD_COL_ALL(cpu);
+BM_BCAST_ADD_COL_ALL(gpu);
+#undef BM_BCAST_ADD_COL_ALL
+#undef BM_BCAST_ADD_COL
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/decode_csv_op.cc b/tensorflow/core/kernels/decode_csv_op.cc
new file mode 100644
index 0000000000..0919bab96f
--- /dev/null
+++ b/tensorflow/core/kernels/decode_csv_op.cc
@@ -0,0 +1,222 @@
+// See docs in ../ops/parsing_ops.cc.
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+class DecodeCSVOp : public OpKernel {
+ public:
+  explicit DecodeCSVOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    string delim;
+
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("OUT_TYPE", &out_type_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("field_delim", &delim));
+
+    OP_REQUIRES(ctx, delim.size() == 1,
+                errors::InvalidArgument("field_delim should be only 1 char"));
+
+    delim_ = delim[0];
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* records;
+    OpInputList record_defaults;
+
+    OP_REQUIRES_OK(ctx, ctx->input("records", &records));
+    OP_REQUIRES_OK(ctx, ctx->input_list("record_defaults", &record_defaults));
+
+    for (int i = 0; i < record_defaults.size(); ++i) {
+      OP_REQUIRES(ctx, record_defaults[i].NumElements() < 2,
+                  errors::InvalidArgument(
+                      "There should only be 1 default per field but field ", i,
+                      " has ", record_defaults[i].NumElements()));
+    }
+
+    auto records_t = records->flat<string>();
+    int records_size = records_t.size();
+
+    OpOutputList output;
+    OP_REQUIRES_OK(ctx, ctx->output_list("output", &output));
+
+    for (size_t i = 0; i < out_type_.size(); ++i) {
+      Tensor* out = nullptr;
+      output.allocate(i, records->shape(), &out);
+    }
+
+    for (int i = 0; i < records_size; ++i) {
+      const StringPiece record(records_t(i));
+      std::vector<string> fields;
+      ExtractFields(ctx, record, &fields);
+      OP_REQUIRES(ctx, fields.size() == out_type_.size(),
+                  errors::InvalidArgument("Expect ", out_type_.size(),
+                                          " fields but have ", fields.size(),
+                                          " in record ", i));
+
+      // Check each field in the record
+      for (size_t f = 0; f < out_type_.size(); ++f) {
+        const DataType& dtype = out_type_[f];
+        switch (dtype) {
+          case DT_INT32: {
+            // If this field is empty, check if default is given:
+            // If yes, use default value; Otherwise report error.
+            if (fields[f].empty()) {
+              OP_REQUIRES(ctx, record_defaults[f].NumElements() == 1,
+                          errors::InvalidArgument(
+                              "Field ", f,
+                              " is required but missing in record ", i, "!"));
+
+              output[f]->flat<int32>()(i) = record_defaults[f].flat<int32>()(0);
+            } else {
+              int32 value;
+              OP_REQUIRES(ctx, strings::safe_strto32(fields[f].c_str(), &value),
+                          errors::InvalidArgument("Field ", f, " in record ", i,
+                                                  " is not a valid int32: ",
+                                                  fields[f]));
+              output[f]->flat<int32>()(i) = value;
+            }
+            break;
+          }
+          case DT_INT64: {
+            // If this field is empty, check if default is given:
+            // If yes, use default value; Otherwise report error.
+            if (fields[f].empty()) {
+              OP_REQUIRES(ctx, record_defaults[f].NumElements() == 1,
+                          errors::InvalidArgument(
+                              "Field ", f,
+                              " is required but missing in record ", i, "!"));
+
+              output[f]->flat<int64>()(i) = record_defaults[f].flat<int64>()(0);
+            } else {
+              int64 value;
+              OP_REQUIRES(ctx, strings::safe_strto64(fields[f].c_str(), &value),
+                          errors::InvalidArgument("Field ", f, " in record ", i,
+                                                  " is not a valid int64: ",
+                                                  fields[f]));
+              output[f]->flat<int64>()(i) = value;
+            }
+            break;
+          }
+          case DT_FLOAT: {
+            // If this field is empty, check if default is given:
+            // If yes, use default value; Otherwise report error.
+            if (fields[f].empty()) {
+              OP_REQUIRES(ctx, record_defaults[f].NumElements() == 1,
+                          errors::InvalidArgument(
+                              "Field ", f,
+                              " is required but missing in record ", i, "!"));
+              output[f]->flat<float>()(i) = record_defaults[f].flat<float>()(0);
+            } else {
+              float value;
+              OP_REQUIRES(ctx, strings::safe_strtof(fields[f].c_str(), &value),
+                          errors::InvalidArgument("Field ", f, " in record ", i,
+                                                  " is not a valid float: ",
+                                                  fields[f]));
+              output[f]->flat<float>()(i) = value;
+            }
+            break;
+          }
+          case DT_STRING: {
+            // If this field is empty, check if default is given:
+            // If yes, use default value; Otherwise report error.
+            if (fields[f].empty()) {
+              OP_REQUIRES(ctx, record_defaults[f].NumElements() == 1,
+                          errors::InvalidArgument(
+                              "Field ", f,
+                              " is required but missing in record ", i, "!"));
+              output[f]->flat<string>()(i) =
+                  record_defaults[f].flat<string>()(0);
+            } else {
+              output[f]->flat<string>()(i) = fields[f];
+            }
+            break;
+          }
+          default:
+            OP_REQUIRES(ctx, false,
+                        errors::InvalidArgument("csv: data type ", dtype,
+                                                " not supported in field ", f));
+        }
+      }
+    }
+  }
+
+ private:
+  std::vector<DataType> out_type_;
+  char delim_;
+
+  void ExtractFields(OpKernelContext* ctx, StringPiece input,
+                     std::vector<string>* result) {
+    int current_idx = 0;
+    if (!input.empty()) {
+      while (static_cast<size_t>(current_idx) < input.size()) {
+        if (input[current_idx] == '\n' || input[current_idx] == '\r') {
+          current_idx++;
+          continue;
+        }
+
+        bool quoted = false;
+        if (input[current_idx] == '"') {
+          quoted = true;
+          current_idx++;
+        }
+
+        // This is the body of the field;
+        string field;
+        if (!quoted) {
+          while (static_cast<size_t>(current_idx) < input.size() &&
+                 input[current_idx] != delim_) {
+            OP_REQUIRES(ctx, input[current_idx] != '"' &&
+                                 input[current_idx] != '\n' &&
+                                 input[current_idx] != '\r',
+                        errors::InvalidArgument(
+                            "Unquoted fields cannot have quotes/CRLFs inside"));
+            field += input[current_idx];
+            current_idx++;
+          }
+
+          // Go to next field or the end
+          current_idx++;
+        } else {
+          // Quoted field needs to be ended with '"' and delim or end
+          while (
+              (static_cast<size_t>(current_idx) < input.size() - 1) &&
+              (input[current_idx] != '"' || input[current_idx + 1] != delim_)) {
+            if (input[current_idx] != '"') {
+              field += input[current_idx];
+              current_idx++;
+            } else {
+              OP_REQUIRES(
+                  ctx, input[current_idx + 1] == '"',
+                  errors::InvalidArgument("Quote inside a string has to be "
+                                          "escaped by another quote"));
+              field += '"';
+              current_idx += 2;
+            }
+          }
+
+          OP_REQUIRES(
+              ctx,
+              input[current_idx] == '"' &&
+                  (static_cast<size_t>(current_idx) == input.size() - 1 ||
+                   input[current_idx + 1] == delim_),
+              errors::InvalidArgument("Quoted field has to end with quote "
+                                      "followed by delim or end"));
+
+          current_idx += 2;
+        }
+
+        result->push_back(field);
+      }
+
+      // Check if the last field is missing
+      if (input[input.size() - 1] == delim_) result->push_back(string());
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("DecodeCSV").Device(DEVICE_CPU), DecodeCSVOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/decode_jpeg_op.cc b/tensorflow/core/kernels/decode_jpeg_op.cc
new file mode 100644
index 0000000000..e41d3f3e11
--- /dev/null
+++ b/tensorflow/core/kernels/decode_jpeg_op.cc
@@ -0,0 +1,72 @@
+// See docs in ../ops/image_ops.cc
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/lib/jpeg/jpeg_mem.h"
+
+namespace tensorflow {
+
+// Decode the contents of a JPEG file
+class DecodeJpegOp : public OpKernel {
+ public:
+  explicit DecodeJpegOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("channels", &flags_.components));
+    OP_REQUIRES(context, flags_.components == 0 || flags_.components == 1 ||
+                             flags_.components == 3,
+                errors::InvalidArgument("channels must be 0, 1, or 3, got ",
+                                        flags_.components));
+    OP_REQUIRES_OK(context, context->GetAttr("ratio", &flags_.ratio));
+    OP_REQUIRES(context, flags_.ratio == 1 || flags_.ratio == 2 ||
+                             flags_.ratio == 4 || flags_.ratio == 8,
+                errors::InvalidArgument("ratio must be 1, 2, 4, or 8, got ",
+                                        flags_.ratio));
+    OP_REQUIRES_OK(
+        context, context->GetAttr("fancy_upscaling", &flags_.fancy_upscaling));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("try_recover_truncated",
+                                    &flags_.try_recover_truncated_jpeg));
+    OP_REQUIRES_OK(context, context->GetAttr("acceptable_fraction",
+                                             &flags_.min_acceptable_fraction));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& contents = context->input(0);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(contents.shape()),
+                errors::InvalidArgument("contents must be scalar, got shape ",
+                                        contents.shape().ShortDebugString()));
+    const StringPiece input = contents.scalar<string>()();
+    OP_REQUIRES(context, input.size() <= std::numeric_limits<int>::max(),
+                errors::InvalidArgument("JPEG contents are too large for int: ",
+                                        input.size()));
+
+    // Decode image, allocating tensor once the image size is known
+    Tensor* output = NULL;
+    OP_REQUIRES(
+        context,
+        jpeg::Uncompress(
+            input.data(), input.size(), flags_, NULL,
+            [=, &output](int width, int height, int channels) -> uint8* {
+              Status status(context->allocate_output(
+                  0, TensorShape({height, width, channels}), &output));
+              if (!status.ok()) {
+                VLOG(1) << status;
+                context->SetStatus(status);
+                return nullptr;
+              }
+              return output->flat<uint8>().data();
+            }),
+        errors::InvalidArgument("Invalid JPEG data, size ", input.size()));
+  }
+
+ private:
+  jpeg::UncompressFlags flags_;
+};
+REGISTER_KERNEL_BUILDER(Name("DecodeJpeg").Device(DEVICE_CPU), DecodeJpegOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/decode_png_op.cc b/tensorflow/core/kernels/decode_png_op.cc
new file mode 100644
index 0000000000..e8071526f9
--- /dev/null
+++ b/tensorflow/core/kernels/decode_png_op.cc
@@ -0,0 +1,69 @@
+// See docs in ../ops/image_ops.cc
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/lib/png/png_io.h"
+
+namespace tensorflow {
+
+// Decode the contents of a PNG file
+class DecodePngOp : public OpKernel {
+ public:
+  explicit DecodePngOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("channels", &channels_));
+    OP_REQUIRES(context, channels_ == 0 || channels_ == 1 || channels_ == 3 ||
+                             channels_ == 4,
+                errors::InvalidArgument("channels must be 0, 1, 3, or 4, got ",
+                                        channels_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& contents = context->input(0);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(contents.shape()),
+                errors::InvalidArgument("contents must be scalar, got shape ",
+                                        contents.shape().ShortDebugString()));
+
+    // Start decoding image to get shape details
+    const StringPiece data = contents.scalar<string>()();
+    png::DecodeContext decode;
+    OP_REQUIRES(
+        context, png::CommonInitDecode(data, channels_, 8, &decode),
+        errors::InvalidArgument("Invalid PNG header, data size ", data.size()));
+
+    // Verify that width and height don't overflow int
+    const int width = decode.width;
+    const int height = decode.height;
+    if (width != static_cast<int64>(decode.width) ||
+        height != static_cast<int64>(decode.height)) {
+      png::CommonFreeDecode(&decode);
+      OP_REQUIRES(context, false,
+                  errors::InvalidArgument("PNG size too large for int: ",
+                                          decode.width, " by ", decode.height));
+    }
+
+    // Allocate tensor
+    Tensor* output = nullptr;
+    const auto status = context->allocate_output(
+        0, TensorShape({height, width, decode.channels}), &output);
+    if (!status.ok()) png::CommonFreeDecode(&decode);
+    OP_REQUIRES_OK(context, status);
+
+    // Finish decoding image
+    OP_REQUIRES(
+        context, png::CommonFinishDecode(output->flat<uint8>().data(),
+                                         decode.channels * width, &decode),
+        errors::InvalidArgument("Invalid PNG data, size ", data.size()));
+  }
+
+ private:
+  int channels_;
+};
+REGISTER_KERNEL_BUILDER(Name("DecodePng").Device(DEVICE_CPU), DecodePngOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/decode_raw_op.cc b/tensorflow/core/kernels/decode_raw_op.cc
new file mode 100644
index 0000000000..ef24c333a4
--- /dev/null
+++ b/tensorflow/core/kernels/decode_raw_op.cc
@@ -0,0 +1,90 @@
+// See docs in ../ops/parse_ops.cc.
+
+#include <algorithm>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+template <typename T>
+class DecodeRawOp : public OpKernel {
+ public:
+  explicit DecodeRawOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("little_endian", &little_endian_));
+    OP_REQUIRES_OK(context, context->GetAttr("out_type", &out_type_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const auto& input = context->input(0);
+    int str_size = -1;
+    auto flat_in = input.flat<string>();
+    for (int i = 0; i < flat_in.size(); ++i) {
+      const string& in_str = flat_in(i);
+      if (str_size == -1) {
+        str_size = in_str.size();
+      } else {
+        OP_REQUIRES(context, str_size == in_str.size(),
+                    errors::InvalidArgument(
+                        "DecodeRaw requires input strings to all be the same "
+                        "size, but element ",
+                        i, " has size ", str_size, " != ", in_str.size()));
+      }
+    }
+    TensorShape out_shape = input.shape();
+    if (str_size == -1) {  // Empty input
+      out_shape.AddDim(1);
+      Tensor* output_tensor = nullptr;
+      OP_REQUIRES_OK(context, context->allocate_output("output", out_shape,
+                                                       &output_tensor));
+      return;
+    }
+    OP_REQUIRES(
+        context, str_size % sizeof(T) == 0,
+        errors::InvalidArgument("Input to DecodeRaw has length ", str_size,
+                                " that is not a multiple of ", sizeof(T),
+                                ", the size of ", DataTypeString(out_type_)));
+    const int added_dim = str_size / sizeof(T);
+    out_shape.AddDim(added_dim);
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(
+        context, context->allocate_output("output", out_shape, &output_tensor));
+    auto out = output_tensor->flat_inner_dims<T>();
+    DCHECK_EQ(flat_in.size(), out.dimensions()[0]);
+    OP_REQUIRES(
+        context,
+        little_endian_ == ::tensorflow::port::kLittleEndian || sizeof(T) == 1,
+        errors::Unimplemented("Unimplemented support for little_endian=",
+                              little_endian_ ? "true" : "false"));
+    // Endianness matches, so just copy each string byte-for-byte.
+    T* out_data = out.data();
+    for (int i = 0; i < flat_in.size(); ++i) {
+      const T* in_data = reinterpret_cast<const T*>(flat_in(i).data());
+      memcpy(out_data, in_data, str_size);
+      out_data += added_dim;
+    }
+  }
+
+ private:
+  bool little_endian_;
+  DataType out_type_;
+};
+
+#define REGISTER(type)                                                       \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name("DecodeRaw").Device(DEVICE_CPU).TypeConstraint<type>("out_type"), \
+      DecodeRawOp<type>)
+
+REGISTER(float);
+REGISTER(double);
+REGISTER(int32);
+REGISTER(uint8);
+REGISTER(int16);
+REGISTER(int8);
+REGISTER(int64);
+
+#undef REGISTER
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/dense_update_ops.cc b/tensorflow/core/kernels/dense_update_ops.cc
new file mode 100644
index 0000000000..f56c37b4ef
--- /dev/null
+++ b/tensorflow/core/kernels/dense_update_ops.cc
@@ -0,0 +1,136 @@
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/assign_op.h"
+#include "tensorflow/core/kernels/dense_update_ops.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+template <typename Device, typename T>
+class AssignOpT : public AssignOp {
+ public:
+  using AssignOp::AssignOp;
+
+  void Copy(OpKernelContext* context, Tensor* lhs, const Tensor& rhs) override {
+    functor::DenseUpdate<Device, T, ASSIGN> copy;
+    copy(context->eigen_device<Device>(), lhs->flat<T>(), rhs.flat<T>());
+  }
+};
+
+// TODO(jeff): Get rid of use_exclusive_lock_ option
+template <typename Device, typename T, DenseUpdateType OP>
+class DenseUpdateOp : public OpKernel {
+ public:
+  explicit DenseUpdateOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("use_locking", &use_exclusive_lock_));
+    const DataType dt = DataTypeToEnum<T>::v();
+    OP_REQUIRES_OK(context, context->MatchSignature({MakeRefType(dt), dt},
+                                                    {MakeRefType(dt)}));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // We always return the input ref.
+    context->forward_ref_input_to_ref_output(0, 0);
+
+    if (use_exclusive_lock_) {
+      mutex_lock l(*context->input_ref_mutex(0));
+      DoUpdate(context);
+    } else {
+      DoUpdate(context);
+    }
+  }
+
+ private:
+  void DoUpdate(OpKernelContext* context) {
+    Tensor Tparams = context->mutable_input(0, use_exclusive_lock_);
+    const Tensor& Tupdate = context->input(1);
+    OP_REQUIRES(context, Tparams.IsInitialized(),
+                errors::FailedPrecondition("Attempting to use uninitialized "
+                                           "parameters: ",
+                                           def().input(0)));
+    OP_REQUIRES(
+        context, Tparams.IsSameSize(Tupdate),
+        errors::InvalidArgument("Parameters and update must be the same size"));
+
+    functor::DenseUpdate<Device, T, OP> update_functor;
+    update_functor(context->eigen_device<Device>(), Tparams.flat<T>(),
+                   Tupdate.flat<T>());
+  }
+
+  bool use_exclusive_lock_;
+};
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+#define REGISTER_KERNELS(type)                                     \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("Assign").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      AssignOpT<CPUDevice, type>);
+
+TF_CALL_ALL_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+
+#if GOOGLE_CUDA
+// Only register 'Assign' on GPU for the subset of types also supported by
+// 'Variable' (see variable_ops.cc.)
+#define REGISTER_GPU_KERNELS(type)                                 \
+  namespace functor {                                              \
+  template <>                                                      \
+  void DenseUpdate<GPUDevice, type, ASSIGN>::operator()(           \
+      const GPUDevice& d, typename TTypes<type>::Flat lhs,         \
+      typename TTypes<type>::ConstFlat rhs);                       \
+  extern template struct DenseUpdate<GPUDevice, type, ASSIGN>;     \
+  }                                                                \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("Assign").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+      AssignOpT<GPUDevice, type>);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+#undef REGISTER_GPU_KERNELS
+#endif  // GOOGLE_CUDA
+
+#define REGISTER_KERNELS(type)                                        \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("AssignAdd").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      DenseUpdateOp<CPUDevice, type, DenseUpdateType::ADD>);          \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("AssignSub").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      DenseUpdateOp<CPUDevice, type, DenseUpdateType::SUB>);
+
+TF_CALL_NUMBER_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC_FOR_OP(T, OP)                     \
+  template <>                                              \
+  void DenseUpdate<GPUDevice, T, OP>::operator()(          \
+      const GPUDevice& d, typename TTypes<T>::Flat params, \
+      typename TTypes<T>::ConstFlat update);               \
+  extern template struct DenseUpdate<GPUDevice, T, OP>
+#define DECLARE_GPU_SPEC(T)                         \
+  DECLARE_GPU_SPEC_FOR_OP(T, DenseUpdateType::ADD); \
+  DECLARE_GPU_SPEC_FOR_OP(T, DenseUpdateType::SUB)
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
+#undef DECLARE_GPU_SPEC
+#undef DECLARE_GPU_SPEC_FOR_OP
+}  // namespace functor
+
+#define REGISTER_GPU_KERNELS(type)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("AssignAdd").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+      DenseUpdateOp<GPUDevice, type, DenseUpdateType::ADD>);          \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("AssignSub").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+      DenseUpdateOp<GPUDevice, type, DenseUpdateType::SUB>);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+#undef REGISTER_GPU_KERNELS
+#endif  // end GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/dense_update_ops.h b/tensorflow/core/kernels/dense_update_ops.h
new file mode 100644
index 0000000000..d32c9a4af2
--- /dev/null
+++ b/tensorflow/core/kernels/dense_update_ops.h
@@ -0,0 +1,43 @@
+#ifndef TENSORFLOW_KERNELS_DENSE_UPDATE_OPS_H_
+#define TENSORFLOW_KERNELS_DENSE_UPDATE_OPS_H_
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+enum DenseUpdateType { ADD, SUB, ASSIGN };
+
+namespace functor {
+
+template <typename Device, typename T, DenseUpdateType OP>
+struct DenseUpdate;
+
+template <typename Device, typename T>
+struct DenseUpdate<Device, T, ADD> {
+  void operator()(const Device& d, typename TTypes<T>::Flat params,
+                  typename TTypes<T>::ConstFlat update) {
+    params.device(d) += update;
+  }
+};
+
+template <typename Device, typename T>
+struct DenseUpdate<Device, T, SUB> {
+  void operator()(const Device& d, typename TTypes<T>::Flat params,
+                  typename TTypes<T>::ConstFlat update) {
+    params.device(d) -= update;
+  }
+};
+
+template <typename Device, typename T>
+struct DenseUpdate<Device, T, ASSIGN> {
+  void operator()(const Device& d, typename TTypes<T>::Flat params,
+                  typename TTypes<T>::ConstFlat update) {
+    params.device(d) = update;
+  }
+};
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_DENSE_UPDATE_OPS_H_
diff --git a/tensorflow/core/kernels/dense_update_ops_gpu.cu.cc b/tensorflow/core/kernels/dense_update_ops_gpu.cu.cc
new file mode 100644
index 0000000000..8e80901c71
--- /dev/null
+++ b/tensorflow/core/kernels/dense_update_ops_gpu.cu.cc
@@ -0,0 +1,22 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/dense_update_ops.h"
+
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+#define DEFINE_GPU_KERNELS(T)                              \
+  template struct functor::DenseUpdate<GPUDevice, T, ADD>; \
+  template struct functor::DenseUpdate<GPUDevice, T, SUB>; \
+  template struct functor::DenseUpdate<GPUDevice, T, ASSIGN>;
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
+#undef DEFINE_GPU_KERNELS
+
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/determinant_op.cc b/tensorflow/core/kernels/determinant_op.cc
new file mode 100644
index 0000000000..d34aab7a44
--- /dev/null
+++ b/tensorflow/core/kernels/determinant_op.cc
@@ -0,0 +1,66 @@
+// See docs in ../ops/linalg_ops.cc.
+#include <cmath>
+
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "third_party/eigen3/Eigen/LU"
+
+namespace tensorflow {
+
+template <class Scalar, bool SupportsBatchOperationT>
+class DeterminantOp : public LinearAlgebraOp<Scalar, SupportsBatchOperationT> {
+ public:
+  explicit DeterminantOp(OpKernelConstruction* context)
+      : LinearAlgebraOp<Scalar, SupportsBatchOperationT>(context) {}
+  ~DeterminantOp() override {}
+
+  TensorShape GetOutputMatrixShape(
+      const TensorShape& input_matrix_shape) override {
+    return TensorShape({});
+  }
+
+  int64 GetCostPerUnit(const TensorShape& input_matrix_shape) override {
+    const int64 rows = input_matrix_shape.dim_size(0);
+    if (rows > (1LL << 20)) {
+      // A big number to cap the cost in case overflow.
+      return kint32max;
+    } else {
+      return rows * rows * rows;
+    }
+  }
+
+  using typename LinearAlgebraOp<Scalar, SupportsBatchOperationT>::MatrixMap;
+  using
+      typename LinearAlgebraOp<Scalar, SupportsBatchOperationT>::ConstMatrixMap;
+
+  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMap& input,
+                     MatrixMap* output) override {
+    OP_REQUIRES(context, input.rows() == input.cols(),
+                errors::InvalidArgument("Input matrix must be square."));
+    Scalar determinant;
+    if (input.rows() == 0) {
+      // An empty matrix' determinant is defined to be 1.  See
+      // wikipedia.
+      determinant = 1;
+    } else {
+      determinant = input.determinant();
+    }
+    OP_REQUIRES(context, std::isfinite(determinant),
+                errors::Internal("The determinant is not finite."));
+    (*output)(0, 0) = determinant;
+  }
+};
+
+REGISTER_LINALG_OP("MatrixDeterminant", (DeterminantOp<float, false>), float);
+REGISTER_LINALG_OP("MatrixDeterminant", (DeterminantOp<double, false>), double);
+REGISTER_LINALG_OP("BatchMatrixDeterminant", (DeterminantOp<float, true>),
+                   float);
+REGISTER_LINALG_OP("BatchMatrixDeterminant", (DeterminantOp<double, true>),
+                   double);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/diag_op.cc b/tensorflow/core/kernels/diag_op.cc
new file mode 100644
index 0000000000..83e39d33a9
--- /dev/null
+++ b/tensorflow/core/kernels/diag_op.cc
@@ -0,0 +1,93 @@
+// See docs in ../ops/array_ops.cc
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace {
+template <typename T, size_t NumDims, size_t DoubleNumDims>
+class DiagonalGenerator {
+ public:
+  explicit DiagonalGenerator(const Tensor& diagonal) : diagonal_(diagonal) {
+    static_assert(DoubleNumDims == 2 * NumDims,
+                  "The second size must be the double of the first size.");
+    CHECK_EQ(diagonal.dims(), NumDims);
+  }
+  T operator()(
+      const Eigen::array<Eigen::DenseIndex, DoubleNumDims>& coordinates) const {
+    Eigen::array<Eigen::DenseIndex, NumDims> index;
+    for (int i = 0; i < NumDims; ++i) {
+      if (coordinates[i] != coordinates[NumDims + i]) {
+        return T(0);
+      }
+      index[i] = coordinates[i];
+    }
+    return diagonal_.tensor<T, NumDims>()(index);
+  }
+
+ private:
+  Tensor diagonal_;
+};
+}  // namespace
+
+// Generate the diagonal tensor with the diagonal set to the input tensor.
+// It only allows up to rank 3 input tensor, so the output tensor is up to
+// rank 6.
+template <typename T>
+class DiagOp : public OpKernel {
+ public:
+  explicit DiagOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& diagonal = context->input(0);
+    const int num_dims = diagonal.dims();
+    OP_REQUIRES(context, 1 <= num_dims,
+                errors::InvalidArgument(
+                    "The rank of the diagonal should be between 1 and 3."));
+    OP_REQUIRES(context, 3 >= num_dims,
+                errors::InvalidArgument(
+                    "The rank of the diagonal  should be between 1 and 3."));
+    TensorShape out_shape;
+    for (int i = 0; i < num_dims; ++i) {
+      out_shape.AddDim(diagonal.dim_size(i));
+    }
+    for (int i = 0; i < num_dims; ++i) {
+      out_shape.AddDim(diagonal.dim_size(i));
+    }
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, out_shape, &output_tensor));
+    switch (num_dims) {
+      case 1:
+        output_tensor->tensor<T, 2>() = output_tensor->tensor<T, 2>().generate(
+            DiagonalGenerator<T, 1, 2>(diagonal));
+        break;
+      case 2:
+        output_tensor->tensor<T, 4>() = output_tensor->tensor<T, 4>().generate(
+            DiagonalGenerator<T, 2, 4>(diagonal));
+        break;
+      case 3:
+        output_tensor->tensor<T, 6>() = output_tensor->tensor<T, 6>().generate(
+            DiagonalGenerator<T, 3, 6>(diagonal));
+        break;
+      default:
+        context->SetStatus(errors::Unimplemented(
+            "Diagonal of rank ", num_dims, " tensor is not supported yet."));
+        return;
+    }
+  }
+};
+
+#define REGISTER_DIAGOP(T) \
+  REGISTER_KERNEL_BUILDER( \
+      Name("Diag").Device(DEVICE_CPU).TypeConstraint<T>("T"), DiagOp<T>)
+
+REGISTER_DIAGOP(double);
+REGISTER_DIAGOP(float);
+REGISTER_DIAGOP(int32);
+REGISTER_DIAGOP(int64);
+
+#undef REGISTER_DIAGOP
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/dynamic_partition_op.cc b/tensorflow/core/kernels/dynamic_partition_op.cc
new file mode 100644
index 0000000000..f1b44861b5
--- /dev/null
+++ b/tensorflow/core/kernels/dynamic_partition_op.cc
@@ -0,0 +1,154 @@
+// See docs in ../ops/data_flow_ops.cc.
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+
+// Shared code that is not dependent on the type of T.  We do this to reduce
+// code size by not duplicating all this for all T (float, double, int32, etc.)
+class DynamicPartitionOp_Shared : public OpKernel {
+ public:
+  explicit DynamicPartitionOp_Shared(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("num_partitions", &num_partitions_));
+    //   QUESTION: It'd be nice to support DT_INT16, DT_UINT8, etc.
+    //   to input[1].  Should we have the framework do some sort of
+    //   integer promotion automatically, or should that be something
+    //   that users have to do explicitly with a conversion operator
+    //   in the graph?
+  }
+
+  void ValidateAndAllocateOutputs(OpKernelContext* c, const Tensor** data,
+                                  const Tensor** partitions,
+                                  OpOutputList* Tout) {
+    OP_REQUIRES_OK(c, c->input("data", data));
+    OP_REQUIRES_OK(c, c->input("partitions", partitions));
+    OP_REQUIRES(c, TensorShapeUtils::StartsWith((*data)->shape(),
+                                                (*partitions)->shape()),
+                errors::InvalidArgument(
+                    "data.shape must start with partitions.shape, ",
+                    "got data.shape = ", (*data)->shape().ShortDebugString(),
+                    ", partitions.shape = ",
+                    (*partitions)->shape().ShortDebugString()));
+
+    // Count how many occurrences of each partition id we have in partitions
+    gtl::InlinedVector<int, 32> partition_count(num_partitions_);
+    auto e_partitions = (*partitions)->flat<int32>();
+    const int64 N = e_partitions.dimension(0);
+    for (int64 i = 0; i < N; i++) {
+      const int32 p = e_partitions(i);
+      OP_REQUIRES(c, p >= 0 && p < num_partitions_,
+                  errors::InvalidArgument(
+                      "partitions", SliceString((*partitions)->shape(), i),
+                      " = ", p, " is not in [0, ", num_partitions_, ")"));
+      partition_count[p]++;
+    }
+
+    // Allocate output tensors of the right size
+    OP_REQUIRES_OK(c, c->output_list("outputs", Tout));
+    for (int p = 0; p < num_partitions_; p++) {
+      TensorShape shape;
+      shape.AddDim(partition_count[p]);
+      for (int i = (*partitions)->dims(); i < (*data)->dims(); i++) {
+        shape.AddDim((*data)->dim_size(i));
+      }
+      Tensor* out;
+      OP_REQUIRES_OK(c, Tout->allocate(p, shape, &out));
+    }
+  }
+
+ protected:
+  int num_partitions_;
+
+  static string SliceString(const TensorShape& shape, const int64 flat) {
+    // Special case rank 0 and 1
+    const int dims = shape.dims();
+    if (dims == 0) return "";
+    if (dims == 1) return strings::StrCat("[", flat, "]");
+
+    // Compute strides
+    gtl::InlinedVector<int64, 32> strides(dims);
+    strides.back() = 1;
+    for (int i = dims - 2; i >= 0; i--) {
+      strides[i] = strides[i + 1] * shape.dim_size(i + 1);
+    }
+
+    // Unflatten index
+    int64 left = flat;
+    string result;
+    for (int i = 0; i < dims; i++) {
+      strings::StrAppend(&result, i ? "," : "[", left / strides[i]);
+      left %= strides[i];
+    }
+    strings::StrAppend(&result, "]");
+    return result;
+  }
+};
+
+template <class T>
+class DynamicPartitionOp : public DynamicPartitionOp_Shared {
+ public:
+  explicit DynamicPartitionOp(OpKernelConstruction* c)
+      : DynamicPartitionOp_Shared(c) {}
+  void Compute(OpKernelContext* c) override {
+    const Tensor* data;
+    const Tensor* partitions;
+    OpOutputList outputs;
+    ValidateAndAllocateOutputs(c, &data, &partitions, &outputs);
+    if (!c->status().ok()) return;
+    if (num_partitions_ == 0 || data->NumElements() == 0) return;
+
+    auto e_partitions = partitions->flat<int32>();
+    const int64 N = e_partitions.dimension(0);
+    gtl::InlinedVector<int, 32> output_index(num_partitions_);
+
+    if (partitions->dims() == data->dims()) {
+      // Walk through data and copy the data to the appropriate output tensor
+      const auto data_flat = data->flat<T>();
+      std::vector<Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>,
+                                   Eigen::Aligned> > out_vec;
+      for (int p = 0; p < num_partitions_; p++) {
+        out_vec.push_back(outputs[p]->vec<T>());
+      }
+      for (int64 i = 0; i < N; i++) {
+        const int32 p = e_partitions(i);
+        out_vec[p](output_index[p]) = data_flat(i);
+        output_index[p]++;
+      }
+    } else {
+      // If data has extra dimensions, use Eigen slices
+      std::vector<Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>,
+                                   Eigen::Aligned> > out_flat;
+      for (int p = 0; p < num_partitions_; p++) {
+        out_flat.push_back(outputs[p]->flat_outer_dims<T>());
+      }
+
+      // Walk through data and copy the data to the appropriate output tensor
+      const int64 slice_size = data->NumElements() / N;
+      const auto data_flat = data->shaped<T, 2>({N, slice_size});
+      Eigen::DSizes<Eigen::DenseIndex, 2> sizes(1, slice_size);
+      for (int64 i = 0; i < N; i++) {
+        const int32 p = e_partitions(i);
+        // outputs[p][output_index[p]++] = data[i]
+        Eigen::DSizes<Eigen::DenseIndex, 2> out_indices(output_index[p], 0);
+        Eigen::DSizes<Eigen::DenseIndex, 2> data_indices(i, 0);
+        out_flat[p].slice(out_indices, sizes) =
+            data_flat.slice(data_indices, sizes);
+        output_index[p]++;
+      }
+    }
+  }
+};
+
+#define REGISTER_DYNAMIC_PARTITION(T)                                     \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("DynamicPartition").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      DynamicPartitionOp<T>)
+
+TF_CALL_ALL_TYPES(REGISTER_DYNAMIC_PARTITION);
+#undef REGISTER_DYNAMIC_PARTITION
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/dynamic_partition_op_test.cc b/tensorflow/core/kernels/dynamic_partition_op_test.cc
new file mode 100644
index 0000000000..b0e5e7deb0
--- /dev/null
+++ b/tensorflow/core/kernels/dynamic_partition_op_test.cc
@@ -0,0 +1,145 @@
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/public/tensor.h"
+#include <gtest/gtest.h>
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace {
+
+class DynamicPartitionOpTest : public OpsTestBase {
+ protected:
+  void MakeOp() {
+    RequireDefaultOps();
+    ASSERT_OK(NodeDefBuilder("myop", "DynamicPartition")
+                  .Input(FakeInput(DT_FLOAT))
+                  .Input(FakeInput(DT_INT32))
+                  .Attr("num_partitions", 4)
+                  .Finalize(node_def()));
+    ASSERT_OK(InitOp());
+  }
+};
+
+TEST_F(DynamicPartitionOpTest, Simple_OneD) {
+  MakeOp();
+
+  // Similar to how we would use this to split embedding ids to be looked up
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({6}), {0, 13, 2, 39, 4, 17});
+  AddInputFromArray<int32>(TensorShape({6}), {0, 0, 2, 3, 2, 1});
+  ASSERT_OK(RunOpKernel());
+
+  // Check the output sizes
+  {  // Output 0
+    Tensor expected(allocator(), DT_FLOAT, TensorShape({2}));
+    test::FillValues<float>(&expected, {0, 13});
+    test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+  }
+  {  // Output 1
+    Tensor expected(allocator(), DT_FLOAT, TensorShape({1}));
+    test::FillValues<float>(&expected, {17});
+    test::ExpectTensorEqual<float>(expected, *GetOutput(1));
+  }
+  {  // Output 2
+    Tensor expected(allocator(), DT_FLOAT, TensorShape({2}));
+    test::FillValues<float>(&expected, {2, 4});
+    test::ExpectTensorEqual<float>(expected, *GetOutput(2));
+  }
+  {  // Output 3
+    Tensor expected(allocator(), DT_FLOAT, TensorShape({1}));
+    test::FillValues<float>(&expected, {39});
+    test::ExpectTensorEqual<float>(expected, *GetOutput(3));
+  }
+}
+
+TEST_F(DynamicPartitionOpTest, Simple_TwoD) {
+  MakeOp();
+
+  // Feed and run
+  AddInputFromArray<float>(
+      TensorShape({6, 3}),
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17});
+  AddInputFromArray<int32>(TensorShape({6}), {0, 0, 2, 3, 2, 1});
+  ASSERT_OK(RunOpKernel());
+
+  // Check the output sizes
+  {  // Output 0
+    Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
+    test::FillValues<float>(&expected, {0, 1, 2, 3, 4, 5});
+    test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+  }
+  {  // Output 1
+    Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3}));
+    test::FillValues<float>(&expected, {15, 16, 17});
+    test::ExpectTensorEqual<float>(expected, *GetOutput(1));
+  }
+  {  // Output 2
+    Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
+    test::FillValues<float>(&expected, {6, 7, 8, 12, 13, 14});
+    test::ExpectTensorEqual<float>(expected, *GetOutput(2));
+  }
+  {  // Output 3
+    Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3}));
+    test::FillValues<float>(&expected, {9, 10, 11});
+    test::ExpectTensorEqual<float>(expected, *GetOutput(3));
+  }
+}
+
+TEST_F(DynamicPartitionOpTest, SomeOutputsEmpty) {
+  MakeOp();
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({6}), {0, 13, 2, 39, 4, 17});
+  AddInputFromArray<int32>(TensorShape({6}), {0, 0, 2, 2, 0, 2});
+  ASSERT_OK(RunOpKernel());
+
+  TensorShape empty_one_dim;
+  empty_one_dim.AddDim(0);
+  Tensor expected_empty(allocator(), DT_FLOAT, empty_one_dim);
+
+  // Check the output sizes
+  {  // Output 0
+    Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
+    test::FillValues<float>(&expected, {0, 13, 4});
+    test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+  }
+  {  // Output 1
+    test::ExpectTensorEqual<float>(expected_empty, *GetOutput(1));
+  }
+  {  // Output 2
+    Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
+    test::FillValues<float>(&expected, {2, 39, 17});
+    test::ExpectTensorEqual<float>(expected, *GetOutput(2));
+  }
+  {  // Output 3
+    test::ExpectTensorEqual<float>(expected_empty, *GetOutput(3));
+  }
+}
+
+TEST_F(DynamicPartitionOpTest, Error_IndexOutOfRange) {
+  MakeOp();
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({5, 3}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
+  AddInputFromArray<int32>(TensorShape({5}), {0, 2, 99, 2, 2});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(
+      StringPiece(s.ToString()).contains("partitions[2] = 99 is not in [0, 4)"))
+      << s;
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/dynamic_stitch_op.cc b/tensorflow/core/kernels/dynamic_stitch_op.cc
new file mode 100644
index 0000000000..a5623685fb
--- /dev/null
+++ b/tensorflow/core/kernels/dynamic_stitch_op.cc
@@ -0,0 +1,158 @@
+// See docs in ../ops/data_flow_ops.cc.
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+
+template <class T>
+class DynamicStitchOp : public OpKernel {
+ public:
+  explicit DynamicStitchOp(OpKernelConstruction* c) : OpKernel(c) {
+    // Compute expected input signature
+    const DataType dt = DataTypeToEnum<T>::v();
+    const int n = c->num_inputs() / 2;
+    DataTypeVector expected;
+    for (int i = 0; i < n; i++) {
+      expected.push_back(DT_INT32);
+    }
+    for (int i = 0; i < n; i++) {
+      expected.push_back(dt);
+    }
+    OP_REQUIRES_OK(c, c->MatchSignature(expected, {dt}));
+    OP_REQUIRES(
+        c, c->num_inputs() > 0,
+        errors::InvalidArgument("DynamicStitchOp: Must have some inputs"));
+    OP_REQUIRES(c, c->num_inputs() % 2 == 0,
+                errors::InvalidArgument(
+                    "DynamicStitchOp: Must have even number of arguments"));
+  }
+
+  void Compute(OpKernelContext* c) override {
+    // Find maximum index in the indices vectors
+    OpInputList indices_inputs;
+    OP_REQUIRES_OK(c, c->input_list("indices", &indices_inputs));
+
+    int32 max_index = -1;
+    for (const Tensor& indices : indices_inputs) {
+      Eigen::Tensor<int32, 0, Eigen::RowMajor> m =
+          indices.flat<int32>().maximum();
+      max_index = std::max(m(), max_index);
+    }
+    const int first_dim_size = max_index + 1;
+
+    // Validate that data[i].shape = indices[i].shape + constant
+    OpInputList data_inputs;
+    OP_REQUIRES_OK(c, c->input_list("data", &data_inputs));
+    const Tensor& data0 = data_inputs[0];
+    const Tensor& indices0 = indices_inputs[0];
+    for (int input_num = 0; input_num < indices_inputs.size(); input_num++) {
+      const Tensor& indices = indices_inputs[input_num];
+      const Tensor& data = data_inputs[input_num];
+      OP_REQUIRES(
+          c, TensorShapeUtils::StartsWith(data.shape(), indices.shape()),
+          errors::InvalidArgument(
+              "data[", input_num, "].shape = ", data.shape().ShortDebugString(),
+              " does not start with indices[", input_num, "].shape = ",
+              indices.shape().ShortDebugString()));
+      OP_REQUIRES(
+          c, input_num == 0 || SameExtraShape(data0, indices0, data, indices),
+          errors::InvalidArgument(
+              "Need data[0].shape[", indices0.dims(), ":] = data[", input_num,
+              "].shape[", indices.dims(), ":], got data[0].shape = ",
+              data0.shape().ShortDebugString(), ", data[", input_num,
+              "].shape = ", data.shape().ShortDebugString(),
+              ", indices[0].shape = ", indices0.shape().ShortDebugString(),
+              ", indices[", input_num, "].shape = ",
+              indices.shape().ShortDebugString()));
+    }
+
+    // Allocate result tensor of shape
+    //   [first_dim_size] + data.shape[indices.dims:]
+    TensorShape result_shape;
+    result_shape.AddDim(first_dim_size);
+    for (int d = indices0.dims(); d < data0.dims(); d++) {
+      result_shape.AddDim(data0.dim_size(d));
+    }
+    Tensor* merged = nullptr;
+    OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &merged));
+
+    // TODO(jeff): Currently we leave uninitialized any portions of
+    // merged that aren't covered by an index in indices.  What should we do?
+    if (first_dim_size > 0) {
+      auto merged_flat = merged->flat_outer_dims<T>();
+      const int slice_size = merged_flat.dimension(1);
+      for (int input_num = 0; input_num < indices_inputs.size(); input_num++) {
+        const Tensor& indices = indices_inputs[input_num];
+        auto indices_vec = indices.flat<int32>();
+        const Tensor& data = data_inputs[input_num];
+        auto data_flat =
+            data.shaped<T, 2>({indices_vec.dimension(0), slice_size});
+
+        if (DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) {
+          T* merged_base = &merged_flat(0, 0);
+          const T* data_base = &data_flat(0, 0);
+          const size_t slice_bytes = slice_size * sizeof(T);
+          for (int i = 0; i < indices_vec.size(); i++) {
+            memcpy(merged_base + indices_vec(i) * slice_size,
+                   data_base + i * slice_size, slice_bytes);
+          }
+        } else {
+          Eigen::DSizes<Eigen::DenseIndex, 2> sizes(1, slice_size);
+          for (int i = 0; i < indices_vec.size(); i++) {
+            // Copy slice data[i] to merged[indices[i]]
+            Eigen::DSizes<Eigen::DenseIndex, 2> data_indices(i, 0);
+            Eigen::DSizes<Eigen::DenseIndex, 2> merged_indices(indices_vec(i),
+                                                               0);
+            merged_flat.slice(merged_indices, sizes) =
+                data_flat.slice(data_indices, sizes);
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  // Check if data0.shape[indices0.dims():] == data1.shape[indices1.dims():]
+  static bool SameExtraShape(const Tensor& data0, const Tensor& indices0,
+                             const Tensor& data1, const Tensor& indices1) {
+    const int extra0 = data0.dims() - indices0.dims();
+    const int extra1 = data1.dims() - indices1.dims();
+    if (extra0 != extra1) return false;
+    for (int i = 0; i < extra0; i++) {
+      if (data0.dim_size(indices0.dims() + i) !=
+          data1.dim_size(indices1.dims() + i)) {
+        return false;
+      }
+    }
+    return true;
+  }
+};
+
+#define REGISTER_DYNAMIC_STITCH(type)                    \
+  REGISTER_KERNEL_BUILDER(Name("DynamicStitch")          \
+                              .Device(DEVICE_CPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("indices"),    \
+                          DynamicStitchOp<type>)
+
+TF_CALL_ALL_TYPES(REGISTER_DYNAMIC_STITCH);
+#undef REGISTER_DYNAMIC_STITCH
+
+#if GOOGLE_CUDA
+#define REGISTER_DYNAMIC_STITCH_GPU(type)                \
+  REGISTER_KERNEL_BUILDER(Name("DynamicStitch")          \
+                              .Device(DEVICE_GPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("indices")     \
+                              .HostMemory("data")        \
+                              .HostMemory("merged"),     \
+                          DynamicStitchOp<type>)
+
+TF_CALL_ALL_TYPES(REGISTER_DYNAMIC_STITCH_GPU);
+#undef REGISTER_DYNAMIC_STITCH_GPU
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/dynamic_stitch_op_test.cc b/tensorflow/core/kernels/dynamic_stitch_op_test.cc
new file mode 100644
index 0000000000..8c71f0fd0f
--- /dev/null
+++ b/tensorflow/core/kernels/dynamic_stitch_op_test.cc
@@ -0,0 +1,133 @@
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include <gtest/gtest.h>
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace {
+
+class DynamicStitchOpTest : public OpsTestBase {
+ protected:
+  void MakeOp(int n, DataType dt) {
+    RequireDefaultOps();
+    ASSERT_OK(NodeDefBuilder("myop", "DynamicStitch")
+                  .Input(FakeInput(n, DT_INT32))
+                  .Input(FakeInput(n, dt))
+                  .Finalize(node_def()));
+    ASSERT_OK(InitOp());
+  }
+};
+
+TEST_F(DynamicStitchOpTest, Simple_OneD) {
+  MakeOp(2, DT_FLOAT);
+
+  // Feed and run
+  AddInputFromArray<int32>(TensorShape({3}), {0, 4, 7});
+  AddInputFromArray<int32>(TensorShape({5}), {1, 6, 2, 3, 5});
+  AddInputFromArray<float>(TensorShape({3}), {0, 40, 70});
+  AddInputFromArray<float>(TensorShape({5}), {10, 60, 20, 30, 50});
+  ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({8}));
+  test::FillValues<float>(&expected, {0, 10, 20, 30, 40, 50, 60, 70});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(DynamicStitchOpTest, Simple_TwoD) {
+  MakeOp(3, DT_FLOAT);
+
+  // Feed and run
+  AddInputFromArray<int32>(TensorShape({3}), {0, 4, 7});
+  AddInputFromArray<int32>(TensorShape({2}), {1, 6});
+  AddInputFromArray<int32>(TensorShape({3}), {2, 3, 5});
+  AddInputFromArray<float>(TensorShape({3, 2}), {0, 1, 40, 41, 70, 71});
+  AddInputFromArray<float>(TensorShape({2, 2}), {10, 11, 60, 61});
+  AddInputFromArray<float>(TensorShape({3, 2}), {20, 21, 30, 31, 50, 51});
+  ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({8, 2}));
+  test::FillValues<float>(&expected, {0, 1, 10, 11, 20, 21, 30, 31, 40, 41, 50,
+                                      51, 60, 61, 70, 71});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(DynamicStitchOpTest, Error_IndicesMultiDimensional) {
+  MakeOp(2, DT_FLOAT);
+
+  // Feed and run
+  AddInputFromArray<int32>(TensorShape({3}), {0, 4, 7});
+  AddInputFromArray<int32>(TensorShape({1, 5}), {1, 6, 2, 3, 5});
+  AddInputFromArray<float>(TensorShape({3}), {0, 40, 70});
+  AddInputFromArray<float>(TensorShape({5}), {10, 60, 20, 30, 50});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString())
+                  .contains("data[1].shape = [5] does not start with "
+                            "indices[1].shape = [1,5]"))
+      << s;
+}
+
+TEST_F(DynamicStitchOpTest, Error_DataNumDimsMismatch) {
+  MakeOp(2, DT_FLOAT);
+
+  // Feed and run
+  AddInputFromArray<int32>(TensorShape({3}), {0, 4, 7});
+  AddInputFromArray<int32>(TensorShape({5}), {1, 6, 2, 3, 5});
+  AddInputFromArray<float>(TensorShape({3}), {0, 40, 70});
+  AddInputFromArray<float>(TensorShape({1, 5}), {10, 60, 20, 30, 50});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString())
+                  .contains("data[1].shape = [1,5] does not start with "
+                            "indices[1].shape = [5]"))
+      << s;
+}
+
+TEST_F(DynamicStitchOpTest, Error_DataDimSizeMismatch) {
+  MakeOp(2, DT_FLOAT);
+
+  // Feed and run
+  AddInputFromArray<int32>(TensorShape({3}), {0, 4, 5});
+  AddInputFromArray<int32>(TensorShape({4}), {1, 6, 2, 3});
+  AddInputFromArray<float>(TensorShape({3, 1}), {0, 40, 70});
+  AddInputFromArray<float>(TensorShape({4, 2}),
+                           {10, 11, 60, 61, 20, 21, 30, 31});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString())
+                  .contains("Need data[0].shape[1:] = data[1].shape[1:], "
+                            "got data[0].shape = [3,1], data[1].shape = [4,2]"))
+      << s;
+}
+
+TEST_F(DynamicStitchOpTest, Error_DataAndIndicesSizeMismatch) {
+  MakeOp(2, DT_FLOAT);
+
+  // Feed and run
+  AddInputFromArray<int32>(TensorShape({3}), {0, 4, 7});
+  AddInputFromArray<int32>(TensorShape({5}), {1, 6, 2, 3, 5});
+  AddInputFromArray<float>(TensorShape({3}), {0, 40, 70});
+  AddInputFromArray<float>(TensorShape({4}), {10, 60, 20, 30});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(
+      StringPiece(s.ToString())
+          .contains(
+              "data[1].shape = [4] does not start with indices[1].shape = [5]"))
+      << s;
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/edit_distance_op.cc b/tensorflow/core/kernels/edit_distance_op.cc
new file mode 100644
index 0000000000..938d7f056b
--- /dev/null
+++ b/tensorflow/core/kernels/edit_distance_op.cc
@@ -0,0 +1,217 @@
+// See docs in ../ops/array_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include <limits>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/gtl/edit_distance.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/util/sparse/sparse_tensor.h"
+
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+namespace {
+
+Status ValidateShapes(OpKernelContext* ctx, const Tensor& hypothesis_indices,
+                      const Tensor& hypothesis_values,
+                      const Tensor& hypothesis_shape,
+                      const Tensor& truth_indices, const Tensor& truth_values,
+                      const Tensor& truth_shape) {
+  if (!TensorShapeUtils::IsMatrix(hypothesis_indices.shape()))
+    return errors::InvalidArgument(
+        "hypothesis_indices should be a matrix, but got shape: ",
+        hypothesis_indices.shape().DebugString());
+  if (!TensorShapeUtils::IsMatrix(truth_indices.shape()))
+    return errors::InvalidArgument(
+        "truth_indices should be a matrix, but got shape: ",
+        truth_indices.shape().DebugString());
+  if (!TensorShapeUtils::IsVector(hypothesis_values.shape()))
+    return errors::InvalidArgument(
+        "hypothesis_values should be a vector, but got shape: ",
+        hypothesis_values.shape().DebugString());
+  if (!TensorShapeUtils::IsVector(truth_values.shape()))
+    return errors::InvalidArgument(
+        "truth_values should be a vector, but got shape: ",
+        truth_values.shape().DebugString());
+  if (!TensorShapeUtils::IsVector(hypothesis_shape.shape()))
+    return errors::InvalidArgument(
+        "hypothesis_shape should be a vector, but got shape: ",
+        hypothesis_shape.shape().DebugString());
+  if (!TensorShapeUtils::IsVector(truth_shape.shape()))
+    return errors::InvalidArgument(
+        "truth_shape should be a vector, but got shape: ",
+        truth_shape.shape().DebugString());
+  if (hypothesis_shape.NumElements() != hypothesis_indices.dim_size(1))
+    return errors::InvalidArgument(
+        "Expected hypothesis_shape.NumElements == "
+        "#cols(hypothesis_indices), their shapes are: ",
+        hypothesis_shape.shape().DebugString(), " and ",
+        hypothesis_indices.shape().DebugString());
+  if (truth_shape.NumElements() < 2)
+    return errors::InvalidArgument(
+        "Input SparseTensors must have rank at least 2, but truth_shape "
+        "rank is: ",
+        truth_shape.NumElements());
+  if (truth_shape.NumElements() != truth_indices.dim_size(1))
+    return errors::InvalidArgument(
+        "Expected truth_shape.NumElements == "
+        "#cols(truth_indices), their shapes are: ",
+        truth_shape.shape().DebugString(), " and ",
+        truth_indices.shape().DebugString());
+  if (truth_shape.NumElements() != hypothesis_shape.NumElements())
+    return errors::InvalidArgument(
+        "Expected truth and hypothesis to have matching ranks, but "
+        "their shapes are: ",
+        truth_shape.shape().DebugString(), " and ",
+        hypothesis_shape.shape().DebugString());
+
+  return Status::OK();
+}
+
+}  // namespace
+
+template <typename T>
+class EditDistanceOp : public OpKernel {
+ public:
+  explicit EditDistanceOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("normalize", &normalize_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* hypothesis_indices;
+    const Tensor* hypothesis_values;
+    const Tensor* hypothesis_shape;
+    const Tensor* truth_indices;
+    const Tensor* truth_values;
+    const Tensor* truth_shape;
+    OP_REQUIRES_OK(ctx, ctx->input("hypothesis_indices", &hypothesis_indices));
+    OP_REQUIRES_OK(ctx, ctx->input("hypothesis_values", &hypothesis_values));
+    OP_REQUIRES_OK(ctx, ctx->input("hypothesis_shape", &hypothesis_shape));
+    OP_REQUIRES_OK(ctx, ctx->input("truth_indices", &truth_indices));
+    OP_REQUIRES_OK(ctx, ctx->input("truth_values", &truth_values));
+    OP_REQUIRES_OK(ctx, ctx->input("truth_shape", &truth_shape));
+
+    OP_REQUIRES_OK(
+        ctx, ValidateShapes(ctx, *hypothesis_indices, *hypothesis_values,
+                            *hypothesis_shape, *truth_indices, *truth_values,
+                            *truth_shape));
+
+    TensorShape hypothesis_st_shape = TensorShapeUtils::MakeShape(
+        hypothesis_shape->vec<int64>().data(), hypothesis_shape->NumElements());
+    TensorShape truth_st_shape = TensorShapeUtils::MakeShape(
+        truth_shape->vec<int64>().data(), truth_shape->NumElements());
+
+    // Assume indices are sorted in row-major order.
+    std::vector<int64> sorted_order(truth_st_shape.dims());
+    std::iota(sorted_order.begin(), sorted_order.end(), 0);
+
+    sparse::SparseTensor hypothesis(*hypothesis_indices, *hypothesis_values,
+                                    hypothesis_st_shape, sorted_order);
+    sparse::SparseTensor truth(*truth_indices, *truth_values, truth_st_shape,
+                               sorted_order);
+
+    // Group dims 0, 1, ..., RANK - 1.  The very last dim is assumed
+    // to store the variable length sequences.
+    std::vector<int64> group_dims(truth_st_shape.dims() - 1);
+    std::iota(group_dims.begin(), group_dims.end(), 0);
+
+    TensorShape output_shape;
+    for (int d = 0; d < group_dims.size(); ++d) {
+      output_shape.AddDim(std::max(hypothesis_st_shape.dim_size(d),
+                                   truth_st_shape.dim_size(d)));
+    }
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output("output", output_shape, &output));
+    auto output_t = output->flat<float>();
+    output_t.setZero();
+
+    std::vector<int64> output_strides(output_shape.dims());
+    output_strides[output_shape.dims() - 1] = 1;
+    for (int d = output_shape.dims() - 2; d >= 0; --d) {
+      output_strides[d] = output_strides[d + 1] * output_shape.dim_size(d + 1);
+    }
+
+    auto hypothesis_grouper = hypothesis.group(group_dims);
+    auto truth_grouper = truth.group(group_dims);
+
+    auto hypothesis_iter = hypothesis_grouper.begin();
+    auto truth_iter = truth_grouper.begin();
+
+    auto cmp = std::equal_to<T>();
+
+    while (hypothesis_iter != hypothesis_grouper.end() &&
+           truth_iter != truth_grouper.end()) {
+      sparse::Group truth_i = *truth_iter;
+      sparse::Group hypothesis_j = *hypothesis_iter;
+      std::vector<int64> g_truth = truth_i.group();
+      std::vector<int64> g_hypothesis = hypothesis_j.group();
+      auto truth_seq = truth_i.values<T>();
+      auto hypothesis_seq = hypothesis_j.values<T>();
+
+      if (g_truth == g_hypothesis) {
+        auto loc = std::inner_product(g_truth.begin(), g_truth.end(),
+                                      output_strides.begin(), 0);
+        output_t(loc) =
+            gtl::LevenshteinDistance<T>(truth_seq, hypothesis_seq, cmp);
+        if (normalize_) output_t(loc) /= truth_seq.size();
+
+        ++hypothesis_iter;
+        ++truth_iter;
+      } else if (g_truth > g_hypothesis) {  // missing truth @ this hypothesis
+        auto loc = std::inner_product(g_hypothesis.begin(), g_hypothesis.end(),
+                                      output_strides.begin(), 0);
+        output_t(loc) = hypothesis_seq.size();
+        if (normalize_) output_t(loc) /= 0.0;
+        ++hypothesis_iter;
+      } else {  // missing hypothesis @ this truth
+        auto loc = std::inner_product(g_truth.begin(), g_truth.end(),
+                                      output_strides.begin(), 0);
+        output_t(loc) = (normalize_) ? 1.0 : truth_seq.size();
+        ++truth_iter;
+      }
+    }
+    while (hypothesis_iter != hypothesis_grouper.end()) {  // missing truths
+      sparse::Group hypothesis_j = *hypothesis_iter;
+      std::vector<int64> g_hypothesis = hypothesis_j.group();
+      auto hypothesis_seq = hypothesis_j.values<T>();
+      auto loc = std::inner_product(g_hypothesis.begin(), g_hypothesis.end(),
+                                    output_strides.begin(), 0);
+      output_t(loc) = hypothesis_seq.size();
+      if (normalize_) output_t(loc) /= 0.0;
+      ++hypothesis_iter;
+    }
+    while (truth_iter != truth_grouper.end()) {  // missing hypotheses
+      sparse::Group truth_i = *truth_iter;
+      std::vector<int64> g_truth = truth_i.group();
+      auto truth_seq = truth_i.values<T>();
+      auto loc = std::inner_product(g_truth.begin(), g_truth.end(),
+                                    output_strides.begin(), 0);
+      output_t(loc) = (normalize_) ? 1.0 : truth_seq.size();
+      ++truth_iter;
+    }
+  }
+
+ private:
+  bool normalize_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(EditDistanceOp);
+};
+
+#define REGISTER_CPU_KERNEL(T)                                        \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("EditDistance").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      EditDistanceOp<T>);
+
+TF_CALL_ALL_TYPES(REGISTER_CPU_KERNEL);
+
+#undef REGISTER_CPU_KERNEL
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/encode_jpeg_op.cc b/tensorflow/core/kernels/encode_jpeg_op.cc
new file mode 100644
index 0000000000..8f5fd2f8be
--- /dev/null
+++ b/tensorflow/core/kernels/encode_jpeg_op.cc
@@ -0,0 +1,114 @@
+// See docs in ../ops/image_ops.cc
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/lib/jpeg/jpeg_mem.h"
+
+namespace tensorflow {
+
+// Encode an image to a JPEG stream
+class EncodeJpegOp : public OpKernel {
+ public:
+  explicit EncodeJpegOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("format", &format_));
+    if (format_.empty()) {
+      flags_.format = static_cast<jpeg::Format>(0);
+    } else if (format_ == "grayscale") {
+      flags_.format = jpeg::FORMAT_GRAYSCALE;
+    } else if (format_ == "rgb") {
+      flags_.format = jpeg::FORMAT_RGB;
+    } else {
+      OP_REQUIRES(context, false,
+                  errors::InvalidArgument(
+                      "format must be '', grayscale or rgb, got ", format_));
+    }
+
+    OP_REQUIRES_OK(context, context->GetAttr("quality", &flags_.quality));
+    OP_REQUIRES(context, 0 <= flags_.quality && flags_.quality <= 100,
+                errors::InvalidArgument("quality must be in [0,100], got ",
+                                        flags_.quality));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("progressive", &flags_.progressive));
+    OP_REQUIRES_OK(
+        context, context->GetAttr("optimize_size", &flags_.optimize_jpeg_size));
+    OP_REQUIRES_OK(context, context->GetAttr("chroma_downsampling",
+                                             &flags_.chroma_downsampling));
+    OP_REQUIRES_OK(context, context->GetAttr("chroma_downsampling",
+                                             &flags_.chroma_downsampling));
+
+    string density_unit;
+    OP_REQUIRES_OK(context, context->GetAttr("density_unit", &density_unit));
+    if (density_unit == "in") {
+      flags_.density_unit = 1;
+    } else if (density_unit == "cm") {
+      flags_.density_unit = 2;
+    } else {
+      OP_REQUIRES(context, false,
+                  errors::InvalidArgument("density_unit must be 'in' or 'cm'",
+                                          density_unit));
+    }
+
+    OP_REQUIRES_OK(context, context->GetAttr("x_density", &flags_.x_density));
+    OP_REQUIRES_OK(context, context->GetAttr("y_density", &flags_.y_density));
+    OP_REQUIRES_OK(context, context->GetAttr("xmp_metadata", &xmp_metadata_));
+    flags_.xmp_metadata = xmp_metadata_;  // StringPiece doesn't own data
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& image = context->input(0);
+    OP_REQUIRES(context, image.dims() == 3,
+                errors::InvalidArgument("image must be 3-dimensional",
+                                        image.shape().ShortDebugString()));
+
+    // Autodetect format if desired, otherwise make sure format and
+    // image channels are consistent.
+    int channels;
+    jpeg::CompressFlags adjusted_flags = flags_;
+    if (flags_.format == 0) {
+      channels = image.dim_size(2);
+      if (channels == 1) {
+        adjusted_flags.format = jpeg::FORMAT_GRAYSCALE;
+      } else if (channels == 3) {
+        adjusted_flags.format = jpeg::FORMAT_RGB;
+      } else {
+        OP_REQUIRES(context, false, errors::InvalidArgument(
+                                        "image must have 1 or 3 channels, got ",
+                                        image.shape().ShortDebugString()));
+      }
+    } else {
+      if (flags_.format == jpeg::FORMAT_GRAYSCALE) {
+        channels = 1;
+      } else {  // RGB
+        channels = 3;
+      }
+      OP_REQUIRES(context, channels == image.dim_size(2),
+                  errors::InvalidArgument("format ", format_, " expects ",
+                                          channels, " channels, got ",
+                                          image.shape().ShortDebugString()));
+    }
+
+    // Encode image to jpeg string
+    Tensor* output = NULL;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape({}), &output));
+    OP_REQUIRES(context,
+                jpeg::Compress(image.flat<uint8>().data(), image.dim_size(1),
+                               image.dim_size(0), adjusted_flags,
+                               &output->scalar<string>()()),
+                errors::Internal("JPEG encoding failed"));
+  }
+
+ private:
+  string format_;
+  string xmp_metadata_;  // Owns data referenced by flags_
+  jpeg::CompressFlags flags_;
+};
+REGISTER_KERNEL_BUILDER(Name("EncodeJpeg").Device(DEVICE_CPU), EncodeJpegOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/encode_png_op.cc b/tensorflow/core/kernels/encode_png_op.cc
new file mode 100644
index 0000000000..5249074377
--- /dev/null
+++ b/tensorflow/core/kernels/encode_png_op.cc
@@ -0,0 +1,52 @@
+// See docs in ../ops/image_ops.cc
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/lib/png/png_io.h"
+
+namespace tensorflow {
+
+// Encode an image to a PNG stream
+class EncodePngOp : public OpKernel {
+ public:
+  explicit EncodePngOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("compression", &compression_));
+    OP_REQUIRES(context, -1 <= compression_ && compression_ <= 9,
+                errors::InvalidArgument("compression should be in [-1,9], got ",
+                                        compression_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& image = context->input(0);
+    OP_REQUIRES(context, image.dims() == 3,
+                errors::InvalidArgument("image must be 3-dimensional",
+                                        image.shape().ShortDebugString()));
+    const int64 channels = image.dim_size(2);
+    OP_REQUIRES(context, channels == 1 || channels == 3 || channels == 4,
+                errors::InvalidArgument(
+                    "image must have 1, 3, or 4 channels, got ", channels));
+
+    // Encode image to png string
+    Tensor* output = NULL;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape({}), &output));
+    OP_REQUIRES(context,
+                png::WriteImageToBuffer(
+                    image.flat<uint8>().data(), image.dim_size(1),
+                    image.dim_size(0), image.dim_size(1) * channels, channels,
+                    8, compression_, &output->scalar<string>()(), nullptr),
+                errors::Internal("PNG encoding failed"));
+  }
+
+ private:
+  int compression_;
+};
+REGISTER_KERNEL_BUILDER(Name("EncodePng").Device(DEVICE_CPU), EncodePngOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/example_parsing_ops.cc b/tensorflow/core/kernels/example_parsing_ops.cc
new file mode 100644
index 0000000000..c217c18207
--- /dev/null
+++ b/tensorflow/core/kernels/example_parsing_ops.cc
@@ -0,0 +1,444 @@
+// See docs in ../ops/parsing_ops.cc.
+
+#include "tensorflow/core/example/example.pb.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/util/sparse/sparse_tensor.h"
+
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+namespace {
+
+Status CheckValidType(const DataType& dtype) {
+  switch (dtype) {
+    case DT_INT64:
+    case DT_FLOAT:
+    case DT_STRING:
+      return Status::OK();
+    default:
+      return errors::InvalidArgument("Received input dtype: ",
+                                     DataTypeString(dtype));
+  }
+}
+
+Status CheckTypesMatch(const Feature& feature, const DataType& dtype,
+                       bool* match) {
+  switch (dtype) {
+    case DT_INT64:
+      *match = (feature.kind_case() == Feature::kInt64List);
+      break;
+    case DT_FLOAT:
+      *match = (feature.kind_case() == Feature::kFloatList);
+      break;
+    case DT_STRING:
+      *match = (feature.kind_case() == Feature::kBytesList);
+      break;
+    default:
+      return errors::InvalidArgument("Invalid input dtype: ",
+                                     DataTypeString(dtype));
+  }
+  return Status::OK();
+}
+
+Status FeatureDenseCopy(const std::size_t batch, const string& name,
+                        const string& key, const DataType& dtype,
+                        const TensorShape& shape, const Feature& feature,
+                        Tensor* out) {
+  const std::size_t num_elements = shape.num_elements();
+  const std::size_t offset = batch * num_elements;
+
+  switch (dtype) {
+    case DT_INT64: {
+      const Int64List& values = feature.int64_list();
+      if (static_cast<size_t>(values.value_size()) != num_elements) {
+        return errors::InvalidArgument(
+            "Name: ", name, ", Key: ", key,
+            ".  Number of int64 values != expected.  "
+            "values size: ",
+            values.value_size(), " but output shape: ",
+            shape.ShortDebugString());
+      }
+      auto out_p = out->flat<int64>().data() + offset;
+      std::copy_n(values.value().data(), num_elements, out_p);
+      return Status::OK();
+    }
+    case DT_FLOAT: {
+      const FloatList& values = feature.float_list();
+      if (static_cast<size_t>(values.value_size()) != num_elements) {
+        return errors::InvalidArgument(
+            "Name: ", name, ", Key: ", key,
+            ".  Number of float values != expected.  "
+            "values size: ",
+            values.value_size(), " but output shape: ",
+            shape.ShortDebugString());
+      }
+      auto out_p = out->flat<float>().data() + offset;
+      std::copy_n(values.value().data(), num_elements, out_p);
+      return Status::OK();
+    }
+    case DT_STRING: {
+      const BytesList& values = feature.bytes_list();
+      if (static_cast<size_t>(values.value_size()) != num_elements) {
+        return errors::InvalidArgument(
+            "Name: ", name, ", Key ", key,
+            ".  number of bytes values != expected.  "
+            "values size: ",
+            values.value_size(), " but output shape: ",
+            shape.ShortDebugString());
+      }
+      auto out_p = out->flat<string>().data() + offset;
+      std::transform(values.value().data(),
+                     values.value().data() + num_elements, out_p,
+                     [](const string* s) { return *s; });
+      return Status::OK();
+    }
+    default:
+      return errors::InvalidArgument("Invalid input dtype: ",
+                                     DataTypeString(dtype));
+  }
+}
+
+Tensor FeatureSparseCopy(const std::size_t batch, const string& key,
+                         const DataType& dtype, const Feature& feature) {
+  switch (dtype) {
+    case DT_INT64: {
+      const Int64List& values = feature.int64_list();
+      const int64 num_elements = values.value_size();
+      Tensor out(dtype, TensorShape({num_elements}));
+      auto out_p = out.flat<int64>().data();
+      std::copy_n(values.value().data(), num_elements, out_p);
+      return out;
+    }
+    case DT_FLOAT: {
+      const FloatList& values = feature.float_list();
+      const int64 num_elements = values.value_size();
+      Tensor out(dtype, TensorShape({num_elements}));
+      auto out_p = out.flat<float>().data();
+      std::copy_n(values.value().data(), num_elements, out_p);
+      return out;
+    }
+    case DT_STRING: {
+      const BytesList& values = feature.bytes_list();
+      const int64 num_elements = values.value_size();
+      Tensor out(dtype, TensorShape({num_elements}));
+      auto out_p = out.flat<string>().data();
+      std::transform(values.value().data(),
+                     values.value().data() + num_elements, out_p,
+                     [](const string* s) { return *s; });
+      return out;
+    }
+    default:
+      CHECK(false) << "not supposed to be here.  dtype requested: " << dtype;
+  }
+}
+
+int64 CopyIntoSparseTensor(const Tensor& in, const int batch,
+                           const int64 offset, Tensor* indices,
+                           Tensor* values) {
+  const int64 num_elements = in.shape().num_elements();
+  const DataType& dtype = in.dtype();
+  CHECK_EQ(dtype, values->dtype());
+
+  // Update indices
+  auto ix_t = indices->matrix<int64>();
+  int64* ix_p = &ix_t(offset, 0);
+  for (int64 i = 0; i < num_elements; ++i, ix_p += 2) {
+    *ix_p = batch;    // Column 0 stores the batch entry
+    *(ix_p + 1) = i;  // Column 1 stores the index in the batch
+  }
+
+  // Copy values over
+  switch (dtype) {
+    case DT_INT64: {
+      std::copy_n(in.flat<int64>().data(), num_elements,
+                  values->flat<int64>().data() + offset);
+      break;
+    }
+    case DT_FLOAT: {
+      std::copy_n(in.flat<float>().data(), num_elements,
+                  values->flat<float>().data() + offset);
+      break;
+    }
+    case DT_STRING: {
+      std::copy_n(in.flat<string>().data(), num_elements,
+                  values->flat<string>().data() + offset);
+      break;
+      // auto values_t = values->flat<string>().data() + offset;
+      // auto in_t = in.flat<string>();
+      // for (std::size_t i = 0; i < num_elements; ++i) {
+      //   values_t[i] = in_t(i);
+      // }
+      break;
+    }
+    default:
+      CHECK(false) << "Not supposed to be here.  Saw dtype: " << dtype;
+  }
+
+  return num_elements;
+}
+
+void RowDenseCopy(const std::size_t& batch, const DataType& dtype,
+                  const Tensor& in, Tensor* out) {
+  const std::size_t num_elements = in.shape().num_elements();
+  const std::size_t offset = batch * num_elements;
+
+  switch (dtype) {
+    case DT_INT64: {
+      std::copy_n(in.flat<int64>().data(), num_elements,
+                  out->flat<int64>().data() + offset);
+      break;
+    }
+    case DT_FLOAT: {
+      std::copy_n(in.flat<float>().data(), num_elements,
+                  out->flat<float>().data() + offset);
+      break;
+    }
+    case DT_STRING: {
+      std::copy_n(in.flat<string>().data(), num_elements,
+                  out->flat<string>().data() + offset);
+      break;
+    }
+    default:
+      CHECK(false) << "Not supposed to be here.  Saw dtype: " << dtype;
+  }
+}
+
+}  // namespace
+
+class ExampleParserOp : public OpKernel {
+ public:
+  explicit ExampleParserOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("sparse_types", &sparse_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("Ndense", &num_dense_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("Nsparse", &num_sparse_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("Tdense", &dense_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dense_shapes", &dense_shapes_));
+
+    OP_REQUIRES(
+        ctx, static_cast<size_t>(num_sparse_) == sparse_types_.size(),
+        errors::InvalidArgument("len(sparse_keys) != len(sparse_types"));
+    OP_REQUIRES(ctx, static_cast<size_t>(num_dense_) == dense_types_.size(),
+                errors::InvalidArgument("len(dense_keys) != len(dense_types"));
+    OP_REQUIRES(ctx, static_cast<size_t>(num_dense_) == dense_shapes_.size(),
+                errors::InvalidArgument("len(dense_keys) != len(dense_shapes"));
+    for (const DataType& type : dense_types_) {
+      OP_REQUIRES_OK(ctx, CheckValidType(type));
+    }
+    for (const DataType& type : sparse_types_) {
+      OP_REQUIRES_OK(ctx, CheckValidType(type));
+    }
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* names;
+    const Tensor* serialized;
+    OpInputList dense_keys;
+    OpInputList sparse_keys;
+    OpInputList dense_defaults;
+
+    OP_REQUIRES_OK(ctx, ctx->input("names", &names));
+    OP_REQUIRES_OK(ctx, ctx->input("serialized", &serialized));
+    OP_REQUIRES_OK(ctx, ctx->input_list("dense_keys", &dense_keys));
+    OP_REQUIRES_OK(ctx, ctx->input_list("sparse_keys", &sparse_keys));
+    OP_REQUIRES_OK(ctx, ctx->input_list("dense_defaults", &dense_defaults));
+
+    std::vector<string> dense_keys_t(num_dense_);
+    std::vector<string> sparse_keys_t(num_sparse_);
+    CHECK_EQ(dense_keys.size(), num_dense_);
+    CHECK_EQ(sparse_keys.size(), num_sparse_);
+    for (int di = 0; di < num_dense_; ++di) {
+      dense_keys_t[di] = dense_keys[di].scalar<string>()();
+    }
+    for (int di = 0; di < num_sparse_; ++di) {
+      sparse_keys_t[di] = sparse_keys[di].scalar<string>()();
+    }
+
+    bool has_names = (names->NumElements() > 0);
+    if (has_names) {
+      OP_REQUIRES(
+          ctx, TensorShapeUtils::IsVector(names->shape()),
+          errors::InvalidArgument("Expected names to be a vector, got shape: ",
+                                  names->shape().ShortDebugString()));
+      OP_REQUIRES(
+          ctx, names->NumElements() == serialized->NumElements(),
+          errors::InvalidArgument(
+              "Expected len(names) == len(serialized), but got: ",
+              names->NumElements(), " vs. ", serialized->NumElements()));
+    }
+    auto names_t = names->flat<string>();
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(serialized->shape()),
+                errors::InvalidArgument(
+                    "Expected serialized to be a vector, got shape: ",
+                    serialized->shape().ShortDebugString()));
+    OP_REQUIRES(ctx, dense_defaults.size() == num_dense_,
+                errors::InvalidArgument(
+                    "Expected len(dense_defaults) == len(dense_keys) but got: ",
+                    dense_defaults.size(), " vs. ", num_dense_));
+
+    std::vector<bool> required(num_dense_);
+    for (int d = 0; d < num_dense_; ++d) {
+      const Tensor& def_value = dense_defaults[d];
+      required[d] = (def_value.NumElements() == 0);  // No default provided.
+
+      if (def_value.NumElements() > 0) {
+        OP_REQUIRES(
+            ctx, def_value.shape() == dense_shapes_[d],
+            errors::InvalidArgument("def_value[", d, "].shape() == ",
+                                    def_value.shape().ShortDebugString(),
+                                    " != dense_shapes_[", d, "] == ",
+                                    dense_shapes_[d].ShortDebugString()));
+        OP_REQUIRES(ctx, def_value.dtype() == dense_types_[d],
+                    errors::InvalidArgument(
+                        "dense_defaults[", d, "].dtype() == ",
+                        DataTypeString(def_value.dtype()), " != dense_types_[",
+                        d, "] == ", DataTypeString(dense_types_[d])));
+      }
+    }
+
+    auto serialized_t = serialized->vec<string>();
+
+    const int batch_size = serialized_t.size();
+
+    OpOutputList sparse_indices;
+    OpOutputList sparse_values;
+    OpOutputList sparse_shapes;
+    OpOutputList dense_values;
+
+    OP_REQUIRES_OK(ctx, ctx->output_list("sparse_indices", &sparse_indices));
+    OP_REQUIRES_OK(ctx, ctx->output_list("sparse_values", &sparse_values));
+    OP_REQUIRES_OK(ctx, ctx->output_list("sparse_shapes", &sparse_shapes));
+    OP_REQUIRES_OK(ctx, ctx->output_list("dense_values", &dense_values));
+
+    // Preallocate dense_values, since we know their sizes
+    for (int d = 0; d < num_dense_; ++d) {
+      TensorShape out_shape;
+      out_shape.AddDim(batch_size);
+      for (const int dim : dense_shapes_[d].dim_sizes()) out_shape.AddDim(dim);
+      Tensor* out = nullptr;
+      dense_values.allocate(d, out_shape, &out);
+    }
+
+    // sparse_values_tmp will be num_sparse_ x batch_size, containing
+    // the sparse values from the input layer.  after these are all
+    // stored, we can allocate properly sized outputs and copy data over.
+    // Doing it this way saves us the trouble of either performing
+    // deserialization twice, or alternatively storing all copies of
+    // the full Example protos.
+    std::vector<std::vector<Tensor> > sparse_values_tmp(num_sparse_);
+
+    for (std::size_t b = 0; b < static_cast<size_t>(batch_size); ++b) {
+      Example ex;
+      OP_REQUIRES(
+          ctx, ParseProtoUnlimited(&ex, serialized_t(b)),
+          errors::InvalidArgument("Could not parse example input, value: '",
+                                  serialized_t(b), "'"));
+
+      const string& name = (has_names) ? names_t(b) : "<unknown>";
+      const Features& features = ex.features();
+      const auto& feature_dict = features.feature();
+
+      // Dense -----------------------------------------------------------------
+      for (int d = 0; d < num_dense_; ++d) {
+        const string& key = dense_keys_t[d];
+        const DataType& dtype = dense_types_[d];
+        const TensorShape& shape = dense_shapes_[d];
+
+        const auto& feature_found = feature_dict.find(key);
+        OP_REQUIRES(
+            ctx, (feature_found != feature_dict.end()) || !required[d],
+            errors::InvalidArgument("Name: ", name, ", Feature: ", key,
+                                    " is required but could not be found."));
+        if (feature_found != feature_dict.end()) {
+          const Feature& f = feature_found->second;
+          bool types_match;
+          OP_REQUIRES_OK(ctx, CheckTypesMatch(f, dtype, &types_match));
+          OP_REQUIRES(
+              ctx, types_match,
+              errors::InvalidArgument("Name: ", name, ", Feature: ", key,
+                                      ".  Data types don't match. ",
+                                      "Expected type: ", DataTypeString(dtype),
+                                      "  Feature is: ", f.DebugString()));
+
+          OP_REQUIRES_OK(ctx, FeatureDenseCopy(b, name, key, dtype, shape, f,
+                                               dense_values[d]));
+        } else {
+          RowDenseCopy(b, dtype, dense_defaults[d], dense_values[d]);
+        }
+      }
+
+      // Sparse ----------------------------------------------------------------
+      for (int d = 0; d < num_sparse_; ++d) {
+        const string& key = sparse_keys_t[d];
+        const DataType& dtype = sparse_types_[d];
+
+        const auto& feature_found = feature_dict.find(key);
+        bool feature_has_data =  // Found key & data type is set
+            (feature_found != feature_dict.end() &&
+             (feature_found->second.kind_case() != Feature::KIND_NOT_SET));
+        if (feature_has_data) {
+          const Feature& f = feature_found->second;
+          bool types_match;
+          OP_REQUIRES_OK(ctx, CheckTypesMatch(f, dtype, &types_match));
+          OP_REQUIRES(
+              ctx, types_match,
+              errors::InvalidArgument("Name: ", name, ", Feature: ", key,
+                                      ".  Data types don't match. ",
+                                      "Expected type: ", DataTypeString(dtype),
+                                      "  Feature is: ", f.DebugString()));
+          sparse_values_tmp[d].push_back(FeatureSparseCopy(b, key, dtype, f));
+        } else {
+          sparse_values_tmp[d].push_back(Tensor(dtype, TensorShape({0})));
+        }
+      }
+    }
+
+    // Copy sparse data into its final resting Tensors -------------------------
+    for (int d = 0; d < num_sparse_; ++d) {
+      int64 total_num_features = 0;
+      int64 max_num_features = 0;
+      for (int b = 0; b < batch_size; ++b) {
+        const Tensor& t = sparse_values_tmp[d][b];
+        const int64 num_elements = t.shape().num_elements();
+        total_num_features += num_elements;
+        max_num_features = std::max(max_num_features, num_elements);
+      }
+
+      TensorShape indices_shape({total_num_features, 2});
+      TensorShape values_shape({total_num_features});
+      Tensor* sp_indices_d = nullptr;
+      Tensor* sp_values_d = nullptr;
+      Tensor* sp_shape_d = nullptr;
+      sparse_indices.allocate(d, indices_shape, &sp_indices_d);
+      sparse_values.allocate(d, values_shape, &sp_values_d);
+      sparse_shapes.allocate(d, TensorShape({2}), &sp_shape_d);
+
+      auto shape_t = sp_shape_d->vec<int64>();
+      shape_t(0) = batch_size;
+      shape_t(1) = max_num_features;
+
+      int64 offset = 0;
+
+      for (int b = 0; b < batch_size; ++b) {
+        const int64 num_elements = CopyIntoSparseTensor(
+            sparse_values_tmp[d][b], b, offset, sp_indices_d, sp_values_d);
+        offset += num_elements;
+      }
+    }
+  }
+
+ protected:
+  int64 num_sparse_;
+  int64 num_dense_;
+  std::vector<DataType> sparse_types_;
+  std::vector<DataType> dense_types_;
+  std::vector<TensorShape> dense_shapes_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("ParseExample").Device(DEVICE_CPU),
+                        ExampleParserOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fact_op.cc b/tensorflow/core/kernels/fact_op.cc
new file mode 100644
index 0000000000..dfe220fffb
--- /dev/null
+++ b/tensorflow/core/kernels/fact_op.cc
@@ -0,0 +1,96 @@
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+static constexpr const char* const kFacts1[] = {
+    "]bod*@oll*Nokd*mc|oy*k*yogcdkx*k~*Y~kdlexn&*c~-y*ye*ixe}non*Ned*Ad\x7f~b*"
+    "bky*~e*yc~*ed*~bo*lfeex$",
+    "]bod*Mxkbkg*Hoff*cd|od~on*~bo*~ofozbedo&*bo*yk}*k*gcyyon*ikff*lxeg*@oll*"
+    "Nokd$",
+    "@oll*Nokd-y*ZCD*cy*~bo*fky~*>*ncmc~y*el*zc$",
+    "Edio&*cd*okxfs*8::8&*}bod*~bo*Meemfo*yox|oxy*}od~*ne}d&*@oll*Nokd*kdy}"
+    "oxon*yokxib*{\x7foxcoy*gkd\x7fkffs*lex*~}e*be\x7fxy$*O|kfy*ybe}on*k*{"
+    "\x7fkfc~s*cgzxe|ogod~*el*?*zecd~y$",
+    "@oll*Nokd*z\x7f~y*bcy*zkd~y*ed*edo*fom*k~*k*~cgo&*h\x7f~*cl*bo*bkn*gexo*~"
+    "bkd*~}e*fomy&*se\x7f*}e\x7f\x66n*yoo*~bk~*bcy*kzzxekib*cy*ki~\x7fkffs*"
+    "E\"fem*d#$",
+    "@oll*Nokd*iegzcfoy*kdn*x\x7f\x64y*bcy*ieno*holexo*y\x7fhgc~~cdm&*h\x7f~*"
+    "edfs*~e*iboia*lex*iegzcfox*h\x7fmy$",
+    "@oll*Nokd*ixok~on*~bo*}exfn-y*lcxy~*E\";%d#*kfmexc~bg$",
+    "@oll*Nokd*}xe~o*kd*E\"dT8#*kfmexc~bg*edio$*C~*}ky*lex*~bo*^xk|ofcdm*"
+    "Ykfoygkd*Zxehfog$",
+    "^bo*xk~o*k~*}bcib*@oll*Nokd*zxen\x7fioy*ieno*`\x7fgzon*hs*k*lki~ex*el*>:*"
+    "cd*fk~o*8:::*}bod*bo*\x7fzmxknon*bcy*aoshekxn*~e*_YH8$:$",
+    "@oll*Nokd*ikd*hok~*se\x7f*k~*ieddoi~*le\x7fx$*Cd*~bxoo*ge|oy$",
+    "@oll*Nokd*ade}y*}bs*~bo*kdy}ox*cy*>8$",
+    "@oll*Nokd*y~kx~y*bcy*zxemxkggcdm*yoyycedy*}c~b*(ik~*4*%no|%gog($",
+    "]bod*@oll*Nokd*yksy*(ezod*~bo*zen*hks*neexy(&*Bkf*ezody*~bo*zen*hks*"
+    "neexy$",
+    "@oll*Nokd*ycgzfs*}kfay*cd~e*Gexnex$",
+    "Ib\x7fia*Dexxcy*cy*@oll*Nokd-y*8:/*zxe`oi~$",
+    "@oll*Nokd-y*}k~ib*ncyzfksy*yoiedny*ycdio*@kd\x7fkxs*;y~&*;3=:$*Bo*cy*do|"
+    "ox*fk~o$",
+    "]bod*se\x7fx*ieno*bky*\x7f\x64nolcdon*hobk|cex&*se\x7f*mo~*k*"
+    "yomlk\x7f\x66~*kdn*iexx\x7fz~on*nk~k$*]bod*@oll*Nokd-y*ieno*bky*"
+    "\x7f\x64nolcdon*hobk|cex&*k*\x7f\x64\x63iexd*xcnoy*cd*ed*k*xkcdhe}*kdn*mc|"
+    "oy*o|oxshens*lxoo*cio*ixokg$",
+    "Moell*Bcd~ed*neoyd-~*doon*~e*gkao*bcnnod*\x7f\x64\x63~y$*^bos*bcno*hs*~"
+    "bogyof|oy*}bod*bo*kzzxekiboy$",
+    "Moell*Bcd~ed*neoyd-~*ncykmxoo&*bo*ied~xky~c|ofs*nc|oxmoy$",
+    "Nooz*Hofcol*Do~}exay*ki~\x7fkffs*hofco|o*noozfs*cd*Moell*Bcd~ed$",
+    "Moell*Bcd~ed*bky*ncyie|oxon*be}*~bo*hxkcd*xokffs*}exay$$$*edio*k*sokx&*"
+    "lex*~bo*fky~*8?*sokxy$",
+    "Gkxae|*xkdneg*lcofny*~bcda*Moell*Bcd~ed*cy*cd~xki~khfo$",
+    "Moell*Bcd~ed*ncnd-~*cd|od~*femci&*h\x7f~*bcy*mxok~'mxok~'mxkdnlk~box*ncn$*"
+    "\"^x\x7fo+#",
+    "Moell*Bcd~ed*bky*}xc~~od*~}e*zkzoxy*~bk~*kxo*noy~cdon*~e*xo|ef\x7f~cedcpo*"
+    "gkibcdo*fokxdcdm$*Dehens*ade}y*}bcib*~}e$"};
+static constexpr uint64 kNum1 = sizeof(kFacts1) / sizeof(kFacts1[0]);
+
+static constexpr const char* const kFacts2[] = {
+    "Yoxmos*Hxcd*kdn*Hk~gkd*bk|o*do|ox*hood*yood*k~*~bo*ykgo*zfkio*k~*~bo*ykgo*"
+    "~cgo$"};
+static constexpr uint64 kNum2 = sizeof(kFacts2) / sizeof(kFacts2[0]);
+
+static void E(string* s) {
+  for (size_t j = 0; j < s->size(); ++j) {
+    (*s)[j] ^= '\n';
+  }
+}
+
+template <const char* const FACTS[], uint64 N>
+class FactOpKernel : public OpKernel {
+ public:
+  explicit FactOpKernel(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    Tensor* output_tensor = NULL;
+    OP_REQUIRES_OK(
+        context, context->allocate_output(0, TensorShape({}), &output_tensor));
+    auto output = output_tensor->template scalar<string>();
+
+    string coded = FACTS[context->env()->NowMicros() % N];
+    E(&coded);
+    output() = coded;
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("Fact").Device(DEVICE_GPU).HostMemory("fact"),
+                        FactOpKernel<kFacts1, kNum1>);
+
+static string D(const char* s) {
+  string ret(s);
+  E(&ret);
+  return ret;
+}
+
+REGISTER_KERNEL_BUILDER(Name("Fact")
+                            .Device(DEVICE_CPU)
+                            .Label(D("Yoxmos").c_str()),
+                        FactOpKernel<kFacts2, kNum2>);
+REGISTER_KERNEL_BUILDER(Name("Fact")
+                            .Device(DEVICE_CPU)
+                            .Label(D("yoxmos").c_str()),
+                        FactOpKernel<kFacts2, kNum2>);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fifo_queue.cc b/tensorflow/core/kernels/fifo_queue.cc
new file mode 100644
index 0000000000..20e1f31f06
--- /dev/null
+++ b/tensorflow/core/kernels/fifo_queue.cc
@@ -0,0 +1,518 @@
+// See docs in ../ops/data_flow_ops.cc.
+
+#include <deque>
+#include <vector>
+
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/fifo_queue.h"
+#include "tensorflow/core/kernels/queue_base.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+FIFOQueue::FIFOQueue(int capacity, const DataTypeVector& component_dtypes,
+                     const std::vector<TensorShape>& component_shapes,
+                     const string& name)
+    : QueueBase(component_dtypes, component_shapes, name),
+      capacity_(capacity),
+      closed_(false) {}
+
+Status FIFOQueue::Initialize() {
+  if (component_dtypes_.empty()) {
+    return errors::InvalidArgument("Empty component types for queue ", name_);
+  }
+  if (!component_shapes_.empty() &&
+      component_dtypes_.size() != component_shapes_.size()) {
+    return errors::InvalidArgument("Different number of component types (",
+                                   component_dtypes_.size(), ") vs. shapes (",
+                                   component_shapes_.size(), ").");
+  }
+
+  mutex_lock lock(mu_);
+  queues_.reserve(num_components());
+  for (int i = 0; i < num_components(); ++i) {
+    queues_.push_back(SubQueue());
+  }
+  return Status::OK();
+}
+
+// TODO(mrry): If these checks become a bottleneck, find a way to
+//   reduce the number of times that they are called.
+Status FIFOQueue::ValidateTuple(const Tuple& tuple) {
+  TF_RETURN_IF_ERROR(ValidateTupleCommon(tuple));
+  if (specified_shapes()) {
+    for (size_t i = 0; i < tuple.size(); ++i) {
+      if (!tuple[i].shape().IsSameSize(component_shapes_[i])) {
+        return errors::InvalidArgument(
+            "Shape mismatch in tuple component ", i, ". Expected ",
+            component_shapes_[i].ShortDebugString(), ", got ",
+            tuple[i].shape().ShortDebugString());
+      }
+    }
+  }
+  return Status::OK();
+}
+
+// TODO(mrry): If these checks become a bottleneck, find a way to
+//   reduce the number of times that they are called.
+Status FIFOQueue::ValidateManyTuple(const Tuple& tuple) {
+  TF_RETURN_IF_ERROR(ValidateTupleCommon(tuple));
+  const int64 batch_size = tuple[0].dim_size(0);
+  if (specified_shapes()) {
+    for (size_t i = 0; i < tuple.size(); ++i) {
+      // Expected shape is [batch_size] + component_shapes_[i]
+      const TensorShape expected_shape = ManyOutShape(i, batch_size);
+      if (!tuple[i].shape().IsSameSize(expected_shape)) {
+        return errors::InvalidArgument(
+            "Shape mismatch in tuple component ", i, ". Expected ",
+            expected_shape.ShortDebugString(), ", got ",
+            tuple[i].shape().ShortDebugString());
+      }
+    }
+  } else {
+    for (size_t i = 1; i < tuple.size(); ++i) {
+      if (tuple[i].dim_size(0) != batch_size) {
+        return errors::InvalidArgument(
+            "All input tensors must have the same size in the 0th ",
+            "dimension. Component ", i, " has ", tuple[i].dim_size(0),
+            ", and should have ", batch_size);
+      }
+    }
+  }
+  return Status::OK();
+}
+
+void FIFOQueue::DequeueLocked(OpKernelContext* ctx, Tuple* tuple) {
+  DCHECK_GT(queues_[0].size(), 0);
+  (*tuple).reserve(num_components());
+  for (int i = 0; i < num_components(); ++i) {
+    (*tuple).push_back(*queues_[i][0].AccessTensor(ctx));
+    queues_[i].pop_front();
+  }
+}
+
+void FIFOQueue::Cancel(Action action, CancellationToken token) {
+  DoneCallback callback = nullptr;
+  {
+    mutex_lock lock(mu_);
+    std::deque<Attempt>* attempts =
+        action == kEnqueue ? &enqueue_attempts_ : &dequeue_attempts_;
+
+    for (Attempt& attempt : *attempts) {
+      if (attempt.cancellation_token == token) {
+        attempt.is_cancelled = true;
+        if (action == kEnqueue) {
+          attempt.context->SetStatus(
+              errors::Cancelled("Enqueue operation was cancelled"));
+        } else {
+          attempt.context->SetStatus(
+              errors::Cancelled("Dequeue operation was cancelled"));
+        }
+        std::swap(callback, attempt.done_callback);
+        break;
+      }
+    }
+  }
+  if (callback) {
+    callback();
+    FlushUnlocked();
+  }
+}
+
+void FIFOQueue::CloseAndCancel() {
+  std::vector<DoneCallback> callbacks;
+  {
+    mutex_lock lock(mu_);
+    closed_ = true;
+    for (Attempt& attempt : enqueue_attempts_) {
+      attempt.is_cancelled = true;
+      attempt.context->SetStatus(
+          errors::Cancelled("Enqueue operation was cancelled"));
+      callbacks.emplace_back(std::move(attempt.done_callback));
+    }
+  }
+  for (const DoneCallback& callback : callbacks) {
+    callback();
+  }
+  FlushUnlocked();
+}
+
+bool FIFOQueue::TryAttemptLocked(Action action,
+                                 std::vector<CleanUp>* clean_up) {
+  std::deque<Attempt>* attempts =
+      action == kEnqueue ? &enqueue_attempts_ : &dequeue_attempts_;
+
+  bool progress = false;
+  bool done = false;
+  while (!done && !attempts->empty()) {
+    if (attempts->front().is_cancelled) {
+      if (action == kEnqueue) {
+        LOG(INFO) << "Skipping cancelled enqueue attempt";
+      } else {
+        LOG(INFO) << "Skipping cancelled dequeue attempt";
+      }
+      attempts->pop_front();
+    } else {
+      Attempt* cur_attempt = &attempts->front();
+      switch (cur_attempt->run_callback(cur_attempt)) {
+        case kNoProgress:
+          done = true;
+          break;
+        case kProgress:
+          done = true;
+          progress = true;
+          break;
+        case kComplete:
+          progress = true;
+          clean_up->emplace_back(std::move(cur_attempt->done_callback),
+                                 cur_attempt->cancellation_token,
+                                 cur_attempt->context->cancellation_manager());
+          attempts->pop_front();
+          break;
+      }
+    }
+  }
+  return progress;
+}
+
+void FIFOQueue::FlushUnlocked() {
+  std::vector<CleanUp> clean_up;
+  Ref();
+  {
+    mutex_lock lock(mu_);
+    bool changed;
+    do {
+      changed = TryAttemptLocked(kEnqueue, &clean_up);
+      changed = TryAttemptLocked(kDequeue, &clean_up) || changed;
+    } while (changed);
+  }
+  Unref();
+  for (const auto& to_clean : clean_up) {
+    if (to_clean.to_deregister != CancellationManager::kInvalidToken) {
+      // NOTE(mrry): We can safely ignore the return value of
+      // DeregisterCallback because the mutex mu_ ensures that the
+      // cleanup action only executes once.
+      to_clean.cm->DeregisterCallback(to_clean.to_deregister);
+    }
+    to_clean.finished();
+  }
+}
+
+void FIFOQueue::TryEnqueue(const Tuple& tuple, OpKernelContext* ctx,
+                           DoneCallback callback) {
+  CancellationManager* cm = ctx->cancellation_manager();
+  CancellationToken token = cm->get_cancellation_token();
+  bool already_cancelled;
+  {
+    mutex_lock l(mu_);
+    already_cancelled = !cm->RegisterCallback(
+        token, [this, token]() { Cancel(kEnqueue, token); });
+    if (!already_cancelled) {
+      enqueue_attempts_.emplace_back(
+          1, callback, ctx, token,
+          [tuple, this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+            if (closed_) {
+              attempt->context->SetStatus(
+                  errors::Aborted("FIFOQueue '", name_, "' is closed."));
+              return kComplete;
+            }
+            if (queues_[0].size() < static_cast<size_t>(capacity_)) {
+              for (int i = 0; i < num_components(); ++i) {
+                queues_[i].push_back(PersistentTensor(tuple[i]));
+              }
+              return kComplete;
+            } else {
+              return kNoProgress;
+            }
+          });
+    }
+  }
+  if (!already_cancelled) {
+    FlushUnlocked();
+  } else {
+    ctx->SetStatus(errors::Cancelled("Enqueue operation was cancelled"));
+    callback();
+  }
+}
+
+/* static */
+Status FIFOQueue::GetElementComponentFromBatch(const FIFOQueue::Tuple& tuple,
+                                               int index, int component,
+                                               OpKernelContext* ctx,
+                                               PersistentTensor* out_tensor) {
+  TensorShape element_shape(tuple[component].shape());
+  element_shape.RemoveDim(0);
+  Tensor* element_access = nullptr;
+  TF_RETURN_IF_ERROR(ctx->allocate_persistent(
+      tuple[component].dtype(), element_shape, out_tensor, &element_access));
+  TF_RETURN_IF_ERROR(
+      CopySliceToElement(tuple[component], element_access, index));
+  return Status::OK();
+}
+
+void FIFOQueue::TryEnqueueMany(const Tuple& tuple, OpKernelContext* ctx,
+                               DoneCallback callback) {
+  const int64 batch_size = tuple[0].dim_size(0);
+  if (batch_size == 0) {
+    callback();
+    return;
+  }
+
+  CancellationManager* cm = ctx->cancellation_manager();
+  CancellationToken token = cm->get_cancellation_token();
+  bool already_cancelled;
+  {
+    mutex_lock l(mu_);
+    already_cancelled = !cm->RegisterCallback(
+        token, [this, token]() { Cancel(kEnqueue, token); });
+    if (!already_cancelled) {
+      enqueue_attempts_.emplace_back(
+          batch_size, callback, ctx, token,
+          [tuple, this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+            if (closed_) {
+              attempt->context->SetStatus(
+                  errors::Aborted("FIFOQueue '", name_, "' is closed."));
+              return kComplete;
+            }
+            RunResult result = kNoProgress;
+            while (queues_[0].size() < static_cast<size_t>(capacity_)) {
+              result = kProgress;
+              const int index =
+                  tuple[0].dim_size(0) - attempt->elements_requested;
+              for (int i = 0; i < num_components(); ++i) {
+                PersistentTensor element;
+                attempt->context->SetStatus(GetElementComponentFromBatch(
+                    tuple, index, i, attempt->context, &element));
+                if (!attempt->context->status().ok()) return kComplete;
+                queues_[i].push_back(element);
+              }
+              --attempt->elements_requested;
+              if (attempt->elements_requested == 0) {
+                return kComplete;
+              }
+            }
+            return result;
+          });
+    }
+  }
+  if (!already_cancelled) {
+    FlushUnlocked();
+  } else {
+    ctx->SetStatus(errors::Cancelled("Enqueue operation was cancelled"));
+    callback();
+  }
+}
+
+void FIFOQueue::TryDequeue(OpKernelContext* ctx, CallbackWithTuple callback) {
+  CancellationManager* cm = ctx->cancellation_manager();
+  CancellationToken token = cm->get_cancellation_token();
+  bool already_cancelled;
+  {
+    mutex_lock l(mu_);
+    already_cancelled = !cm->RegisterCallback(
+        token, [this, token]() { Cancel(kDequeue, token); });
+    if (!already_cancelled) {
+      // TODO(josh11b): This makes two copies of callback, avoid this if possible.
+      dequeue_attempts_.emplace_back(
+          1, [callback]() { callback(Tuple()); }, ctx, token,
+          [callback, this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+            const int32 s = queues_[0].size();
+            if (closed_ && s == 0) {
+              attempt->context->SetStatus(errors::OutOfRange(
+                  "FIFOQueue '", name_, "' is closed and has ",
+                  "insufficient elements (requested ", 1, ", current size ", s,
+                  ")"));
+              return kComplete;
+            }
+            if (s > 0) {
+              Tuple tuple;
+              DequeueLocked(attempt->context, &tuple);
+              attempt->done_callback = [callback, tuple]() { callback(tuple); };
+              return kComplete;
+            } else {
+              return kNoProgress;
+            }
+          });
+    }
+  }
+  if (!already_cancelled) {
+    FlushUnlocked();
+  } else {
+    ctx->SetStatus(errors::Cancelled("Dequeue operation was cancelled"));
+    callback(Tuple());
+  }
+}
+
+void FIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
+                               CallbackWithTuple callback) {
+  if (!specified_shapes()) {
+    ctx->SetStatus(
+        errors::InvalidArgument("FIFOQueue's DequeueMany requires the "
+                                "components to have specified shapes."));
+    callback(Tuple());
+    return;
+  }
+  if (num_elements == 0) {
+    Tuple tuple;
+    tuple.reserve(num_components());
+    for (int i = 0; i < num_components(); ++i) {
+      // TODO(josh11b,misard): Switch to allocate_output().  Problem is
+      // this breaks the abstraction boundary since we don't *really*
+      // know if and how the Tensors in the tuple we pass to callback
+      // correspond to the outputs of *ctx.  For example, the
+      // ReaderRead Op uses TryDequeue() to get a filename out of a
+      // queue that is used internally by the reader and is not
+      // associated with any output of the ReaderRead.
+      // mrry@ adds:
+      // Maybe we need to pass a std::function<Tensor*(...)> (or
+      // better signature) that calls the appropriate allocator
+      // function in addition to ctx?  (Or support a shim Allocator
+      // that has an internal OpKernelContext*, and dispatches to the
+      // appropriate method?)
+      // misard@ adds:
+      // I don't see that a std::function would help. The problem is
+      // that at this point (allocation time) the system doesn't know
+      // what is going to happen to the element read out of the
+      // queue. As long as we keep the generality that TensorFlow Ops
+      // do their own dynamic allocation in arbitrary C++ code, we
+      // need to preserve robustness to allocating output Tensors with
+      // the 'wrong' attributes, and fixing up with a copy. The only
+      // improvement I can see here in the future would be to support
+      // an optimized case where the queue 'knows' what attributes to
+      // use, and plumbs them through here.
+      Tensor element;
+      ctx->allocate_temp(component_dtypes_[i], ManyOutShape(i, 0), &element);
+      tuple.emplace_back(element);
+    }
+    callback(tuple);
+    return;
+  }
+
+  CancellationManager* cm = ctx->cancellation_manager();
+  CancellationToken token = cm->get_cancellation_token();
+  bool already_cancelled;
+  {
+    mutex_lock l(mu_);
+    already_cancelled = !cm->RegisterCallback(
+        token, [this, token]() { Cancel(kDequeue, token); });
+    if (!already_cancelled) {
+      // TODO(josh11b): This makes two copies of callback, avoid this if possible.
+      dequeue_attempts_.emplace_back(
+          num_elements, [callback]() { callback(Tuple()); }, ctx, token,
+          [callback, this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+            int32 s = queues_[0].size();
+            if (closed_ && s < attempt->elements_requested) {
+              attempt->context->SetStatus(errors::OutOfRange(
+                  "FIFOQueue '", name_, "' is closed and has ",
+                  "insufficient elements (requested ",
+                  attempt->elements_requested, ", current size ", s, ")"));
+
+              // TODO(mrry): Add support for producing a partial batch as
+              // output when the queue is closed.
+              if (!attempt->tuple.empty()) {
+                // Restore already-dequeued elements to the front of the queue.
+                for (int64 i = attempt->tuple[0].dim_size(0) -
+                               attempt->elements_requested - 1;
+                     i >= 0; --i) {
+                  for (int j = 0; j < num_components(); ++j) {
+                    PersistentTensor element;
+                    Status s = GetElementComponentFromBatch(
+                        attempt->tuple, i, j, attempt->context, &element);
+                    if (!s.ok()) {
+                      attempt->context->SetStatus(
+                          errors::DataLoss("Failed to restore element from "
+                                           "partially-dequeued batch "
+                                           "to FIFOQueue"));
+                    }
+                    queues_[j].push_front(element);
+                  }
+                }
+              }
+              return kComplete;
+            }
+
+            RunResult result = kNoProgress;
+            for (; s > 0; --s) {
+              if (attempt->tuple.empty()) {
+                // Only allocate tuple when we have something to dequeue
+                // so we don't use exceessive memory when there are many
+                // blocked dequeue attempts waiting.
+                attempt->tuple.reserve(num_components());
+                for (int i = 0; i < num_components(); ++i) {
+                  const TensorShape shape =
+                      ManyOutShape(i, attempt->elements_requested);
+                  Tensor element;
+                  attempt->context->allocate_temp(component_dtypes_[i], shape,
+                                                  &element);
+                  attempt->tuple.emplace_back(element);
+                }
+              }
+              result = kProgress;
+              Tuple tuple;
+              DequeueLocked(attempt->context, &tuple);
+              const int index =
+                  attempt->tuple[0].dim_size(0) - attempt->elements_requested;
+              for (int i = 0; i < num_components(); ++i) {
+                attempt->context->SetStatus(
+                    CopyElementToSlice(tuple[i], &attempt->tuple[i], index));
+                if (!attempt->context->status().ok()) return kComplete;
+              }
+              tuple.clear();
+              --attempt->elements_requested;
+              if (attempt->elements_requested == 0) {
+                tuple = attempt->tuple;
+                attempt->done_callback = [callback, tuple]() {
+                  callback(tuple);
+                };
+                return kComplete;
+              }
+            }
+            return result;
+          });
+    }
+  }
+  if (!already_cancelled) {
+    FlushUnlocked();
+  } else {
+    ctx->SetStatus(errors::Cancelled("Dequeue operation was cancelled"));
+    callback(Tuple());
+  }
+}
+
+void FIFOQueue::Close(OpKernelContext* ctx, bool cancel_pending_enqueues,
+                      DoneCallback callback) {
+  if (cancel_pending_enqueues) {
+    CloseAndCancel();
+    callback();
+  } else {
+    {
+      mutex_lock lock(mu_);
+      enqueue_attempts_.emplace_back(
+          0, callback, ctx, CancellationManager::kInvalidToken,
+          [this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+            if (closed_) {
+              attempt->context->SetStatus(errors::Aborted(
+                  "FIFOQueue '", name_, "' is already closed."));
+            } else {
+              closed_ = true;
+            }
+            return kComplete;
+          });
+    }
+    FlushUnlocked();
+  }
+}
+
+Status FIFOQueue::MatchesNodeDef(const NodeDef& node_def) {
+  TF_RETURN_IF_ERROR(MatchesNodeDefOp(node_def, "FIFOQueue"));
+  TF_RETURN_IF_ERROR(MatchesNodeDefCapacity(node_def, capacity_));
+  TF_RETURN_IF_ERROR(MatchesNodeDefTypes(node_def));
+  TF_RETURN_IF_ERROR(MatchesNodeDefShapes(node_def));
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fifo_queue.h b/tensorflow/core/kernels/fifo_queue.h
new file mode 100644
index 0000000000..e9fe5f34a4
--- /dev/null
+++ b/tensorflow/core/kernels/fifo_queue.h
@@ -0,0 +1,127 @@
+#ifndef TENSORFLOW_KERNELS_FIFO_QUEUE_H_
+#define TENSORFLOW_KERNELS_FIFO_QUEUE_H_
+
+#include <deque>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/queue_base.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+class FIFOQueue : public QueueBase {
+ public:
+  FIFOQueue(int32 capacity, const DataTypeVector& component_dtypes,
+            const std::vector<TensorShape>& component_shapes,
+            const string& name);
+  Status Initialize();  // Must be called before any other method.
+
+  // Implementations of QueueInterface methods --------------------------------
+
+  Status ValidateTuple(const Tuple& tuple) override;
+  Status ValidateManyTuple(const Tuple& tuple) override;
+  void TryEnqueue(const Tuple& tuple, OpKernelContext* ctx,
+                  DoneCallback callback) override;
+  void TryEnqueueMany(const Tuple& tuple, OpKernelContext* ctx,
+                      DoneCallback callback) override;
+  void TryDequeue(OpKernelContext* ctx, CallbackWithTuple callback) override;
+  void TryDequeueMany(int num_elements, OpKernelContext* ctx,
+                      CallbackWithTuple callback) override;
+  void Close(OpKernelContext* ctx, bool cancel_pending_enqueues,
+             DoneCallback callback) override;
+  Status MatchesNodeDef(const NodeDef& node_def) override;
+
+  int32 size() override {
+    mutex_lock lock(mu_);
+    return queues_[0].size();
+  }
+
+  int32 capacity() const { return capacity_; }
+
+ private:
+  enum Action { kEnqueue, kDequeue };
+
+  ~FIFOQueue() override {}
+
+  TensorShape ManyOutShape(int i, int64 batch_size) {
+    TensorShape shape({batch_size});
+    shape.AppendShape(component_shapes_[i]);
+    return shape;
+  }
+
+  // Helper for dequeuing a single element from queues_.
+  void DequeueLocked(OpKernelContext* ctx, Tuple* tuple)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  void Cancel(Action action, CancellationToken token);
+
+  // Helper for cancelling all pending Enqueue(Many) operations when
+  // Close is called with cancel_pending_enqueues.
+  void CloseAndCancel();
+
+  // Tries to enqueue/dequeue (or close) based on whatever is at the
+  // front of enqueue_attempts_/dequeue_attempts_.  Appends to
+  // *finished the callback for any finished attempt (so it may be
+  // called once mu_ is released).  Returns true if any progress was
+  // made.
+  struct CleanUp {
+    CleanUp(DoneCallback&& f, CancellationToken ct, CancellationManager* cm)
+        : finished(f), to_deregister(ct), cm(cm) {}
+    DoneCallback finished;
+    CancellationToken to_deregister;
+    CancellationManager* cm;
+  };
+  bool TryAttemptLocked(Action action, std::vector<CleanUp>* clean_up)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Tries to make progress on the enqueues or dequeues at the front
+  // of the *_attempts_ queues.
+  void FlushUnlocked();
+
+  const int32 capacity_;
+
+  mutex mu_;
+  typedef std::deque<PersistentTensor> SubQueue;
+  std::vector<SubQueue> queues_ GUARDED_BY(mu_);
+  bool closed_ GUARDED_BY(mu_);
+
+  enum RunResult { kNoProgress, kProgress, kComplete };
+  struct Attempt;
+  typedef std::function<RunResult(Attempt*)> RunCallback;
+  struct Attempt {
+    int32 elements_requested;
+    DoneCallback done_callback;  // must be run outside mu_
+    OpKernelContext* context;
+    CancellationToken cancellation_token;
+    RunCallback run_callback;  // must be run while holding mu_
+    bool is_cancelled;
+    Tuple tuple;
+
+    Attempt(int32 elements_requested, DoneCallback done_callback,
+            OpKernelContext* context, CancellationToken cancellation_token,
+            RunCallback run_callback)
+        : elements_requested(elements_requested),
+          done_callback(done_callback),
+          context(context),
+          cancellation_token(cancellation_token),
+          run_callback(run_callback),
+          is_cancelled(false) {}
+  };
+  std::deque<Attempt> enqueue_attempts_ GUARDED_BY(mu_);
+  std::deque<Attempt> dequeue_attempts_ GUARDED_BY(mu_);
+
+  static Status GetElementComponentFromBatch(const Tuple& tuple, int index,
+                                             int component,
+                                             OpKernelContext* ctx,
+                                             PersistentTensor* out_element);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(FIFOQueue);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_FIFO_QUEUE_H_
diff --git a/tensorflow/core/kernels/fifo_queue_op.cc b/tensorflow/core/kernels/fifo_queue_op.cc
new file mode 100644
index 0000000000..f1088181fe
--- /dev/null
+++ b/tensorflow/core/kernels/fifo_queue_op.cc
@@ -0,0 +1,93 @@
+// See docs in ../ops/data_flow_ops.cc.
+
+#include <deque>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/fifo_queue.h"
+#include "tensorflow/core/kernels/queue_base.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+// Defines a FIFOQueueOp, which produces a Queue (specifically, one
+// backed by FIFOQueue) that persists across different graph
+// executions, and sessions. Running this op produces a single-element
+// tensor of handles to Queues in the corresponding device.
+class FIFOQueueOp : public OpKernel {
+ public:
+  explicit FIFOQueueOp(OpKernelConstruction* context)
+      : OpKernel(context), queue_handle_set_(false) {
+    OP_REQUIRES_OK(context, context->GetAttr("capacity", &capacity_));
+    OP_REQUIRES_OK(context,
+                   context->allocate_persistent(DT_STRING, TensorShape({2}),
+                                                &queue_handle_, nullptr));
+    if (capacity_ < 0) {
+      capacity_ = FIFOQueue::kUnbounded;
+    }
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("component_types", &component_types_));
+    OP_REQUIRES_OK(context, context->GetAttr("shapes", &component_shapes_));
+  }
+
+  ~FIFOQueueOp() override {
+    // If the queue object was not shared, delete it.
+    if (queue_handle_set_ && cinfo_.resource_is_private_to_kernel()) {
+      TF_CHECK_OK(cinfo_.resource_manager()->Delete<QueueInterface>(
+          cinfo_.container(), cinfo_.name()));
+    }
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    mutex_lock l(mu_);
+    if (!queue_handle_set_) {
+      OP_REQUIRES_OK(ctx, SetQueueHandle(ctx));
+    }
+    ctx->set_output_ref(0, &mu_, queue_handle_.AccessTensor(ctx));
+  }
+
+ private:
+  Status SetQueueHandle(OpKernelContext* ctx) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    TF_RETURN_IF_ERROR(cinfo_.Init(ctx->resource_manager(), def()));
+    QueueInterface* queue;
+    auto creator = [this](QueueInterface** ret) {
+      FIFOQueue* queue = new FIFOQueue(capacity_, component_types_,
+                                       component_shapes_, cinfo_.name());
+      *ret = queue;
+      return queue->Initialize();
+    };
+    TF_RETURN_IF_ERROR(
+        cinfo_.resource_manager()->LookupOrCreate<QueueInterface>(
+            cinfo_.container(), cinfo_.name(), &queue, creator));
+    core::ScopedUnref unref_me(queue);
+    // Verify that the shared queue is compatible with the requested arguments.
+    TF_RETURN_IF_ERROR(queue->MatchesNodeDef(def()));
+    auto h = queue_handle_.AccessTensor(ctx)->flat<string>();
+    h(0) = cinfo_.container();
+    h(1) = cinfo_.name();
+    queue_handle_set_ = true;
+    return Status::OK();
+  }
+
+  int32 capacity_;
+  DataTypeVector component_types_;
+  std::vector<TensorShape> component_shapes_;
+  ContainerInfo cinfo_;
+
+  mutex mu_;
+  PersistentTensor queue_handle_ GUARDED_BY(mu_);
+  bool queue_handle_set_ GUARDED_BY(mu_);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(FIFOQueueOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("FIFOQueue").Device(DEVICE_CPU), FIFOQueueOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fill_functor.h b/tensorflow/core/kernels/fill_functor.h
new file mode 100644
index 0000000000..831f0c899e
--- /dev/null
+++ b/tensorflow/core/kernels/fill_functor.h
@@ -0,0 +1,26 @@
+#ifndef TENSORFLOW_KERNELS_FILL_FUNCTOR_H_
+#define TENSORFLOW_KERNELS_FILL_FUNCTOR_H_
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T>
+struct FillFunctor {
+  // Computes on device "d": out = out.constant(in(0)),
+  void operator()(const Device& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstScalar in);
+};
+
+template <typename Device, typename T>
+struct SetZeroFunctor {
+  // Computes on device "d": out = out.setZero(),
+  void operator()(const Device& d, typename TTypes<T>::Flat out);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_FILL_FUNCTOR_H_
diff --git a/tensorflow/core/kernels/fixed_length_record_reader_op.cc b/tensorflow/core/kernels/fixed_length_record_reader_op.cc
new file mode 100644
index 0000000000..77516ab151
--- /dev/null
+++ b/tensorflow/core/kernels/fixed_length_record_reader_op.cc
@@ -0,0 +1,109 @@
+// See docs in ../ops/io_ops.cc.
+
+#include <memory>
+#include "tensorflow/core/framework/reader_op_kernel.h"
+#include "tensorflow/core/kernels/reader_base.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/io/inputbuffer.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/public/env.h"
+
+namespace tensorflow {
+
+class FixedLengthRecordReader : public ReaderBase {
+ public:
+  FixedLengthRecordReader(const string& node_name, int64 header_bytes,
+                          int64 record_bytes, int64 footer_bytes, Env* env)
+      : ReaderBase(
+            strings::StrCat("FixedLengthRecordReader '", node_name, "'")),
+        header_bytes_(header_bytes),
+        record_bytes_(record_bytes),
+        footer_bytes_(footer_bytes),
+        env_(env),
+        file_pos_limit_(-1),
+        record_number_(0) {}
+
+  // On success:
+  // * input_buffer_ != nullptr,
+  // * input_buffer_->Tell() == footer_bytes_
+  // * file_pos_limit_ == file size - header_bytes_
+  Status OnWorkStartedLocked() override {
+    record_number_ = 0;
+    uint64 file_size = 0;
+    TF_RETURN_IF_ERROR(env_->GetFileSize(current_work(), &file_size));
+    file_pos_limit_ = file_size - footer_bytes_;
+
+    RandomAccessFile* file = nullptr;
+    TF_RETURN_IF_ERROR(env_->NewRandomAccessFile(current_work(), &file));
+    input_buffer_.reset(new io::InputBuffer(file, kBufferSize));
+    TF_RETURN_IF_ERROR(input_buffer_->SkipNBytes(header_bytes_));
+    return Status::OK();
+  }
+
+  Status OnWorkFinishedLocked() override {
+    input_buffer_.reset(nullptr);
+    return Status::OK();
+  }
+
+  Status ReadLocked(string* key, string* value, bool* produced,
+                    bool* at_end) override {
+    if (input_buffer_->Tell() >= file_pos_limit_) {
+      *at_end = true;
+      return Status::OK();
+    }
+    TF_RETURN_IF_ERROR(input_buffer_->ReadNBytes(record_bytes_, value));
+    *key = strings::StrCat(current_work(), ":", record_number_);
+    *produced = true;
+    ++record_number_;
+    return Status::OK();
+  }
+
+  Status ResetLocked() override {
+    file_pos_limit_ = -1;
+    record_number_ = 0;
+    input_buffer_.reset(nullptr);
+    return ReaderBase::ResetLocked();
+  }
+
+  // TODO(josh11b): Implement serializing and restoring the state.
+
+ private:
+  enum { kBufferSize = 256 << 10 /* 256 kB */ };
+  const int64 header_bytes_;
+  const int64 record_bytes_;
+  const int64 footer_bytes_;
+  Env* const env_;
+  int64 file_pos_limit_;
+  int64 record_number_;
+  std::unique_ptr<io::InputBuffer> input_buffer_;
+};
+
+class FixedLengthRecordReaderOp : public ReaderOpKernel {
+ public:
+  explicit FixedLengthRecordReaderOp(OpKernelConstruction* context)
+      : ReaderOpKernel(context) {
+    int64 header_bytes = -1, record_bytes = -1, footer_bytes = -1;
+    OP_REQUIRES_OK(context, context->GetAttr("header_bytes", &header_bytes));
+    OP_REQUIRES_OK(context, context->GetAttr("record_bytes", &record_bytes));
+    OP_REQUIRES_OK(context, context->GetAttr("footer_bytes", &footer_bytes));
+    OP_REQUIRES(context, header_bytes >= 0,
+                errors::InvalidArgument("header_bytes must be >= 0 not ",
+                                        header_bytes));
+    OP_REQUIRES(context, record_bytes >= 0,
+                errors::InvalidArgument("record_bytes must be >= 0 not ",
+                                        record_bytes));
+    OP_REQUIRES(context, footer_bytes >= 0,
+                errors::InvalidArgument("footer_bytes must be >= 0 not ",
+                                        footer_bytes));
+    Env* env = context->env();
+    SetReaderFactory([this, header_bytes, record_bytes, footer_bytes, env]() {
+      return new FixedLengthRecordReader(name(), header_bytes, record_bytes,
+                                         footer_bytes, env);
+    });
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("FixedLengthRecordReader").Device(DEVICE_CPU),
+                        FixedLengthRecordReaderOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
new file mode 100644
index 0000000000..8bd48f26d6
--- /dev/null
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -0,0 +1,136 @@
+// See docs in ../ops/array_ops.cc.
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+
+namespace {
+template <typename T, typename Index, int static_slice_elems>
+void HandleCopies(const Tensor& Tparams,
+                  typename TTypes<Index>::ConstVec& Tindices, int slice_elems,
+                  typename TTypes<T>::Matrix Tout) {
+  const int N = Tindices.dimension(0);
+  const auto& Tparams_flat = Tparams.flat_outer_dims<T>();
+  T* Tout_base = &Tout(0, 0);
+  const T* Tparams_base = &Tparams_flat(0, 0);
+  const size_t slice_bytes = slice_elems * sizeof(T);
+  if (static_slice_elems >= 0) {
+    // Give compiler static knowledge of the number of elements/bytes
+    CHECK_EQ(static_slice_elems, slice_elems);
+    slice_elems = static_slice_elems;
+  }
+  for (int i = 0; i < N; i++) {
+    int j = i + 1;
+    if (j < N) {
+      port::prefetch<port::PREFETCH_HINT_T0>(&Tparams_flat(Tindices(j), 0));
+      port::prefetch<port::PREFETCH_HINT_T0>(&Tout(j, 0));
+    }
+    memcpy(Tout_base + i * slice_elems,
+           Tparams_base + Tindices(i) * slice_elems, slice_bytes);
+  }
+}
+
+}  // anonymous namespace
+
+template <typename T, typename Index>
+class GatherOp : public OpKernel {
+ public:
+  //   QUESTION: It'd be nice to support DT_INT16, DT_UINT8,
+  //   etc. here for the type of the second input argument.  Should
+  //   we have the framework do some sort of integer promotion
+  //   automatically, or should that be something that users have to
+  //   do explicitly with a conversion operator in the graph?
+  explicit GatherOp(OpKernelConstruction* c) : OpKernel(c) {
+    const DataType dt = DataTypeToEnum<T>::v();
+    const DataType index_t = DataTypeToEnum<Index>::v();
+    OP_REQUIRES_OK(c, c->MatchSignature({dt, index_t}, {dt}));
+  }
+
+  void Compute(OpKernelContext* c) override {
+    const Tensor& Tparams = c->input(0);
+    const Tensor& Tindices = c->input(1);
+    OP_REQUIRES(
+        c, TensorShapeUtils::IsVectorOrHigher(Tparams.shape()),
+        errors::InvalidArgument("params must be at least 1 dimensional"));
+    const int64 N = Tindices.NumElements();
+    const int64 first_dim_size = Tparams.dim_size(0);
+
+    // Validate all the indices are in range
+    auto Tindices_vec = Tindices.flat<Index>();
+    for (int64 i = 0; i < N; i++) {
+      const Index index = Tindices_vec(i);
+      OP_REQUIRES(c, index >= 0 && index < first_dim_size,
+                  errors::InvalidArgument(
+                      strings::StrCat("Index ", index, " at offset ", i,
+                                      " in Tindices is out of range")));
+    }
+
+    // The result shape is indices.shape + params.shape[1:].
+    TensorShape result_shape = Tindices.shape();
+    for (int i = 1; i < Tparams.dims(); i++) {
+      result_shape.AddDim(Tparams.dim_size(i));
+    }
+
+    Tensor* Tout = nullptr;
+    OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &Tout));
+    const auto& Tparams_flat = Tparams.flat_outer_dims<T>();
+    if (N > 0) {
+      auto Tindices_flat = Tindices.flat<Index>();
+      auto Tout_flat = Tout->shaped<T, 2>({N, Tout->NumElements() / N});
+      if (DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) {
+        const int64 slice_size = Tout->NumElements() / N;
+#define SPECIALIZE(elems)                                               \
+  do {                                                                  \
+    if (slice_size == elems) {                                          \
+      HandleCopies<T, Index, elems>(Tparams, Tindices_flat, slice_size, \
+                                    Tout_flat);                         \
+      return;                                                           \
+    }                                                                   \
+  } while (0)
+
+        SPECIALIZE(10);
+        SPECIALIZE(20);
+
+#undef SPECIALIZE
+
+        HandleCopies<T, Index, -1>(Tparams, Tindices_flat, slice_size,
+                                   Tout_flat);
+      } else {
+        for (int i = 0; i < N; i++) {
+          int j = i + 1;
+          if (j < N) {
+            port::prefetch<port::PREFETCH_HINT_T0>(
+                &Tparams_flat(Tindices_vec(j), 0));
+            port::prefetch<port::PREFETCH_HINT_T0>(&Tout_flat(j, 0));
+          }
+          // Copy last Ndim-1 dimensions of Tparams[Tindices[i]] to Tout[i]
+          Tout_flat.template chip<0>(i) =
+              Tparams_flat.template chip<0>(Tindices_vec(i));
+        }
+      }
+    }
+  }
+};
+
+#define REGISTER_GATHER(type, index_type)                              \
+  REGISTER_KERNEL_BUILDER(Name("Gather")                               \
+                              .Device(DEVICE_CPU)                      \
+                              .TypeConstraint<type>("Tparams")         \
+                              .TypeConstraint<index_type>("Tindices"), \
+                          GatherOp<type, index_type>)
+
+#define REGISTER_GATHER_INT32(type) REGISTER_GATHER(type, int32)
+#define REGISTER_GATHER_INT64(type) REGISTER_GATHER(type, int64)
+
+TF_CALL_ALL_TYPES(REGISTER_GATHER_INT32);
+TF_CALL_ALL_TYPES(REGISTER_GATHER_INT64);
+
+#undef REGISTER_GATHER_INT32
+#undef REGISTER_GATHER_INT64
+#undef REGISTER_GATHER
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/gather_op_test.cc b/tensorflow/core/kernels/gather_op_test.cc
new file mode 100644
index 0000000000..d7410169e1
--- /dev/null
+++ b/tensorflow/core/kernels/gather_op_test.cc
@@ -0,0 +1,213 @@
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+namespace {
+
+class GatherOpTest : public OpsTestBase {
+ protected:
+  void MakeOp(DataType index_type) {
+    RequireDefaultOps();
+    ASSERT_OK(NodeDefBuilder("myop", "Gather")
+                  .Input(FakeInput(DT_FLOAT))
+                  .Input(FakeInput(index_type))
+                  .Finalize(node_def()));
+    ASSERT_OK(InitOp());
+  }
+};
+
+TEST_F(GatherOpTest, ScalarIndices) {
+  MakeOp(DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({5}), {0, 1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({}), {3});
+  ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({}));
+  test::FillValues<float>(&expected, {3});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(GatherOpTest, Simple_TwoD32) {
+  MakeOp(DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({5, 3}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
+  AddInputFromArray<int32>(TensorShape({4}), {0, 4, 0, 2});
+  ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({4, 3}));
+  test::FillValues<float>(&expected, {0, 1, 2, 12, 13, 14, 0, 1, 2, 6, 7, 8});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(GatherOpTest, Simple_TwoD64) {
+  MakeOp(DT_INT64);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({5, 3}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
+  AddInputFromArray<int64>(TensorShape({4}), {0, 4, 0, 2});
+  ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({4, 3}));
+  test::FillValues<float>(&expected, {0, 1, 2, 12, 13, 14, 0, 1, 2, 6, 7, 8});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(GatherOpTest, HighRank) {
+  MakeOp(DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({4}), {0, 1, 2, 3});
+  AddInputFromArray<int32>(TensorShape({2, 3}), {1, 2, 0, 2, 3, 0});
+  ASSERT_OK(RunOpKernel());
+
+  // Check the output
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  test::FillValues<float>(&expected, {1, 2, 0, 2, 3, 0});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(GatherOpTest, Error_IndexOutOfRange) {
+  MakeOp(DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({5, 3}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
+  AddInputFromArray<int32>(TensorShape({4}), {0, 4, 99, 2});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString())
+                  .contains("Index 99 at offset 2 in Tindices is out of range"))
+      << s;
+}
+
+class GatherOpForBenchmark : public GatherOpTest {
+ public:
+  void TestBody() override {  // not used }
+  }
+  void PublicMakeOp(DataType index_type) { MakeOp(index_type); }
+};
+
+static const int kSorted = 0x8000;  // Mask for arg to specify sorting vs. not
+
+template <typename Index>
+void BM_Gather(int iters, int arg) {
+  testing::StopTiming();
+
+  bool sorted = ((arg & kSorted) != 0);
+  int dim = arg & ~kSorted;
+
+  GatherOpForBenchmark t;
+  t.PublicMakeOp(DataTypeToEnum<Index>::v());
+  // Use a 512 MB table, regardless of dim
+  const int kRows = ((1 << 29) / sizeof(float)) / dim;
+  std::vector<float> data(kRows * dim, 1.0f);
+  t.AddInputFromArray<float>(TensorShape({kRows, dim}), data);
+  const int kLookups = 2000;
+  const int kBatches = 1000000 / kLookups;
+  random::PhiloxRandom philox(301, 17);
+  random::SimplePhilox rnd(&philox);
+  std::vector<std::vector<Index>> all_ids(kBatches);
+  for (int i = 0; i < kBatches; ++i) {
+    std::vector<Index>* ids = &all_ids[i];
+    ids->resize(kLookups);
+    for (int j = 0; j < kLookups; ++j) {
+      (*ids)[j] = rnd.Uniform(kRows);
+    }
+    if (sorted) {
+      sort(ids->begin(), ids->end());
+    }
+  }
+
+  t.AddInput<Index>(TensorShape({kLookups}), [](int i) { return 0; });
+  if (sorted) {
+    testing::SetLabel("sorted by id");
+  }
+  testing::BytesProcessed(static_cast<int64>(iters) * kLookups * dim *
+                          sizeof(float));
+  testing::StartTiming();
+  while (--iters > 0) {
+    const std::vector<Index>& b = all_ids[iters % kBatches];
+    TensorValue input = t.mutable_input(1);
+    gtl::MutableArraySlice<Index> slice(&input->vec<Index>()(0),
+                                        input->NumElements());
+    for (int i = 0; i < kLookups; i++) {
+      slice[i] = b[i];
+    }
+    Status s = t.RunOpKernel();
+  }
+}
+
+static void BM_Gather32(int iters, int arg) { BM_Gather<int32>(iters, arg); }
+
+static void BM_Gather64(int iters, int arg) { BM_Gather<int64>(iters, arg); }
+
+BENCHMARK(BM_Gather32)
+    ->Arg(10)
+    ->Arg(10 | kSorted)
+    ->Arg(20)
+    ->Arg(40)
+    ->Arg(63)
+    ->Arg(63 | kSorted)
+    ->Arg(64)
+    ->Arg(64 | kSorted)
+    ->Arg(65)
+    ->Arg(65 | kSorted)
+    ->Arg(100)
+    ->Arg(100 | kSorted)
+    ->Arg(127)
+    ->Arg(127 | kSorted)
+    ->Arg(128)
+    ->Arg(128 | kSorted)
+    ->Arg(129)
+    ->Arg(129 | kSorted)
+    ->Arg(1000)
+    ->Arg(1000 | kSorted);
+
+BENCHMARK(BM_Gather64)
+    ->Arg(10)
+    ->Arg(10 | kSorted)
+    ->Arg(20)
+    ->Arg(40)
+    ->Arg(63)
+    ->Arg(63 | kSorted)
+    ->Arg(64)
+    ->Arg(64 | kSorted)
+    ->Arg(65)
+    ->Arg(65 | kSorted)
+    ->Arg(100)
+    ->Arg(100 | kSorted)
+    ->Arg(127)
+    ->Arg(127 | kSorted)
+    ->Arg(128)
+    ->Arg(128 | kSorted)
+    ->Arg(129)
+    ->Arg(129 | kSorted)
+    ->Arg(1000)
+    ->Arg(1000 | kSorted);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/identity_op.cc b/tensorflow/core/kernels/identity_op.cc
new file mode 100644
index 0000000000..b29efbddfb
--- /dev/null
+++ b/tensorflow/core/kernels/identity_op.cc
@@ -0,0 +1,45 @@
+// See docs in ../ops/array_ops.cc.
+#include "tensorflow/core/kernels/identity_op.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+
+REGISTER_KERNEL_BUILDER(Name("Identity").Device(DEVICE_CPU), IdentityOp);
+// StopGradient does the same thing as Identity, but has a different
+// gradient registered.
+REGISTER_KERNEL_BUILDER(Name("StopGradient").Device(DEVICE_CPU), IdentityOp);
+
+REGISTER_KERNEL_BUILDER(Name("RefIdentity").Device(DEVICE_CPU), IdentityOp);
+
+#define REGISTER_GPU_KERNEL(type)                                        \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("Identity").Device(DEVICE_GPU).TypeConstraint<type>("T"),     \
+      IdentityOp);                                                       \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("RefIdentity").Device(DEVICE_GPU).TypeConstraint<type>("T"),  \
+      IdentityOp);                                                       \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("StopGradient").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+      IdentityOp)
+
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+REGISTER_GPU_KERNEL(bool);
+REGISTER_GPU_KERNEL(bfloat16);
+
+#undef REGISTER_GPU_KERNEL
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Identity")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("input")
+                            .HostMemory("output")
+                            .TypeConstraint<int32>("T"),
+                        IdentityOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/identity_op.h b/tensorflow/core/kernels/identity_op.h
new file mode 100644
index 0000000000..7adc1eace0
--- /dev/null
+++ b/tensorflow/core/kernels/identity_op.h
@@ -0,0 +1,25 @@
+#ifndef TENSORFLOW_KERNELS_IDENTITY_OP_H_
+#define TENSORFLOW_KERNELS_IDENTITY_OP_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+class IdentityOp : public OpKernel {
+ public:
+  explicit IdentityOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    if (IsRefType(context->input_dtype(0))) {
+      context->forward_ref_input_to_ref_output(0, 0);
+    } else {
+      context->set_output(0, context->input(0));
+    }
+  }
+
+  bool IsExpensive() override { return false; }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_IDENTITY_OP_H_
diff --git a/tensorflow/core/kernels/identity_op_test.cc b/tensorflow/core/kernels/identity_op_test.cc
new file mode 100644
index 0000000000..6483367a79
--- /dev/null
+++ b/tensorflow/core/kernels/identity_op_test.cc
@@ -0,0 +1,56 @@
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include <gtest/gtest.h>
+
+namespace tensorflow {
+namespace {
+
+class IdentityOpTest : public OpsTestBase {
+ protected:
+  Status Init(DataType input_type) {
+    RequireDefaultOps();
+    TF_CHECK_OK(NodeDefBuilder("op", "Identity")
+                    .Input(FakeInput(input_type))
+                    .Finalize(node_def()));
+    return InitOp();
+  }
+};
+
+TEST_F(IdentityOpTest, Int32Success_6) {
+  ASSERT_OK(Init(DT_INT32));
+  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_INT32, TensorShape({6}));
+  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+}
+
+TEST_F(IdentityOpTest, Int32Success_2_3) {
+  ASSERT_OK(Init(DT_INT32));
+  AddInputFromArray<int32>(TensorShape({2, 3}), {1, 2, 3, 4, 5, 6});
+  ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_INT32, TensorShape({2, 3}));
+  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+}
+
+TEST_F(IdentityOpTest, StringSuccess) {
+  ASSERT_OK(Init(DT_STRING));
+  AddInputFromArray<string>(TensorShape({6}), {"A", "b", "C", "d", "E", "f"});
+  ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({6}));
+  test::FillValues<string>(&expected, {"A", "b", "C", "d", "E", "f"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(IdentityOpTest, RefInputError) { ASSERT_OK(Init(DT_INT32_REF)); }
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/identity_reader_op.cc b/tensorflow/core/kernels/identity_reader_op.cc
new file mode 100644
index 0000000000..a63fea5dbb
--- /dev/null
+++ b/tensorflow/core/kernels/identity_reader_op.cc
@@ -0,0 +1,57 @@
+// See docs in ../ops/io_ops.cc.
+
+#include <memory>
+#include "tensorflow/core/framework/reader_op_kernel.h"
+#include "tensorflow/core/kernels/reader_base.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+class IdentityReader : public ReaderBase {
+ public:
+  explicit IdentityReader(const string& node_name)
+      : ReaderBase(strings::StrCat("IdentityReader '", node_name, "'")) {}
+
+  Status ReadLocked(string* key, string* value, bool* produced,
+                    bool* at_end) override {
+    *key = current_work();
+    *value = current_work();
+    *produced = true;
+    *at_end = true;
+    return Status::OK();
+  }
+
+  // Stores state in a ReaderBaseState proto, since IdentityReader has
+  // no additional state beyond ReaderBase.
+  Status SerializeStateLocked(string* state) override {
+    ReaderBaseState base_state;
+    SaveBaseState(&base_state);
+    base_state.SerializeToString(state);
+    return Status::OK();
+  }
+
+  Status RestoreStateLocked(const string& state) override {
+    ReaderBaseState base_state;
+    if (!ParseProtoUnlimited(&base_state, state)) {
+      return errors::InvalidArgument("Could not parse state for ", name(), ": ",
+                                     str_util::CEscape(state));
+    }
+    TF_RETURN_IF_ERROR(RestoreBaseState(base_state));
+    return Status::OK();
+  }
+};
+
+class IdentityReaderOp : public ReaderOpKernel {
+ public:
+  explicit IdentityReaderOp(OpKernelConstruction* context)
+      : ReaderOpKernel(context) {
+    SetReaderFactory([this]() { return new IdentityReader(name()); });
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("IdentityReader").Device(DEVICE_CPU),
+                        IdentityReaderOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/in_topk_op.cc b/tensorflow/core/kernels/in_topk_op.cc
new file mode 100644
index 0000000000..d08f6f53da
--- /dev/null
+++ b/tensorflow/core/kernels/in_topk_op.cc
@@ -0,0 +1,58 @@
+// See docs in ../ops/nn_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+template <typename T>
+class InTopK : public OpKernel {
+ public:
+  explicit InTopK(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("k", &k_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const auto& predictions_in = context->input(0);
+    const auto& targets_in = context->input(1);
+    OP_REQUIRES(context, predictions_in.dims() == 2,
+                errors::InvalidArgument("predictions must be 2-dimensional"));
+    OP_REQUIRES(context, targets_in.dims() == 1,
+                errors::InvalidArgument("targets must be 1-dimensional"));
+    OP_REQUIRES(context, predictions_in.dim_size(0) == targets_in.dim_size(0),
+                errors::InvalidArgument("First dimension of predictions ",
+                                        predictions_in.dim_size(0),
+                                        " must match length of targets ",
+                                        targets_in.dim_size(0)));
+    const auto& predictions = predictions_in.matrix<T>();
+    const auto& targets = targets_in.vec<int>();
+
+    Tensor* t_out = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(
+                       0, TensorShape({targets_in.dim_size(0)}), &t_out));
+    auto out = t_out->vec<bool>();
+
+    const auto size = targets.size();
+    const auto num_classes = predictions.dimension(1);
+    for (int b = 0; b < size; b++) {
+      T target_prediction = predictions(b, targets(b));
+      int more_probable_classes = 0;
+      for (int i = 0; i < num_classes; ++i) {
+        if (predictions(b, i) > target_prediction) ++more_probable_classes;
+      }
+      out(b) = more_probable_classes < k_;
+    }
+  }
+
+ private:
+  int k_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("InTopK").Device(DEVICE_CPU), InTopK<float>);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/initializable_lookup_table.cc b/tensorflow/core/kernels/initializable_lookup_table.cc
new file mode 100644
index 0000000000..7f8b070556
--- /dev/null
+++ b/tensorflow/core/kernels/initializable_lookup_table.cc
@@ -0,0 +1,41 @@
+#include "tensorflow/core/kernels/initializable_lookup_table.h"
+
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace lookup {
+
+Status InitializableLookupTable::Find(const Tensor& keys, Tensor* values,
+                                      const Tensor& default_value) {
+  if (!is_initialized()) {
+    return errors::FailedPrecondition("Table not initialized.");
+  }
+  TF_RETURN_IF_ERROR(CheckFindArguments(keys, *values, default_value));
+  return DoFind(keys, values, default_value);
+}
+
+Status InitializableLookupTable::Initialize(InitTableIterator& iter) {
+  if (!iter.Valid()) {
+    return iter.status();
+  }
+  TF_RETURN_IF_ERROR(CheckKeyAndValueTensors(iter.keys(), iter.values()));
+
+  mutex_lock l(mu_);
+  if (is_initialized()) {
+    return errors::FailedPrecondition("Table already initialized.");
+  }
+
+  TF_RETURN_IF_ERROR(DoPrepare(iter.total_size()));
+  while (iter.Valid()) {
+    TF_RETURN_IF_ERROR(DoInsert(iter.keys(), iter.values()));
+    iter.Next();
+  }
+  if (!errors::IsOutOfRange(iter.status())) {
+    return iter.status();
+  }
+  is_initialized_ = true;
+  return Status::OK();
+}
+
+}  // namespace lookup
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/initializable_lookup_table.h b/tensorflow/core/kernels/initializable_lookup_table.h
new file mode 100644
index 0000000000..651b491457
--- /dev/null
+++ b/tensorflow/core/kernels/initializable_lookup_table.h
@@ -0,0 +1,103 @@
+#ifndef TENSORFLOW_KERNELS_INITIALIZABLE_LOOKUP_TABLE_H_
+#define TENSORFLOW_KERNELS_INITIALIZABLE_LOOKUP_TABLE_H_
+
+#include "tensorflow/core/framework/lookup_interface.h"
+
+namespace tensorflow {
+namespace lookup {
+
+// Base class for lookup tables that require initialization.
+class InitializableLookupTable : public LookupInterface {
+ public:
+  class InitTableIterator;
+
+  // Performs batch lookups, for every element in the key tensor, Find returns
+  // the corresponding value into the values tensor.
+  // If an element is not present in the table, the given default value is used.
+  //
+  // For tables that require initialization, `Find` is available once the table
+  // is marked as initialized.
+  //
+  // Returns the following statuses:
+  // - OK: when the find finishes successfully.
+  // - FailedPrecondition: if the table is not initialized.
+  // - InvalidArgument: if any of the preconditions on the lookup key or value
+  //   fails.
+  // - In addition, other implementations may provide another non-OK status
+  //   specific to their failure modes.
+  Status Find(const Tensor& keys, Tensor* values,
+              const Tensor& default_value) final;
+
+  // Returns whether the table was initialized and is ready to serve lookups.
+  bool is_initialized() const { return is_initialized_; }
+
+  // Initializes the table from the given init table iterator.
+  //
+  // Atomically, this operation prepares the table, populates it with the given
+  // iterator, and mark the table as initialized.
+  //
+  // Returns the following statuses:
+  // - OK: when the initialization was successful.
+  // - InvalidArgument: if any of the preconditions on the lookup key or value
+  //   fails.
+  // - FailedPrecondition: if the table is already initialized and
+  //   fail_if_initialized is set to true.
+  // - In addition, other implementations may provide another non-OK status
+  //   specific to their failure modes.
+  Status Initialize(InitTableIterator& iter);
+
+  // Basic iterator to initialize lookup tables.
+  // It yields a sequence of pairs of `keys()` and `values()` Tensors, so that
+  // the consumer may insert key-value pairs in batches.
+  //
+  // Then the iterator is exhausted, valid returns false and status returns
+  // Status::OutOfRange.
+  class InitTableIterator {
+   public:
+    InitTableIterator() {}
+
+    virtual ~InitTableIterator() {}
+
+    // Prepares the next batch of key and value tensors.
+    virtual void Next() = 0;
+
+    // Returns true if keys and values point to valid tensors.
+    virtual bool Valid() const = 0;
+
+    // Returns a tensor that contains the current batch of 'key' values.
+    virtual const Tensor& keys() const = 0;
+
+    // Returns a tensor that contains the current batch of 'value' values.
+    virtual const Tensor& values() const = 0;
+
+    // Returns an error if one has occurred, otherwire returns Status::OK.
+    virtual Status status() const = 0;
+
+    // Returns the total number of elements that the iterator will produce.
+    virtual int64 total_size() const = 0;
+
+   private:
+    TF_DISALLOW_COPY_AND_ASSIGN(InitTableIterator);
+  };
+
+ protected:
+  // Prepares and allocates the underlying data structure to store the given
+  // number of expected elements.
+  virtual Status DoPrepare(size_t expected_num_elements) = 0;
+
+  // Populates the table in batches given keys and values as tensors into the
+  // underlying data structure.
+  virtual Status DoInsert(const Tensor& keys, const Tensor& values) = 0;
+
+  // Performs the batch find operation on the underlying data structure.
+  virtual Status DoFind(const Tensor& keys, Tensor* values,
+                        const Tensor& default_value) = 0;
+
+  mutex mu_;
+  bool is_initialized_ = false;
+};
+
+}  // namespace lookup
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_INITIALIZABLE_LOOKUP_TABLE_H_
diff --git a/tensorflow/core/kernels/io.cc b/tensorflow/core/kernels/io.cc
new file mode 100644
index 0000000000..9d6921aa8e
--- /dev/null
+++ b/tensorflow/core/kernels/io.cc
@@ -0,0 +1,270 @@
+// See docs in ../ops/io_ops.cc
+#include <unordered_map>
+
+#include "tensorflow/core/kernels/io.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/util/tensor_slice_reader.h"
+#include "tensorflow/core/util/tensor_slice_reader_cache.h"
+#include "tensorflow/core/util/tensor_slice_writer.h"
+
+namespace tensorflow {
+
+namespace {
+bool ParseShapeAndSlice(const string& shape_and_slice, TensorShape* shape,
+                        TensorSlice* slice, TensorShape* shape_slice,
+                        string* error) {
+  CHECK(!shape_and_slice.empty());
+  // Syntax: dim0 dim1 dim2 ... <slice string>
+  // Where slice string is defined in core/framework/tensor_slice.h
+  std::vector<string> splits = str_util::Split(shape_and_slice, ' ');
+
+  // Must have at least 2 strings.
+  if (splits.size() < 2) {
+    *error = strings::StrCat(
+        "Need least two elements in shape_and_slice specification: ",
+        shape_and_slice);
+    return false;
+  }
+  int num_dims = splits.size() - 1;
+  shape->Clear();
+  for (int i = 0; i < num_dims; ++i) {
+    int dim;
+    if (!str_util::NumericParse32(splits[i], &dim)) {
+      *error = strings::StrCat("Non numerical dimension in shape_and_slice: ",
+                               shape_and_slice);
+      return false;
+    }
+    shape->AddDim(dim);
+  }
+  // The last split is the slice specification.
+  slice->Clear();
+  auto status = slice->Parse(splits.back(), slice);
+  if (!status.ok()) {
+    *error = status.error_message();
+    return false;
+  }
+  // The specified slice must be compatible with the specified shape.
+  status = slice->SliceTensorShape(*shape, shape_slice);
+  if (!status.ok()) {
+    *error = status.error_message();
+    return false;
+  }
+  return true;
+}
+}  // namespace
+
+void SaveTensors(
+    OpKernelContext* context,
+    checkpoint::TensorSliceWriter::CreateBuilderFunction builder_func,
+    bool save_slices) {
+  const Tensor& filename_t = context->input(0);
+  {
+    const int64 size = filename_t.NumElements();
+    OP_REQUIRES(
+        context, size == 1,
+        errors::InvalidArgument(
+            "Input 0 (filename) must be a string scalar; got a tensor of ",
+            size, "elements"));
+  }
+
+  const Tensor& tensor_names_t = context->input(1);
+  const int64 N = tensor_names_t.NumElements();
+  const string* tensor_shapes_and_slices_ptr = nullptr;
+  if (save_slices) {
+    const Tensor& tensor_shapes_and_slices_t = context->input(2);
+    OP_REQUIRES(
+        context, tensor_shapes_and_slices_t.NumElements() == N,
+        errors::InvalidArgument("Expected ", N,
+                                " elements for the tensor "
+                                "shapes and slices but got ",
+                                tensor_shapes_and_slices_t.NumElements()));
+    tensor_shapes_and_slices_ptr =
+        tensor_shapes_and_slices_t.flat<string>().data();
+  }
+  // Path, names, and slices if save_slices is true.
+  const int kFixedInputs = save_slices ? 3 : 2;
+  OP_REQUIRES(context, context->num_inputs() == N + kFixedInputs,
+              errors::InvalidArgument("Expected totally ", N + kFixedInputs,
+                                      " inputs as input #1 (which is a string "
+                                      "tensor of saved names) contains ",
+                                      N, " names, but received ",
+                                      context->num_inputs(), " inputs"));
+
+  VLOG(1) << "About to save tensors to file " << filename_t.flat<string>()(0)
+          << "...";
+  checkpoint::TensorSliceWriter writer(filename_t.flat<string>()(0),
+                                       builder_func);
+
+  Status s;
+  auto tensor_names_flat = tensor_names_t.flat<string>();
+
+  string error;
+  for (int64 i = 0; i < N; ++i) {
+    const string& name = tensor_names_flat(i);
+    const Tensor& input = context->input(i + kFixedInputs);
+    TensorShape shape(input.shape());
+    TensorSlice slice(input.dims());
+    if (save_slices && !tensor_shapes_and_slices_ptr[i].empty()) {
+      const string& shape_spec = tensor_shapes_and_slices_ptr[i];
+      TensorShape slice_shape;
+      OP_REQUIRES(context, ParseShapeAndSlice(shape_spec, &shape, &slice,
+                                              &slice_shape, &error),
+                  errors::InvalidArgument(error));
+      OP_REQUIRES(context, slice_shape.IsSameSize(input.shape()),
+                  errors::InvalidArgument("Slice in shape_and_slice "
+                                          "specification does not match the "
+                                          "shape of the tensor to  save: ",
+                                          shape_spec, ", tensor: ",
+                                          input.shape().DebugString()));
+    }
+
+#define WRITER_ADD(dt)                                             \
+  case dt:                                                         \
+    s = writer.Add(name, shape, slice,                             \
+                   input.flat<EnumToDataType<dt>::Type>().data()); \
+    break
+
+    switch (input.dtype()) {
+      WRITER_ADD(DT_FLOAT);
+      WRITER_ADD(DT_DOUBLE);
+      WRITER_ADD(DT_INT32);
+      WRITER_ADD(DT_UINT8);
+      WRITER_ADD(DT_INT16);
+      WRITER_ADD(DT_INT8);
+      WRITER_ADD(DT_INT64);
+      WRITER_ADD(DT_QUINT8);
+      WRITER_ADD(DT_QINT8);
+      WRITER_ADD(DT_QINT32);
+      default:
+        context->SetStatus(errors::Unimplemented("Saving data type ",
+                                                 DataTypeString(input.dtype()),
+                                                 " not yet supported"));
+        return;
+    }
+#undef WRITER_ADD
+    if (!s.ok()) {
+      context->SetStatus(s);
+      return;
+    }
+  }
+
+  s = writer.Finish();
+  if (!s.ok()) {
+    context->SetStatus(s);
+  }
+}
+
+void RestoreTensor(OpKernelContext* context,
+                   checkpoint::TensorSliceReader::OpenTableFunction open_func,
+                   int preferred_shard, bool restore_slice) {
+  const Tensor& file_pattern_t = context->input(0);
+  {
+    const int64 size = file_pattern_t.NumElements();
+    OP_REQUIRES(
+        context, size == 1,
+        errors::InvalidArgument(
+            "Input 0 (file_pattern) must be a string scalar; got a tensor of ",
+            size, "elements"));
+  }
+  const string& file_pattern = file_pattern_t.flat<string>()(0);
+
+  const Tensor& tensor_name_t = context->input(1);
+  {
+    const int64 size = tensor_name_t.NumElements();
+    OP_REQUIRES(
+        context, size == 1,
+        errors::InvalidArgument(
+            "Input 1 (tensor_name) must be a string scalar; got a tensor of ",
+            size, "elements"));
+  }
+  const string& tensor_name = tensor_name_t.flat<string>()(0);
+
+  const string* tensor_shape_and_slice_ptr = nullptr;
+  if (restore_slice) {
+    const Tensor& tensor_shape_and_slice_t = context->input(2);
+    OP_REQUIRES(
+        context, tensor_shape_and_slice_t.NumElements() == 1,
+        errors::InvalidArgument("Expected 1 element for the tensor "
+                                "shape and slice but got ",
+                                tensor_shape_and_slice_t.NumElements()));
+    tensor_shape_and_slice_ptr = tensor_shape_and_slice_t.flat<string>().data();
+  }
+
+  // If we cannot find a cached reader we will allocate our own.
+  std::unique_ptr<checkpoint::TensorSliceReader> allocated_reader;
+
+  const checkpoint::TensorSliceReader* reader =
+      context->slice_reader_cache()->GetReader(file_pattern, open_func,
+                                               preferred_shard);
+  if (!reader) {
+    allocated_reader.reset(new checkpoint::TensorSliceReader(
+        file_pattern, open_func, preferred_shard));
+    reader = allocated_reader.get();
+  }
+  OP_REQUIRES_OK(context, CHECK_NOTNULL(reader)->status());
+
+  // Get the shape and type from the save file.
+  DataType type;
+  TensorShape saved_shape;
+  OP_REQUIRES(
+      context, reader->HasTensor(tensor_name, &saved_shape, &type),
+      errors::NotFound("Tensor name \"", tensor_name,
+                       "\" not found in checkpoint files ", file_pattern));
+  OP_REQUIRES(
+      context, type == context->expected_output_dtype(0),
+      errors::InvalidArgument("Expected to restore a tensor of type ",
+                              DataTypeString(context->expected_output_dtype(0)),
+                              ", got a tensor of type ", DataTypeString(type),
+                              " instead: tensor_name = ", tensor_name));
+
+  // Shape of the output and slice to load.
+  TensorShape output_shape(saved_shape);
+  TensorSlice slice_to_load(saved_shape.dims());
+  if (restore_slice && !tensor_shape_and_slice_ptr[0].empty()) {
+    const string& shape_spec = tensor_shape_and_slice_ptr[0];
+    TensorShape parsed_shape;
+    string error;
+    OP_REQUIRES(context,
+                ParseShapeAndSlice(shape_spec, &parsed_shape, &slice_to_load,
+                                   &output_shape, &error),
+                errors::InvalidArgument(error));
+    OP_REQUIRES(
+        context, parsed_shape.IsSameSize(saved_shape),
+        errors::InvalidArgument(
+            "Shape in shape_and_slice spec does not match the shape in the "
+            "save file: ",
+            parsed_shape.DebugString(), ", save file shape: ",
+            saved_shape.DebugString()));
+  }
+
+  Tensor* t = nullptr;
+  OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &t));
+#define READER_COPY(dt)                                                \
+  case dt:                                                             \
+    reader->CopySliceData(tensor_name, slice_to_load,                  \
+                          t->flat<EnumToDataType<dt>::Type>().data()); \
+    break
+
+  switch (type) {
+    READER_COPY(DT_FLOAT);
+    READER_COPY(DT_DOUBLE);
+    READER_COPY(DT_INT32);
+    READER_COPY(DT_UINT8);
+    READER_COPY(DT_INT16);
+    READER_COPY(DT_INT8);
+    READER_COPY(DT_INT64);
+    default:
+      context->SetStatus(errors::Unimplemented(
+          "Restoring data type ", DataTypeString(type), " not yet supported"));
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/io.h b/tensorflow/core/kernels/io.h
new file mode 100644
index 0000000000..7e548f1ad0
--- /dev/null
+++ b/tensorflow/core/kernels/io.h
@@ -0,0 +1,38 @@
+#ifndef TENSORFLOW_KERNELS_IO_H_
+#define TENSORFLOW_KERNELS_IO_H_
+
+#include "tensorflow/core/util/tensor_slice_reader.h"
+#include "tensorflow/core/util/tensor_slice_writer.h"
+
+namespace tensorflow {
+
+class OpKernelContext;
+
+// Save input tensors in *context to a writer built from builder_func().
+// context must have the following inputs:
+//  0: a single element string tensor that contains the file name.
+//  1: names for the remaining tensors
+// If save_slices is true:
+//  2: shape and slice specifications.
+//  rest: tensors to save
+void SaveTensors(
+    OpKernelContext* context,
+    checkpoint::TensorSliceWriter::CreateBuilderFunction builder_func,
+    bool save_slices);
+
+// Reads a tensor from the reader built from open_func() and produces it as
+// context->output(0).  "preferred_shard" is the same the TensorSliceReader
+// preferred_shard parameter.
+//
+// context must have the following inputs:
+//  0: a single element string tensor that contains the file name.
+//  1: a single element string tensor that names the output to be restored.
+// If restore_slice is true:
+//  2: shape and slice specification of the tensor to restore.
+void RestoreTensor(OpKernelContext* context,
+                   checkpoint::TensorSliceReader::OpenTableFunction open_func,
+                   int preferred_shard, bool restore_slice);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_IO_H_
diff --git a/tensorflow/core/kernels/l2loss_op.cc b/tensorflow/core/kernels/l2loss_op.cc
new file mode 100644
index 0000000000..6f83f01676
--- /dev/null
+++ b/tensorflow/core/kernels/l2loss_op.cc
@@ -0,0 +1,69 @@
+// See docs in ../ops/nn_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/l2loss_op.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class L2LossOp : public OpKernel {
+ public:
+  explicit L2LossOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    // The input tensor can be of any number of dimensions, even though it's
+    // 2D in most typical applications.
+    const Tensor& input = context->input(0);
+    // The output is a single number.
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape({}), &output));
+    functor::L2Loss<Device, T>()(context->eigen_device<Device>(),
+                                 input.flat<T>(), output->scalar<T>());
+  }
+};
+
+#define REGISTER_KERNEL(T)                                      \
+  REGISTER_KERNEL_BUILDER(                                      \
+      Name("L2Loss").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      L2LossOp<CPUDevice, T>);
+
+REGISTER_KERNEL(float);
+REGISTER_KERNEL(double);
+#undef REGISTER_KERNEL
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                                    \
+  template <>                                                                  \
+  void L2Loss<GPUDevice, T>::operator()(const GPUDevice& d,                    \
+                                        typename TTypes<T>::ConstTensor input, \
+                                        typename TTypes<T>::Scalar output);    \
+  extern template struct L2Loss<GPUDevice, T>;
+
+DECLARE_GPU_SPEC(float);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+// Registration of the GPU implementations.
+#define REGISTER_GPU_KERNEL(T)                                  \
+  REGISTER_KERNEL_BUILDER(                                      \
+      Name("L2Loss").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      L2LossOp<GPUDevice, T>);
+
+REGISTER_GPU_KERNEL(float);
+#undef REGISTER_GPU_KERNEL
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/l2loss_op.h b/tensorflow/core/kernels/l2loss_op.h
new file mode 100644
index 0000000000..d307353e24
--- /dev/null
+++ b/tensorflow/core/kernels/l2loss_op.h
@@ -0,0 +1,24 @@
+#ifndef TENSORFLOW_KERNELS_L2LOSS_OP_H_
+#define TENSORFLOW_KERNELS_L2LOSS_OP_H_
+// Functor definition for L2LossOp, must be compilable by nvcc.
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by L2LossOp to do the computations.
+template <typename Device, typename T>
+struct L2Loss {
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor input,
+                  typename TTypes<T>::Scalar output) {
+    // We flatten the input tensor and reduce on dimension 0, producing
+    // a single number which is Mul(Sum(x^2), 0.5).
+    output.device(d) = input.square().sum() * static_cast<T>(0.5);
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_L2LOSS_OP_H_
diff --git a/tensorflow/core/kernels/l2loss_op_gpu.cu.cc b/tensorflow/core/kernels/l2loss_op_gpu.cu.cc
new file mode 100644
index 0000000000..858fcfe8d3
--- /dev/null
+++ b/tensorflow/core/kernels/l2loss_op_gpu.cu.cc
@@ -0,0 +1,16 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/l2loss_op.h"
+
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+template struct functor::L2Loss<GPUDevice, float>;
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/linalg_ops_common.cc b/tensorflow/core/kernels/linalg_ops_common.cc
new file mode 100644
index 0000000000..93342a7a24
--- /dev/null
+++ b/tensorflow/core/kernels/linalg_ops_common.cc
@@ -0,0 +1,99 @@
+#include "tensorflow/core/kernels/linalg_ops_common.h"
+
+namespace tensorflow {
+
+void LinearAlgebraOpBase::Compute(OpKernelContext* context) {
+  const Tensor& in = context->input(0);
+
+  const int input_rank = GetInputMatrixRank();
+  OP_REQUIRES(
+      context, input_rank == 2,
+      errors::InvalidArgument("Only matrix inputs are supported so far."));
+  if (SupportsBatchOperation()) {
+    OP_REQUIRES(context, in.dims() > input_rank,
+                errors::InvalidArgument("Input tensor must have rank >= %d",
+                                        input_rank + 1));
+  } else {
+    OP_REQUIRES(context, in.dims() == input_rank,
+                errors::InvalidArgument("Input tensor must have rank == %d",
+                                        input_rank));
+  }
+
+  // If the tensor rank is greater than input_rank, we consider the inner-most
+  // dimensions as matrices, and loop over all the other outer
+  // dimensions to compute the results.
+  // TODO(kalakris): Only matrix inputs are currently supported.
+  const int row_dimension = in.dims() - 2;
+  const int col_dimension = in.dims() - 1;
+  const int64 num_rows = in.dim_size(row_dimension);
+  const int64 num_cols = in.dim_size(col_dimension);
+  const TensorShape input_matrix_shape = TensorShape({num_rows, num_cols});
+  const TensorShape output_matrix_shape =
+      GetOutputMatrixShape(input_matrix_shape);
+  OP_REQUIRES(context, output_matrix_shape.dims() <= 2,
+              errors::InvalidArgument("Output rank must be 1 or 2."));
+
+  int num_matrices = 1;
+  // The output has the shape of all the outer dimensions of the input
+  // except for the last two, plus the output_matrix_shape (if the output
+  // is not scalar). This still assumes that each input matrix is
+  // 2-dimensional, in accordance with the TODO above.
+  TensorShape output_shape;
+  if (in.dims() == 2) {
+    output_shape = output_matrix_shape;
+  } else {
+    for (int dim = 0; dim <= in.dims() - 3; ++dim) {
+      num_matrices *= in.dim_size(dim);
+      output_shape.AddDim(in.dim_size(dim));
+    }
+    for (int dim = 0; dim < output_matrix_shape.dims(); ++dim) {
+      output_shape.AddDim(output_matrix_shape.dim_size(dim));
+    }
+  }
+
+  Tensor* out = nullptr;
+  OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &out));
+
+  auto shard = [this, &in, &input_matrix_shape, &output_matrix_shape, context,
+                out](int64 begin, int64 end) {
+    for (int64 i = begin; i < end; ++i) {
+      ComputeMatrix(context, i, in, input_matrix_shape, out,
+                    output_matrix_shape);
+    }
+  };
+
+  auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+  Shard(worker_threads.num_threads, worker_threads.workers, num_matrices,
+        GetCostPerUnit(input_matrix_shape), shard);
+}
+
+template <typename Scalar, bool SupportsBatchOperationT>
+void LinearAlgebraOp<Scalar, SupportsBatchOperationT>::ComputeMatrix(
+    OpKernelContext* context, int64 matrix_index, const Tensor& in,
+    const TensorShape& input_matrix_shape, Tensor* out,
+    const TensorShape& output_matrix_shape) {
+  // TODO(kalakris): Handle alignment if possible. Eigen::Map is
+  // unaligned by default.
+  ConstMatrixMap input(in.flat<Scalar>().data() +
+                           matrix_index * input_matrix_shape.num_elements(),
+                       input_matrix_shape.dim_size(0),
+                       input_matrix_shape.dim_size(1));
+
+  // The output matrix shape may not be a matrix.
+  int num_output_rows =
+      output_matrix_shape.dims() >= 1 ? output_matrix_shape.dim_size(0) : 1;
+  int num_output_cols =
+      output_matrix_shape.dims() == 2 ? output_matrix_shape.dim_size(1) : 1;
+  MatrixMap output(out->flat<Scalar>().data() +
+                       matrix_index * output_matrix_shape.num_elements(),
+                   num_output_rows, num_output_cols);
+  ComputeMatrix(context, input, &output);
+}
+
+// Explicitly instantiate LinearAlgebraOp for the scalar types we expect to use.
+template class LinearAlgebraOp<float, false>;
+template class LinearAlgebraOp<float, true>;
+template class LinearAlgebraOp<double, false>;
+template class LinearAlgebraOp<double, true>;
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/linalg_ops_common.h b/tensorflow/core/kernels/linalg_ops_common.h
new file mode 100644
index 0000000000..471f11e25f
--- /dev/null
+++ b/tensorflow/core/kernels/linalg_ops_common.h
@@ -0,0 +1,123 @@
+#ifndef TENSORFLOW_KERNELS_LINALG_OPS_COMMON_H_
+#define TENSORFLOW_KERNELS_LINALG_OPS_COMMON_H_
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/util/work_sharder.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+// A base class to support linear algebra functionality, similar to the
+// numpy.linalg module. Supports batch computation on several matrices at once,
+// sharding the computations across different threads if necessary.
+//
+// TODO(kalakris): This needs to be expanded to support binary inputs, and
+// multiple outputs.
+class LinearAlgebraOpBase : public OpKernel {
+ public:
+  explicit LinearAlgebraOpBase(OpKernelConstruction* context)
+      : OpKernel(context) {}
+  ~LinearAlgebraOpBase() override {}
+
+  // Return the expected rank of the input.
+  // TODO(kalakris): This should be a virtual function to support vector inputs.
+  int GetInputMatrixRank() { return 2; }
+
+  // Return the output shape of each individual matrix operation. Must be
+  // rank 0, 1, or 2.  Scalar outputs are rank 0.
+  virtual TensorShape GetOutputMatrixShape(
+      const TensorShape& input_matrix_shape) = 0;
+
+  // Return the cost per matrix operation. Cost per unit is assumed to be
+  // roughly 1ns, based on comments in core/util/work_sharder.cc.
+  virtual int64 GetCostPerUnit(const TensorShape& input_matrix_shape) = 0;
+
+  // If SupportsBatchOperation() returns false, this Op will only accept rank 2
+  // (if the supported input type is a matrix). If it returns true, the Op will
+  // accept inputs of rank >= 3, and repeatedly execute the operation on all
+  // matrices in the innermost two dimensions.
+  virtual bool SupportsBatchOperation() = 0;
+
+  // Perform the actual computation on an input matrix, and store the results
+  // in the output. This will be called repeatedly for a single call to
+  // Compute(), if multiple matrices exist in the input Tensor.
+  //
+  // This function should only compute the results for a single input matrix.
+  // The 'matrix_index' parameter specifies the index of the matrix to be used
+  // from the input, and the index of the matrix to be written to in the output.
+  // The input matrix is in row major order, and is located at the memory
+  // address
+  //   in.flat<Scalar>().data() +
+  //   matrix_index * input_matrix_shape.num_elements().
+  // The output matrix is in row major order, and is located at the memory
+  // address
+  //   out->flat<Scalar>().data() +
+  //   matrix_index * output_matrix_shape.num_elements().
+  // The LinearAlgebraOp<Scalar> class below has functionality which performs
+  // this mapping and presents an interface based on the Eigen::MatrixBase API.
+  virtual void ComputeMatrix(OpKernelContext* context, int64 matrix_index,
+                             const Tensor& in,
+                             const TensorShape& input_matrix_shape, Tensor* out,
+                             const TensorShape& output_matrix_shape) = 0;
+
+  void Compute(OpKernelContext* context) override;
+};
+
+// A base class for linear algebra ops templated on the scalar type.
+//
+// This base class encapsulates the functionality of mapping the input and
+// output tensors using Eigen::Map, so that the Eigen::MatrixBase API may be
+// directly used by derived classes.
+// SupportsBatchOperationT is a bool template argument which if set to true
+// will allow the Op to process batches of matrices (rank >= 3); if set to
+// false the Op will only accept rank 2 inputs.
+template <typename Scalar, bool SupportsBatchOperationT>
+class LinearAlgebraOp : public LinearAlgebraOpBase {
+ public:
+  explicit LinearAlgebraOp(OpKernelConstruction* context)
+      : LinearAlgebraOpBase(context) {}
+
+  using ConstMatrixMap =
+      Eigen::Map<const Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic,
+                                     Eigen::RowMajor>>;
+  using MatrixMap = Eigen::Map<
+      Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+
+  // Perform the actual computation on the input matrix, and store the results
+  // in the output. This will be called repeatedly for a single call to
+  // Compute(), if multiple matrices exist in the input Tensor.
+  virtual void ComputeMatrix(OpKernelContext* context,
+                             const ConstMatrixMap& input,
+                             MatrixMap* output) = 0;
+
+  bool SupportsBatchOperation() final { return SupportsBatchOperationT; }
+
+  // A concrete implementation of LinearAlgebraOpBase::ComputeMatrix().
+  void ComputeMatrix(OpKernelContext* context, int64 matrix_index,
+                     const Tensor& in, const TensorShape& input_matrix_shape,
+                     Tensor* out, const TensorShape& output_matrix_shape) final;
+};
+
+// Declare that LinearAlgebraOp is explicitly instantiated in
+// linalg_ops_common.cc for float and double.
+extern template class LinearAlgebraOp<float, false>;
+extern template class LinearAlgebraOp<float, true>;
+extern template class LinearAlgebraOp<double, false>;
+extern template class LinearAlgebraOp<double, true>;
+
+}  // namespace tensorflow
+
+#define REGISTER_LINALG_OP(OpName, OpClass, Scalar) \
+  REGISTER_KERNEL_BUILDER(                          \
+      Name(OpName).Device(DEVICE_CPU).TypeConstraint<Scalar>("T"), OpClass)
+
+#endif  // TENSORFLOW_KERNELS_LINALG_OPS_COMMON_H_
diff --git a/tensorflow/core/kernels/listdiff_op.cc b/tensorflow/core/kernels/listdiff_op.cc
new file mode 100644
index 0000000000..f490f5ddd3
--- /dev/null
+++ b/tensorflow/core/kernels/listdiff_op.cc
@@ -0,0 +1,75 @@
+#include <unordered_set>
+#include <utility>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/status.h"
+
+namespace tensorflow {
+template <typename T>
+class ListDiffOp : public OpKernel {
+ public:
+  explicit ListDiffOp(OpKernelConstruction* context) : OpKernel(context) {
+    const DataType dt = DataTypeToEnum<T>::v();
+    OP_REQUIRES_OK(context, context->MatchSignature({dt, dt}, {dt, DT_INT32}));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& x = context->input(0);
+    const Tensor& y = context->input(1);
+
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(x.shape()),
+                errors::InvalidArgument("x should be a 1D vector."));
+
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(y.shape()),
+                errors::InvalidArgument("y should be a 1D vector."));
+
+    std::unordered_set<T> y_set;
+    const auto Ty = y.vec<T>();
+    const int y_size = Ty.size();
+    y_set.reserve(y_size);
+    for (int i = 0; i < y_size; ++i) {
+      y_set.insert(Ty(i));
+    }
+
+    // Compute the size of the output.
+    const auto Tx = x.vec<T>();
+    const int x_size = Tx.size();
+
+    int out_size = 0;
+    for (int i = 0; i < x_size; ++i) {
+      if (y_set.count(Tx(i)) == 0) {
+        ++out_size;
+      }
+    }
+
+    // Allocate and populate outputs.
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, {out_size}, &out));
+    auto Tout = out->vec<T>();
+
+    Tensor* indices = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(1, {out_size}, &indices));
+    auto Tindices = indices->vec<int32>();
+
+    for (int i = 0, p = 0; i < x_size; ++i) {
+      if (y_set.count(Tx(i)) == 0) {
+        Tout(p) = Tx(i);
+        Tindices(p) = i;
+        p++;
+      }
+    }
+  }
+};
+
+#define REGISTER_LISTDIFF(type)                                      \
+  REGISTER_KERNEL_BUILDER(                                           \
+      Name("ListDiff").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      ListDiffOp<type>)
+
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_LISTDIFF);
+#undef REGISTER_LISTDIFF
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/logging_ops.cc b/tensorflow/core/kernels/logging_ops.cc
new file mode 100644
index 0000000000..ec84145f75
--- /dev/null
+++ b/tensorflow/core/kernels/logging_ops.cc
@@ -0,0 +1,77 @@
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+class AssertOp : public OpKernel {
+ public:
+  explicit AssertOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("summarize", &summarize_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& cond = ctx->input(0);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsLegacyScalar(cond.shape()),
+                errors::InvalidArgument("In[0] should be a scalar: ",
+                                        cond.shape().ShortDebugString()));
+
+    if (cond.scalar<bool>()()) {
+      return;
+    }
+    string msg = "assertion failed: ";
+    for (int i = 1; i < ctx->num_inputs(); ++i) {
+      strings::StrAppend(&msg, "[", ctx->input(i).SummarizeValue(summarize_),
+                         "]");
+      if (i < ctx->num_inputs() - 1) strings::StrAppend(&msg, " ");
+    }
+    ctx->SetStatus(errors::InvalidArgument(msg));
+  }
+
+ private:
+  int32 summarize_ = 0;
+};
+
+REGISTER_KERNEL_BUILDER(Name("Assert").Device(DEVICE_CPU), AssertOp);
+
+class PrintOp : public OpKernel {
+ public:
+  explicit PrintOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx), call_counter_(0) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("message", &message_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("first_n", &first_n_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("summarize", &summarize_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    if (IsRefType(ctx->input_dtype(0))) {
+      ctx->forward_ref_input_to_ref_output(0, 0);
+    } else {
+      ctx->set_output(0, ctx->input(0));
+    }
+    if (first_n_ >= 0) {
+      mutex_lock l(mu_);
+      if (call_counter_ >= first_n_) return;
+      call_counter_++;
+    }
+    string msg;
+    strings::StrAppend(&msg, message_);
+    for (int i = 1; i < ctx->num_inputs(); ++i) {
+      strings::StrAppend(&msg, "[", ctx->input(i).SummarizeValue(summarize_),
+                         "]");
+    }
+    LOG(INFO) << msg;
+  }
+
+ private:
+  mutex mu_;
+  int64 call_counter_ GUARDED_BY(mu_) = 0;
+  int64 first_n_ = 0;
+  int32 summarize_ = 0;
+  string message_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("Print").Device(DEVICE_CPU), PrintOp);
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/logging_ops_test.cc b/tensorflow/core/kernels/logging_ops_test.cc
new file mode 100644
index 0000000000..a7af6eb303
--- /dev/null
+++ b/tensorflow/core/kernels/logging_ops_test.cc
@@ -0,0 +1,87 @@
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace tensorflow {
+namespace {
+
+class PrintingGraphTest : public OpsTestBase {
+ protected:
+  Status Init(DataType input_type1, DataType input_type2, string msg = "",
+                   int first_n = -1, int summarize = 3) {
+    RequireDefaultOps();
+    TF_CHECK_OK(NodeDefBuilder("op", "Print")
+                    .Input(FakeInput(input_type1))
+                    .Input(FakeInput(2, input_type2))
+                    .Attr("message", msg)
+                    .Attr("first_n", first_n)
+                    .Attr("summarize", summarize)
+                    .Finalize(node_def()));
+    return InitOp();
+  }
+};
+
+TEST_F(PrintingGraphTest, Int32Success_6) {
+  ASSERT_OK(Init(DT_INT32, DT_INT32));
+  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_INT32, TensorShape({6}));
+  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+}
+
+TEST_F(PrintingGraphTest, Int32Success_Summarize6) {
+  ASSERT_OK(Init(DT_INT32, DT_INT32, "", -1, 6));
+  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_INT32, TensorShape({6}));
+  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+}
+
+TEST_F(PrintingGraphTest, StringSuccess) {
+  ASSERT_OK(Init(DT_INT32, DT_STRING));
+  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<string>(TensorShape({}), {"foo"});
+  AddInputFromArray<string>(TensorShape({}), {"bar"});
+  ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_INT32, TensorShape({6}));
+  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+}
+
+TEST_F(PrintingGraphTest, MsgSuccess) {
+  ASSERT_OK(Init(DT_INT32, DT_STRING, "Message: "));
+  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<string>(TensorShape({}), {"foo"});
+  AddInputFromArray<string>(TensorShape({}), {"bar"});
+  ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_INT32, TensorShape({6}));
+  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+}
+
+TEST_F(PrintingGraphTest, FirstNSuccess) {
+  ASSERT_OK(Init(DT_INT32, DT_STRING, "", 3));
+  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<string>(TensorShape({}), {"foo"});
+  AddInputFromArray<string>(TensorShape({}), {"bar"});
+  // run 4 times but we only print 3 as intended
+  for (int i = 0; i < 4; i++) ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_INT32, TensorShape({6}));
+  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+}
+
+}  // end namespace
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/lookup_table_init_op.cc b/tensorflow/core/kernels/lookup_table_init_op.cc
new file mode 100644
index 0000000000..9781bcfa59
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_table_init_op.cc
@@ -0,0 +1,116 @@
+#define EIGEN_USE_THREADS
+
+#include <string>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/initializable_lookup_table.h"
+#include "tensorflow/core/kernels/lookup_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+namespace lookup {
+
+// Iterator to initialize tables given 'keys' and 'values' tensors.
+//
+// The two tensors are returned in the first iteration. It doesn't loop
+// over each element of the tensor since insertions in the lookup table can
+// process batches.
+class KeyValueTensorIterator
+    : public InitializableLookupTable::InitTableIterator {
+ public:
+  // keys and values are not owned by the iterator.
+  explicit KeyValueTensorIterator(const Tensor* keys, const Tensor* values)
+      : keys_(keys), values_(values), valid_(true), status_(Status::OK()) {
+    TensorShape key_shape = keys_->shape();
+    if (!key_shape.IsSameSize(values_->shape())) {
+      valid_ = false;
+      status_ = errors::InvalidArgument(
+          "keys and values should have the same dimension.",
+          key_shape.DebugString(), " vs ", values_->shape().DebugString());
+    }
+    if (key_shape.num_elements() == 0) {
+      valid_ = false;
+      status_ =
+          errors::InvalidArgument("keys and values cannot be empty tensors.");
+    }
+  }
+
+  bool Valid() const override { return valid_; }
+
+  void Next() override {
+    valid_ = false;
+    status_ = errors::OutOfRange("No more data.");
+  }
+
+  const Tensor& keys() const override { return *keys_; }
+
+  const Tensor& values() const override { return *values_; }
+
+  Status status() const override { return status_; }
+
+  int64 total_size() const {
+    return keys_ == nullptr ? -1 : keys_->NumElements();
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(KeyValueTensorIterator);
+
+  const Tensor* keys_;    // Doesn't own it.
+  const Tensor* values_;  // Doesn't own it.
+  bool valid_;            // true if the iterator points to an existing range.
+  Status status_;
+};
+
+}  // namespace lookup
+
+// Kernel to initialize a look table given a key and value tensors.
+// After this operation, the table becomes read-only.
+class InitializeTableOp : public OpKernel {
+ public:
+  explicit InitializeTableOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    mutex_lock l(mu_);
+    lookup::InitializableLookupTable* table;
+    OP_REQUIRES_OK(ctx,
+                   GetInitializableLookupTable("table_handle", ctx, &table));
+    core::ScopedUnref unref_me(table);
+
+    DataTypeVector expected_inputs = {DT_STRING_REF, table->key_dtype(),
+                                      table->value_dtype()};
+    DataTypeVector expected_outputs = {};
+    OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, expected_outputs));
+
+    const Tensor& keys = ctx->input(1);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(keys.shape()),
+                errors::InvalidArgument("Keys must be a vector, but received ",
+                                        keys.shape().DebugString()));
+
+    const Tensor& values = ctx->input(2);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsVector(values.shape()),
+        errors::InvalidArgument("Values must be a vector, but received ",
+                                values.shape().DebugString()));
+
+    OP_REQUIRES(ctx, keys.NumElements() == values.NumElements(),
+                errors::InvalidArgument(
+                    "Keys and values must have the same size ",
+                    keys.NumElements(), " vs ", values.NumElements()));
+
+    lookup::KeyValueTensorIterator iter(&keys, &values);
+    OP_REQUIRES_OK(ctx, table->Initialize(iter));
+  }
+
+ private:
+  mutex mu_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("InitializeTable").Device(DEVICE_CPU),
+                        InitializeTableOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/lookup_table_op.cc b/tensorflow/core/kernels/lookup_table_op.cc
new file mode 100644
index 0000000000..2bab4df94f
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_table_op.cc
@@ -0,0 +1,166 @@
+#include "tensorflow/core/kernels/lookup_table_op.h"
+#define EIGEN_USE_THREADS
+
+#include <string>
+#include <utility>
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/initializable_lookup_table.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/hash/hash.h"
+
+namespace tensorflow {
+namespace lookup {
+
+// Lookup table that wraps an unordered_map, where the key and value data type
+// is specified.
+//
+// This table is recommened for any variations to key values.
+//
+// For look up, the table is required to be initialized (allocated
+// and populated). Once the table is marked as initialized it becomes read-only.
+//
+// Sample use case:
+//
+// HashTable<int64, int64> table;  // int64 -> int64.
+// table.Prepare(10); // Prepare the underlying data structure, the number of
+//                    // elements is required by interface, but not used.
+// // Populate the table, elements could be added in one or multiple calls.
+// table.Insert(key_tensor, value_tensor); // Populate the table.
+// ...
+// table.set_is_initialized();
+//
+// table.Find(in_t, &out_t, default_t)
+//
+template <class K, class V>
+class HashTable : public InitializableLookupTable {
+ public:
+  size_t size() const override { return table_ ? table_->size() : 0; }
+
+  DataType key_dtype() const override { return DataTypeToEnum<K>::v(); }
+
+  DataType value_dtype() const override { return DataTypeToEnum<V>::v(); }
+
+ protected:
+  Status DoPrepare(size_t unused) override {
+    if (is_initialized_) {
+      return errors::Aborted("HashTable already initialized.");
+    }
+    if (!table_) {
+      table_ = std::unique_ptr<std::unordered_map<K, V>>(
+          new std::unordered_map<K, V>());
+    }
+    return Status::OK();
+  };
+
+  Status DoInsert(const Tensor& keys, const Tensor& values) override {
+    if (!table_) {
+      return errors::FailedPrecondition("HashTable is not prepared.");
+    }
+
+    const auto key_values = keys.flat<K>();
+    const auto value_values = values.flat<V>();
+    for (size_t i = 0; i < key_values.size(); ++i) {
+      const K& key = key_values(i);
+      const V& value = value_values(i);
+      const V& previous_value = gtl::LookupOrInsert(table_.get(), key, value);
+      if (previous_value != value) {
+        return errors::FailedPrecondition(
+            "HashTable has different value for same key. Key ", key, " has ",
+            previous_value, " and trying to add value ", value);
+      }
+    }
+    return Status::OK();
+  }
+
+  Status DoFind(const Tensor& key, Tensor* value,
+                const Tensor& default_value) override {
+    const V default_val = default_value.flat<V>()(0);
+    const auto key_values = key.flat<K>();
+    auto value_values = value->flat<V>();
+
+    for (size_t i = 0; i < key_values.size(); ++i) {
+      value_values(i) =
+          gtl::FindWithDefault(*table_, key_values(i), default_val);
+    }
+    return Status::OK();
+  }
+
+ private:
+  std::unique_ptr<std::unordered_map<K, V>> table_;
+};
+
+}  // namespace lookup
+
+// Table lookup op. Perform the lookup operation on the given table.
+class LookupTableFindOp : public OpKernel {
+ public:
+  explicit LookupTableFindOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    lookup::LookupInterface* table;
+    OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table));
+    core::ScopedUnref unref_me(table);
+
+    DataTypeVector expected_inputs = {DT_STRING_REF, table->key_dtype(),
+                                      table->value_dtype()};
+    DataTypeVector expected_outputs = {table->value_dtype()};
+    OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, expected_outputs));
+
+    const Tensor& input = ctx->input(1);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(input.shape()),
+                errors::InvalidArgument("Input must be a vector, not ",
+                                        input.shape().DebugString()));
+
+    const Tensor& default_value = ctx->input(2);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(default_value.shape()),
+                errors::InvalidArgument("Default value must be a scalar, not ",
+                                        default_value.shape().DebugString()));
+
+    Tensor* out;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output("output_values", input.shape(), &out));
+
+    OP_REQUIRES_OK(ctx, table->Find(input, out, default_value));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("LookupTableFind").Device(DEVICE_CPU),
+                        LookupTableFindOp);
+
+// Op that returns the size of the given table.
+class LookupTableSizeOp : public OpKernel {
+ public:
+  explicit LookupTableSizeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    lookup::LookupInterface* table;
+    OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table));
+    core::ScopedUnref unref_me(table);
+
+    Tensor* out;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output("size", TensorShape({}), &out));
+    out->flat<int64>().setConstant(table->size());
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("LookupTableSize").Device(DEVICE_CPU),
+                        LookupTableSizeOp);
+
+// Register the HashTable op with the currently supported key and value types.
+#define REGISTER_KERNEL(key_dtype, value_dtype)                           \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("HashTable")                                                   \
+          .Device(DEVICE_CPU)                                             \
+          .TypeConstraint<key_dtype>("key_dtype")                         \
+          .TypeConstraint<value_dtype>("value_dtype"),                    \
+      LookupTableOp<lookup::HashTable<key_dtype, value_dtype>, key_dtype, \
+                    value_dtype>)
+
+REGISTER_KERNEL(string, int64);
+REGISTER_KERNEL(int64, string);
+
+#undef REGISTER_KERNEL
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/lookup_table_op.h b/tensorflow/core/kernels/lookup_table_op.h
new file mode 100644
index 0000000000..dc53ce33a6
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_table_op.h
@@ -0,0 +1,80 @@
+#ifndef TENSORFLOW_KERNELS_LOOKUP_TABLE_OP_H_
+#define TENSORFLOW_KERNELS_LOOKUP_TABLE_OP_H_
+
+#include "tensorflow/core/framework/lookup_interface.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/kernels/lookup_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+// Lookup table op that supports different table implementations specified by
+// the 'Container' template. Container must be derived from LookupInterface. The
+// key and value are of the templated type "key_dtype" and "value_dtype"
+// respectively.
+template <class Container, class key_dtype, class value_dtype>
+class LookupTableOp : public OpKernel {
+ public:
+  // ctx is not owned by this class.
+  explicit LookupTableOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx), table_handle_set_(false) {
+    OP_REQUIRES_OK(ctx, ctx->allocate_persistent(tensorflow::DT_STRING,
+                                                 tensorflow::TensorShape({2}),
+                                                 &table_handle_, nullptr));
+  }
+
+  // ctx is not owned by this function.
+  void Compute(OpKernelContext* ctx) override {
+    mutex_lock l(mu_);
+    if (!table_handle_set_) {
+      OP_REQUIRES_OK(ctx, cinfo_.Init(ctx->resource_manager(), def()));
+      auto creator = [this](lookup::LookupInterface** ret) {
+        *ret = new Container();
+        return Status::OK();
+      };
+
+      lookup::LookupInterface* table = nullptr;
+      OP_REQUIRES_OK(
+          ctx, cinfo_.resource_manager()
+                   ->template LookupOrCreate<lookup::LookupInterface>(
+                       cinfo_.container(), cinfo_.name(), &table, creator));
+      core::ScopedUnref unref_me(table);
+
+      OP_REQUIRES_OK(ctx, lookup::CheckTableDataTypes(
+                              *table, DataTypeToEnum<key_dtype>::v(),
+                              DataTypeToEnum<value_dtype>::v(), cinfo_.name()));
+
+      auto h = table_handle_.AccessTensor(ctx)->template flat<string>();
+      h(0) = cinfo_.container();
+      h(1) = cinfo_.name();
+      table_handle_set_ = true;
+    }
+    ctx->set_output_ref(0, &mu_, table_handle_.AccessTensor(ctx));
+  }
+
+  ~LookupTableOp() override {
+    // If the table object was not shared, delete it.
+    if (table_handle_set_ && cinfo_.resource_is_private_to_kernel()) {
+      TF_CHECK_OK(
+          cinfo_.resource_manager()->template Delete<lookup::LookupInterface>(
+              cinfo_.container(), cinfo_.name()));
+    }
+  }
+
+ private:
+  mutex mu_;
+  PersistentTensor table_handle_ GUARDED_BY(mu_);
+  bool table_handle_set_ GUARDED_BY(mu_);
+  ContainerInfo cinfo_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(LookupTableOp);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_LOOKUP_TABLE_OP_H_
diff --git a/tensorflow/core/kernels/lookup_util.cc b/tensorflow/core/kernels/lookup_util.cc
new file mode 100644
index 0000000000..634c11e4a5
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_util.cc
@@ -0,0 +1,72 @@
+#include "tensorflow/core/kernels/lookup_util.h"
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+namespace lookup {
+namespace {
+
+Status GetTableHandle(const string& input_name, OpKernelContext* ctx,
+                      string* container, string* table_handle) {
+  {
+    mutex* mu;
+    TF_RETURN_IF_ERROR(ctx->input_ref_mutex(input_name, &mu));
+    mutex_lock l(*mu);
+    Tensor tensor;
+    TF_RETURN_IF_ERROR(ctx->mutable_input(input_name, &tensor, true));
+    if (tensor.NumElements() != 2) {
+      return errors::InvalidArgument(
+          "Lookup table handle must be scalar, but had shape: ",
+          tensor.shape().DebugString());
+    }
+    auto h = tensor.flat<string>();
+    *container = h(0);
+    *table_handle = h(1);
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Status GetLookupTable(const string& input_name, OpKernelContext* ctx,
+                      LookupInterface** table) {
+  string container;
+  string table_handle;
+  TF_RETURN_IF_ERROR(
+      GetTableHandle(input_name, ctx, &container, &table_handle));
+  return ctx->resource_manager()->Lookup(container, table_handle, table);
+}
+
+Status GetInitializableLookupTable(const string& input_name,
+                                   OpKernelContext* ctx,
+                                   InitializableLookupTable** table) {
+  string container;
+  string table_handle;
+  TF_RETURN_IF_ERROR(
+      GetTableHandle(input_name, ctx, &container, &table_handle));
+  LookupInterface* lookup_table;
+  TF_RETURN_IF_ERROR(
+      ctx->resource_manager()->Lookup(container, table_handle, &lookup_table));
+  *table = dynamic_cast<InitializableLookupTable*>(lookup_table);
+  if (*table == nullptr) {
+    lookup_table->Unref();
+    return errors::InvalidArgument("Table ", container, " ", table_handle,
+                                   " is not initializable");
+  }
+  return Status::OK();
+}
+
+Status CheckTableDataTypes(const LookupInterface& table, DataType key_dtype,
+                           DataType value_dtype, const string& table_name) {
+  if (table.key_dtype() != key_dtype || table.value_dtype() != value_dtype) {
+    return errors::InvalidArgument(
+        "Conflicting key/value dtypes ", key_dtype, "->", value_dtype, " with ",
+        table.key_dtype(), "-", table.value_dtype(), " for table ", table_name);
+  }
+  return Status::OK();
+}
+
+}  // namespace lookup
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/lookup_util.h b/tensorflow/core/kernels/lookup_util.h
new file mode 100644
index 0000000000..991a757edd
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_util.h
@@ -0,0 +1,31 @@
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_LOOKUP_UTIL_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_LOOKUP_UTIL_H_
+
+#include "tensorflow/core/framework/lookup_interface.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/initializable_lookup_table.h"
+
+namespace tensorflow {
+namespace lookup {
+
+// Gets the LookupTable stored in the ctx->resource_manager() with key
+// passed by attribute with name input_name, returns null if the table
+// doesn't exist.
+Status GetLookupTable(const string& input_name, OpKernelContext* ctx,
+                      LookupInterface** table);
+
+// Gets the InitializableLookupTable stored in the
+// ctx->resource_manager() with key passed by attribute with name
+// input_name, returns null if the table doesn't exist.
+Status GetInitializableLookupTable(const string& input_name,
+                                   OpKernelContext* ctx,
+                                   InitializableLookupTable** table);
+
+// Verify that the given key_dtype and value_dtype matches the corresponding
+// table's data types.
+Status CheckTableDataTypes(const LookupInterface& table, DataType key_dtype,
+                           DataType value_dtype, const string& table_name);
+}  // namespace lookup
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_LOOKUP_UTIL_H_
diff --git a/tensorflow/core/kernels/lrn_op.cc b/tensorflow/core/kernels/lrn_op.cc
new file mode 100644
index 0000000000..e5abf5906f
--- /dev/null
+++ b/tensorflow/core/kernels/lrn_op.cc
@@ -0,0 +1,228 @@
+// LRN = Local Response Normalization
+// See docs in ../ops/nn_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+#ifndef __ANDROID__
+#include "tensorflow/core/util/work_sharder.h"
+#endif
+
+namespace tensorflow {
+
+// Create a depth-by-depth band matrix with 1s along a swath of size (2 *
+// depth_radius + 1) around the diagonal.
+static void GetBandMatrix(int depth, int64 depth_radius,
+                          Eigen::Tensor<float, 2, Eigen::RowMajor>* result) {
+  result->setZero();
+  for (int row = 0; row < depth; ++row) {
+    const int begin = std::max<int>(0, row - depth_radius);
+    const int end = std::min<int64>(depth, row + depth_radius + 1);
+    Eigen::DSizes<ptrdiff_t, 2> start(row, begin);
+    Eigen::DSizes<ptrdiff_t, 2> sizes(1, end - begin);
+    result->slice(start, sizes).setConstant(1.0f);
+  }
+}
+
+class LRNOp : public OpKernel {
+ public:
+  explicit LRNOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius_));
+    OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
+    OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
+    OP_REQUIRES_OK(context, context->GetAttr("beta", &beta_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& in = context->input(0);
+    OP_REQUIRES(context, in.dims() == 4,
+                errors::InvalidArgument("in must be 4-dimensional"));
+    const int64 batch = in.dim_size(0);
+    const int64 rows = in.dim_size(1);
+    const int64 cols = in.dim_size(2);
+    const int64 depth = in.dim_size(3);
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(
+                       0, TensorShape({batch, rows, cols, depth}), &output));
+
+#ifdef __ANDROID__
+    MognetLRN(in, batch, rows, cols, depth, output);
+#else
+    const int nodes = cols * rows;
+    auto in_shaped = in.shaped<float, 2>({nodes * batch, depth});
+
+    // Multiplying the input with the band matrix has the effect of reducing the
+    // correct patch along the depth.
+    Eigen::Tensor<float, 2, Eigen::RowMajor> multiplier(depth, depth);
+    GetBandMatrix(depth, depth_radius_, &multiplier);
+
+    auto out_shaped = output->shaped<float, 2>({nodes * batch, depth});
+    Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+    /// TODO(keveman): Optimize for beta in {0, 1, 0.5}
+    out_shaped.device(context->eigen_cpu_device()) =
+        in_shaped /
+        in_shaped.square()
+            .contract(multiplier, dims)
+            .unaryExpr([this](float x) { return bias_ + alpha_ * x; })
+            .pow(beta_);
+#endif
+  }
+
+ private:
+  typedef Eigen::Tensor<float, 1, Eigen::RowMajor>::DimensionPair DimPair;
+
+  void MognetLRN(const Tensor& in, const int batch, const int rows,
+                 const int cols, const int depth, Tensor* out) {
+    Eigen::Map<const Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>>
+    data_in(in.flat<float>().data(), depth, batch * rows * cols);
+
+    Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>> data_out(
+        out->flat<float>().data(), depth, batch * rows * cols);
+
+    const int double_depth_radius = depth_radius_ * 2;
+    Eigen::VectorXf padded_square(data_in.rows() + double_depth_radius);
+    padded_square.setZero();
+    for (int r = 0; r < data_in.cols(); ++r) {
+      // Do local response normalization for data_in(:, r)
+      // first, compute the square and store them in buffer for repeated use
+      padded_square.block(depth_radius_, 0, data_out.rows(), 1) =
+          data_in.col(r).cwiseProduct(data_in.col(r)) * alpha_;
+      // Then, compute the scale and writes them to data_out
+      float accumulated_scale = 0;
+      for (int i = 0; i < double_depth_radius; ++i) {
+        accumulated_scale += padded_square(i);
+      }
+      for (int i = 0; i < data_in.rows(); ++i) {
+        accumulated_scale += padded_square(i + double_depth_radius);
+        data_out(i, r) = bias_ + accumulated_scale;
+        accumulated_scale -= padded_square(i);
+      }
+    }
+
+    // In a few cases, the pow computation could benefit from speedups.
+    if (beta_ == 1) {
+      data_out.array() = data_in.array() * data_out.array().inverse();
+    } else if (beta_ == 0.5) {
+      data_out.array() = data_in.array() * data_out.array().sqrt().inverse();
+    } else {
+      data_out.array() = data_in.array() * data_out.array().pow(-beta_);
+    }
+  }
+
+  int64 depth_radius_;
+  float bias_;
+  float alpha_;
+  float beta_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("LRN").Device(DEVICE_CPU), LRNOp);
+
+#ifndef __ANDROID__
+
+class LRNGradOp : public OpKernel {
+ public:
+  explicit LRNGradOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius_));
+    OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
+    OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
+    OP_REQUIRES_OK(context, context->GetAttr("beta", &beta_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& in_grads = context->input(0);
+    const Tensor& in_image = context->input(1);
+    const Tensor& out_image = context->input(2);
+
+    OP_REQUIRES(context, in_grads.dims() == 4 && in_image.dims() == 4,
+                errors::InvalidArgument("inputs must be 4-dimensional"));
+    const int64 batch = in_grads.dim_size(0);
+    const int64 rows = in_grads.dim_size(1);
+    const int64 cols = in_grads.dim_size(2);
+    const int64 depth = in_grads.dim_size(3);
+    OP_REQUIRES(
+        context,
+        in_image.dim_size(0) == batch && in_image.dim_size(1) == rows &&
+            in_image.dim_size(2) == cols && in_image.dim_size(3) == depth &&
+            out_image.dim_size(0) == batch && out_image.dim_size(1) == rows &&
+            out_image.dim_size(2) == cols && out_image.dim_size(3) == depth,
+        errors::InvalidArgument(
+            "input_grads, input_image, and out_image should have the same "
+            "shape"));
+    const auto nodes = cols * rows;
+    auto grads_shaped = in_grads.shaped<float, 2>({nodes * batch, depth});
+    auto in_shaped = in_image.shaped<float, 2>({nodes * batch, depth});
+    auto activations = out_image.shaped<float, 2>({nodes * batch, depth});
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(
+                       0, TensorShape({batch, rows, cols, depth}), &output));
+    auto out_shaped = output->shaped<float, 2>({nodes * batch, depth});
+    out_shaped.setZero();
+
+    auto shard = [this, activations, in_shaped, grads_shaped, out_shaped,
+                  depth](int64 begin, int64 end) {
+      for (int64 i = begin; i < end; ++i) {
+        for (int64 j = 0; j < depth; ++j) {
+          // Let y be the LRN activations and x be the inputs along the depth
+          // dimension. (LRN operates independently along rows, cols, and
+          // batch).
+          // We have
+          // yi = xi / (bias + alpha(sum_j_{i - depth_radius}^{i + depth_radius}
+          //      x_j^2))^beta
+          //
+          // Let N = (bias + alpha(sum_j_{i - depth_radius}^{i + depth_radius}
+          //           x_j^2))
+          // dy_i/dx_i = (N^beta - xi. beta*N^(beta-1)*2*alpha*xi)/N^(2*beta)
+          // dy_i/dx_j = (       - xi. beta*N^(beta-1)*2*alpha*xj)/N^(2*beta)
+          //
+          // NOTE(keveman) : We can compute N by doing (yi/xi) ^ (1/beta).
+          // However, this is numerically unstable for small values of xi. We
+          // compute N explicitly here to avoid that.
+
+          int64 depth_begin = std::max<int64>(0, j - depth_radius_);
+          int64 depth_end = std::min<int64>(depth, j + depth_radius_ + 1);
+
+          float norm = 0.0f;
+          for (int64 k = depth_begin; k < depth_end; ++k) {
+            norm += in_shaped(i, k) * in_shaped(i, k);
+          }
+          norm = alpha_ * norm + bias_;
+          DCHECK_GT(norm, 1e-6);
+          for (int64 k = depth_begin; k < depth_end; ++k) {
+            float dyi = -2.0f * alpha_ * beta_ * in_shaped(i, k) *
+                        activations(i, j) / norm;
+            if (k == j) {
+              dyi += std::pow(norm, -beta_);
+            }
+            dyi *= grads_shaped(i, j);
+            const_cast<TTypes<float, 2>::Tensor&>(out_shaped)(i, k) += dyi;
+          }
+        }
+      }
+    };
+    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+    Shard(worker_threads.num_threads, worker_threads.workers, nodes * batch,
+          depth * depth, shard);
+  }
+
+ private:
+  typedef Eigen::Tensor<float, 1, Eigen::RowMajor>::DimensionPair DimPair;
+
+  int64 depth_radius_;
+  float bias_;
+  float alpha_;
+  float beta_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("LRNGrad").Device(DEVICE_CPU), LRNGradOp);
+
+#endif  // __ANDROID__
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/lrn_op_test.cc b/tensorflow/core/kernels/lrn_op_test.cc
new file mode 100644
index 0000000000..4c338b6cb3
--- /dev/null
+++ b/tensorflow/core/kernels/lrn_op_test.cc
@@ -0,0 +1,185 @@
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+
+static const float tol_ = 1e-4;
+
+class LRNFloatTest : public OpsTestBase {
+ protected:
+  LRNFloatTest() : philox_(123, 17), rand_(&philox_) { RequireDefaultOps(); }
+
+  int GetIntAttr(const string& name) {
+    int value;
+    TF_CHECK_OK(GetNodeAttr(*node_def(), name, &value));
+    return value;
+  }
+
+  float GetFloatAttr(const string& name) {
+    float value;
+    TF_CHECK_OK(GetNodeAttr(*node_def(), name, &value));
+    return value;
+  }
+
+  bool Compare() {
+    const auto& input = GetInput(0);
+    const int64 batch_size = input.dim_size(0);
+    const int64 rows = input.dim_size(1);
+    const int64 cols = input.dim_size(2);
+    const int64 depth = input.dim_size(3);
+    const int64 rest = cols * rows * batch_size;
+
+    const int64 depth_radius = GetIntAttr("depth_radius");
+    const float bias = GetFloatAttr("bias");
+    const float alpha = GetFloatAttr("alpha");
+    const float beta = GetFloatAttr("beta");
+
+    Eigen::Tensor<float, 4, Eigen::RowMajor> expected(batch_size, rows, cols,
+                                                      depth);
+    auto out = expected.reshape(Eigen::DSizes<int64, 2>{rest, depth});
+    auto in = input.shaped<float, 2>({rest, depth});
+
+    for (int64 i = 0; i < rest; ++i) {
+      Eigen::Tensor<float, 1, Eigen::RowMajor> out_col(depth);
+      for (int64 d = 0; d < depth; ++d) {
+        float denom = 0.0f;
+        for (int64 r = std::max(0ll, d - depth_radius);
+             r < std::min(depth, d + depth_radius + 1); ++r) {
+          denom += in(i, r) * in(i, r);
+        }
+        denom = std::pow(denom * alpha + bias, beta);
+        out_col(d) = in(i, d) / denom;
+      }
+      out.chip<0>(i) = out_col;
+    }
+    auto actual = GetOutput(0)->tensor<float, 4>();
+    Eigen::Tensor<float, 0, Eigen::RowMajor> sum =
+        ((expected - actual).abs() > actual.constant(tol_))
+            .select(actual.constant(1), actual.constant(0))
+            .sum();
+    return sum() == 0;
+  }
+
+  random::PhiloxRandom philox_;
+  random::SimplePhilox rand_;
+};
+
+TEST_F(LRNFloatTest, Depth96) {
+  ASSERT_OK(NodeDefBuilder("lrn_op", "LRN")
+                .Input(FakeInput())
+                .Attr("depth_radius", 5)
+                .Attr("bias", 1.0f)
+                .Attr("alpha", 0.1f)
+                .Attr("beta", 2.0f)
+                .Finalize(node_def()));
+  ASSERT_OK(InitOp());
+  AddInput<float>(TensorShape({1, 1, 1, 96}),
+                  [this](int i) -> float { return i + 1; });
+  ASSERT_OK(RunOpKernel());
+  auto actual = GetOutput(0)->tensor<float, 4>();
+
+  // Output for Node 0 with Value 1:
+  // 1 / (1 + 0.1*(1^2 + 2^2 + 3^2 + 4^2 + 5^2 + 6^2))^2
+  EXPECT_NEAR(1. / (10.1 * 10.1), actual(0, 0, 0, 0), tol_);
+
+  // Output for Node 5 with Value 6:
+  // 6 / (1 + 0.1*(1^2 + 2^2 + 3^2 + 4^2 + 5^2 + 6^2 ... + 11^2))^2
+  EXPECT_NEAR(6. / (51.6 * 51.6), actual(0, 0, 0, 5), tol_);
+
+  // Output for Node 63 with value 64:
+  // 64 / (1 + 0.1*(59^2 + 60^2 + 61^2 + 62^2 + 63^2 + 64^2))^2
+  EXPECT_NEAR(64. / (2272.1 * 2272.1), actual(0, 0, 0, 63), tol_);
+
+  // Output for Node 64 with value 65:
+  // 65 / (1 + 0.1*(65^2 + 66^2 + 67^2 + 68^2 + 69^2 + 70^2))^2
+  EXPECT_NEAR(65. / (2736.5 * 2736.5), actual(0, 0, 0, 64), tol_);
+
+  // Output for Node 95 with value 96:
+  // 96 / (1 + 0.1*(91^2 + 92^2 + 93^2 + 94^2 + 95^2 + 96^2))^2
+  EXPECT_NEAR(96. / (5248.1 * 5248.1), actual(0, 0, 0, 95), tol_);
+  EXPECT_TRUE(Compare());
+}
+
+TEST_F(LRNFloatTest, Depth16) {
+  ASSERT_OK(NodeDefBuilder("lrn_op", "LRN")
+                .Input(FakeInput())
+                .Attr("depth_radius", 5)
+                .Attr("bias", 1.0f)
+                .Attr("alpha", 0.1f)
+                .Attr("beta", 2.0f)
+                .Finalize(node_def()));
+  ASSERT_OK(InitOp());
+  AddInput<float>(TensorShape({1, 1, 1, 16}),
+                  [this](int i) -> float { return i + 1; });
+  ASSERT_OK(RunOpKernel());
+  auto actual = GetOutput(0)->tensor<float, 4>();
+
+  // Output for Node 0 with Value 1:
+  // 1 / (1 + 0.1*(1^2 + 2^2 + 3^2 + 4^2 + 5^2 + 6^2))^2
+  EXPECT_NEAR(1. / (10.1 * 10.1), actual(0, 0, 0, 0), tol_);
+
+  // Output for Node 5 with Value 6:
+  // 6 / (1 + 0.1*(1^2 + 2^2 + 3^2 + 4^2 + 5^2 + 6^2 ... + 11^2))^2
+  EXPECT_NEAR(6. / (51.6 * 51.6), actual(0, 0, 0, 5), tol_);
+
+  // Output for Node 15 with value 16:
+  // 16 / (1 + 0.1*(11^2 + 12^2 + 13^2 + 14^2 + 15^2 + 16^2))^2
+  EXPECT_NEAR(16. / (112.1 * 112.1), actual(0, 0, 0, 15), tol_);
+  EXPECT_TRUE(Compare());
+}
+
+static double RndGaussian(random::SimplePhilox* rnd) {
+  // Box-Muller transformation.
+  // See, for example, http://www.taygeta.com/random/gaussian.html
+  double x1, x2;
+  double r;
+  do {
+    x1 = 2 * rnd->RandDouble() - 1;
+    x2 = 2 * rnd->RandDouble() - 1;
+    r = x1 * x1 + x2 * x2;
+  } while (r == 0 || r >= 1.0);
+  double w = sqrt(-2.0 * log(r) / r);
+  return x1 * w;
+}
+
+#define TCASE(NAME, DEPTH, BATCH, DEPTH_RADIUS, BIAS, ALPHA, BETA)           \
+  TEST_F(LRNFloatTest, NAME) {                                               \
+    ASSERT_OK(NodeDefBuilder("lrn_op", "LRN")                                \
+                  .Input(FakeInput())                                        \
+                  .Attr("depth_radius", (DEPTH_RADIUS))                      \
+                  .Attr("bias", (BIAS))                                      \
+                  .Attr("alpha", ((ALPHA) / 10))                             \
+                  .Attr("beta", (BETA))                                      \
+                  .Finalize(node_def()));                                    \
+    ASSERT_OK(InitOp());                                                     \
+    AddInput<float>(TensorShape({BATCH, 1, 1, DEPTH}),                       \
+                    [this](int i) -> float { return RndGaussian(&rand_); }); \
+    ASSERT_OK(RunOpKernel());                                                \
+    EXPECT_TRUE(Compare());                                                  \
+  }
+
+// clang-format off
+//        DEPTH  BATCH  DEPTH_RADIUS  BIAS  ALPHA  BETA
+TCASE(T0, 4,     2,     2,            1.0f, 1.0f,  2.0f)
+TCASE(T1, 16,    1,     5,            1.0f, 1.0f,  2.0f)
+TCASE(T2, 16,    32,    2,            1.0f, 2.0f,  1.0f)
+TCASE(T3, 128,   4,     3,            2.0f, 1.0f,  1.0f)
+// clang-format on
+
+#undef TCASE
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matching_files_op.cc b/tensorflow/core/kernels/matching_files_op.cc
new file mode 100644
index 0000000000..08a4da5b41
--- /dev/null
+++ b/tensorflow/core/kernels/matching_files_op.cc
@@ -0,0 +1,42 @@
+// See docs in ../ops/io_ops.cc.
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/io/match.h"
+#include "tensorflow/core/public/env.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+class MatchingFilesOp : public OpKernel {
+ public:
+  using OpKernel::OpKernel;
+  void Compute(OpKernelContext* context) override {
+    const Tensor* pattern;
+    OP_REQUIRES_OK(context, context->input("pattern", &pattern));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(pattern->shape()),
+                errors::InvalidArgument(
+                    "Input pattern tensor must be scalar, but had shape: ",
+                    pattern->shape().DebugString()));
+    std::vector<string> fnames;
+    OP_REQUIRES_OK(context,
+                   io::GetMatchingFiles(context->env(),
+                                        pattern->scalar<string>()(), &fnames));
+    const int num_out = fnames.size();
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                "filenames", TensorShape({num_out}), &output));
+    auto output_vec = output->vec<string>();
+    for (int i = 0; i < num_out; ++i) {
+      output_vec(i) = fnames[i];
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MatchingFiles").Device(DEVICE_CPU),
+                        MatchingFilesOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc
new file mode 100644
index 0000000000..48bdba78b2
--- /dev/null
+++ b/tensorflow/core/kernels/matmul_op.cc
@@ -0,0 +1,214 @@
+// See docs in ../ops/math_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/matmul_op.h"
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+
+#if GOOGLE_CUDA
+#include "tensorflow/core/common_runtime/gpu_device_context.h"
+#include "tensorflow/stream_executor/stream.h"
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+
+#if GOOGLE_CUDA
+
+namespace {
+template <typename T>
+perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
+  perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory));
+  perftools::gputools::DeviceMemory<T> typed(wrapped);
+  return typed;
+}
+}  // namespace
+
+#endif  // GOOGLE_CUDA
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T, bool USE_CUBLAS>
+struct LaunchMatMul;
+
+// On CPUs, we ignore USE_CUBLAS
+template <typename T>
+struct LaunchMatMulCPU {
+  static void launch(
+      OpKernelContext* ctx, OpKernel* kernel, const Tensor& a, const Tensor& b,
+      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
+      Tensor* out) {
+    functor::MatMulFunctor<CPUDevice, T>()(ctx->eigen_device<CPUDevice>(),
+                                           out->matrix<T>(), a.matrix<T>(),
+                                           b.matrix<T>(), dim_pair);
+  }
+};
+
+template <typename T, bool USE_CUBLAS>
+struct LaunchMatMul<CPUDevice, T, USE_CUBLAS> : public LaunchMatMulCPU<T> {};
+
+#if GOOGLE_CUDA
+
+template <typename T>
+struct LaunchMatMul<GPUDevice, T, true /* USE_CUBLAS */> {
+  static void launch(
+      OpKernelContext* ctx, OpKernel* kernel, const Tensor& a, const Tensor& b,
+      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
+      Tensor* out) {
+    perftools::gputools::blas::Transpose trans[] = {
+        perftools::gputools::blas::Transpose::kNoTranspose,
+        perftools::gputools::blas::Transpose::kTranspose};
+    const uint64 m = a.dim_size(1 - dim_pair[0].first);
+    const uint64 k = a.dim_size(dim_pair[0].first);
+    const uint64 n = b.dim_size(1 - dim_pair[0].second);
+    bool transpose_a = dim_pair[0].first == 0;
+    bool transpose_b = dim_pair[0].second == 1;
+    auto blas_transpose_a = trans[transpose_a];
+    auto blas_transpose_b = trans[transpose_b];
+
+    auto* stream = ctx->op_device_context<GPUDeviceContext>()->stream();
+    OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
+
+    auto a_ptr = AsDeviceMemory(a.template flat<T>().data());
+    auto b_ptr = AsDeviceMemory(b.template flat<T>().data());
+    auto c_ptr = AsDeviceMemory(out->template flat<T>().data());
+
+    // Cublas does
+    // C = A x B
+    // where A, B and C are assumed to be in column major.
+    // We want the output to be in row-major, so we can compute
+    // C' = B' x A' (' stands for transpose)
+    bool blas_launch_status =
+        stream->ThenBlasGemm(blas_transpose_b, blas_transpose_a, n, m, k, 1.0f,
+                             b_ptr, transpose_b ? k : n, a_ptr,
+                             transpose_a ? m : k, 0.0f, &c_ptr, n)
+            .ok();
+    if (!blas_launch_status) {
+      ctx->SetStatus(errors::Internal(
+          "Blas SGEMM launch failed : a.shape=(", a.dim_size(0), ", ",
+          a.dim_size(1), "), b.shape=(", b.dim_size(0), ", ", b.dim_size(1),
+          "), m=", m, ", n=", n, ", k=", k));
+    }
+  }
+};
+
+template <typename T>
+struct LaunchMatMul<GPUDevice, T, false /* USE_CUBLAS */> {
+  static void launch(
+      OpKernelContext* ctx, OpKernel* kernel, const Tensor& a, const Tensor& b,
+      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
+      Tensor* out) {
+    functor::MatMulFunctor<GPUDevice, T>()(ctx->eigen_device<GPUDevice>(),
+                                           out->matrix<T>(), a.matrix<T>(),
+                                           b.matrix<T>(), dim_pair);
+  }
+};
+
+#endif  // GOOGLE_CUDA
+
+template <typename Device, typename T, bool USE_CUBLAS>
+class MatMulOp : public OpKernel {
+ public:
+  explicit MatMulOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_a", &transpose_a_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_b", &transpose_b_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& a = ctx->input(0);
+    const Tensor& b = ctx->input(1);
+
+    // Check that the dimensions of the two matrices are valid.
+    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a.shape()),
+                errors::InvalidArgument("In[0] is not a matrix"));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(b.shape()),
+                errors::InvalidArgument("In[1] is not a matrix"));
+    Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
+    dim_pair[0].first = transpose_a_ ? 0 : 1;
+    dim_pair[0].second = transpose_b_ ? 1 : 0;
+
+    OP_REQUIRES(ctx,
+                a.dim_size(dim_pair[0].first) == b.dim_size(dim_pair[0].second),
+                errors::InvalidArgument("Matrix size-compatible: In[0]: ",
+                                        a.shape().DebugString(), ", In[1]: ",
+                                        b.shape().DebugString()));
+    int a_dim_remaining = 1 - dim_pair[0].first;
+    int b_dim_remaining = 1 - dim_pair[0].second;
+    TensorShape out_shape(
+        {a.dim_size(a_dim_remaining), b.dim_size(b_dim_remaining)});
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
+
+    if (out->NumElements() == 0) {
+      // If a has shape [0, x] or b has shape [x, 0], the output shape
+      // is a 0-element matrix, so there is nothing to do.
+      return;
+    }
+
+    if (a.NumElements() == 0 || b.NumElements() == 0) {
+      // If a has shape [x, 0] and b has shape [0, y], the
+      // output shape is [x, y] where x and y are non-zero, so we fill
+      // the output with zeros.
+      functor::SetZeroFunctor<Device, T> f;
+      f(ctx->eigen_device<Device>(), out->flat<T>());
+      return;
+    }
+
+    LaunchMatMul<Device, T, USE_CUBLAS>::launch(ctx, this, a, b, dim_pair, out);
+  }
+
+ private:
+  bool transpose_a_;
+  bool transpose_b_;
+};
+
+namespace functor {
+
+// Partial specialization MatMulFunctor<Device=CPUDevice, T>.
+template <typename T>
+struct MatMulFunctor<CPUDevice, T> {
+  void operator()(
+      const CPUDevice& d, typename MatMulTypes<T>::out_type out,
+      typename MatMulTypes<T>::in_type in0,
+      typename MatMulTypes<T>::in_type in1,
+      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair) {
+    MatMul<CPUDevice>(d, out, in0, in1, dim_pair);
+  }
+};
+
+}  // end namespace functor
+
+#define REGISTER_CPU(T)                                                        \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("MatMul").Device(DEVICE_CPU).TypeConstraint<T>("T"),                \
+      MatMulOp<CPUDevice, T, false /* cublas, ignored for CPU */>);            \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("MatMul").Device(DEVICE_CPU).TypeConstraint<T>("T").Label("eigen"), \
+      MatMulOp<CPUDevice, T, false /* cublas, ignored for CPU */>)
+
+#define REGISTER_GPU(T)                                                        \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("MatMul").Device(DEVICE_GPU).TypeConstraint<T>("T"),                \
+      MatMulOp<GPUDevice, T, true /* cublas, true by default */>);             \
+  REGISTER_KERNEL_BUILDER(Name("MatMul")                                       \
+                              .Device(DEVICE_GPU)                              \
+                              .TypeConstraint<T>("T")                          \
+                              .Label("cublas"),                                \
+                          MatMulOp<GPUDevice, T, true /* cublas */>);          \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("MatMul").Device(DEVICE_GPU).TypeConstraint<T>("T").Label("eigen"), \
+      MatMulOp<GPUDevice, T, false /* cublas */>)
+
+REGISTER_CPU(float);
+REGISTER_CPU(double);
+REGISTER_CPU(int32);
+REGISTER_CPU(complex64);
+#if GOOGLE_CUDA
+REGISTER_GPU(float);
+// REGISTER_GPU(double);
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matmul_op.h b/tensorflow/core/kernels/matmul_op.h
new file mode 100644
index 0000000000..f75b0ded1b
--- /dev/null
+++ b/tensorflow/core/kernels/matmul_op.h
@@ -0,0 +1,40 @@
+#ifndef TENSORFLOW_KERNELS_MATMUL_OP_H_
+#define TENSORFLOW_KERNELS_MATMUL_OP_H_
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+// Helpers to define tensor<T> needed by MatMul op.
+template <typename T>
+struct MatMulTypes {
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned>
+      out_type;
+  typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor>,
+                           Eigen::Aligned> in_type;
+};
+
+template <typename Device, typename In0, typename In1, typename Out,
+          typename DimPair>
+void MatMul(const Device& d, Out out, In0 in0, In1 in1,
+            const DimPair& dim_pair) {
+  out.device(d) = in0.contract(in1, dim_pair);
+}
+
+template <typename Device, typename T>
+struct MatMulFunctor {
+  // Computes on device "d": out = in0 * in1, where * is matrix
+  // multiplication.
+  void operator()(
+      const Device& d, typename MatMulTypes<T>::out_type out,
+      typename MatMulTypes<T>::in_type in0,
+      typename MatMulTypes<T>::in_type in1,
+      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair);
+};
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_MATMUL_OP_H_
diff --git a/tensorflow/core/kernels/matmul_op_gpu.cu.cc b/tensorflow/core/kernels/matmul_op_gpu.cu.cc
new file mode 100644
index 0000000000..17107ce5df
--- /dev/null
+++ b/tensorflow/core/kernels/matmul_op_gpu.cu.cc
@@ -0,0 +1,32 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/matmul_op.h"
+
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Partial specialization MatMulTensorFunctor<Device=GPUDevice, T>
+template <typename T>
+struct MatMulFunctor<GPUDevice, T> {
+  void operator()(
+      const GPUDevice& d, typename MatMulTypes<T>::out_type out,
+      typename MatMulTypes<T>::in_type in0,
+      typename MatMulTypes<T>::in_type in1,
+      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair) {
+    MatMul<GPUDevice>(d, To32Bit(out), To32Bit(in0), To32Bit(in1), dim_pair);
+  }
+};
+
+#define DEFINE(T) template struct MatMulFunctor<GPUDevice, T>;
+DEFINE(float);
+// DEFINE(double);  // Does not compile 1/2015.
+#undef DEFINE
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/matmul_op_test.cc b/tensorflow/core/kernels/matmul_op_test.cc
new file mode 100644
index 0000000000..b2b8f3d905
--- /dev/null
+++ b/tensorflow/core/kernels/matmul_op_test.cc
@@ -0,0 +1,56 @@
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include <gtest/gtest.h>
+
+namespace tensorflow {
+
+static Graph* Matmul(int m, int k, int n, bool transpose_a, bool transpose_b) {
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor in0(DT_FLOAT, transpose_a ? TensorShape({k, m}) : TensorShape({m, k}));
+  in0.flat<float>().setRandom();
+  Tensor in1(DT_FLOAT, transpose_b ? TensorShape({n, k}) : TensorShape({k, n}));
+  in1.flat<float>().setRandom();
+  test::graph::Matmul(g, test::graph::Constant(g, in0),
+                      test::graph::Constant(g, in1), transpose_a, transpose_b);
+  return g;
+}
+
+#define BM_MatmulDev(M, K, N, TA, TB, DEVICE)                           \
+  static void BM_Matmul##_##M##_##K##_##N##_##TA##_##TB##_##DEVICE(     \
+      int iters) {                                                      \
+    testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * 2); \
+    test::Benchmark(#DEVICE, Matmul(M, K, N, TA, TB)).Run(iters);       \
+  }                                                                     \
+  BENCHMARK(BM_Matmul##_##M##_##K##_##N##_##TA##_##TB##_##DEVICE);
+
+#define BM_Matmul(M, K, N, TA, TB)    \
+  BM_MatmulDev(M, K, N, TA, TB, cpu); \
+  BM_MatmulDev(M, K, N, TA, TB, gpu);
+
+// Typical fully connected layers
+BM_Matmul(8, 512, 512, false, false);
+BM_Matmul(16, 512, 512, false, false);
+BM_Matmul(128, 512, 512, false, false);
+
+BM_Matmul(8, 1024, 1024, false, false);
+BM_Matmul(16, 1024, 1024, false, false);
+BM_Matmul(128, 1024, 1024, false, false);
+BM_Matmul(4096, 4096, 4096, false, false);
+
+// Backward for fully connected layers
+BM_Matmul(8, 1024, 1024, false, true);
+BM_Matmul(16, 1024, 1024, false, true);
+BM_Matmul(128, 1024, 1024, false, true);
+
+// Forward softmax with large output size
+BM_Matmul(8, 200, 10000, false, false);
+BM_Matmul(20, 200, 10000, false, false);
+BM_Matmul(20, 200, 20000, false, false);
+
+// Backward softmax with large output size
+BM_Matmul(8, 10000, 200, false, true);
+BM_Matmul(20, 10000, 200, false, true);
+BM_Matmul(20, 20000, 200, false, true);
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/matrix_inverse_op.cc b/tensorflow/core/kernels/matrix_inverse_op.cc
new file mode 100644
index 0000000000..ad0948d6ef
--- /dev/null
+++ b/tensorflow/core/kernels/matrix_inverse_op.cc
@@ -0,0 +1,64 @@
+// See docs in ../ops/linalg_ops.cc.
+#include <cmath>
+
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "third_party/eigen3/Eigen/LU"
+
+namespace tensorflow {
+
+template <class Scalar, bool SupportsBatchOperationT>
+class MatrixInverseOp
+    : public LinearAlgebraOp<Scalar, SupportsBatchOperationT> {
+ public:
+  explicit MatrixInverseOp(OpKernelConstruction* context)
+      : LinearAlgebraOp<Scalar, SupportsBatchOperationT>(context) {}
+  ~MatrixInverseOp() override {}
+
+  TensorShape GetOutputMatrixShape(
+      const TensorShape& input_matrix_shape) override {
+    return input_matrix_shape;
+  }
+
+  int64 GetCostPerUnit(const TensorShape& input_matrix_shape) override {
+    const int64 rows = input_matrix_shape.dim_size(0);
+    if (rows > (1LL << 20)) {
+      // A big number to cap the cost in case overflow.
+      return kint32max;
+    } else {
+      return rows * rows * rows;
+    }
+  }
+
+  using typename LinearAlgebraOp<Scalar, SupportsBatchOperationT>::MatrixMap;
+  using
+      typename LinearAlgebraOp<Scalar, SupportsBatchOperationT>::ConstMatrixMap;
+
+  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMap& input,
+                     MatrixMap* output) override {
+    OP_REQUIRES(context, input.rows() == input.cols(),
+                errors::InvalidArgument("Input matrix must be square."));
+    if (input.rows() == 0) {
+      // By definition, an empty matrix's inverse is an emptry matrix.
+      return;
+    }
+    Eigen::FullPivLU<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic,
+                                   Eigen::RowMajor>> lu_decomposition(input);
+    OP_REQUIRES(context, lu_decomposition.isInvertible(),
+                errors::InvalidArgument("Input is not invertible."));
+    *output = lu_decomposition.inverse();
+  }
+};
+
+REGISTER_LINALG_OP("MatrixInverse", (MatrixInverseOp<float, false>), float);
+REGISTER_LINALG_OP("MatrixInverse", (MatrixInverseOp<double, false>), double);
+REGISTER_LINALG_OP("BatchMatrixInverse", (MatrixInverseOp<float, true>), float);
+REGISTER_LINALG_OP("BatchMatrixInverse", (MatrixInverseOp<double, true>),
+                   double);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
new file mode 100644
index 0000000000..31046018c5
--- /dev/null
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -0,0 +1,554 @@
+// See docs in ../ops/nn_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/maxpooling_op.h"
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/pooling_ops_common.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/util/use_cudnn.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+#if GOOGLE_CUDA
+#include "tensorflow/stream_executor/stream.h"
+#include "tensorflow/core/kernels/maxpooling_op_gpu.h"
+#include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+const int kInvalidMaxPoolingIndex = -1;
+
+template <typename Device, typename T>
+struct SpatialMaxPoolWithArgMaxHelper {
+  static void Compute(Tensor* output, Tensor* output_arg_max,
+                      const Tensor& tensor_in, const PoolParameters& params,
+                      const Padding& padding) {
+    typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+        ConstEigenMatrixMap;
+    typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+        EigenMatrixMap;
+    typedef Eigen::Map<Eigen::Matrix<int64, Eigen::Dynamic, Eigen::Dynamic>>
+        EigenIndexMatrixMap;
+
+    ConstEigenMatrixMap in_mat(
+        tensor_in.flat<T>().data(), params.depth,
+        params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch);
+    EigenMatrixMap out_mat(
+        output->flat<T>().data(), params.depth,
+        params.out_width * params.out_height * params.tensor_in_batch);
+    EigenIndexMatrixMap out_arg_max_mat(
+        output_arg_max->flat<int64>().data(), params.depth,
+        params.out_width * params.out_height * params.tensor_in_batch);
+
+    // Initializes the output tensor with MIN<T>.
+    output_arg_max->flat<int64>().setConstant(kInvalidMaxPoolingIndex);
+    output->flat<T>().setConstant(Eigen::NumTraits<T>::lowest());
+
+    // The following code basically does the following:
+    // 1. Flattens the input and output tensors into two dimensional arrays.
+    //    tensor_in_as_matrix:
+    //      depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
+    //    output_as_matrix:
+    //      depth by (out_width * out_height * tensor_in_batch)
+    //
+    // 2. Walks through the set of columns in the flattened tensor_in_as_matrix,
+    //    and updates the corresponding column(s) in output_as_matrix with the
+    //    max value.
+    for (int b = 0; b < params.tensor_in_batch; ++b) {
+      for (int h = 0; h < params.tensor_in_rows; ++h) {
+        for (int w = 0; w < params.tensor_in_cols; ++w) {
+          // (h_start, h_end) * (w_start, w_end) is the range that the input
+          // vector projects to.
+          const int hpad = h + params.pad_rows;
+          const int wpad = w + params.pad_cols;
+          const int h_start =
+              (hpad < params.window_rows)
+                  ? 0
+                  : (hpad - params.window_rows) / params.row_stride + 1;
+          const int h_end =
+              std::min(hpad / params.row_stride + 1, params.out_height);
+          const int w_start =
+              (wpad < params.window_cols)
+                  ? 0
+                  : (wpad - params.window_cols) / params.col_stride + 1;
+          const int w_end =
+              std::min(wpad / params.col_stride + 1, params.out_width);
+          // compute elementwise max
+          const int in_index =
+              (b * params.tensor_in_rows + h) * params.tensor_in_cols + w;
+          for (int ph = h_start; ph < h_end; ++ph) {
+            for (int pw = w_start; pw < w_end; ++pw) {
+              const int out_index =
+                  (b * params.out_height + ph) * params.out_width + pw;
+              /// NOTES(zhengxq): not using the eigen matrix operation for now.
+              /// May consider parallelizing the operations if needed.
+              for (int d = 0; d < params.depth; ++d) {
+                const T& input_ref = in_mat.coeffRef(d, in_index);
+                T& output_ref = out_mat.coeffRef(d, out_index);
+                int64& out_arg_max_ref = out_arg_max_mat.coeffRef(d, out_index);
+                if (output_ref < input_ref ||
+                    out_arg_max_ref == kInvalidMaxPoolingIndex) {
+                  output_ref = input_ref;
+                  int input_offset = in_index * params.depth + d;
+                  out_arg_max_ref = input_offset;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MaxPool").Device(DEVICE_CPU),
+                        MaxPoolingOp<CPUDevice, float>);
+
+#if GOOGLE_CUDA
+// Forward declarations for the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                            \
+  template <>                                                          \
+  void SpatialMaxPooling<Eigen::GpuDevice, T>::operator()(             \
+      const Eigen::GpuDevice& d, typename TTypes<T, 4>::Tensor output, \
+      typename TTypes<T, 4>::ConstTensor input, int window_rows,       \
+      int window_cols, int row_stride, int col_stride,                 \
+      const Eigen::PaddingType& padding);                              \
+  extern template struct SpatialMaxPooling<Eigen::GpuDevice, T>;
+
+DECLARE_GPU_SPEC(float);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+// Note(jiayq): Currently, the Caffe custom implementation is faster than the
+// default Eigen implementation so we are using the custom kernel as the
+// default. However, you can explicitly invoke the eigen version using
+// kernel_label_map.
+REGISTER_KERNEL_BUILDER(Name("MaxPool")
+                            .Device(DEVICE_GPU)
+                            .Label("eigen_tensor"),
+                        MaxPoolingOp<Eigen::GpuDevice, float>);
+#endif  // GOOGLE_CUDA
+
+// The operation to compute MaxPool gradients.
+// It takes three inputs:
+//   - The original input tensor
+//   - The original output tensor
+//   - Backprop tensor for output
+// It produces one output: backprop tensor for input.
+template <class Device, class T>
+class MaxPoolingGradOp : public OpKernel {
+ public:
+  explicit MaxPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument(
+                    "Sliding window ksize field must "
+                    "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument(
+                    "Sliding window strides field must "
+                    "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+    OP_REQUIRES(
+        context, ksize_[3] == 1 && stride_[3] == 1,
+        errors::Unimplemented(
+            "MaxPoolingGrad is not yet supported on the depth dimension."));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& tensor_in = context->input(0);
+    const Tensor& tensor_out = context->input(1);
+    const Tensor& out_backprop = context->input(2);
+
+    // For maxpooling, tensor_in should have 4 dimensions.
+    OP_REQUIRES(context, tensor_in.dims() == 4,
+                errors::InvalidArgument("tensor_in must be 4-dimensional"));
+    OP_REQUIRES(context, tensor_out.dims() == 4,
+                errors::InvalidArgument("tensor_out must be 4-dimensional"));
+    // For maxpooling, out_backprop should have 4 dimensions.
+    OP_REQUIRES(context, out_backprop.dims() == 4,
+                errors::InvalidArgument("out_backprop must be 4-dimensional"));
+
+    TensorShape output_shape = tensor_in.shape();
+
+    // Tensor index_tensor(context->allocator(), DT_INT32, output_shape);
+
+    Tensor tensor_out_dup;
+    OP_REQUIRES_OK(context,
+                   context->allocate_temp(DataTypeToEnum<T>::v(),
+                                          tensor_out.shape(), &tensor_out_dup));
+    Tensor tensor_out_arg_max;
+    OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<int64>::v(),
+                                                   tensor_out.shape(),
+                                                   &tensor_out_arg_max));
+
+    PoolParameters params{context, ksize_, stride_, padding_,
+                          tensor_in.shape()};
+    if (!context->status().ok()) {
+      return;
+    }
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+    output->flat<T>().setZero();
+
+    SpatialMaxPoolWithArgMaxHelper<CPUDevice, T>::Compute(
+        &tensor_out_dup, &tensor_out_arg_max, tensor_in, params, padding_);
+    auto out_backprop_flat = out_backprop.flat<T>();
+    auto input_backprop_flat = output->flat<T>();
+    auto out_arg_max_flat = tensor_out_arg_max.flat<int64>();
+    int num_total_outputs = out_backprop.flat<T>().size();
+    int num_total_inputs = input_backprop_flat.size();
+
+    for (int index = 0; index < num_total_outputs; ++index) {
+      int input_backprop_index = out_arg_max_flat(index);
+      // Although this check is in the inner loop, it is worth its value
+      // so we don't end up with memory corruptions. Our benchmark shows that
+      // the performance impact is quite small
+      CHECK(input_backprop_index >= 0 &&
+            input_backprop_index < num_total_inputs)
+          << "Invalid input backprop index: " << input_backprop_index << ", "
+          << num_total_inputs;
+      input_backprop_flat(input_backprop_index) += out_backprop_flat(index);
+    }
+  }
+
+ private:
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("MaxPoolGrad").Device(DEVICE_CPU),
+                        MaxPoolingGradOp<CPUDevice, float>);
+
+#ifdef GOOGLE_CUDA
+
+static void MaxPoolingBackwardCustomKernel(
+    OpKernelContext* context, const std::vector<int32>& size,
+    const std::vector<int32>& stride, Padding padding, const Tensor* tensor_in,
+    const Tensor& out_backprop, const TensorShape& tensor_in_shape) {
+  Tensor* output = nullptr;
+
+  OP_REQUIRES_OK(context,
+                 context->allocate_output(0, tensor_in_shape, &output));
+
+  PoolParameters params{context, size, stride, padding, tensor_in_shape};
+  if (!context->status().ok()) {
+    return;
+  }
+
+  MaxPoolBackwardNoMask(
+      tensor_in->flat<float>().data(), params.tensor_in_batch,
+      params.tensor_in_rows, params.tensor_in_cols, params.depth,
+      params.out_height, params.out_width, params.window_rows,
+      params.window_cols, params.row_stride, params.col_stride, params.pad_rows,
+      params.pad_cols, out_backprop.flat<float>().data(),
+      output->flat<float>().data(), context->eigen_device<Eigen::GpuDevice>());
+}
+
+template <class T>
+class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
+ public:
+  typedef Eigen::GpuDevice Device;
+
+  explicit MaxPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument(
+                    "Sliding window ksize field must "
+                    "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument(
+                    "Sliding window strides field must "
+                    "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+
+    use_dnn_ = CanUseCudnn();
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& tensor_in = context->input(0);
+    const Tensor& tensor_out = context->input(1);
+    const Tensor& out_backprop = context->input(2);
+
+    // For maxpooling, tensor_in should have 4 dimensions.
+    OP_REQUIRES(context, tensor_in.dims() == 4,
+                errors::InvalidArgument("tensor_in must be 4-dimensional 4"));
+    OP_REQUIRES(context, tensor_out.dims() == 4,
+                errors::InvalidArgument("tensor_out must be 4-dimensional"));
+    // For maxpooling, out_backprop should have 4 dimensions.
+    OP_REQUIRES(context, out_backprop.dims() == 4,
+                errors::InvalidArgument("out_backprop must be 4-dimensional"));
+
+    TensorShape output_shape = tensor_in.shape();
+
+    if (use_dnn_) {
+      DnnPoolingGradOp<T>::Compute(
+          context, perftools::gputools::dnn::PoolingMode::kMaximum, ksize_,
+          stride_, padding_, &tensor_in, &tensor_out, out_backprop,
+          output_shape);
+    } else {
+      MaxPoolingBackwardCustomKernel(context, ksize_, stride_, padding_,
+                                     &tensor_in, out_backprop, output_shape);
+    }
+  }
+
+ private:
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  bool use_dnn_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("MaxPoolGrad").Device(DEVICE_GPU),
+                        MaxPoolingGradOp<Eigen::GpuDevice, float>);
+
+#endif  // GOOGLE_CUDA
+
+template <typename Device, typename T>
+struct LaunchMaxPoolingNoMask;
+
+template <typename Device, typename T>
+class MaxPoolingNoMaskOp : public OpKernel {
+ public:
+  explicit MaxPoolingNoMaskOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument("Sliding window stride field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& tensor_in = context->input(0);
+
+    PoolParameters params{context, ksize_, stride_, padding_,
+                          tensor_in.shape()};
+    if (!context->status().ok()) {
+      return;
+    }
+
+    TensorShape out_shape({params.tensor_in_batch, params.out_height,
+                           params.out_width, params.depth});
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
+
+    LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in,
+                                              output);
+  }
+
+ private:
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+};
+
+template <typename Device, typename T>
+struct LaunchMaxPoolingWithArgmax;
+
+template <typename Device, typename T>
+class MaxPoolingWithArgmaxOp : public OpKernel {
+ public:
+  explicit MaxPoolingWithArgmaxOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument(
+                    "Sliding window ksize field must "
+                    "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument(
+                    "Sliding window stride field must "
+                    "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& tensor_in = context->input(0);
+
+    PoolParameters params{context, ksize_, stride_, padding_,
+                          tensor_in.shape()};
+    if (!context->status().ok()) {
+      return;
+    }
+
+    TensorShape out_shape({params.tensor_in_batch, params.out_height,
+                           params.out_width, params.depth});
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
+    Tensor* argmax = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(1, out_shape, &argmax));
+
+    LaunchMaxPoolingWithArgmax<Device, T>::launch(context, params, tensor_in,
+                                                  output, argmax);
+  }
+
+ private:
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+};
+
+template <typename Device, typename T>
+struct LaunchMaxPoolingGradWithArgmax;
+
+template <typename Device, typename T>
+class MaxPoolingGradWithArgmaxOp : public OpKernel {
+ public:
+  explicit MaxPoolingGradWithArgmaxOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument(
+                    "Sliding window ksize field must "
+                    "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument(
+                    "Sliding window stride field must "
+                    "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& tensor_in = context->input(0);
+    const Tensor& grad_in = context->input(1);
+    const Tensor& argmax = context->input(2);
+
+    PoolParameters params{context, ksize_, stride_, padding_,
+                          tensor_in.shape()};
+    if (!context->status().ok()) {
+      return;
+    }
+
+    TensorShape out_shape({params.tensor_in_batch, params.tensor_in_rows,
+                           params.tensor_in_cols, params.depth});
+    Tensor* grad_out = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &grad_out));
+
+    LaunchMaxPoolingGradWithArgmax<Device, T>::launch(context, params, grad_in,
+                                                      argmax, grad_out);
+  }
+
+ private:
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+};
+
+#if GOOGLE_CUDA
+
+template <typename T>
+struct LaunchMaxPoolingNoMask<Eigen::GpuDevice, T> {
+  static void launch(OpKernelContext* context, const PoolParameters& params,
+                     const Tensor& input, Tensor* output) {
+    bool status = MaxPoolForwardWithOptionalArgmax(
+        input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows,
+        params.tensor_in_cols, params.depth, params.out_height,
+        params.out_width, params.window_rows, params.window_cols,
+        params.row_stride, params.col_stride, params.pad_rows, params.pad_cols,
+        output->flat<T>().data(), nullptr, context->eigen_gpu_device());
+    if (!status) {
+      context->SetStatus(
+          errors::Internal("Failed launching MaxPoolForwardNoMask"));
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MaxPool").Device(DEVICE_GPU),
+                        MaxPoolingNoMaskOp<Eigen::GpuDevice, float>);
+
+template <typename T>
+struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T> {
+  static void launch(OpKernelContext* context, const PoolParameters& params,
+                     const Tensor& input, Tensor* output, Tensor* argmax) {
+    bool status = MaxPoolForwardWithOptionalArgmax(
+        input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows,
+        params.tensor_in_cols, params.depth, params.out_height,
+        params.out_width, params.window_rows, params.window_cols,
+        params.row_stride, params.col_stride, params.pad_rows, params.pad_cols,
+        output->flat<T>().data(),
+        reinterpret_cast<int64*>(argmax->flat<int64>().data()),
+        context->eigen_gpu_device());
+    if (!status) {
+      context->SetStatus(
+          errors::Internal("Failed launching MaxPoolForwardWithArgmax"));
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int64>("Targmax"),
+                        MaxPoolingWithArgmaxOp<Eigen::GpuDevice, float>);
+
+template <typename T>
+struct LaunchMaxPoolingGradWithArgmax<Eigen::GpuDevice, T> {
+  static void launch(OpKernelContext* context, const PoolParameters& params,
+                     const Tensor& grad_in, const Tensor& argmax,
+                     Tensor* grad_out) {
+    const int input_size = params.tensor_in_batch * params.tensor_in_rows *
+                           params.tensor_in_cols * params.depth;
+    const int output_size = params.tensor_in_batch * params.out_height *
+                            params.out_width * params.depth;
+    const int top_offset = params.out_height * params.out_width * params.depth;
+    const int bottom_offset =
+        params.tensor_in_rows * params.tensor_in_cols * params.depth;
+    bool status = MaxPoolBackwardWithArgmax(
+        output_size, input_size, grad_in.flat<T>().data(),
+        reinterpret_cast<const int64*>(argmax.flat<int64>().data()), top_offset,
+        bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device());
+    if (!status) {
+      context->SetStatus(
+          errors::Internal("Failed launching MaxPoolForwardWithArgmax"));
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MaxPoolGradWithArgmax")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int64>("Targmax"),
+                        MaxPoolingGradWithArgmaxOp<Eigen::GpuDevice, float>);
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/maxpooling_op.h b/tensorflow/core/kernels/maxpooling_op.h
new file mode 100644
index 0000000000..a074174118
--- /dev/null
+++ b/tensorflow/core/kernels/maxpooling_op.h
@@ -0,0 +1,29 @@
+#ifndef TENSORFLOW_KERNELS_MAXPOOLING_OP_H_
+#define TENSORFLOW_KERNELS_MAXPOOLING_OP_H_
+// Functor definition for MaxPoolingOp, must be compilable by nvcc.
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T>
+struct SpatialMaxPooling {
+  void operator()(const Device& d, typename TTypes<T, 4>::Tensor output,
+                  typename TTypes<T, 4>::ConstTensor input, int window_rows,
+                  int window_cols, int row_stride, int col_stride,
+                  const Eigen::PaddingType& padding) {
+    // Because we swap the layout, we swap the row/cols as well
+    output.swap_layout().device(d) =
+        Eigen::SpatialMaxPooling(input.swap_layout(), window_cols, window_rows,
+                                 col_stride, row_stride, padding);
+  }
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_MAXPOOLING_OP_H_
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
new file mode 100644
index 0000000000..65262eb54e
--- /dev/null
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
@@ -0,0 +1,261 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <stdio.h>
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/maxpooling_op.h"
+#include "tensorflow/core/kernels/maxpooling_op_gpu.h"
+
+namespace tensorflow {
+namespace {
+// This is Yangqing's custom kernel for the maxpooling operation. There are
+// three functions: MaxPoolForwardNCHW and MaxPoolForwardNHWC are the two
+// forward functions, dealing with the forward case. MaxPoolBackward is the
+// backward function that deals with the backward case for both storage orders.
+// The parameters to the kernels in the forward function is as follows:
+//     nthreads: the number of threads, which is equal to the output size.
+//     bottom_data: the bottom data of N*H*W*C (or N*C*H*W) items.
+//     height, width, pooled_height, pooled_width: the input and output sizes.
+//     kernel_h, kernel_w: the kernel sizes.
+//     stride_h, stride_w: the strides.
+//     pad_t, pad_l: the padding values on the top and left side.
+//     top_data: the maxpool output.
+//     mask: the output mask of the same size as top_data. It is stored in
+//         int form, keeping track of the flattened index of the input item that
+//         produces the max output. If a nullptr is passed in for mask, no mask
+//         will be produced.
+#define CUDA_1D_KERNEL_LOOP(i, n)                                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x;                          \
+       i < (n); i += blockDim.x * gridDim.x)
+
+// To call the forward and backward functions, use e.g.:
+// const int kThreadsPerBlock = 1024
+// const int output_size = batch * channels * pooled_height * pooled_width;
+// MaxPoolForwardNCHW<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
+//                      kThreadsPerBlock, 0, cuda_stream>>>(...);
+template <typename dtype>
+__global__ void MaxPoolForwardNCHW(const int nthreads, const dtype* bottom_data,
+                                   const int channels, const int height,
+                                   const int width, const int pooled_height,
+                                   const int pooled_width, const int kernel_h,
+                                   const int kernel_w, const int stride_h,
+                                   const int stride_w, const int pad_t,
+                                   const int pad_l, dtype* top_data,
+                                   int64* mask) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    int hstart = ph * stride_h - pad_t;
+    int wstart = pw * stride_w - pad_l;
+    int hend = min(hstart + kernel_h, height);
+    int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    dtype maxval = -FLT_MAX;
+    int maxidx = -1;
+    const dtype* bottom_data_n = bottom_data + n * channels * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        int idx = c * height * width + h * width + w;
+        if (bottom_data_n[idx] > maxval) {
+          maxidx = idx;
+          maxval = bottom_data_n[idx];
+        }
+      }
+    }
+    top_data[index] = maxval;
+    if (mask != nullptr) {
+      mask[index] = maxidx;
+    }
+  }
+}
+
+template <typename dtype>
+__global__ void MaxPoolForwardNHWC(const int nthreads, const dtype* bottom_data,
+                                   const int height, const int width,
+                                   const int channels, const int pooled_height,
+                                   const int pooled_width, const int kernel_h,
+                                   const int kernel_w, const int stride_h,
+                                   const int stride_w, const int pad_t,
+                                   const int pad_l, dtype* top_data,
+                                   int64* mask) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index;
+    int c = n % channels;
+    n /= channels;
+    int wstart = (n % pooled_width) * stride_w - pad_l;
+    n /= pooled_width;
+    int hstart = (n % pooled_height) * stride_h - pad_t;
+    n /= pooled_height;
+    int hend = min(hstart + kernel_h, height);
+    int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    dtype maxval = -FLT_MAX;
+    int maxidx = -1;
+    const dtype* bottom_data_n = bottom_data + n * height * width * channels;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        int idx = (h * width + w) * channels + c;
+        if (bottom_data_n[idx] > maxval) {
+          maxidx = idx;
+          maxval = bottom_data_n[idx];
+        }
+      }
+    }
+    top_data[index] = maxval;
+    if (mask != nullptr) {
+      mask[index] = maxidx;
+    }
+  }
+}
+
+template <typename dtype>
+__global__ void MaxPoolBackwardNoMaskNHWC(
+    const int nthreads, const dtype* bottom_data, const int height,
+    const int width, const int channels, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
+    const dtype* top_diff, dtype* bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // First find out the index to the maximum, since we have no mask.
+    int n = index;
+    int c = n % channels;
+    n /= channels;
+    int wstart = (n % pooled_width) * stride_w - pad_l;
+    n /= pooled_width;
+    int hstart = (n % pooled_height) * stride_h - pad_t;
+    n /= pooled_height;
+    int hend = min(hstart + kernel_h, height);
+    int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    dtype maxval = -FLT_MAX;
+    int maxidx = -1;
+    const dtype* bottom_data_n = bottom_data + n * height * width * channels;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        int idx = (h * width + w) * channels + c;
+        if (bottom_data_n[idx] > maxval) {
+          maxidx = idx;
+          maxval = bottom_data_n[idx];
+        }
+      }
+    }
+
+    // Atomically accumulate the bottom diff. The index could still be
+    // uninitialized, if all the bottom_data are NaN.
+    if (maxidx != -1) {
+      atomicAdd(bottom_diff + n * height * width * channels + maxidx,
+                top_diff[index]);
+    }
+  }
+}
+
+// The parameters to the kernels in the backward function is as follows:
+//     nthreads: the number of threads, which is equal to the output size.
+//     top_diff: the gradient of the output data, of size N*Hout*Wout*C (or
+//        N*C*Hout*Wout). As we have stored the flattened index of the input
+//        entries, the backward function is agnostic of the input storage order.
+//     mask: the output mask of the same size as top_data. It is stored in
+//         int form, keeping track of the flattened index of the input item that
+//         produces the max output.
+//     top_offset: the pre-computed per-image offset of the maxpool output. This
+//         is equal to Hout*Wout*C. We choose to pre-compute this so we do not
+//         need to compute it every time inside the kernel.
+//     bottom_offset: the pre-computed per-image offset of the maxpool input.
+//         This is equal to H*W*C.
+//     bottom_diff: the gradient with respect to the input.
+// This function relies on atomicAdd to avoid race conditions. Also, before the
+// kernel is run, you will need to make sure that bottom_diff is filled with
+// zero first.
+template <typename dtype>
+__global__ void MaxPoolBackward(const int nthreads, const dtype* top_diff,
+                                const int64* mask, const int top_offset,
+                                const int bottom_offset, dtype* bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int image_id = (index / top_offset);
+    atomicAdd(bottom_diff + image_id * bottom_offset + mask[index],
+              top_diff[index]);
+  }
+}
+
+template <typename dtype>
+__global__ void SetZero(const int nthreads, dtype* bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) { *(bottom_diff + index) = dtype(0); }
+}
+
+#undef CUDA_1D_KERNEL_LOOP
+}  // namespace
+
+bool MaxPoolForwardWithOptionalArgmax(
+    const float* bottom_data, const int batch, const int height,
+    const int width, const int channels, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
+    float* top_data, int64* mask, const Eigen::GpuDevice& d) {
+  const int kThreadsPerBlock = 1024;
+  const int output_size = batch * channels * pooled_height * pooled_width;
+
+  MaxPoolForwardNHWC<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
+                       kThreadsPerBlock, 0, d.stream()>>>(
+      output_size, bottom_data, height, width, channels, pooled_height,
+      pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
+      top_data, mask);
+  return d.ok();
+}
+
+bool MaxPoolBackwardNoMask(const float* bottom_data, const int batch,
+                           const int height, const int width,
+                           const int channels, const int pooled_height,
+                           const int pooled_width, const int kernel_h,
+                           const int kernel_w, const int stride_h,
+                           const int stride_w, const int pad_t, const int pad_l,
+                           const float* top_diff, float* bottom_diff,
+                           const Eigen::GpuDevice& d) {
+  const int kThreadsPerBlock = 1024;
+  const int bottom_size = batch * channels * height * width;
+  const int top_size = batch * channels * pooled_height * pooled_width;
+
+  SetZero<<<(bottom_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
+            kThreadsPerBlock, 0, d.stream()>>>(bottom_size, bottom_diff);
+
+  MaxPoolBackwardNoMaskNHWC<<<(top_size + kThreadsPerBlock - 1) /
+                                  kThreadsPerBlock,
+                              kThreadsPerBlock, 0, d.stream()>>>(
+      top_size, bottom_data, height, width, channels, pooled_height,
+      pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
+      top_diff, bottom_diff);
+  return d.ok();
+}
+
+bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size,
+                               const float* top_diff, const int64* mask,
+                               const int top_offset, const int bottom_offset,
+                               float* bottom_diff, const Eigen::GpuDevice& d) {
+  const int kThreadsPerBlock = 1024;
+  SetZero<<<(input_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
+            kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff);
+  MaxPoolBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
+                    kThreadsPerBlock, 0, d.stream()>>>(
+      output_size, top_diff, mask, top_offset, bottom_offset, bottom_diff);
+  return d.ok();
+}
+
+typedef Eigen::GpuDevice GPUDevice;
+
+#define DEFINE_GPU_KERNELS(T) \
+  template struct functor::SpatialMaxPooling<GPUDevice, T>;
+
+DEFINE_GPU_KERNELS(float)
+
+#undef DEFINE_GPU_KERNELS
+
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.h b/tensorflow/core/kernels/maxpooling_op_gpu.h
new file mode 100644
index 0000000000..bfdac904cc
--- /dev/null
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.h
@@ -0,0 +1,42 @@
+#if !GOOGLE_CUDA
+#error This file must only be included when building with Cuda support
+#endif
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MAXPOOLING_OP_GPU_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MAXPOOLING_OP_GPU_H_
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks"
+
+namespace tensorflow {
+
+// Run the forward pass of max pooling, optionally writing the argmax indices to
+// the mask array, if it is not nullptr. If mask is passed in as nullptr, the
+// argmax indices are not written.
+bool MaxPoolForwardWithOptionalArgmax(
+    const float* bottom_data, const int batch, const int height,
+    const int width, const int channels, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
+    float* top_data, int64* mask, const Eigen::GpuDevice& d);
+
+bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size,
+                               const float* top_diff, const int64* mask,
+                               const int top_offset, const int bottom_offset,
+                               float* bottom_diff, const Eigen::GpuDevice& d);
+
+bool MaxPoolBackwardNoMask(const float* bottom_data, const int batch,
+                           const int height, const int width,
+                           const int channels, const int pooled_height,
+                           const int pooled_width, const int kernel_h,
+                           const int kernel_w, const int stride_h,
+                           const int stride_w, const int pad_t, const int pad_l,
+                           const float* top_diff, float* bottom_diff,
+                           const Eigen::GpuDevice& d);
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MAXPOOLING_OP_GPU_H_
diff --git a/tensorflow/core/kernels/no_op.cc b/tensorflow/core/kernels/no_op.cc
new file mode 100644
index 0000000000..b4f9df81a6
--- /dev/null
+++ b/tensorflow/core/kernels/no_op.cc
@@ -0,0 +1,8 @@
+#include "tensorflow/core/kernels/no_op.h"
+
+namespace tensorflow {
+
+REGISTER_KERNEL_BUILDER(Name("NoOp").Device(DEVICE_CPU), NoOp);
+REGISTER_KERNEL_BUILDER(Name("NoOp").Device(DEVICE_GPU), NoOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/no_op.h b/tensorflow/core/kernels/no_op.h
new file mode 100644
index 0000000000..a3bcbd7680
--- /dev/null
+++ b/tensorflow/core/kernels/no_op.h
@@ -0,0 +1,17 @@
+#ifndef TENSORFLOW_KERNELS_NO_OP_H_
+#define TENSORFLOW_KERNELS_NO_OP_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+class NoOp : public OpKernel {
+ public:
+  explicit NoOp(OpKernelConstruction* context) : OpKernel(context) {}
+  void Compute(OpKernelContext* context) override {}
+  bool IsExpensive() override { return false; }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_NO_OP_H_
diff --git a/tensorflow/core/kernels/ops_testutil.cc b/tensorflow/core/kernels/ops_testutil.cc
new file mode 100644
index 0000000000..7bea17b9e2
--- /dev/null
+++ b/tensorflow/core/kernels/ops_testutil.cc
@@ -0,0 +1,18 @@
+#include "tensorflow/core/kernels/ops_testutil.h"
+
+namespace tensorflow {
+namespace test {
+
+NodeDef Node(const string& name, const string& op,
+             const std::vector<string>& inputs) {
+  NodeDef def;
+  def.set_name(name);
+  def.set_op(op);
+  for (const string& s : inputs) {
+    def.add_input(s);
+  }
+  return def;
+}
+
+}  // namespace test
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/ops_testutil.h b/tensorflow/core/kernels/ops_testutil.h
new file mode 100644
index 0000000000..7a3405bf04
--- /dev/null
+++ b/tensorflow/core/kernels/ops_testutil.h
@@ -0,0 +1,191 @@
+#ifndef TENSORFLOW_KERNELS_OPS_TESTUTIL_H_
+#define TENSORFLOW_KERNELS_OPS_TESTUTIL_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/public/env.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/util/tensor_slice_reader_cache.h"
+#include <gtest/gtest.h>
+
+namespace tensorflow {
+
+namespace test {
+
+// Return a NodeDef with the specified name/op/inputs.
+NodeDef Node(const string& name, const string& op,
+             const std::vector<string>& inputs);
+
+}  // namespace test
+
+// Helpful functions to test operators.
+//
+// This class will eventually be replaced / heavily modified
+// to use the BrainClient interface.
+class OpsTestBase : public ::testing::Test {
+ public:
+  OpsTestBase() : device_type_(DEVICE_CPU) {
+    device_.reset(
+        DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+    CHECK(device_.get()) << "Could not create CPU device";
+  }
+
+  ~OpsTestBase() override {
+    gtl::STLDeleteElements(&tensors_);
+    context_.reset(nullptr);
+  }
+
+  void set_node_def(const NodeDef& node_def) { node_def_.CopyFrom(node_def); }
+
+  // Clients can manipulate the underlying NodeDef via this accessor.
+  NodeDef* node_def() { return &node_def_; }
+
+  // Initializes an operator that takes in 'input_types' as input
+  // and output types as output.
+  //
+  // Returns the status of initialization.
+  Status InitOp() {
+    Status status;
+    kernel_ = CreateOpKernel(device_type_, device_.get(), allocator(),
+                             node_def_, &status);
+    if (kernel_ != nullptr) input_types_ = kernel_->input_types();
+    return status;
+  }
+
+  // Adds an input for every element described by the shape.
+  // 'input_mapping' maps an index (0...NumElements(shape)) to a
+  // value.
+  //
+  // TODO(vrv): Replace with something like a BrainClient Feed.
+  template <typename T>
+  void AddInput(const TensorShape& shape, std::function<T(int)> input_mapping) {
+    CHECK_GT(input_types_.size(), inputs_.size())
+        << "Adding more inputs than types; perhaps you need to call MakeOp";
+    bool is_ref = IsRefType(input_types_[inputs_.size()]);
+    Tensor* input = new Tensor(device_->GetAllocator(AllocatorAttributes()),
+                               DataTypeToEnum<T>::v(), shape);
+    test::FillFn(input, input_mapping);
+    tensors_.push_back(input);
+    if (is_ref) {
+      CHECK_EQ(RemoveRefType(input_types_[inputs_.size()]),
+               DataTypeToEnum<T>::v());
+      inputs_.push_back({&lock_for_refs_, input});
+    } else {
+      CHECK_EQ(input_types_[inputs_.size()], DataTypeToEnum<T>::v());
+      inputs_.push_back({nullptr, input});
+    }
+  }
+
+  // Like AddInput but takes in an explicit arrayslice of data.
+  template <typename T>
+  void AddInputFromArray(const TensorShape& shape,
+                         const gtl::ArraySlice<T>& data) {
+    CHECK_GT(input_types_.size(), inputs_.size())
+        << "Adding more inputs than types; perhaps you need to call MakeOp";
+    bool is_ref = IsRefType(input_types_[inputs_.size()]);
+    Tensor* input = new Tensor(device_->GetAllocator(AllocatorAttributes()),
+                               DataTypeToEnum<T>::v(), shape);
+    test::FillValues<T>(input, data);
+    tensors_.push_back(input);
+    if (is_ref) {
+      CHECK_EQ(RemoveRefType(input_types_[inputs_.size()]),
+               DataTypeToEnum<T>::v());
+      inputs_.push_back({&lock_for_refs_, input});
+    } else {
+      CHECK_EQ(input_types_[inputs_.size()], DataTypeToEnum<T>::v());
+      inputs_.push_back({nullptr, input});
+    }
+  }
+
+  // Runs an operation producing 'num_outputs' outputs.
+  //
+  // Returns the context's status after running the operation.
+  Status RunOpKernel() {
+    OpKernelContext::Params params;
+    params.device = device_.get();
+    params.frame_iter = FrameAndIter(0, 0);
+    params.inputs = &inputs_;
+    params.op_kernel = kernel_.get();
+    params.output_alloc_attr = [this, &params](int index) {
+      AllocatorAttributes attr;
+      const bool on_host =
+          (kernel_->output_memory_types()[index] == HOST_MEMORY);
+      attr.set_on_host(on_host);
+      return attr;
+    };
+    checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_wrapper;
+    params.slice_reader_cache = &slice_reader_cache_wrapper;
+
+    context_.reset(new OpKernelContext(params));
+    device_->Compute(kernel_.get(), context_.get());
+    return context_->status();
+  }
+
+  // Returns the tensor input for 'input_index'.
+  //
+  // REQUIRES: 0 <= input_index < context_->num_inputs()
+  const Tensor& GetInput(int input_index) const {
+    CHECK_LT(input_index, context_->num_inputs());
+    CHECK(!IsRefType(context_->input_dtype(input_index)));
+    return context_->input(input_index);
+  }
+
+  TensorValue mutable_input(int input_index) {
+    CHECK_LT(input_index, inputs_.size());
+    return inputs_[input_index];
+  }
+  // Returns the tensor output for 'output_index'.
+  //
+  // REQUIRES: 0 <= output_index < context_->num_outputs()
+  Tensor* GetOutput(int output_index) {
+    CHECK_LT(output_index, context_->num_outputs());
+    return context_->mutable_output(output_index);
+  }
+
+  Allocator* allocator() {
+    return device_->GetAllocator(AllocatorAttributes());
+  }
+
+  const DataTypeVector& output_types() const { return kernel_->output_types(); }
+
+ protected:
+  std::unique_ptr<Device> device_;
+
+  std::unique_ptr<OpKernel> kernel_;
+  NodeDef node_def_;
+  DataTypeVector input_types_;
+  DeviceType device_type_;
+
+  mutex lock_for_refs_;  // Used as the Mutex for inputs added as refs
+
+  gtl::InlinedVector<TensorValue, 4> inputs_;
+  // Owns Tensors.
+  std::vector<Tensor*> tensors_;
+
+  std::unique_ptr<OpKernelContext> context_;
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(OpsTestBase);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_OPS_TESTUTIL_H_
diff --git a/tensorflow/core/kernels/ops_util.cc b/tensorflow/core/kernels/ops_util.cc
new file mode 100644
index 0000000000..ca2925128e
--- /dev/null
+++ b/tensorflow/core/kernels/ops_util.cc
@@ -0,0 +1,113 @@
+#include <cmath>
+
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/padding.h"
+
+namespace tensorflow {
+
+void RequireDefaultOps() {
+// TODO(opensource): Use a more generic sounding preprocessor name than
+// GOOGLE_CUDA (maybe SUPPORT_CUDA?)
+#if GOOGLE_CUDA
+  void RequireGPUDevice();
+  RequireGPUDevice();
+#endif
+}
+
+Status Get2dOutputSize(const int in_height, const int in_width,
+                       int filter_height, int filter_width, int row_stride,
+                       int col_stride, Padding padding, int* new_height,
+                       int* new_width, int* pad_rows, int* pad_cols) {
+  int pad_bottom_unused, pad_right_unused;
+  return Get2dOutputSizeVerbose(
+      in_height, in_width, filter_height, filter_width, row_stride, col_stride,
+      padding, new_height, new_width, pad_rows, &pad_bottom_unused, pad_cols,
+      &pad_right_unused);
+}
+
+Status Get2dOutputSizeVerbose(const int in_height, const int in_width,
+                              int filter_height, int filter_width,
+                              int row_stride, int col_stride, Padding padding,
+                              int* new_height, int* new_width, int* pad_top,
+                              int* pad_bottom, int* pad_left, int* pad_right) {
+  // Cannot have strides larger than the patch size.
+  if (row_stride > filter_height || col_stride > filter_width) {
+    return errors::InvalidArgument(
+        "stride must be less than or equal to kernel size");
+  }
+  switch (padding) {
+    case Padding::VALID:
+      *new_height = ceil((in_height - filter_height + 1.f) /
+                         static_cast<float>(row_stride));
+      *new_width = ceil((in_width - filter_width + 1.f) /
+                        static_cast<float>(col_stride));
+      *pad_top = 0;
+      *pad_bottom = 0;
+      *pad_left = 0;
+      *pad_right = 0;
+      break;
+    case Padding::SAME:
+      *new_height = ceil(in_height / static_cast<float>(row_stride));
+      *new_width = ceil(in_width / static_cast<float>(col_stride));
+      // Calculate padding for top/bottom/left/right, spilling any excess
+      // padding to bottom and right.
+      const int pad_needed_height =
+          (*new_height - 1) * row_stride + filter_height - in_height;
+      *pad_top = pad_needed_height / 2;
+      CHECK_GE(pad_needed_height, 0);
+      *pad_bottom = pad_needed_height - *pad_top;
+
+      const int pad_needed_width =
+          (*new_width - 1) * col_stride + filter_width - in_width;
+      *pad_left = pad_needed_width / 2;
+      CHECK_GE(pad_needed_width, 0);
+      *pad_right = pad_needed_width - *pad_left;
+      break;
+  }
+  if (*new_height < 0 || *new_width < 0) {
+    return errors::InvalidArgument("computed output size would be negative");
+  }
+  return Status::OK();
+}
+
+Eigen::PaddingType BrainPadding2EigenPadding(Padding padding) {
+  switch (padding) {
+    case Padding::VALID:
+      return Eigen::PADDING_VALID;
+    case Padding::SAME:
+      return Eigen::PADDING_SAME;
+  }
+  return Eigen::PADDING_SAME;  // Prevent compiler warning about missing return
+}
+
+Status GetBroadcastSize(const int index, const int in_size,
+                             const int ksize, const int stride,
+                             const int pad_size, int* bindex, int* bsize) {
+  // Cannot have strides larger than the patch size.
+  if (stride > ksize) {
+    return errors::InvalidArgument(
+        "stride must be less than or equal to kernel size");
+  }
+  // Cannot have index beyond the input size.
+  if (index * stride > in_size) {
+    return errors::InvalidArgument(
+        "index * stride must be less than or equal to input size");
+  }
+  *bindex = index * stride;
+  *bsize = ksize;
+  if (*bindex < pad_size) {
+    // If the current index is in the padding area, start broadcast  from index
+    // 0 with broadcast size reduced by padding size.
+    *bsize = ksize + *bindex - pad_size;
+    *bindex = 0;
+  } else {
+    // Otherwise, start broadcast from current index reduced by padding size.
+    *bindex -= pad_size;
+  }
+  if (*bindex + ksize > in_size) {
+    *bsize = std::min((in_size - *bindex), ksize);
+  }
+  return Status::OK();
+}
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/ops_util.h b/tensorflow/core/kernels/ops_util.h
new file mode 100644
index 0000000000..283338f8df
--- /dev/null
+++ b/tensorflow/core/kernels/ops_util.h
@@ -0,0 +1,180 @@
+#ifndef TENSORFLOW_KERNELS_OPS_UTIL_H_
+#define TENSORFLOW_KERNELS_OPS_UTIL_H_
+
+// This file contains utilities for various operations.
+
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+// Call this function from a test if op kernels are not being
+// registered.  This can happen if the test is linked in a shared
+// mode and has no direct references to any code from this directory.
+void RequireDefaultOps();
+
+// Get2dOutputSize(): Given an input tensor, kernel, stride and padding
+// type, the function computes the output and padding dimensions.
+//
+// Convolution layers take in an input tensor of shape (D, C, R, B), and
+// convolve it with a set of filters, which can also be presented as a
+// tensor (D, K, K, M), where M is the number of filters, K is the filter size,
+// and each 3-dimensional tensor of size (D, K, K) is a filter. For
+// simplicity we assume that we always use square filters (which is usually the
+// case in images). It also takes in a few additional parameters:
+//
+// Stride (S): the stride with which we apply the filters. This is the offset
+// between locations where we apply the filters. A larger stride
+// means that the output will be spatially smaller.
+//
+// Padding (P): the padding we apply to the input tensor along the R and C
+// dimensions. This is usually used to make sure that the spatial dimension
+// do not shrink when we progress with convolutions. Two types of padding are
+// often used:
+//   SAME: the pad value is computed so that the output will have size R/S
+//         and C/S.
+//   VALID: no padding is carried out.
+// The padded area is zero-filled.
+//
+// The output dimensions for convolution and many other operations, when given
+// all the parameters above, are as follows:
+// - When Padding = SAME: the output size is (B, R', C', M), where
+//     R' = ceil(float(R) / float(S))
+//     C' = ceil(float(C) / float(S))
+//   where ceil is the ceiling function. The number of padded rows and columns
+//   are computed as:
+//     Pr = ((R' - 1) * S + K - R) / 2
+//     Pc = ((C' - 1) * S + K - C) / 2
+//   When the stride is 1, we have the simplified case
+//     R'=R, C'=C, Pr=Pc=(K-1)/2.
+//   This is where SAME comes from - the output has the same size as the input
+//   has.
+//
+// - When Padding = VALID: the output size is computed as
+//     R' = ceil(float(R - K + 1) / float(S))
+//     C' = ceil(float(C - K + 1) / float(S))
+//   and the number of padded rows and columns are computed in the same way.
+//   When the stride is 1, we have the simplified case
+//     R'=R-K+1, C'=C-K+1, Pr=0, Pc=0.
+//
+// For convolution, mathematically, the output value at location (b, r', c', m)
+// is the inner product of two vectors: the chunk of input at
+//    (b, (r'*S-Pr) : (r'*S-Pr+K), (c'*S-Pc) : (c'*S-Pc+K), :),
+// and the filter at (m, :, :, :).
+//
+Status Get2dOutputSize(const int in_height, const int in_width,
+                       int filter_height, int filter_width, int row_stride,
+                       int col_stride, Padding padding, int* new_height,
+                       int* new_width, int* pad_rows, int* pad_cols);
+
+// Returns the same output dimensions as in Get2dOutputSize, but returns verbose
+// padding dimensions (top/bottom/left/right). Any excess padding (caused by
+// an odd padding size value) is added to the 'pad_bottom' and 'pad_right'
+// dimensions.
+Status Get2dOutputSizeVerbose(const int in_height, const int in_width,
+                              int filter_height, int filter_width,
+                              int row_stride, int col_stride, Padding padding,
+                              int* new_height, int* new_width, int* pad_top,
+                              int* pad_bottom, int* pad_left, int* pad_right);
+
+// Calculates broadcast starting index and size.  For SAME padding, addition
+// padding could be applied to right, left, top and bottom.  Depending on the
+// current index, input size, kernel size, stride, padding size, the starting
+// index and size for broadcast for that dimension are different from the
+// current index and kernel size.
+// This is mainly used by gradient algorithms for pooling operations.
+Status GetBroadcastSize(const int index, const int in_size,
+                             const int ksize, const int stride,
+                             const int pad_size, int* bindex, int* bsize);
+
+// Converts Brain's Padding to Eigen's PaddingType.
+Eigen::PaddingType BrainPadding2EigenPadding(Padding padding);
+
+// Given a shape 's' of a tensor of type T. Returns true iff the
+// number of bytes occupied by each dim 0 (i.e., &tensor(i + 1, ...) -
+// &tensor(i, ...)) is multiple of EIGEN_ALIGN_BYTES.
+template <typename T>
+bool IsInnerDimsSizeAligned(const TensorShape& s) {
+  if (s.dims() == 0) return false;
+  const int64 dim0_size = s.dim_size(0);
+  if (dim0_size == 0) return false;
+  const int64 bytes_per_dim0 = (s.num_elements() / dim0_size) * sizeof(T);
+  return bytes_per_dim0 % EIGEN_MAX_ALIGN_BYTES == 0;
+}
+
+// Returns in 'col_data', image patches in storage order (height, width, depth)
+// extracted from image at 'input_data', which is requred to be in storage
+// order (batch, height, width, depth).
+// Implementation written by Yangqing Jia (jiayq).
+template <typename T>
+void Im2col(const T* input_data, const int depth, const int height,
+            const int width, const int filter_h, const int filter_w,
+            const int pad_t, const int pad_l, const int pad_b, const int pad_r,
+            const int stride_h, const int stride_w, T* col_data) {
+  int height_col = (height + pad_t + pad_b - filter_h) / stride_h + 1;
+  int width_col = (width + pad_l + pad_r - filter_w) / stride_w + 1;
+
+  int h_pad = -pad_t;
+  for (int h = 0; h < height_col; ++h) {
+    int w_pad = -pad_l;
+    for (int w = 0; w < width_col; ++w) {
+      for (int ih = h_pad; ih < h_pad + filter_h; ++ih) {
+        for (int iw = w_pad; iw < w_pad + filter_w; ++iw) {
+          if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
+            memcpy(col_data, input_data + (ih * width + iw) * depth,
+                   sizeof(T) * depth);
+          } else {
+            // This should be simply padded with zero.
+            memset(col_data, 0, sizeof(T) * depth);
+          }
+          col_data += depth;
+        }
+      }
+      w_pad += stride_w;
+    }
+    h_pad += stride_h;
+  }
+}
+
+// Returns in 'im_data' image patch in storage order (height, width, depth),
+// constructed from patches in 'col_data', which is required to be in storage
+// order (out_height * out_width, filter_height, filter_width, in_depth).
+// Implementation by Yangqing Jia (jiayq).
+template <typename T>
+void Col2im(const T* col_data, const int depth, const int height,
+            const int width, const int filter_h, const int filter_w,
+            const int pad_t, const int pad_l, const int pad_b, const int pad_r,
+            const int stride_h, const int stride_w, T* im_data) {
+  memset(im_data, 0, sizeof(T) * height * width * depth);
+  int height_col = (height + pad_t + pad_b - filter_h) / stride_h + 1;
+  int width_col = (width + pad_l + pad_r - filter_w) / stride_w + 1;
+  int h_pad = -pad_t;
+  for (int h = 0; h < height_col; ++h) {
+    int w_pad = -pad_l;
+    for (int w = 0; w < width_col; ++w) {
+      T* im_patch_data = im_data + (h_pad * width + w_pad) * depth;
+      for (int ih = h_pad; ih < h_pad + filter_h; ++ih) {
+        for (int iw = w_pad; iw < w_pad + filter_w; ++iw) {
+          if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
+            // TODO(andydavis) Vectorize this loop (if compiler does not).
+            for (int i = 0; i < depth; ++i) {
+              im_patch_data[i] += col_data[i];
+            }
+          }
+          im_patch_data += depth;
+          col_data += depth;
+        }
+        // Jump over remaining number of depth.
+        im_patch_data += depth * (width - filter_w);
+      }
+      w_pad += stride_w;
+    }
+    h_pad += stride_h;
+  }
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_OPS_UTIL_H_
diff --git a/tensorflow/core/kernels/ops_util_test.cc b/tensorflow/core/kernels/ops_util_test.cc
new file mode 100644
index 0000000000..bc4f57e220
--- /dev/null
+++ b/tensorflow/core/kernels/ops_util_test.cc
@@ -0,0 +1,265 @@
+#include "tensorflow/core/kernels/ops_util.h"
+#include <gtest/gtest.h>
+
+namespace tensorflow {
+namespace {
+
+class OpsUtilTest : public ::testing::Test {
+ protected:
+  OpsUtilTest() {}
+  ~OpsUtilTest() override {}
+
+  // Padding structure.
+  struct padding_struct {
+    // Input parameters.
+    struct {
+      int in_height;
+      int in_width;
+      int filter_height;
+      int filter_width;
+      int row_stride;
+      int col_stride;
+      Padding padding;
+    } input;
+    // Output.
+    struct {
+      int new_height;
+      int new_width;
+      int pad_top;
+      int pad_bottom;
+      int pad_left;
+      int pad_right;
+    } output;
+  };
+
+  // Broadcast structure.
+  struct bcast_struct {
+    // Input parameters.
+    struct {
+      int index;     // Current index.
+      int in_size;   // Size of the dimension.
+      int ksize;     // Kernel size.
+      int stride;    // Stride.
+      int pad_size;  // Padding size.
+    } input;
+    // Output.
+    struct {
+      int new_index;  // New starting index.
+      int new_size;   // New broadcast size.
+    } output;
+  };
+
+  static void VerifyGet2dOutputSizeBoundaries(padding_struct pad_struct,
+                                              error::Code code) {
+    int new_height, new_width, pad_rows, pad_cols;
+    Status status = Get2dOutputSize(
+        pad_struct.input.in_height, pad_struct.input.in_width,
+        pad_struct.input.filter_height, pad_struct.input.filter_width,
+        pad_struct.input.row_stride, pad_struct.input.col_stride,
+        pad_struct.input.padding, &new_height, &new_width, &pad_rows,
+        &pad_cols);
+    EXPECT_EQ(status.code(), code) << status;
+  }
+
+  static void VerifyGet2dOutputSizeValues(padding_struct pad_struct,
+                                          error::Code code) {
+    int new_height, new_width, pad_rows, pad_cols;
+    Status status = Get2dOutputSize(
+        pad_struct.input.in_height, pad_struct.input.in_width,
+        pad_struct.input.filter_height, pad_struct.input.filter_width,
+        pad_struct.input.row_stride, pad_struct.input.col_stride,
+        pad_struct.input.padding, &new_height, &new_width, &pad_rows,
+        &pad_cols);
+    EXPECT_EQ(status.code(), code) << status;
+    EXPECT_EQ(pad_struct.output.new_height, new_height);
+    EXPECT_EQ(pad_struct.output.new_width, new_width);
+    EXPECT_EQ(pad_struct.output.pad_top, pad_rows);
+    EXPECT_EQ(pad_struct.output.pad_left, pad_cols);
+  }
+
+  static void VerifyGet2dOutputVerboseSizeValues(padding_struct pad_struct,
+                                                 error::Code code) {
+    int new_height, new_width, pad_top, pad_bottom, pad_left, pad_right;
+    Status status = Get2dOutputSizeVerbose(
+        pad_struct.input.in_height, pad_struct.input.in_width,
+        pad_struct.input.filter_height, pad_struct.input.filter_width,
+        pad_struct.input.row_stride, pad_struct.input.col_stride,
+        pad_struct.input.padding, &new_height, &new_width, &pad_top,
+        &pad_bottom, &pad_left, &pad_right);
+    EXPECT_EQ(status.code(), code) << status;
+    EXPECT_EQ(pad_struct.output.new_height, new_height);
+    EXPECT_EQ(pad_struct.output.new_width, new_width);
+    EXPECT_EQ(pad_struct.output.pad_top, pad_top);
+    EXPECT_EQ(pad_struct.output.pad_bottom, pad_bottom);
+    EXPECT_EQ(pad_struct.output.pad_left, pad_left);
+    EXPECT_EQ(pad_struct.output.pad_right, pad_right);
+  }
+
+  static void VerifyBoundaries(bcast_struct bcast, error::Code code) {
+    int new_index, new_size;
+    Status status = GetBroadcastSize(
+        bcast.input.index, bcast.input.in_size, bcast.input.ksize,
+        bcast.input.stride, bcast.input.pad_size, &new_index, &new_size);
+    EXPECT_EQ(status.code(), code) << status;
+  }
+
+  static void VerifyBcastValues(bcast_struct bcast) {
+    int new_index, new_size;
+    EXPECT_EQ(Status::OK(),
+              GetBroadcastSize(bcast.input.index, bcast.input.in_size,
+                               bcast.input.ksize, bcast.input.stride,
+                               bcast.input.pad_size, &new_index, &new_size));
+    EXPECT_EQ(bcast.output.new_index, new_index);
+    EXPECT_EQ(bcast.output.new_size, new_size);
+  }
+};
+
+// Test stride > ksize fails with INVALID_ARGUMENT.
+TEST_F(OpsUtilTest, Get2dOutputSizeInvalidTest) {
+  padding_struct pad_struct = {{3, 3, 1, 2, 2, 2, SAME}, {3, 3, 1, 1, 1, 1}};
+  VerifyGet2dOutputSizeBoundaries(pad_struct, error::INVALID_ARGUMENT);
+}
+
+TEST_F(OpsUtilTest, Get2dOutputSizeNegativeSizeTest) {
+  padding_struct pad_struct = {{1, 1, 3, 3, 1, 1, VALID}, {-1, -1, 0, 0, 0, 0}};
+  VerifyGet2dOutputSizeBoundaries(pad_struct, error::INVALID_ARGUMENT);
+}
+
+TEST_F(OpsUtilTest, Get2dOutputSizeSquareFilterTest) {
+  padding_struct pad_struct1 = {{3, 3, 2, 2, 2, 2, SAME}, {2, 2, 0, 0, 0, 0}};
+  padding_struct pad_struct2 = {{3, 3, 2, 2, 2, 2, VALID}, {1, 1, 0, 0, 0, 0}};
+  VerifyGet2dOutputSizeValues(pad_struct1, error::OK);
+  VerifyGet2dOutputSizeValues(pad_struct2, error::OK);
+}
+
+TEST_F(OpsUtilTest, Get2dOutputSizeNonSquareFilterTest) {
+  padding_struct pad_struct1 = {{4, 5, 1, 2, 1, 1, SAME}, {4, 5, 0, 0, 0, 0}};
+  padding_struct pad_struct2 = {{4, 5, 1, 2, 1, 1, VALID}, {4, 4, 0, 0, 0, 0}};
+  VerifyGet2dOutputSizeValues(pad_struct1, error::OK);
+  VerifyGet2dOutputSizeValues(pad_struct2, error::OK);
+}
+
+TEST_F(OpsUtilTest, Get2dOutputSizeUnevenStrideTest) {
+  padding_struct pad_struct1 = {{4, 4, 2, 2, 1, 2, VALID}, {3, 2, 0, 0, 0, 0}};
+  padding_struct pad_struct2 = {{4, 4, 2, 2, 2, 1, VALID}, {2, 3, 0, 0, 0, 0}};
+  VerifyGet2dOutputSizeValues(pad_struct1, error::OK);
+  VerifyGet2dOutputSizeValues(pad_struct2, error::OK);
+}
+
+TEST_F(OpsUtilTest, Get2dOutputSizeVerbose) {
+  padding_struct pad_struct1 = {{3, 3, 2, 2, 2, 2, SAME}, {2, 2, 0, 1, 0, 1}};
+  padding_struct pad_struct2 = {{3, 3, 2, 2, 2, 2, VALID}, {1, 1, 0, 0, 0, 0}};
+  VerifyGet2dOutputVerboseSizeValues(pad_struct1, error::OK);
+  VerifyGet2dOutputVerboseSizeValues(pad_struct2, error::OK);
+}
+
+// Test stride > ksize fails with INVALID_ARGUMENT.
+TEST_F(OpsUtilTest, GetBroadcastTest3_1_2_0) {
+  bcast_struct bcast = {{0, 3, 1, 2, 0}, {0, 3}};
+  VerifyBoundaries(bcast, error::INVALID_ARGUMENT);
+}
+
+// Test index * stride > in_size fails with INVALID_ARGUMENT.
+TEST_F(OpsUtilTest, GetBroadcastTestBadIndex) {
+  bcast_struct bcast = {{2, 3, 1, 2, 0}, {0, 3}};
+  VerifyBoundaries(bcast, error::INVALID_ARGUMENT);
+}
+
+// in_size = 3, ksize = 3, stride = 1, pad_size = 0
+TEST_F(OpsUtilTest, GetBroadcastTest3_3_1_0) {
+  bcast_struct bcast[] = {
+      {{0, 3, 3, 1, 0}, {0, 3}},
+      {{1, 3, 3, 1, 0}, {1, 2}},
+      {{2, 3, 3, 1, 0}, {2, 1}},
+  };
+  for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
+    VerifyBcastValues(bcast[i]);
+  }
+}
+
+// in_size = 3, ksize = 3, stride = 1, pad_size = 1
+TEST_F(OpsUtilTest, GetBroadcastTest3_3_1_1) {
+  bcast_struct bcast[] = {
+      {{0, 3, 3, 1, 1}, {0, 2}},
+      {{1, 3, 3, 1, 1}, {0, 3}},
+      {{2, 3, 3, 1, 1}, {1, 2}},
+  };
+  for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
+    VerifyBcastValues(bcast[i]);
+  }
+}
+
+// in_size = 3, ksize = 3, stride = 1, pad_size = 2
+TEST_F(OpsUtilTest, GetBroadcastTest3_3_1_2) {
+  bcast_struct bcast[] = {
+      {{0, 3, 3, 1, 2}, {0, 1}},
+      {{1, 3, 3, 1, 2}, {0, 2}},
+      {{2, 3, 3, 1, 2}, {0, 3}},
+  };
+  for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
+    VerifyBcastValues(bcast[i]);
+  }
+}
+
+// in_size = 3, ksize = 3, stride = 2, pad_size = 0
+TEST_F(OpsUtilTest, GetBroadcastTest3_3_2_0) {
+  bcast_struct bcast[] = {
+      {{0, 3, 3, 2, 0}, {0, 3}}, {{1, 3, 3, 2, 0}, {2, 1}},
+  };
+  for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
+    VerifyBcastValues(bcast[i]);
+  }
+}
+
+// in_size = 3, ksize = 3, stride = 2, pad_size = 1
+TEST_F(OpsUtilTest, GetBroadcastTest3_3_2_1) {
+  bcast_struct bcast[] = {
+      {{0, 3, 3, 2, 1}, {0, 2}}, {{1, 3, 3, 2, 1}, {1, 2}},
+  };
+  for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
+    VerifyBcastValues(bcast[i]);
+  }
+}
+
+// in_size = 3, ksize = 3, stride = 2, pad_size = 2
+TEST_F(OpsUtilTest, GetBroadcastTest3_3_2_2) {
+  bcast_struct bcast[] = {
+      {{0, 3, 3, 2, 2}, {0, 1}},
+  };
+  for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
+    VerifyBcastValues(bcast[i]);
+  }
+}
+
+// in_size = 3, ksize = 3, stride = 3, pad_size = 0
+TEST_F(OpsUtilTest, GetBroadcastTest3_3_3_0) {
+  bcast_struct bcast[] = {
+      {{0, 3, 3, 3, 0}, {0, 3}},
+  };
+  for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
+    VerifyBcastValues(bcast[i]);
+  }
+}
+
+// in_size = 3, ksize = 3, stride = 3, pad_size = 1
+TEST_F(OpsUtilTest, GetBroadcastTest3_3_3_1) {
+  bcast_struct bcast[] = {
+      {{0, 3, 3, 3, 1}, {0, 2}}, {{1, 3, 3, 3, 1}, {2, 1}},
+  };
+  for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
+    VerifyBcastValues(bcast[i]);
+  }
+}
+
+// in_size = 3, ksize = 3, stride = 3, pad_size = 2
+TEST_F(OpsUtilTest, GetBroadcastTest3_3_3_2) {
+  bcast_struct bcast[] = {
+      {{0, 3, 3, 3, 2}, {0, 1}},
+  };
+  for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
+    VerifyBcastValues(bcast[i]);
+  }
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc
new file mode 100644
index 0000000000..cb125ea2fe
--- /dev/null
+++ b/tensorflow/core/kernels/pack_op.cc
@@ -0,0 +1,114 @@
+// See docs in ../ops/array_ops.cc.
+
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/concat_op.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/public/status.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+// --------------------------------------------------------------------------
+template <typename Device, typename T>
+class PackOp : public OpKernel {
+ public:
+  typedef std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>
+      ConstMatrixVector;
+
+  explicit PackOp(OpKernelConstruction* c) : OpKernel(c) {}
+
+  void Compute(OpKernelContext* c) override {
+    OpInputList values;
+    OP_REQUIRES_OK(c, c->input_list("values", &values));
+    const int num = values.size();
+
+    // Verify that all input shapes match
+    for (int i = 1; i < num; i++) {
+      OP_REQUIRES(c, values[0].shape().IsSameSize(values[i].shape()),
+                  errors::InvalidArgument(
+                      "Shapes of all inputs must match: values[0].shape = ",
+                      values[0].shape().ShortDebugString(), " != values[", i,
+                      "].shape = ", values[i].shape().ShortDebugString()));
+    }
+
+    TensorShape output_shape(values[0].shape());
+    output_shape.InsertDim(0, num);
+
+    // In the num = 1 case, just reshape the input
+    if (num == 1) {
+      Tensor output;
+      CHECK(output.CopyFrom(values[0], output_shape));
+      c->set_output(0, output);
+      return;
+    }
+
+    // Allocate output
+    Tensor* output;
+    OP_REQUIRES_OK(c, c->allocate_output(0, output_shape, &output));
+
+    const int output_size = output->NumElements();
+    if (output_size > 0) {
+      auto output_flat = output->shaped<T, 2>({1, output_size});
+
+      // Except for shapes, pack is a special case of concat, so we reuse the
+      // same computational kernels.
+      ConstMatrixVector inputs_flat;
+      inputs_flat.reserve(num);
+      for (int i = 0; i < num; ++i) {
+        inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+            values[i].shaped<T, 2>({1, values[i].NumElements()})));
+      }
+      if (std::is_same<Device, GPUDevice>::value) {
+        ConcatGPU<T>(c->eigen_gpu_device(), inputs_flat, &output_flat);
+      } else {
+        ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
+      }
+    }
+  }
+};
+
+#define REGISTER_PACK(type)                                      \
+  REGISTER_KERNEL_BUILDER(                                       \
+      Name("Pack").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      PackOp<CPUDevice, type>)
+
+TF_CALL_ALL_TYPES(REGISTER_PACK);
+REGISTER_PACK(quint8);
+REGISTER_PACK(qint8);
+REGISTER_PACK(qint32);
+REGISTER_PACK(bfloat16);
+
+#undef REGISTER_PACK
+
+#if GOOGLE_CUDA
+
+#define REGISTER_GPU(type)                                       \
+  REGISTER_KERNEL_BUILDER(                                       \
+      Name("Pack").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+      PackOp<GPUDevice, type>)
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+#undef REGISTER_GPU
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Pack")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("values")
+                            .HostMemory("output")
+                            .TypeConstraint<int32>("T"),
+                        PackOp<CPUDevice, int32>);
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/pad_op.cc b/tensorflow/core/kernels/pad_op.cc
new file mode 100644
index 0000000000..6c66e54e3d
--- /dev/null
+++ b/tensorflow/core/kernels/pad_op.cc
@@ -0,0 +1,159 @@
+// See docs in ../ops/nn_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/pad_op.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class PadOp : public OpKernel {
+ public:
+  explicit PadOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& in0 = context->input(0);
+    const Tensor& in1 = context->input(1);
+    const int dims = in0.dims();
+    static const int kMinDims = 0;
+    static const int kMaxDims = 5;
+    OP_REQUIRES(context, kMinDims <= dims && dims <= kMaxDims,
+                errors::Unimplemented("inputs rank not in [", kMinDims, ",",
+                                      kMaxDims, "]: ", dims));
+    OP_REQUIRES(
+        context,
+        TensorShapeUtils::IsMatrix(in1.shape()) && in1.dim_size(1) == 2,
+        errors::InvalidArgument("paddings must be a matrix with 2 columns: ",
+                                in1.shape().DebugString()));
+    const int fixed_dims =
+        (kAllowLegacyScalars && dims == 0 && in1.dim_size(0) == 1) ? 1 : dims;
+    OP_REQUIRES(
+        context, fixed_dims == in1.dim_size(0),
+        errors::InvalidArgument(
+            "The first dimension of paddings must be the rank of inputs",
+            in1.shape().DebugString(), " ", in0.shape().DebugString()));
+
+    // Compute the shape of the output tensor, and allocate it.
+    TensorShape output_shape;
+    TTypes<int32>::ConstMatrix paddings = in1.matrix<int32>();
+    for (int d = 0; d < fixed_dims; ++d) {
+      const int32 before_d = paddings(d, 0);  // Pad before existing elements.
+      const int32 after_d = paddings(d, 1);   // Pad after exisitng elements.
+      OP_REQUIRES(context, before_d >= 0 && after_d >= 0,
+                  errors::InvalidArgument("Paddings must be non-negative: ",
+                                          before_d, " ", after_d));
+      const int size_d =
+          (kAllowLegacyScalars && d == in0.dims()) ? 1 : in0.dim_size(d);
+      output_shape.AddDim(before_d + size_d + after_d);
+    }
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+
+    // Invoke the dims-specific implementation.
+    switch (fixed_dims) {
+      case 0:
+        Operate<0>(context, in0.tensor<T, 0>(), paddings, output);
+        break;
+      case 1:
+        // TODO(irving): Once Pad doesn't need a scalar special case,
+        // change flat to tensor.  That is, once !kAllowLegacyScalars.
+        Operate<1>(context, in0.flat<T>(), paddings, output);
+        break;
+      case 2:
+        Operate<2>(context, in0.tensor<T, 2>(), paddings, output);
+        break;
+      case 3:
+        Operate<3>(context, in0.tensor<T, 3>(), paddings, output);
+        break;
+      case 4:
+        Operate<4>(context, in0.tensor<T, 4>(), paddings, output);
+        break;
+      case 5:
+        Operate<5>(context, in0.tensor<T, 5>(), paddings, output);
+        break;
+      default:
+        OP_REQUIRES(context, false,
+                    errors::InvalidArgument("Only ranks up to 5 supported: ",
+                                            in0.shape().DebugString()));
+    }
+  }
+
+ private:
+  template <int Dims>
+  void Operate(OpKernelContext* context,
+               typename TTypes<T, Dims>::ConstTensor input,
+               TTypes<int32>::ConstMatrix paddings, Tensor* output) {
+    CHECK_EQ(Dims, paddings.dimension(0));
+    CHECK_EQ(2, paddings.dimension(1));
+    Eigen::array<std::pair<int32, int32>, Dims> paddings_array;
+    for (int i = 0; i < Dims; ++i) {
+      paddings_array[i] = std::make_pair(paddings(i, 0), paddings(i, 1));
+    }
+    functor::Pad<Device, T, Dims> functor;
+    functor(context->eigen_device<Device>(), output->tensor<T, Dims>(), input,
+            paddings_array);
+  }
+};
+
+#define REGISTER_KERNEL(type)                            \
+  REGISTER_KERNEL_BUILDER(Name("Pad")                    \
+                              .Device(DEVICE_CPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("paddings"),   \
+                          PadOp<CPUDevice, type>)
+
+TF_CALL_ALL_TYPES(REGISTER_KERNEL);
+#undef REGISTER_KERNEL
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T, Dims)                                  \
+  template <>                                                      \
+  void Pad<GPUDevice, T, Dims>::operator()(                        \
+      const GPUDevice& d, typename TTypes<T, Dims>::Tensor output, \
+      typename TTypes<T, Dims>::ConstTensor input,                 \
+      Eigen::array<std::pair<int32, int32>, Dims> paddings);       \
+  extern template struct Pad<GPUDevice, T, Dims>;
+
+#define DECLARE_GPU_SPECS(T) \
+  DECLARE_GPU_SPEC(T, 0);    \
+  DECLARE_GPU_SPEC(T, 1);    \
+  DECLARE_GPU_SPEC(T, 2);    \
+  DECLARE_GPU_SPEC(T, 3);    \
+  DECLARE_GPU_SPEC(T, 4);    \
+  DECLARE_GPU_SPEC(T, 5);
+
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
+}  // namespace functor
+
+// Registration of the GPU implementations.
+#define REGISTER_GPU_KERNEL(T)                         \
+  REGISTER_KERNEL_BUILDER(Name("Pad")                  \
+                              .Device(DEVICE_GPU)      \
+                              .TypeConstraint<T>("T")  \
+                              .HostMemory("paddings"), \
+                          PadOp<GPUDevice, T>)
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNEL);
+#endif  // GOOGLE_CUDA
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/pad_op.h b/tensorflow/core/kernels/pad_op.h
new file mode 100644
index 0000000000..c4f8a4abda
--- /dev/null
+++ b/tensorflow/core/kernels/pad_op.h
@@ -0,0 +1,27 @@
+#ifndef TENSORFLOW_KERNELS_PAD_OP_H_
+#define TENSORFLOW_KERNELS_PAD_OP_H_
+// Functor definition for PadOp, must be compilable by nvcc.
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by PadOp to do the computations.
+template <typename Device, typename T, int Dims>
+struct Pad {
+  // Pad "input" into "output", as specified by "paddings".  See pad_op.cc for
+  // details.
+  void operator()(const Device& d, typename TTypes<T, Dims>::Tensor output,
+                  typename TTypes<T, Dims>::ConstTensor input,
+                  Eigen::array<std::pair<int32, int32>, Dims> paddings) {
+    output.device(d) = input.pad(paddings);
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_PAD_OP_H_
diff --git a/tensorflow/core/kernels/pad_op_gpu.cu.cc b/tensorflow/core/kernels/pad_op_gpu.cu.cc
new file mode 100644
index 0000000000..35a03a2cb2
--- /dev/null
+++ b/tensorflow/core/kernels/pad_op_gpu.cu.cc
@@ -0,0 +1,26 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/pad_op.h"
+
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Definition of the GPU implementations declared in pad_op.cc.
+#define DEFINE_GPU_SPECS(T)                      \
+  template struct functor::Pad<GPUDevice, T, 0>; \
+  template struct functor::Pad<GPUDevice, T, 1>; \
+  template struct functor::Pad<GPUDevice, T, 2>; \
+  template struct functor::Pad<GPUDevice, T, 3>; \
+  template struct functor::Pad<GPUDevice, T, 4>; \
+  template struct functor::Pad<GPUDevice, T, 5>;
+
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
new file mode 100644
index 0000000000..35e9bd75fa
--- /dev/null
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -0,0 +1,252 @@
+#include "tensorflow/core/kernels/pooling_ops_common.h"
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/public/tensor.h"
+
+#if GOOGLE_CUDA
+#include "tensorflow/core/common_runtime/gpu_device_context.h"
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/maxpooling_op_gpu.h"
+#include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
+#include "tensorflow/stream_executor/dnn.h"
+#include "tensorflow/stream_executor/stream.h"
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+
+PoolParameters::PoolParameters(OpKernelContext* context,
+                               const std::vector<int32>& ksize,
+                               const std::vector<int32>& stride,
+                               Padding padding,
+                               const TensorShape& tensor_in_shape) {
+  // For maxpooling, tensor_in should have 4 dimensions.
+  OP_REQUIRES(context, tensor_in_shape.dims() == 4,
+              errors::InvalidArgument("tensor_in must be 4-dimensional"));
+
+  depth = tensor_in_shape.dim_size(3);
+  tensor_in_cols = tensor_in_shape.dim_size(2);
+  tensor_in_rows = tensor_in_shape.dim_size(1);
+  tensor_in_batch = tensor_in_shape.dim_size(0);
+  window_rows = ksize[1];
+  window_cols = ksize[2];
+  depth_window = ksize[3];
+  row_stride = stride[1];
+  col_stride = stride[2];
+  depth_stride = stride[3];
+
+  // We only support 2D pooling across width/height and depthwise
+  // pooling, not a combination.
+  OP_REQUIRES(context,
+              (depth_window == 1 || (window_rows == 1 && window_cols == 1)),
+              errors::Unimplemented(
+                  "MaxPooling supports exactly one of pooling across depth "
+                  "or pooling across width/height."));
+
+  if (depth_window == 1) {
+    OP_REQUIRES_OK(context, Get2dOutputSize(
+                                tensor_in_rows, tensor_in_cols, window_rows,
+                                window_cols, row_stride, col_stride, padding,
+                                &out_height, &out_width, &pad_rows, &pad_cols));
+  } else {
+    // Our current version of depthwise max pooling does not support
+    // any padding, and expects the depth_window to equal the
+    // depth_stride (no overlapping).
+    OP_REQUIRES(
+        context, depth % depth_window == 0,
+        errors::Unimplemented("Depthwise max pooling requires the depth "
+                              "window to evenly divide the input depth"));
+    OP_REQUIRES(
+        context, depth_stride == depth_window,
+        errors::Unimplemented("Depthwise max pooling requires the depth "
+                              "window to equal the depth stride"));
+
+    // The current version of depthwise max is only implemented on CPU.
+    OP_REQUIRES(context,
+                (DeviceType(static_cast<Device*>(context->device())
+                                ->attributes()
+                                .device_type()) == DeviceType(DEVICE_CPU)),
+                errors::Unimplemented("Depthwise max pooling is currently "
+                                      "only implemented for CPU devices."));
+
+    pad_depth = 0;
+    out_depth = depth / depth_window;
+  }
+}
+
+TensorShape PoolParameters::forward_output_shape() {
+  if (depth_window == 1) {
+    // Spatial pooling
+    return TensorShape({tensor_in_batch, out_height, out_width, depth});
+  } else {
+    // Depthwise pooling
+    return TensorShape(
+        {tensor_in_batch, tensor_in_rows, tensor_in_cols, out_depth});
+  }
+}
+
+#ifdef GOOGLE_CUDA
+
+namespace {
+template <typename T>
+perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory,
+                                                    uint64 size) {
+  perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory),
+                                                size * sizeof(T));
+  perftools::gputools::DeviceMemory<T> typed(wrapped);
+  return typed;
+}
+}  // namespace
+
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                      \
+  template <>                                                    \
+  void TransformDepth<GPUDevice, T>::operator()(                 \
+      const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in, \
+      const Eigen::DSizes<Eigen::DenseIndex, 4>& shuffle,        \
+      typename TTypes<T, 4>::Tensor out);                        \
+  extern template struct TransformDepth<GPUDevice, T>;
+
+DECLARE_GPU_SPEC(float);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+template <typename T>
+void DnnPoolingGradOp<T>::Compute(
+    OpKernelContext* context,
+    perftools::gputools::dnn::PoolingMode pooling_mode,
+    const std::vector<int32>& size, const std::vector<int32>& stride,
+    Padding padding, const Tensor* tensor_in, const Tensor* tensor_out,
+    const Tensor& out_backprop, const TensorShape& tensor_in_shape) {
+  CHECK((pooling_mode == perftools::gputools::dnn::PoolingMode::kMaximum) ||
+        (tensor_in && tensor_out))
+      << "For MaxPoolGrad, both tensor_in and tensor_out needs to be "
+         "specified";
+
+  Tensor* output = nullptr;
+  OP_REQUIRES_OK(context,
+                 context->allocate_output(0, tensor_in_shape, &output));
+
+  PoolParameters params{context, size, stride, padding, tensor_in_shape};
+  if (!context->status().ok()) {
+    return;
+  }
+
+  /// For now, cudnn does not support NHWC format, so we need to convert it
+  /// to NCHW before calling cudnn. We need to get rid of this once it is done
+  Tensor transformed_input;
+  OP_REQUIRES_OK(context, context->allocate_temp(
+                              DataTypeToEnum<T>::value,
+                              TensorShape({tensor_in_shape.dim_size(0),
+                                           tensor_in_shape.dim_size(3),
+                                           tensor_in_shape.dim_size(1),
+                                           tensor_in_shape.dim_size(2)}),
+                              &transformed_input));
+  Tensor transformed_input_backprop;
+  OP_REQUIRES_OK(context, context->allocate_temp(
+                              DataTypeToEnum<T>::value,
+                              TensorShape({tensor_in_shape.dim_size(0),
+                                           tensor_in_shape.dim_size(3),
+                                           tensor_in_shape.dim_size(1),
+                                           tensor_in_shape.dim_size(2)}),
+                              &transformed_input_backprop));
+  Tensor transformed_output;
+  OP_REQUIRES_OK(
+      context,
+      context->allocate_temp(
+          DataTypeToEnum<T>::value,
+          TensorShape({out_backprop.dim_size(0), out_backprop.dim_size(3),
+                       out_backprop.dim_size(1), out_backprop.dim_size(2)}),
+          &transformed_output));
+  Tensor transformed_output_backprop;
+  OP_REQUIRES_OK(
+      context,
+      context->allocate_temp(
+          DataTypeToEnum<T>::value,
+          TensorShape({out_backprop.dim_size(0), out_backprop.dim_size(3),
+                       out_backprop.dim_size(1), out_backprop.dim_size(2)}),
+          &transformed_output_backprop));
+
+  auto nhwc_to_nchw = Eigen::DSizes<Eigen::DenseIndex, 4>(0, 3, 1, 2);
+  if (tensor_in) {
+    // For AvgPoolGrad, the original input tensor is not necessary. However,
+    // cudnn still requires them to run, although they do not affect the
+    // results.
+    functor::TransformDepth<GPUDevice, T>()(
+        context->eigen_device<Device>(), tensor_in->tensor<T, 4>(),
+        nhwc_to_nchw, transformed_input.tensor<T, 4>());
+  }
+  if (tensor_out) {
+    // For AvgPoolGrad, the original output tensor is not necessary. However,
+    // cudnn still requires them to run, although they do not affect the
+    // results.
+    functor::TransformDepth<GPUDevice, T>()(
+        context->eigen_device<Device>(), tensor_out->tensor<T, 4>(),
+        nhwc_to_nchw, transformed_output.tensor<T, 4>());
+  }
+  functor::TransformDepth<GPUDevice, T>()(
+      context->eigen_device<Device>(), out_backprop.tensor<T, 4>(),
+      nhwc_to_nchw, transformed_output_backprop.tensor<T, 4>());
+
+  /// Get ready to call cudnn
+  perftools::gputools::dnn::PoolingDescriptor pooling_desc;
+  pooling_desc.set_pooling_mode(pooling_mode)
+      .set_window_height(params.window_rows)
+      .set_window_width(params.window_cols)
+      .set_vertical_stride(params.row_stride)
+      .set_horizontal_stride(params.col_stride)
+      .set_vertical_padding(params.pad_rows)
+      .set_horizontal_padding(params.pad_cols);
+
+  perftools::gputools::dnn::BatchDescriptor orig_output_desc;
+  orig_output_desc.set_count(params.tensor_in_batch)
+      .set_height(params.out_height)
+      .set_width(params.out_width)
+      .set_feature_map_count(params.depth)
+      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+
+  perftools::gputools::dnn::BatchDescriptor orig_input_desc;
+  orig_input_desc.set_count(params.tensor_in_batch)
+      .set_height(params.tensor_in_rows)
+      .set_width(params.tensor_in_cols)
+      .set_feature_map_count(params.depth)
+      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+
+  auto orig_output_data =
+      AsDeviceMemory(transformed_output.template flat<T>().data(),
+                     transformed_output.template flat<T>().size());
+  auto orig_input_data =
+      AsDeviceMemory(transformed_input.template flat<T>().data(),
+                     transformed_input.template flat<T>().size());
+  auto output_backprop =
+      AsDeviceMemory(transformed_output_backprop.template flat<T>().data(),
+                     transformed_output_backprop.template flat<T>().size());
+  auto input_backprop =
+      AsDeviceMemory(transformed_input_backprop.template flat<T>().data(),
+                     transformed_input_backprop.template flat<T>().size());
+
+  auto* stream = context->op_device_context<GPUDeviceContext>()->stream();
+  OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+
+  bool status =
+      stream->ThenPoolBackward(pooling_desc, orig_input_desc, orig_input_data,
+                               orig_output_desc, orig_output_data,
+                               output_backprop, &input_backprop)
+          .ok();
+  OP_REQUIRES(context, status,
+              errors::Internal("cudnn PoolBackward launch failed"));
+
+  /// Transform the output data from NCHW back to NHWC
+  auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
+  auto nchw_to_nhwc = Eigen::DSizes<Eigen::DenseIndex, 4>(0, 2, 3, 1);
+  functor::TransformDepth<GPUDevice, T>()(
+      context->eigen_device<Device>(),
+      toConstTensor(transformed_input_backprop).template tensor<T, 4>(),
+      nchw_to_nhwc, output->tensor<T, 4>());
+}
+
+template class DnnPoolingGradOp<float>;
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/pooling_ops_common.h b/tensorflow/core/kernels/pooling_ops_common.h
new file mode 100644
index 0000000000..5bf44b6e40
--- /dev/null
+++ b/tensorflow/core/kernels/pooling_ops_common.h
@@ -0,0 +1,264 @@
+#ifndef TENSORFLOW_KERNELS_POOLING_OPS_COMMON_H_
+#define TENSORFLOW_KERNELS_POOLING_OPS_COMMON_H_
+
+#include <vector>
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/avgpooling_op.h"
+#include "tensorflow/core/kernels/maxpooling_op.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// A helper class to manage sizes and shapes for pooling operations.
+struct PoolParameters {
+  // Updates context->status if there is an invalid input.
+  PoolParameters(OpKernelContext* context, const std::vector<int32>& ksize,
+                 const std::vector<int32>& stride, Padding padding,
+                 const TensorShape& tensor_in_shape);
+
+  // Returns the shape of the output for "forward" pooling operations.
+  TensorShape forward_output_shape();
+
+  int depth;
+
+  int tensor_in_cols;
+  int tensor_in_rows;
+  int tensor_in_batch;
+
+  int window_rows;
+  int window_cols;
+  int depth_window;
+
+  int row_stride;
+  int col_stride;
+  int depth_stride;
+
+  int out_height;
+  int out_width;
+  int out_depth;
+
+  int pad_rows;
+  int pad_cols;
+  int pad_depth;
+};
+
+// An implementation of MaxPooling (forward).
+template <typename Device, typename T>
+class MaxPoolingOp : public UnaryOp<T> {
+ public:
+  explicit MaxPoolingOp(OpKernelConstruction* context) : UnaryOp<T>(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument("Sliding window stride field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& tensor_in = context->input(0);
+    PoolParameters params{context, ksize_, stride_, padding_,
+                          tensor_in.shape()};
+    if (!context->status().ok()) {
+      return;
+    }
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                0, params.forward_output_shape(), &output));
+
+    if (params.depth_window > 1) {
+      DepthwiseMaxPool(context, output, tensor_in, params);
+    } else {
+      SpatialMaxPool(context, output, tensor_in, params, padding_);
+    }
+  }
+
+ private:
+  // Single-threaded implementation of DepthwiseMaxPool which
+  // does not handle all of the same options as SpatialMaxPool
+  // (strict assumptions on no padding, stride).
+  //
+  // TODO(vrv): implement a more general depthwise-max pool that works
+  // on GPU as well.
+  void DepthwiseMaxPool(OpKernelContext* context, Tensor* output,
+                        const Tensor& tensor_in, const PoolParameters& params) {
+    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+        in_by_pool(tensor_in.flat<T>().data(), params.depth_window,
+                   tensor_in.NumElements() / params.depth_window);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> out_by_pool(
+        output->flat<T>().data(), 1, output->NumElements());
+    out_by_pool = in_by_pool.colwise().maxCoeff();
+  }
+
+  void SpatialMaxPool(OpKernelContext* context, Tensor* output,
+                      const Tensor& tensor_in, const PoolParameters& params,
+                      const Padding& padding) {
+    // On GPU, use Eigen's Spatial Max Pooling.  On CPU, use an
+    // EigenMatrix version that is currently faster than Eigen's
+    // Spatial MaxPooling implementation.
+    //
+    // TODO(vrv): Remove this once we no longer need it.
+    if (std::is_same<Device, GPUDevice>::value) {
+      Eigen::PaddingType pt = BrainPadding2EigenPadding(padding);
+      functor::SpatialMaxPooling<Device, T>()(
+          context->eigen_device<Device>(), output->tensor<T, 4>(),
+          tensor_in.tensor<T, 4>(), params.window_rows, params.window_cols,
+          params.row_stride, params.col_stride, pt);
+    } else {
+      typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+          ConstEigenMatrixMap;
+      typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+          EigenMatrixMap;
+
+      ConstEigenMatrixMap in_mat(tensor_in.flat<T>().data(), params.depth,
+                                 params.tensor_in_cols * params.tensor_in_rows *
+                                     params.tensor_in_batch);
+      EigenMatrixMap out_mat(
+          output->flat<T>().data(), params.depth,
+          params.out_width * params.out_height * params.tensor_in_batch);
+
+      // Initializes the output tensor with MIN<T>.
+      output->flat<T>().setConstant(Eigen::NumTraits<T>::lowest());
+
+      // The following code basically does the following:
+      // 1. Flattens the input and output tensors into two dimensional arrays.
+      //    tensor_in_as_matrix:
+      //      depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
+      //    output_as_matrix:
+      //      depth by (out_width * out_height * tensor_in_batch)
+      //
+      // 2. Walks through the set of columns in the flattened
+      // tensor_in_as_matrix,
+      //    and updates the corresponding column(s) in output_as_matrix with the
+      //    max value.
+      for (int b = 0; b < params.tensor_in_batch; ++b) {
+        for (int h = 0; h < params.tensor_in_rows; ++h) {
+          for (int w = 0; w < params.tensor_in_cols; ++w) {
+            // (h_start, h_end) * (w_start, w_end) is the range that the input
+            // vector projects to.
+            const int hpad = h + params.pad_rows;
+            const int wpad = w + params.pad_cols;
+            const int h_start =
+                (hpad < params.window_rows)
+                    ? 0
+                    : (hpad - params.window_rows) / params.row_stride + 1;
+            const int h_end =
+                std::min(hpad / params.row_stride + 1, params.out_height);
+            const int w_start =
+                (wpad < params.window_cols)
+                    ? 0
+                    : (wpad - params.window_cols) / params.col_stride + 1;
+            const int w_end =
+                std::min(wpad / params.col_stride + 1, params.out_width);
+            // compute elementwise max
+            const int in_offset =
+                (b * params.tensor_in_rows + h) * params.tensor_in_cols + w;
+            for (int ph = h_start; ph < h_end; ++ph) {
+              for (int pw = w_start; pw < w_end; ++pw) {
+                const int out_offset =
+                    (b * params.out_height + ph) * params.out_width + pw;
+                out_mat.col(out_offset) =
+                    out_mat.col(out_offset).cwiseMax(in_mat.col(in_offset));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+};
+
+template <typename Device, typename T>
+void SpatialAvgPool(OpKernelContext* context, Tensor* output,
+                    const Tensor& input, const PoolParameters& params,
+                    const Padding& padding) {
+  typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+      ConstEigenMatrixMap;
+  typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+      EigenMatrixMap;
+
+  auto in_flat = input.flat<T>();
+  auto out_flat = output->flat<T>();
+
+  ConstEigenMatrixMap in_mat(
+      in_flat.data(), params.depth,
+      params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch);
+  EigenMatrixMap out_mat(
+      out_flat.data(), params.depth,
+      params.out_width * params.out_height * params.tensor_in_batch);
+  Eigen::Matrix<T, Eigen::Dynamic, 1> out_count(out_mat.cols());
+  out_count.setZero();
+
+  // Initializes output to zero.
+  out_flat.setZero();
+
+  // The following code basically does the following:
+  // 1. Flattens the input and output tensors into two dimensional arrays.
+  //    tensor_in_as_matrix:
+  //      depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
+  //    output_as_matrix:
+  //      depth by (out_width * out_height * tensor_in_batch)
+  //
+  // 2. Walks through the set of columns in the flattened
+  // tensor_in_as_matrix,
+  //    and updates the corresponding column(s) in output_as_matrix with the
+  //    average value.
+  for (int b = 0; b < params.tensor_in_batch; ++b) {
+    for (int h = 0; h < params.tensor_in_rows; ++h) {
+      for (int w = 0; w < params.tensor_in_cols; ++w) {
+        // (h_start, h_end) * (w_start, w_end) is the range that the input
+        // vector projects to.
+        const int hpad = h + params.pad_rows;
+        const int wpad = w + params.pad_cols;
+        const int h_start =
+            (hpad < params.window_rows)
+                ? 0
+                : (hpad - params.window_rows) / params.row_stride + 1;
+        const int h_end =
+            std::min(hpad / params.row_stride + 1, params.out_height);
+        const int w_start =
+            (wpad < params.window_cols)
+                ? 0
+                : (wpad - params.window_cols) / params.col_stride + 1;
+        const int w_end =
+            std::min(wpad / params.col_stride + 1, params.out_width);
+        const int in_offset =
+            (b * params.tensor_in_rows + h) * params.tensor_in_cols + w;
+        Eigen::DSizes<ptrdiff_t, 2> in_indices(0, in_offset);
+        for (int ph = h_start; ph < h_end; ++ph) {
+          for (int pw = w_start; pw < w_end; ++pw) {
+            const int out_offset =
+                (b * params.out_height + ph) * params.out_width + pw;
+            out_mat.col(out_offset) += in_mat.col(in_offset);
+            out_count(out_offset)++;
+          }
+        }
+      }
+    }
+  }
+  DCHECK_GT(out_count.minCoeff(), 0);
+  out_mat.array().rowwise() /= out_count.transpose().array();
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_POOLING_OPS_COMMON_H_
diff --git a/tensorflow/core/kernels/pooling_ops_common_gpu.h b/tensorflow/core/kernels/pooling_ops_common_gpu.h
new file mode 100644
index 0000000000..87a3ef5186
--- /dev/null
+++ b/tensorflow/core/kernels/pooling_ops_common_gpu.h
@@ -0,0 +1,39 @@
+#if !GOOGLE_CUDA
+#error This file must only be included when building with Cuda support
+#endif
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_GPU_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_GPU_H_
+
+#include "tensorflow/stream_executor/dnn.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/avgpooling_op.h"
+#include "tensorflow/core/kernels/maxpooling_op.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+// A helper class that launch the cudnn pooling backward operations.
+// The original input and output tensors are optional for AvgPoolGrad, but
+// mandatory for MaxPoolGrad.
+template <typename T>
+class DnnPoolingGradOp {
+ public:
+  typedef GPUDevice Device;
+  static void Compute(OpKernelContext* context,
+                      perftools::gputools::dnn::PoolingMode pooling_mode,
+                      const std::vector<int32>& size,
+                      const std::vector<int32>& stride, Padding padding,
+                      const Tensor* tensor_in, const Tensor* tensor_out,
+                      const Tensor& out_backprop,
+                      const TensorShape& tensor_in_shape);
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_GPU_H_
diff --git a/tensorflow/core/kernels/queue_base.cc b/tensorflow/core/kernels/queue_base.cc
new file mode 100644
index 0000000000..1b13f68a3a
--- /dev/null
+++ b/tensorflow/core/kernels/queue_base.cc
@@ -0,0 +1,153 @@
+#include "tensorflow/core/kernels/queue_base.h"
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+namespace {
+
+template <DataType DT>
+void HandleSliceToElement(const Tensor& parent, Tensor* element, int index) {
+  typedef typename EnumToDataType<DT>::Type T;
+  auto parent_as_matrix = parent.flat_outer_dims<T>();
+  element->flat<T>() = parent_as_matrix.chip(index, 0);
+}
+
+template <DataType DT>
+void HandleElementToSlice(const Tensor& element, Tensor* parent, int index) {
+  typedef typename EnumToDataType<DT>::Type T;
+  auto parent_as_matrix = parent->flat_outer_dims<T>();
+  parent_as_matrix.chip(index, 0) = element.flat<T>();
+}
+
+}  // namespace
+
+// static
+Status QueueBase::CopySliceToElement(const Tensor& parent, Tensor* element,
+                                     int index) {
+#define HANDLE_TYPE(DT)                               \
+  if (parent.dtype() == DT) {                         \
+    HandleSliceToElement<DT>(parent, element, index); \
+    return Status::OK();                              \
+  }
+  HANDLE_TYPE(DT_FLOAT);
+  HANDLE_TYPE(DT_DOUBLE);
+  HANDLE_TYPE(DT_INT32);
+  HANDLE_TYPE(DT_UINT8);
+  HANDLE_TYPE(DT_INT16);
+  HANDLE_TYPE(DT_INT8);
+  HANDLE_TYPE(DT_STRING);
+  HANDLE_TYPE(DT_INT64);
+#undef HANDLE_TYPE
+  return errors::Unimplemented("Unhandled data type: ", parent.dtype());
+}
+
+// static
+Status QueueBase::CopyElementToSlice(const Tensor& element, Tensor* parent,
+                                     int index) {
+#define HANDLE_TYPE(DT)                               \
+  if (element.dtype() == DT) {                        \
+    HandleElementToSlice<DT>(element, parent, index); \
+    return Status::OK();                              \
+  }
+  HANDLE_TYPE(DT_FLOAT);
+  HANDLE_TYPE(DT_DOUBLE);
+  HANDLE_TYPE(DT_INT32);
+  HANDLE_TYPE(DT_UINT8);
+  HANDLE_TYPE(DT_INT16);
+  HANDLE_TYPE(DT_INT8);
+  HANDLE_TYPE(DT_STRING);
+  HANDLE_TYPE(DT_INT64);
+#undef HANDLE_TYPE
+  return errors::Unimplemented("Unhandled data type: ", element.dtype());
+}
+
+QueueBase::QueueBase(const DataTypeVector& component_dtypes,
+                     const std::vector<TensorShape>& component_shapes,
+                     const string& name)
+    : component_dtypes_(component_dtypes),
+      component_shapes_(component_shapes),
+      name_(name) {}
+
+Status QueueBase::ValidateTupleCommon(const Tuple& tuple) const {
+  if (tuple.size() != static_cast<size_t>(num_components())) {
+    return errors::InvalidArgument(
+        "Wrong number of components in tuple. Expected ", num_components(),
+        ", got ", tuple.size());
+  }
+  for (size_t i = 0; i < tuple.size(); ++i) {
+    if (tuple[i].dtype() != component_dtypes_[i]) {
+      return errors::InvalidArgument(
+          "Type mismatch in tuple component ", i, ". Expected ",
+          DataTypeString(component_dtypes_[i]), ", got ",
+          DataTypeString(tuple[i].dtype()));
+    }
+  }
+  return Status::OK();
+}
+
+// static
+string QueueBase::ShapeListString(const gtl::ArraySlice<TensorShape>& shapes) {
+  string result = "[";
+  bool first = true;
+  for (const TensorShape& shape : shapes) {
+    strings::StrAppend(&result, (first ? "" : ", "), shape.ShortDebugString());
+    first = false;
+  }
+  strings::StrAppend(&result, "]");
+  return result;
+}
+
+Status QueueBase::MatchesNodeDefOp(const NodeDef& node_def,
+                                   const string& op) const {
+  if (node_def.op() != op) {
+    return errors::InvalidArgument("Shared queue '", name_, "' has type '", op,
+                                   "' that does not match type of Node '",
+                                   node_def.name(), "': ", node_def.op());
+  }
+  return Status::OK();
+}
+
+Status QueueBase::MatchesNodeDefCapacity(const NodeDef& node_def,
+                                         int32 capacity) const {
+  int32 requested_capacity = -1;
+  TF_RETURN_IF_ERROR(GetNodeAttr(node_def, "capacity", &requested_capacity));
+  if (requested_capacity < 0) requested_capacity = kUnbounded;
+  if (requested_capacity != capacity) {
+    return errors::InvalidArgument("Shared queue '", name_, "' has capacity ",
+                                   capacity, " but requested capacity was ",
+                                   requested_capacity);
+  }
+  return Status::OK();
+}
+
+Status QueueBase::MatchesNodeDefTypes(const NodeDef& node_def) const {
+  DataTypeVector requested_dtypes;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(node_def, "component_types", &requested_dtypes));
+  if (requested_dtypes != component_dtypes_) {
+    return errors::InvalidArgument("Shared queue '", name_,
+                                   "' has component types ",
+                                   DataTypeSliceString(component_dtypes_),
+                                   " but requested component types were ",
+                                   DataTypeSliceString(requested_dtypes));
+  }
+  return Status::OK();
+}
+
+Status QueueBase::MatchesNodeDefShapes(const NodeDef& node_def) const {
+  std::vector<TensorShape> requested_shapes;
+  TF_RETURN_IF_ERROR(GetNodeAttr(node_def, "shapes", &requested_shapes));
+  if (requested_shapes != component_shapes_) {
+    return errors::InvalidArgument("Shared queue '", name_,
+                                   "' has component shapes ",
+                                   ShapeListString(component_shapes_),
+                                   " but requested component shapes were ",
+                                   ShapeListString(requested_shapes));
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/queue_base.h b/tensorflow/core/kernels/queue_base.h
new file mode 100644
index 0000000000..4897102974
--- /dev/null
+++ b/tensorflow/core/kernels/queue_base.h
@@ -0,0 +1,77 @@
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_QUEUE_BASE_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_QUEUE_BASE_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/queue_interface.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+// Functionality common to QueueInterface implementations.
+class QueueBase : public QueueInterface {
+ public:
+  // As a possible value of 'capacity'.
+  static const int32 kUnbounded = INT_MAX;
+
+  // Args:
+  //   component_dtypes: The types of each component in a queue-element tuple.
+  //   component_shapes: The shapes of each component in a queue-element tuple,
+  //     which must either be empty (if the shapes are not specified) or
+  //     or have the same size as component_dtypes.
+  //   name: A name to use for the queue.
+  QueueBase(const DataTypeVector& component_dtypes,
+            const std::vector<TensorShape>& component_shapes,
+            const string& name);
+
+  // Implementations of QueueInterface methods --------------------------------
+  const DataTypeVector& component_dtypes() const override {
+    return component_dtypes_;
+  }
+
+  // Other public methods -----------------------------------------------------
+  const std::vector<TensorShape>& component_shapes() const {
+    return component_shapes_;
+  }
+
+ protected:
+  // Returns the number of components in a queue-element tuple.
+  int32 num_components() const { return component_dtypes_.size(); }
+
+  // True if shapes were specified.  If so, inputs will be validated
+  // against them, etc.
+  bool specified_shapes() const { return component_shapes_.size() > 0; }
+
+  // Code common to Validate*Tuple().
+  Status ValidateTupleCommon(const Tuple& tuple) const;
+
+  // Copies the index^th slice (in the first dimension) of parent into element.
+  static Status CopySliceToElement(const Tensor& parent, Tensor* element,
+                                   int index);
+
+  // Copies element into the index^th slice (in the first dimension) of parent.
+  static Status CopyElementToSlice(const Tensor& element, Tensor* parent,
+                                   int index);
+
+  ~QueueBase() override {}
+
+  // Helpers for implementing MatchesNodeDef().
+  static string ShapeListString(const gtl::ArraySlice<TensorShape>& shapes);
+  Status MatchesNodeDefOp(const NodeDef& node_def, const string& op) const;
+  Status MatchesNodeDefCapacity(const NodeDef& node_def, int32 capacity) const;
+  Status MatchesNodeDefTypes(const NodeDef& node_def) const;
+  Status MatchesNodeDefShapes(const NodeDef& node_def) const;
+
+  const DataTypeVector component_dtypes_;
+  const std::vector<TensorShape> component_shapes_;
+  const string name_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(QueueBase);
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_QUEUE_BASE_H_
diff --git a/tensorflow/core/kernels/queue_ops.cc b/tensorflow/core/kernels/queue_ops.cc
new file mode 100644
index 0000000000..c70dc76777
--- /dev/null
+++ b/tensorflow/core/kernels/queue_ops.cc
@@ -0,0 +1,288 @@
+// See docs in ../ops/data_flow_ops.cc.
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/queue_interface.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+class QueueOpKernel : public AsyncOpKernel {
+ public:
+  explicit QueueOpKernel(OpKernelConstruction* context)
+      : AsyncOpKernel(context) {}
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback callback) final {
+    QueueInterface* queue;
+    OP_REQUIRES_OK_ASYNC(ctx, GetResourceFromContext(ctx, "handle", &queue),
+                         callback);
+    ComputeAsync(ctx, queue, [callback, queue]() {
+      queue->Unref();
+      callback();
+    });
+  }
+
+ protected:
+  virtual void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
+                            DoneCallback callback) = 0;
+};
+
+class QueueAccessOpKernel : public QueueOpKernel {
+ public:
+  explicit QueueAccessOpKernel(OpKernelConstruction* context)
+      : QueueOpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("timeout_ms", &timeout_));
+    // TODO(keveman): Enable timeout.
+    OP_REQUIRES(context, timeout_ == -1,
+                errors::InvalidArgument("Timeout not supported yet."));
+  }
+
+ protected:
+  int64 timeout_;
+};
+
+// Defines an EnqueueOp, the execution of which enqueues a tuple of
+// tensors in the given Queue.
+//
+// The op has 1 + k inputs, where k is the number of components in the
+// tuples stored in the given Queue:
+// - Input 0: queue handle.
+// - Input 1: 0th element of the tuple.
+// - ...
+// - Input (1+k): kth element of the tuple.
+class EnqueueOp : public QueueAccessOpKernel {
+ public:
+  explicit EnqueueOp(OpKernelConstruction* context)
+      : QueueAccessOpKernel(context) {}
+
+ protected:
+  void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
+                    DoneCallback callback) override {
+    DataTypeVector expected_inputs = {DT_STRING_REF};
+    for (DataType dt : queue->component_dtypes()) {
+      expected_inputs.push_back(dt);
+    }
+    OP_REQUIRES_OK_ASYNC(ctx, ctx->MatchSignature(expected_inputs, {}),
+                         callback);
+
+    QueueInterface::Tuple tuple;
+    OpInputList components;
+    OP_REQUIRES_OK_ASYNC(ctx, ctx->input_list("components", &components),
+                         callback);
+    for (const Tensor& Tcomponent : components) {
+      tuple.push_back(Tcomponent);
+    }
+
+    OP_REQUIRES_OK_ASYNC(ctx, queue->ValidateTuple(tuple), callback);
+    queue->TryEnqueue(tuple, ctx, callback);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(EnqueueOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("QueueEnqueue").Device(DEVICE_CPU), EnqueueOp);
+
+// Defines an EnqueueManyOp, the execution of which slices each
+// component of a tuple of tensors along the 0th dimension, and
+// enqueues tuples of slices in the given Queue.
+//
+// The op has 1 + k inputs, where k is the number of components in the
+// tuples stored in the given Queue:
+// - Input 0: queue handle.
+// - Input 1: 0th element of the tuple.
+// - ...
+// - Input (1+k): kth element of the tuple.
+//
+// N.B. All tuple components must have the same size in the 0th
+// dimension.
+class EnqueueManyOp : public QueueAccessOpKernel {
+ public:
+  explicit EnqueueManyOp(OpKernelConstruction* context)
+      : QueueAccessOpKernel(context) {}
+
+ protected:
+  void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
+                    DoneCallback callback) override {
+    DataTypeVector expected_inputs = {DT_STRING_REF};
+    for (DataType dt : queue->component_dtypes()) {
+      expected_inputs.push_back(dt);
+    }
+    OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, {}));
+
+    QueueInterface::Tuple tuple;
+    OpInputList components;
+    OP_REQUIRES_OK_ASYNC(ctx, ctx->input_list("components", &components),
+                         callback);
+    for (const Tensor& Tcomponent : components) {
+      tuple.push_back(Tcomponent);
+    }
+
+    OP_REQUIRES_OK_ASYNC(ctx, queue->ValidateManyTuple(tuple), callback);
+    queue->TryEnqueueMany(tuple, ctx, callback);
+  }
+
+  ~EnqueueManyOp() override {}
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(EnqueueManyOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("QueueEnqueueMany").Device(DEVICE_CPU),
+                        EnqueueManyOp);
+
+// Defines a DequeueOp, the execution of which dequeues a tuple of
+// tensors from the given Queue.
+//
+// The op has one input, which is the handle of the appropriate
+// Queue. The op has k outputs, where k is the number of components in
+// the tuples stored in the given Queue, and output i is the ith
+// component of the dequeued tuple.
+class DequeueOp : public QueueAccessOpKernel {
+ public:
+  explicit DequeueOp(OpKernelConstruction* context)
+      : QueueAccessOpKernel(context) {}
+
+ protected:
+  void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
+                    DoneCallback callback) override {
+    OP_REQUIRES_OK_ASYNC(
+        ctx, ctx->MatchSignature({DT_STRING_REF}, queue->component_dtypes()),
+        callback);
+
+    queue->TryDequeue(ctx, [ctx, callback](const QueueInterface::Tuple& tuple) {
+      if (!ctx->status().ok()) {
+        callback();
+        return;
+      }
+      OpOutputList output_components;
+      OP_REQUIRES_OK_ASYNC(
+          ctx, ctx->output_list("components", &output_components), callback);
+      for (int i = 0; i < ctx->num_outputs(); ++i) {
+        output_components.set(i, tuple[i]);
+      }
+      callback();
+    });
+  }
+
+  ~DequeueOp() override {}
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(DequeueOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("QueueDequeue").Device(DEVICE_CPU), DequeueOp);
+
+// Defines a DequeueManyOp, the execution of which concatenates the
+// requested number of elements from the given Queue along the 0th
+// dimension, and emits the result as a single tuple of tensors.
+//
+// The op has two inputs:
+// - Input 0: the handle to a queue.
+// - Input 1: the number of elements to dequeue.
+//
+// The op has k outputs, where k is the number of components in the
+// tuples stored in the given Queue, and output i is the ith component
+// of the dequeued tuple.
+class DequeueManyOp : public QueueAccessOpKernel {
+ public:
+  explicit DequeueManyOp(OpKernelConstruction* context)
+      : QueueAccessOpKernel(context) {}
+
+ protected:
+  void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
+                    DoneCallback callback) override {
+    const Tensor& Tnum_elements = ctx->input(1);
+    int32 num_elements = Tnum_elements.flat<int32>()(0);
+
+    OP_REQUIRES_ASYNC(
+        ctx, num_elements >= 0,
+        errors::InvalidArgument("DequeueManyOp must request a positive number "
+                                "of elements"),
+        callback);
+
+    OP_REQUIRES_OK_ASYNC(ctx, ctx->MatchSignature({DT_STRING_REF, DT_INT32},
+                                                  queue->component_dtypes()),
+                         callback);
+
+    queue->TryDequeueMany(
+        num_elements, ctx, [ctx, callback](const QueueInterface::Tuple& tuple) {
+          if (!ctx->status().ok()) {
+            callback();
+            return;
+          }
+          OpOutputList output_components;
+          OP_REQUIRES_OK_ASYNC(
+              ctx, ctx->output_list("components", &output_components),
+              callback);
+          for (int i = 0; i < ctx->num_outputs(); ++i) {
+            output_components.set(i, tuple[i]);
+          }
+          callback();
+        });
+  }
+
+  ~DequeueManyOp() override {}
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(DequeueManyOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("QueueDequeueMany").Device(DEVICE_CPU),
+                        DequeueManyOp);
+
+// Defines a QueueCloseOp, which closes the given Queue. Closing a
+// Queue signals that no more elements will be enqueued in it.
+//
+// The op has one input, which is the handle of the appropriate Queue.
+class QueueCloseOp : public QueueOpKernel {
+ public:
+  explicit QueueCloseOp(OpKernelConstruction* context)
+      : QueueOpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("cancel_pending_enqueues",
+                                             &cancel_pending_enqueues_));
+  }
+
+ protected:
+  void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
+                    DoneCallback callback) override {
+    queue->Close(ctx, cancel_pending_enqueues_, callback);
+  }
+
+ private:
+  bool cancel_pending_enqueues_;
+  TF_DISALLOW_COPY_AND_ASSIGN(QueueCloseOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("QueueClose").Device(DEVICE_CPU), QueueCloseOp);
+
+// Defines a QueueSizeOp, which computes the number of elements in the
+// given Queue, and emits it as an output tensor.
+//
+// The op has one input, which is the handle of the appropriate Queue;
+// and one output, which is a single-element tensor containing the current
+// size of that Queue.
+class QueueSizeOp : public QueueOpKernel {
+ public:
+  explicit QueueSizeOp(OpKernelConstruction* context)
+      : QueueOpKernel(context) {}
+
+ protected:
+  void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
+                    DoneCallback callback) override {
+    Tensor* Tqueue_size = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &Tqueue_size));
+    Tqueue_size->flat<int32>().setConstant(queue->size());
+    callback();
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(QueueSizeOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("QueueSize").Device(DEVICE_CPU), QueueSizeOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/random_crop_op.cc b/tensorflow/core/kernels/random_crop_op.cc
new file mode 100644
index 0000000000..4fc12e92cb
--- /dev/null
+++ b/tensorflow/core/kernels/random_crop_op.cc
@@ -0,0 +1,103 @@
+// See docs in ../ops/image_ops.cc.
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/util/guarded_philox_random.h"
+
+namespace tensorflow {
+
+template <typename T>
+class RandomCropOp : public OpKernel {
+ public:
+  explicit RandomCropOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, generator_.Init(context));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    OP_REQUIRES(context, input.dims() == 3,
+                errors::InvalidArgument("input must be 3-dimensional",
+                                        input.shape().ShortDebugString()));
+    const Tensor& shape_t = context->input(1);
+    OP_REQUIRES(context, shape_t.dims() == 1,
+                errors::InvalidArgument("shape_t must be 1-dimensional",
+                                        shape_t.shape().ShortDebugString()));
+    OP_REQUIRES(context, shape_t.NumElements() == 2,
+                errors::InvalidArgument("shape_t must have two elements",
+                                        shape_t.shape().ShortDebugString()));
+
+    auto shape_vec = shape_t.vec<int64>();
+    const int32 target_height = shape_vec(0);
+    const int32 target_width = shape_vec(1);
+
+    const int32 height = input.dim_size(0);
+    const int32 width = input.dim_size(1);
+    const int32 channels = input.dim_size(2);
+
+    // Initialize shape to the batch size of the input, then add
+    // the rest of the dimensions
+    Tensor* output = nullptr;
+    const auto output_shape =
+        TensorShape({target_height, target_width, channels});
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+
+    // If the target size matches the actual size, then do nothing.
+    if ((target_height == height) && (target_width == width)) {
+      *output = context->input(0);
+    }
+
+    // TODO(shlens): Implement edge case to guarantee output size dimensions.
+    // Edge case. The target dimensions are larger then the image, so
+    // zero-pad the image. This guarantees that the image will *always*
+    // be [target_height, target_width] in size.
+    OP_REQUIRES(context, width >= target_width, errors::FailedPrecondition(
+        "width must be >= target_width: width = ", width,
+        ", target_width = ", target_width));
+    OP_REQUIRES(context, height >= target_height, errors::FailedPrecondition(
+        "height must be >= target_height: height = ", height,
+        ", target_height = ", target_height));
+
+    int32 offset_height = 0;
+    int32 offset_width = 0;
+
+    auto local_gen = generator_.ReserveSamples32(2);
+    random::SimplePhilox random(&local_gen);
+
+    if (width > target_width) {
+      offset_width = random.Rand32() % (width - target_width + 1);
+    }
+    if (height > target_height) {
+      offset_height = random.Rand32() % (height - target_height + 1);
+    }
+
+    // TODO(shlens): Do this more efficiently with memcpy once padding is
+    // available for smaller images.
+    typename TTypes<T, 3>::ConstTensor input_data = input.tensor<T, 3>();
+    typename TTypes<T, 3>::Tensor output_data = output->tensor<T, 3>();
+
+    for (int y = 0; y < target_height; ++y) {
+      for (int x = 0; x < target_width; ++x) {
+        for (int c = 0; c < channels; ++c) {
+          output_data(y, x, c) =
+              input_data(y + offset_height, x + offset_width, c);
+        }
+      }
+    }
+  }
+
+ private:
+  GuardedPhiloxRandom generator_;
+};
+
+#define REGISTER_KERNELS(type)                                       \
+  REGISTER_KERNEL_BUILDER(                                           \
+    Name("RandomCrop").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+    RandomCropOp<type>)
+
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/random_crop_op_test.cc b/tensorflow/core/kernels/random_crop_op_test.cc
new file mode 100644
index 0000000000..1f232f4969
--- /dev/null
+++ b/tensorflow/core/kernels/random_crop_op_test.cc
@@ -0,0 +1,60 @@
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/public/tensor.h"
+#include <gtest/gtest.h>
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+
+class RandomCropOpTest : public OpsTestBase {
+ protected:
+  RandomCropOpTest() {
+    RequireDefaultOps();
+    EXPECT_OK(NodeDefBuilder("random_crop_op", "RandomCrop")
+                  .Input(FakeInput(DT_UINT8))
+                  .Input(FakeInput(DT_INT64))
+                  .Attr("T", DT_UINT8)
+                  .Finalize(node_def()));
+    EXPECT_OK(InitOp());
+  }
+};
+
+TEST_F(RandomCropOpTest, Basic) {
+  AddInputFromArray<uint8>(TensorShape({1, 2, 1}), {2, 2});
+  AddInputFromArray<int64>(TensorShape({2}), {1, 1});
+  ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_UINT8, TensorShape({1, 1, 1}));
+  test::FillValues<uint8>(&expected, {2});
+  test::ExpectTensorEqual<uint8>(expected, *GetOutput(0));
+}
+
+TEST_F(RandomCropOpTest, SameSizeOneChannel) {
+  AddInputFromArray<uint8>(TensorShape({2, 1, 1}), {1, 2});
+  AddInputFromArray<int64>(TensorShape({2}), {2, 1});
+  ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_UINT8, TensorShape({2, 1, 1}));
+  test::FillValues<uint8>(&expected, {1, 2});
+  test::ExpectTensorEqual<uint8>(expected, *GetOutput(0));
+}
+
+TEST_F(RandomCropOpTest, SameSizeMultiChannel) {
+  AddInputFromArray<uint8>(TensorShape({2, 1, 3}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int64>(TensorShape({2}), {2, 1});
+  ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_UINT8, TensorShape({2, 1, 3}));
+  test::FillValues<uint8>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<uint8>(expected, *GetOutput(0));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/random_op.cc b/tensorflow/core/kernels/random_op.cc
new file mode 100644
index 0000000000..09b66d30e6
--- /dev/null
+++ b/tensorflow/core/kernels/random_op.cc
@@ -0,0 +1,276 @@
+// See docs in ../ops/random_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/random_op.h"
+
+#include <algorithm>
+#include <memory>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/lib/hash/crc32c.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/util/guarded_philox_random.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+// The default implementation of the functor, which should never be invoked
+// But we still need to provide implementation for now for the linker to work,
+// since we do not support all the distributions yet.
+template <typename Device, class Distribution>
+struct FillPhiloxRandom {
+  typedef typename Distribution::ResultElementType T;
+  void operator()(OpKernelContext*, const Device&, random::PhiloxRandom gen,
+                  T* data, int64 size) {
+    LOG(FATAL) << "Default FillPhiloxRandom should not be executed.";
+  }
+};
+
+#if GOOGLE_CUDA
+// Declaration for the partial specialization with GPU
+template <class Distribution>
+struct FillPhiloxRandom<GPUDevice, Distribution> {
+  typedef typename Distribution::ResultElementType T;
+  void operator()(OpKernelContext* ctx, const GPUDevice&,
+                  random::PhiloxRandom gen, T* data, int64 size);
+};
+
+#endif
+
+// A class to fill a specified range of random groups
+template <class Distribution, bool VariableSamplesPerOutput>
+struct FillPhiloxRandomTask;
+
+// Specialization for distribution that takes a fixed number of samples for
+// each output.
+template <class Distribution>
+struct FillPhiloxRandomTask<Distribution, false> {
+  typedef typename Distribution::ResultElementType T;
+  static void Run(random::PhiloxRandom gen, T* data, int64 size,
+                  int64 start_group, int64 limit_group) {
+    Distribution dist;
+    const int kGroupSize = Distribution::kResultElementCount;
+
+    gen.Skip(start_group);
+    int64 offset = start_group * kGroupSize;
+
+    // First fill all the full-size groups
+    int64 limit_group_full = std::min(limit_group, size / kGroupSize);
+    for (int64 index = start_group; index < limit_group_full; ++index) {
+      auto samples = dist(&gen);
+      std::copy(&samples[0], &samples[0] + kGroupSize, data + offset);
+      offset += kGroupSize;
+    }
+
+    // If there are any remaining elements that need to be filled, process them
+    if (limit_group_full < limit_group) {
+      int remaining_size = size - limit_group_full * kGroupSize;
+      auto samples = dist(&gen);
+      std::copy(&samples[0], &samples[0] + remaining_size, data + offset);
+    }
+  }
+};
+
+// Specialization for distribution that takes a varaiable number of samples for
+// each output. This will be slower due to the generality.
+template <class Distribution>
+struct FillPhiloxRandomTask<Distribution, true> {
+  typedef typename Distribution::ResultElementType T;
+  static const int64 kReservedSamplesPerOutput = 256;
+
+  static void Run(random::PhiloxRandom base_gen, T* data, int64 size,
+                  int64 start_group, int64 limit_group) {
+    using random::PhiloxRandom;
+    using random::SingleSampleAdapter;
+
+    Distribution dist;
+    const int kGroupSize = Distribution::kResultElementCount;
+
+    static const int kGeneratorSkipPerOutputGroup =
+        kGroupSize * kReservedSamplesPerOutput /
+        PhiloxRandom::kResultElementCount;
+
+    int64 offset = start_group * kGroupSize;
+
+    // First fill all the full-size groups
+    int64 limit_group_full = std::min(limit_group, size / kGroupSize);
+    int64 group_index;
+    for (group_index = start_group; group_index < limit_group_full;
+         ++group_index) {
+      // Reset the generator to the beginning of the output group region
+      // This is necessary if we want the results to be independent of order
+      // of work
+      PhiloxRandom gen = base_gen;
+      gen.Skip(group_index * kGeneratorSkipPerOutputGroup);
+      SingleSampleAdapter<PhiloxRandom> single_samples(&gen);
+
+      auto samples = dist(&single_samples);
+      std::copy(&samples[0], &samples[0] + kGroupSize, data + offset);
+      offset += kGroupSize;
+    }
+
+    // If there are any remaining elements that need to be filled, process them
+    if (limit_group_full < limit_group) {
+      PhiloxRandom gen = base_gen;
+      gen.Skip(group_index * kGeneratorSkipPerOutputGroup);
+      SingleSampleAdapter<PhiloxRandom> single_samples(&gen);
+
+      int remaining_size = size - limit_group_full * kGroupSize;
+      auto samples = dist(&single_samples);
+      std::copy(&samples[0], &samples[0] + remaining_size, data + offset);
+    }
+  }
+};
+
+// Partial specialization for CPU to fill the entire region with randoms
+// It splits the work into several tasks and run them in parallel
+template <class Distribution>
+struct FillPhiloxRandom<CPUDevice, Distribution> {
+  typedef typename Distribution::ResultElementType T;
+  void operator()(OpKernelContext* context, const CPUDevice&,
+                  random::PhiloxRandom gen, T* data, int64 size) {
+    const int kGroupSize = Distribution::kResultElementCount;
+
+    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+
+    int64 total_group_count = (size + kGroupSize - 1) / kGroupSize;
+
+    // Limit to maximum six threads for now. The performance scaling is very
+    // sub-linear. Too many threads causes a much worse overall performance.
+    int num_workers = 6;
+    Shard(num_workers, worker_threads.workers, total_group_count, kGroupSize,
+          [&gen, data, size](int64 start_group, int64 limit_group) {
+            FillPhiloxRandomTask<
+                Distribution,
+                Distribution::kVariableSamplesPerOutput>::Run(gen, data, size,
+                                                              start_group,
+                                                              limit_group);
+          });
+  }
+};
+}  // namespace functor
+
+// For now, use the same interface as RandomOp, so we can choose either one
+// at the run-time.
+template <typename Device, class Distribution>
+class PhiloxRandomOp : public OpKernel {
+ public:
+  typedef typename Distribution::ResultElementType T;
+  explicit PhiloxRandomOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, generator_.Init(ctx));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& input = ctx->input(0);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsLegacyVector(input.shape()),
+        errors::InvalidArgument("shape must be a vector of {int32,int64}."));
+    Tensor* output = nullptr;
+    if (input.dtype() == DataType::DT_INT32) {
+      auto vec = input.flat<int32>();
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShapeUtils::MakeShape(
+                                                      vec.data(), vec.size()),
+                                               &output));
+    } else if (input.dtype() == DataType::DT_INT64) {
+      auto vec = input.flat<int64>();
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShapeUtils::MakeShape(
+                                                      vec.data(), vec.size()),
+                                               &output));
+    } else {
+      OP_REQUIRES(ctx, false, errors::InvalidArgument(
+                                  "shape must be a vector of {int32,int64}."));
+    }
+    functor::FillPhiloxRandom<Device, Distribution>()(
+        ctx, ctx->eigen_device<Device>(),
+        ReserveRandomOutputs(output->flat<T>().size()),
+        output->flat<T>().data(), output->flat<T>().size());
+  }
+
+ private:
+  GuardedPhiloxRandom generator_;
+
+  // Reserve enough random samples in the generator for the given output count.
+  random::PhiloxRandom ReserveRandomOutputs(int64 output_count) {
+    int64 conservative_sample_count = output_count << 8;
+    return generator_.ReserveSamples128(conservative_sample_count);
+  }
+};
+
+#define REGISTER(TYPE)                                              \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("RandomUniform")                                         \
+          .Device(DEVICE_CPU)                                       \
+          .HostMemory("shape")                                      \
+          .TypeConstraint<TYPE>("dtype"),                           \
+      PhiloxRandomOp<CPUDevice, random::UniformDistribution<        \
+                                    random::PhiloxRandom, TYPE> >); \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("RandomStandardNormal")                                  \
+          .Device(DEVICE_CPU)                                       \
+          .HostMemory("shape")                                      \
+          .TypeConstraint<TYPE>("dtype"),                           \
+      PhiloxRandomOp<CPUDevice, random::NormalDistribution<         \
+                                    random::PhiloxRandom, TYPE> >); \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("TruncatedNormal")                                       \
+          .Device(DEVICE_CPU)                                       \
+          .HostMemory("shape")                                      \
+          .TypeConstraint<TYPE>("dtype"),                           \
+      PhiloxRandomOp<                                               \
+          CPUDevice,                                                \
+          random::TruncatedNormalDistribution<                      \
+              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >)
+
+REGISTER(float);
+REGISTER(double);
+
+#undef REGISTER
+
+#if GOOGLE_CUDA
+
+#define REGISTER(TYPE)                                              \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("RandomUniform")                                         \
+          .Device(DEVICE_GPU)                                       \
+          .HostMemory("shape")                                      \
+          .TypeConstraint<int32>("T")                               \
+          .TypeConstraint<TYPE>("dtype"),                           \
+      PhiloxRandomOp<GPUDevice, random::UniformDistribution<        \
+                                    random::PhiloxRandom, TYPE> >); \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("RandomStandardNormal")                                  \
+          .Device(DEVICE_GPU)                                       \
+          .HostMemory("shape")                                      \
+          .TypeConstraint<int32>("T")                               \
+          .TypeConstraint<TYPE>("dtype"),                           \
+      PhiloxRandomOp<GPUDevice, random::NormalDistribution<         \
+                                    random::PhiloxRandom, TYPE> >); \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("TruncatedNormal")                                       \
+          .Device(DEVICE_GPU)                                       \
+          .HostMemory("shape")                                      \
+          .TypeConstraint<int32>("T")                               \
+          .TypeConstraint<TYPE>("dtype"),                           \
+      PhiloxRandomOp<                                               \
+          GPUDevice,                                                \
+          random::TruncatedNormalDistribution<                      \
+              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >)
+
+REGISTER(float);
+REGISTER(double);
+
+#undef REGISTER
+
+#endif  // GOOGLE_CUDA
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/random_op.h b/tensorflow/core/kernels/random_op.h
new file mode 100644
index 0000000000..7c7eed4227
--- /dev/null
+++ b/tensorflow/core/kernels/random_op.h
@@ -0,0 +1,16 @@
+#ifndef TENSORFLOW_KERNELS_RANDOM_OP_H_
+#define TENSORFLOW_KERNELS_RANDOM_OP_H_
+
+namespace tensorflow {
+
+class OpKernelContext;
+
+namespace functor {
+
+template <typename Device, class Distribution>
+struct FillPhiloxRandom;
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_RANDOM_OP_H_
diff --git a/tensorflow/core/kernels/random_op_gpu.cu.cc b/tensorflow/core/kernels/random_op_gpu.cu.cc
new file mode 100644
index 0000000000..15cf85f27e
--- /dev/null
+++ b/tensorflow/core/kernels/random_op_gpu.cu.cc
@@ -0,0 +1,152 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/random_op.h"
+
+#include <stdio.h>
+#include <assert.h>
+
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+class OpKernelContext;
+
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template <class Distribution, bool VariableSamplesPerOutput>
+struct FillPhiloxRandomKernel;
+
+// A cuda kernel to fill the data with random numbers from the specified
+// distribution. Each output takes a fixed number of samples.
+template <class Distribution>
+struct FillPhiloxRandomKernel<Distribution, false> {
+  typedef typename Distribution::ResultElementType T;
+  PHILOX_DEVICE_FUNC void Run(random::PhiloxRandom gen, T* data, int64 size) {
+    Distribution dist;
+    const int kGroupSize = Distribution::kResultElementCount;
+
+    const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    const int32 total_thread_count = gridDim.x * blockDim.x;
+    int32 offset = thread_id * kGroupSize;
+    gen.Skip(thread_id);
+
+    while (offset < size) {
+      typename Distribution::ResultType samples = dist(&gen);
+
+      for (int i = 0; i < kGroupSize; ++i) {
+        if (offset >= size) {
+          return;
+        }
+        data[offset] = samples[i];
+        ++offset;
+      }
+
+      offset += (total_thread_count - 1) * kGroupSize;
+      gen.Skip(total_thread_count - 1);
+    }
+  }
+};
+
+// A cuda kernel to fill the data with random numbers from the specified
+// distribution. Each output takes a variable number of samples.
+template <class Distribution>
+struct FillPhiloxRandomKernel<Distribution, true> {
+  typedef typename Distribution::ResultElementType T;
+  PHILOX_DEVICE_FUNC void Run(const random::PhiloxRandom& base_gen, T* data,
+                              int64 size) {
+    using random::PhiloxRandom;
+    using random::SingleSampleAdapter;
+
+    const int kReservedSamplesPerOutput = 256;
+    const int kGroupSize = Distribution::kResultElementCount;
+    const int kGeneratorSkipPerOutputGroup = kGroupSize *
+                                             kReservedSamplesPerOutput /
+                                             PhiloxRandom::kResultElementCount;
+
+    const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    const int32 total_thread_count = gridDim.x * blockDim.x;
+    int64 group_index = thread_id;
+    int64 offset = group_index * kGroupSize;
+    Distribution dist;
+
+    while (offset < size) {
+      // Since each output takes a variable number of samples, we need to
+      // realign the generator to the beginning for the current output group
+      PhiloxRandom gen = base_gen;
+      gen.Skip(group_index * kGeneratorSkipPerOutputGroup);
+      SingleSampleAdapter<PhiloxRandom> single_samples(&gen);
+
+      typename Distribution::ResultType samples = dist(&single_samples);
+
+      for (int i = 0; i < kGroupSize; ++i) {
+        if (offset >= size) {
+          return;
+        }
+        data[offset] = samples[i];
+        ++offset;
+      }
+
+      offset += (total_thread_count - 1) * kGroupSize;
+      group_index += total_thread_count;
+    }
+  }
+};
+
+// A simple launch pad to call the correct function templates to fill the data
+template <class Distribution>
+__global__ void __launch_bounds__(1024)
+    FillPhiloxRandomKernelLaunch(random::PhiloxRandom base_gen,
+                                 typename Distribution::ResultElementType* data,
+                                 int64 size) {
+  FillPhiloxRandomKernel<Distribution,
+                         Distribution::kVariableSamplesPerOutput>()
+      .Run(base_gen, data, size);
+}
+
+// Partial specialization for GPU
+template <class Distribution>
+struct FillPhiloxRandom<GPUDevice, Distribution> {
+  typedef typename Distribution::ResultElementType T;
+  typedef GPUDevice Device;
+  void operator()(OpKernelContext*, const Device& d, random::PhiloxRandom gen,
+                  T* data, int64 size) {
+    const int32 block_size = d.maxCudaThreadsPerBlock();
+    const int32 num_blocks =
+        (d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor()) /
+        block_size;
+
+    FillPhiloxRandomKernelLaunch<
+        Distribution><<<num_blocks, block_size, 0, d.stream()>>>(gen, data,
+                                                                 size);
+  }
+};
+
+// Explicit instantiation of the GPU distributions functors
+// clang-format off
+// NVCC cannot handle ">>" properly
+template struct FillPhiloxRandom<
+    GPUDevice, random::UniformDistribution<random::PhiloxRandom, float> >;
+template struct FillPhiloxRandom<
+    GPUDevice, random::UniformDistribution<random::PhiloxRandom, double> >;
+template struct FillPhiloxRandom<
+    GPUDevice, random::NormalDistribution<random::PhiloxRandom, float> >;
+template struct FillPhiloxRandom<
+    GPUDevice, random::NormalDistribution<random::PhiloxRandom, double> >;
+template struct FillPhiloxRandom<
+    GPUDevice, random::TruncatedNormalDistribution<
+                   random::SingleSampleAdapter<random::PhiloxRandom>, float> >;
+template struct FillPhiloxRandom<
+    GPUDevice, random::TruncatedNormalDistribution<
+                   random::SingleSampleAdapter<random::PhiloxRandom>, double> >;
+// clang-format on
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/random_op_test.cc b/tensorflow/core/kernels/random_op_test.cc
new file mode 100644
index 0000000000..751b61cfba
--- /dev/null
+++ b/tensorflow/core/kernels/random_op_test.cc
@@ -0,0 +1,99 @@
+#include <random>
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/tensor.h"
+#include <gtest/gtest.h>
+
+namespace tensorflow {
+
+Tensor Int32(int32 v) {
+  Tensor t(DT_INT32, TensorShape({}));
+  t.scalar<int32>()() = v;
+  return t;
+}
+
+Graph* RandomUniform(int64 n) {
+  Graph* g = new Graph(OpRegistry::Global());
+  test::graph::RandomUniform(g, test::graph::Constant(g, Int32(n)), DT_FLOAT);
+  return g;
+}
+
+Graph* RandomNormal(int64 n) {
+  Graph* g = new Graph(OpRegistry::Global());
+  test::graph::RandomGaussian(g, test::graph::Constant(g, Int32(n)), DT_FLOAT);
+  return g;
+}
+
+Graph* RandomParameters(int64 n) {
+  Graph* g = new Graph(OpRegistry::Global());
+  test::graph::RandomParameters(g, test::graph::Constant(g, Int32(n)),
+                                DT_FLOAT);
+  return g;
+}
+
+#define BM_RNG(DEVICE, RNG)                                   \
+  static void BM_##DEVICE##_##RNG(int iters, int arg) {       \
+    testing::ItemsProcessed(static_cast<int64>(iters) * arg); \
+    test::Benchmark(#DEVICE, RNG(arg)).Run(iters);            \
+  }                                                           \
+  BENCHMARK(BM_##DEVICE##_##RNG)->Range(1 << 20, 8 << 20);
+
+BM_RNG(cpu, RandomUniform);
+BM_RNG(cpu, RandomNormal);
+BM_RNG(cpu, RandomParameters);
+
+BM_RNG(gpu, RandomUniform);
+BM_RNG(gpu, RandomNormal);
+BM_RNG(gpu, RandomParameters);
+
+static void BM_PhiloxRandom(int iters) {
+  // Fill 2M random numbers
+  int count = 2 << 20;
+
+  testing::ItemsProcessed(static_cast<int64>(iters) * count);
+
+  random::PhiloxRandom gen(0x12345);
+
+  int val = 1;
+  for (int i = 0; i < iters; ++i) {
+    for (int j = 0; j < count; j += 4) {
+      /// each invocation of gen() returns 128-bit samples
+      auto samples = gen();
+
+      // use the result trivially so the compiler does not optimize it away
+      val ^= samples[0] ^ samples[1] ^ samples[2] ^ samples[3];
+    }
+  }
+
+  // A anchor point to make sure the compiler does not cut corners
+  CHECK(val) << val;
+}
+BENCHMARK(BM_PhiloxRandom);
+
+static void BM_StdMTRandom(int iters) {
+  // Fill 2M random numbers
+  int count = 2 << 20;
+
+  testing::ItemsProcessed(static_cast<int64>(iters) * count);
+
+  std::mt19937 gen(0x12345);
+
+  int val = 1;
+  for (int i = 0; i < iters; ++i) {
+    for (int j = 0; j < count; ++j) {
+      /// each invocation of gen() returns 32-bit sample
+      uint32 sample = gen();
+
+      // use the result trivially so the compiler does not optimize it away
+      val ^= sample;
+    }
+  }
+
+  // A anchor point to make sure the compiler does not cut corners
+  CHECK(val) << val;
+}
+BENCHMARK(BM_StdMTRandom);
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/random_shuffle_op.cc b/tensorflow/core/kernels/random_shuffle_op.cc
new file mode 100644
index 0000000000..b87f4e58a0
--- /dev/null
+++ b/tensorflow/core/kernels/random_shuffle_op.cc
@@ -0,0 +1,89 @@
+// See docs in ../ops/random_ops.cc.
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/util/guarded_philox_random.h"
+
+namespace tensorflow {
+
+// TODO(irving): If performance is critical, generate output directly instead
+// of an in-place shuffle using a pseudorandom permutation like
+//
+//   https://github.com/otherlab/geode/blob/master/geode/random/permute.cpp
+//
+// This is probably also the right thing if we want a GPU version of shuffling.
+
+// We use our own version of std::random_shuffle to guarantee that exactly
+// size - 1 samples are used.
+template <class Iter, class Random>
+static inline void RandomShuffle(Iter first, Iter last, Random& uniform) {
+  if (first == last) return;
+  const auto stop = last - 1;
+  for (auto i = first; i != stop; ++i) {
+    using std::iter_swap;
+    iter_swap(i, i + uniform(last - i));
+  }
+}
+
+template <typename T>
+class RandomShuffleOp : public OpKernel {
+ public:
+  explicit RandomShuffleOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, generator_.Init(context));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+
+    if (input.NumElements() <= 1 || input.dim_size(0) <= 1) {
+      // No shuffling is required, so copy input directly to output
+      context->set_output(0, input);
+    } else {
+      // Reserve enough random samples for shuffling
+      const int64 size = input.dim_size(0);
+      const int64 samples = size - 1;
+      auto local_gen = generator_.ReserveSamples32(samples);
+      random::SingleSampleAdapter<random::PhiloxRandom> single(&local_gen);
+      const auto uniform = [&single](uint32 n) { return single() % n; };
+
+      if (input.dims() == 1) {
+        // For 1D data, copy and then shuffle in place
+        context->set_output(0, tensor::DeepCopy(input));
+        auto vec = context->mutable_output(0)->vec<T>();
+        RandomShuffle(vec.data(), vec.data() + size, uniform);
+      } else {
+        // For >= 2D, shuffle indices and then copy across
+        Tensor* output = nullptr;
+        OP_REQUIRES_OK(context,
+                       context->allocate_output(0, input.shape(), &output));
+        const auto input_mat = input.flat_outer_dims<T>();
+        auto output_mat = output->flat_outer_dims<T>();
+        std::vector<int> permutation(size);
+        for (int i = 0; i < size; i++) {
+          permutation[i] = i;
+        }
+        RandomShuffle(permutation.begin(), permutation.end(), uniform);
+        for (int i = 0; i < size; i++) {
+          output_mat.template chip<0>(i) =
+              input_mat.template chip<0>(permutation[i]);
+        }
+      }
+    }
+  }
+
+ private:
+  GuardedPhiloxRandom generator_;
+};
+
+#define REGISTER(T)                                                    \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("RandomShuffle").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      RandomShuffleOp<T>);
+TF_CALL_ALL_TYPES(REGISTER)
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/random_shuffle_queue_op.cc b/tensorflow/core/kernels/random_shuffle_queue_op.cc
new file mode 100644
index 0000000000..561ec76e53
--- /dev/null
+++ b/tensorflow/core/kernels/random_shuffle_queue_op.cc
@@ -0,0 +1,740 @@
+// See docs in ../ops/data_flow_ops.cc.
+
+#include <deque>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/queue_base.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+class RandomShuffleQueue : public QueueBase {
+ public:
+  RandomShuffleQueue(int32 capacity, int32 min_after_dequeue, int64 seed,
+                     int64 seed2, const DataTypeVector& component_dtypes,
+                     const std::vector<TensorShape>& component_shapes,
+                     const string& name);
+  Status Initialize();  // Must be called before any other method.
+
+  // Implementations of QueueInterface methods --------------------------------
+
+  Status ValidateTuple(const Tuple& tuple) override;
+  Status ValidateManyTuple(const Tuple& tuple) override;
+  void TryEnqueue(const Tuple& tuple, OpKernelContext* ctx,
+                  DoneCallback callback) override;
+  void TryEnqueueMany(const Tuple& tuple, OpKernelContext* ctx,
+                      DoneCallback callback) override;
+  void TryDequeue(OpKernelContext* ctx, CallbackWithTuple callback) override;
+  void TryDequeueMany(int num_elements, OpKernelContext* ctx,
+                      CallbackWithTuple callback) override;
+  void Close(OpKernelContext* ctx, bool cancel_pending_enqueues,
+             DoneCallback callback) override;
+  Status MatchesNodeDef(const NodeDef& node_def) override;
+
+  int32 size() override {
+    mutex_lock lock(mu_);
+    return queues_[0].size();
+  }
+
+ private:
+  enum Action { kEnqueue, kDequeue };
+
+  ~RandomShuffleQueue() override {}
+
+  TensorShape ManyOutShape(int i, int batch_size) {
+    TensorShape shape({batch_size});
+    shape.AppendShape(component_shapes_[i]);
+    return shape;
+  }
+
+  // Helper for dequeuing a single random element from queues_.
+  void DequeueLocked(OpKernelContext* ctx, Tuple* tuple)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  void Cancel(Action action, CancellationToken token);
+
+  // Helper for cancelling all pending Enqueue(Many) operations when
+  // Close is called with cancel_pending_enqueues.
+  void CloseAndCancel();
+
+  // Tries to enqueue/dequeue (or close) based on whatever is at the
+  // front of enqueue_attempts_/dequeue_attempts_.  Appends to
+  // *finished the callback for any finished attempt (so it may be
+  // called once mu_ is released).  Returns true if any progress was
+  // made.
+  struct CleanUp {
+    CleanUp(DoneCallback&& f, CancellationToken ct, CancellationManager* cm)
+        : finished(f), to_deregister(ct), cm(cm) {}
+    DoneCallback finished;
+    CancellationToken to_deregister;
+    CancellationManager* cm;
+  };
+  bool TryAttemptLocked(Action action, std::vector<CleanUp>* clean_up)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Tries to make progress on the enqueues or dequeues at the front
+  // of the *_attempts_ queues.
+  void FlushUnlocked();
+
+  const int32 capacity_;
+  const int32 min_after_dequeue_;
+  const int64 original_seed_;
+  const int64 original_seed2_;
+
+  mutex mu_;
+  typedef std::vector<PersistentTensor> SubQueue;
+  std::vector<SubQueue> queues_ GUARDED_BY(mu_);
+  bool closed_ GUARDED_BY(mu_);
+  random::PhiloxRandom parent_generator_ GUARDED_BY(mu_);
+  random::SingleSampleAdapter<random::PhiloxRandom> generator_ GUARDED_BY(mu_);
+
+  enum RunResult { kNoProgress, kProgress, kComplete };
+  struct Attempt;
+  typedef std::function<RunResult(Attempt*)> RunCallback;
+  struct Attempt {
+    int32 elements_requested;
+    DoneCallback done_callback;  // must be run outside mu_
+    OpKernelContext* context;
+    CancellationToken cancellation_token;
+    RunCallback run_callback;  // must be run while holding mu_
+    bool is_cancelled;
+    Tuple tuple;
+
+    Attempt(int32 elements_requested, DoneCallback done_callback,
+            OpKernelContext* context, CancellationToken cancellation_token,
+            RunCallback run_callback)
+        : elements_requested(elements_requested),
+          done_callback(done_callback),
+          context(context),
+          cancellation_token(cancellation_token),
+          run_callback(run_callback),
+          is_cancelled(false) {}
+  };
+  std::deque<Attempt> enqueue_attempts_ GUARDED_BY(mu_);
+  std::deque<Attempt> dequeue_attempts_ GUARDED_BY(mu_);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(RandomShuffleQueue);
+};
+
+RandomShuffleQueue::RandomShuffleQueue(
+    int capacity, int min_after_dequeue, int64 seed, int64 seed2,
+    const DataTypeVector& component_dtypes,
+    const std::vector<TensorShape>& component_shapes, const string& name)
+    : QueueBase(component_dtypes, component_shapes, name),
+      capacity_(capacity),
+      min_after_dequeue_(min_after_dequeue),
+      original_seed_(seed),
+      original_seed2_(seed2),
+      closed_(false),
+      generator_(&parent_generator_) {
+  if (seed == 0 && seed2 == 0) {
+    // If both seeds are unspecified, use completely random seeds.
+    seed = random::New64();
+    seed2 = random::New64();
+  }
+  parent_generator_ = random::PhiloxRandom(seed, seed2);
+}
+
+Status RandomShuffleQueue::Initialize() {
+  if (component_dtypes_.empty()) {
+    return errors::InvalidArgument("Empty component types for queue ", name_);
+  }
+  if (!component_shapes_.empty() &&
+      component_dtypes_.size() != component_shapes_.size()) {
+    return errors::InvalidArgument("Different number of component types (",
+                                   component_dtypes_.size(), ") vs. shapes (",
+                                   component_shapes_.size(), ").");
+  }
+
+  mutex_lock lock(mu_);
+  queues_.reserve(num_components());
+  for (int i = 0; i < num_components(); ++i) {
+    queues_.push_back(SubQueue());
+    queues_.back().reserve(min_after_dequeue_);
+  }
+  return Status::OK();
+}
+
+// TODO(mrry): If these checks become a bottleneck, find a way to
+//   reduce the number of times that they are called.
+Status RandomShuffleQueue::ValidateTuple(const Tuple& tuple) {
+  TF_RETURN_IF_ERROR(ValidateTupleCommon(tuple));
+  if (specified_shapes()) {
+    for (size_t i = 0; i < tuple.size(); ++i) {
+      if (!tuple[i].shape().IsSameSize(component_shapes_[i])) {
+        return errors::InvalidArgument(
+            "Shape mismatch in tuple component ", i, ". Expected ",
+            component_shapes_[i].ShortDebugString(), ", got ",
+            tuple[i].shape().ShortDebugString());
+      }
+    }
+  }
+  return Status::OK();
+}
+
+// TODO(mrry): If these checks become a bottleneck, find a way to
+//   reduce the number of times that they are called.
+Status RandomShuffleQueue::ValidateManyTuple(const Tuple& tuple) {
+  TF_RETURN_IF_ERROR(ValidateTupleCommon(tuple));
+  const int64 batch_size = tuple[0].dim_size(0);
+  if (specified_shapes()) {
+    for (size_t i = 0; i < tuple.size(); ++i) {
+      // Expected shape is [batch_size] + component_shapes_[i]
+      const TensorShape expected_shape = ManyOutShape(i, batch_size);
+      if (!tuple[i].shape().IsSameSize(expected_shape)) {
+        return errors::InvalidArgument(
+            "Shape mismatch in tuple component ", i, ". Expected ",
+            expected_shape.ShortDebugString(), ", got ",
+            tuple[i].shape().ShortDebugString());
+      }
+    }
+  } else {
+    for (size_t i = 1; i < tuple.size(); ++i) {
+      if (tuple[i].dim_size(0) != batch_size) {
+        return errors::InvalidArgument(
+            "All input tensors must have the same size in the 0th ",
+            "dimension. Component ", i, " has ", tuple[i].dim_size(0),
+            ", and should have ", batch_size);
+      }
+    }
+  }
+  return Status::OK();
+}
+
+void RandomShuffleQueue::DequeueLocked(OpKernelContext* ctx, Tuple* tuple) {
+  DCHECK_GT(queues_[0].size(), 0);
+  int64 index = generator_() % queues_[0].size();
+  (*tuple).reserve(num_components());
+  for (int i = 0; i < num_components(); ++i) {
+    (*tuple).push_back(*queues_[i][index].AccessTensor(ctx));
+    queues_[i][index] = queues_[i].back();
+    queues_[i].pop_back();
+  }
+}
+
+void RandomShuffleQueue::Cancel(Action action, CancellationToken token) {
+  DoneCallback callback = nullptr;
+  {
+    mutex_lock lock(mu_);
+    std::deque<Attempt>* attempts =
+        action == kEnqueue ? &enqueue_attempts_ : &dequeue_attempts_;
+
+    for (Attempt& attempt : *attempts) {
+      if (attempt.cancellation_token == token) {
+        attempt.is_cancelled = true;
+        if (action == kEnqueue) {
+          attempt.context->SetStatus(
+              errors::Cancelled("Enqueue operation was cancelled"));
+        } else {
+          attempt.context->SetStatus(
+              errors::Cancelled("Dequeue operation was cancelled"));
+        }
+        std::swap(callback, attempt.done_callback);
+        break;
+      }
+    }
+  }
+  if (callback) {
+    callback();
+    FlushUnlocked();
+  }
+}
+
+void RandomShuffleQueue::CloseAndCancel() {
+  std::vector<DoneCallback> callbacks;
+  {
+    mutex_lock lock(mu_);
+    closed_ = true;
+    for (Attempt& attempt : enqueue_attempts_) {
+      attempt.is_cancelled = true;
+      attempt.context->SetStatus(
+          errors::Cancelled("Enqueue operation was cancelled"));
+      callbacks.emplace_back(std::move(attempt.done_callback));
+    }
+  }
+  for (const DoneCallback& callback : callbacks) {
+    callback();
+  }
+  FlushUnlocked();
+}
+
+bool RandomShuffleQueue::TryAttemptLocked(
+    Action action, std::vector<CleanUp>* clean_up) {
+  std::deque<Attempt>* attempts =
+      action == kEnqueue ? &enqueue_attempts_ : &dequeue_attempts_;
+
+  bool progress = false;
+  bool done = false;
+  while (!done && !attempts->empty()) {
+    if (attempts->front().is_cancelled) {
+      if (action == kEnqueue) {
+        LOG(INFO) << "Skipping cancelled enqueue attempt";
+      } else {
+        LOG(INFO) << "Skipping cancelled dequeue attempt";
+      }
+      attempts->pop_front();
+    } else {
+      Attempt* cur_attempt = &attempts->front();
+      switch (cur_attempt->run_callback(cur_attempt)) {
+        case kNoProgress:
+          done = true;
+          break;
+        case kProgress:
+          done = true;
+          progress = true;
+          break;
+        case kComplete:
+          progress = true;
+          clean_up->emplace_back(std::move(cur_attempt->done_callback),
+                                 cur_attempt->cancellation_token,
+                                 cur_attempt->context->cancellation_manager());
+          attempts->pop_front();
+          break;
+      }
+    }
+  }
+  return progress;
+}
+
+void RandomShuffleQueue::FlushUnlocked() {
+  std::vector<CleanUp> clean_up;
+  Ref();
+  {
+    mutex_lock lock(mu_);
+    bool changed;
+    do {
+      changed = TryAttemptLocked(kEnqueue, &clean_up);
+      changed = TryAttemptLocked(kDequeue, &clean_up) || changed;
+    } while (changed);
+  }
+  Unref();
+  for (const auto& to_clean : clean_up) {
+    if (to_clean.to_deregister != CancellationManager::kInvalidToken) {
+      // NOTE(mrry): We can safely ignore the return value of
+      // DeregisterCallback because the mutex mu_ ensures that the
+      // cleanup action only executes once.
+      to_clean.cm->DeregisterCallback(to_clean.to_deregister);
+    }
+    to_clean.finished();
+  }
+}
+
+void RandomShuffleQueue::TryEnqueue(const Tuple& tuple, OpKernelContext* ctx,
+                                    DoneCallback callback) {
+  CancellationManager* cm = ctx->cancellation_manager();
+  CancellationToken token = cm->get_cancellation_token();
+  bool already_cancelled;
+  {
+    mutex_lock l(mu_);
+    already_cancelled = !cm->RegisterCallback(
+        token, [this, token]() { Cancel(kEnqueue, token); });
+    if (!already_cancelled) {
+      enqueue_attempts_.emplace_back(
+          1, callback, ctx, token,
+          [tuple, this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+            if (closed_) {
+              attempt->context->SetStatus(errors::Aborted(
+                  "RandomShuffleQueue '", name_, "' is closed."));
+              return kComplete;
+            }
+            if (queues_[0].size() < static_cast<size_t>(capacity_)) {
+              for (int i = 0; i < num_components(); ++i) {
+                queues_[i].push_back(PersistentTensor(tuple[i]));
+              }
+              return kComplete;
+            } else {
+              return kNoProgress;
+            }
+          });
+    }
+  }
+  if (!already_cancelled) {
+    FlushUnlocked();
+  } else {
+    ctx->SetStatus(errors::Cancelled("Enqueue operation was cancelled"));
+    callback();
+  }
+}
+
+void RandomShuffleQueue::TryEnqueueMany(const Tuple& tuple,
+                                        OpKernelContext* ctx,
+                                        DoneCallback callback) {
+  const int64 batch_size = tuple[0].dim_size(0);
+  if (batch_size == 0) {
+    callback();
+    return;
+  }
+
+  CancellationManager* cm = ctx->cancellation_manager();
+  CancellationToken token = cm->get_cancellation_token();
+  bool already_cancelled;
+  {
+    mutex_lock l(mu_);
+    already_cancelled = !cm->RegisterCallback(
+        token, [this, token]() { Cancel(kEnqueue, token); });
+    if (!already_cancelled) {
+      enqueue_attempts_.emplace_back(
+          batch_size, callback, ctx, token,
+          [tuple, this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+            if (closed_) {
+              attempt->context->SetStatus(errors::Aborted(
+                  "RandomShuffleQueue '", name_, "' is closed."));
+              return kComplete;
+            }
+            RunResult result = kNoProgress;
+            while (queues_[0].size() < static_cast<size_t>(capacity_)) {
+              result = kProgress;
+              const int index =
+                  tuple[0].dim_size(0) - attempt->elements_requested;
+              for (int i = 0; i < num_components(); ++i) {
+                TensorShape element_shape(tuple[i].shape());
+                element_shape.RemoveDim(0);
+                PersistentTensor element;
+                Tensor* element_access = nullptr;
+                attempt->context->allocate_persistent(
+                    tuple[i].dtype(), element_shape, &element, &element_access);
+                attempt->context->SetStatus(
+                    CopySliceToElement(tuple[i], element_access, index));
+                if (!attempt->context->status().ok()) return kComplete;
+                queues_[i].push_back(element);
+              }
+              --attempt->elements_requested;
+              if (attempt->elements_requested == 0) {
+                return kComplete;
+              }
+            }
+            return result;
+          });
+    }
+  }
+  if (!already_cancelled) {
+    FlushUnlocked();
+  } else {
+    ctx->SetStatus(errors::Cancelled("Enqueue operation was cancelled"));
+    callback();
+  }
+}
+
+void RandomShuffleQueue::TryDequeue(OpKernelContext* ctx,
+                                    CallbackWithTuple callback) {
+  CancellationManager* cm = ctx->cancellation_manager();
+  CancellationToken token = cm->get_cancellation_token();
+  bool already_cancelled;
+  {
+    mutex_lock l(mu_);
+    already_cancelled = !cm->RegisterCallback(
+        token, [this, token]() { Cancel(kDequeue, token); });
+    if (!already_cancelled) {
+      // TODO(josh11b): This makes two copies of callback, avoid this if possible.
+      dequeue_attempts_.emplace_back(
+          1, [callback]() { callback(Tuple()); }, ctx, token,
+          [callback, this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+            int32 s = queues_[0].size();
+            if (closed_ && s == 0) {
+              attempt->context->SetStatus(errors::OutOfRange(
+                  "RandomShuffleQueue '", name_, "' is closed and has ",
+                  "insufficient elements (requested ", 1, ", current size ", s,
+                  ")"));
+              return kComplete;
+            }
+            if (!closed_) s -= min_after_dequeue_;
+            if (s > 0) {
+              Tuple tuple;
+              DequeueLocked(attempt->context, &tuple);
+              attempt->done_callback = [callback, tuple]() { callback(tuple); };
+              return kComplete;
+            } else {
+              return kNoProgress;
+            }
+          });
+    }
+  }
+  if (!already_cancelled) {
+    FlushUnlocked();
+  } else {
+    ctx->SetStatus(errors::Cancelled("Dequeue operation was cancelled"));
+    callback(Tuple());
+  }
+}
+
+void RandomShuffleQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
+                                        CallbackWithTuple callback) {
+  if (!specified_shapes()) {
+    ctx->SetStatus(
+        errors::InvalidArgument("RandomShuffleQueue's DequeueMany requires the "
+                                "components to have specified shapes."));
+    callback(Tuple());
+    return;
+  }
+  if (num_elements == 0) {
+    Tuple tuple;
+    tuple.reserve(num_components());
+    for (int i = 0; i < num_components(); ++i) {
+      // TODO(josh11b,misard): Switch to allocate_output().  Problem is
+      // this breaks the abstraction boundary since we don't *really*
+      // know if and how the Tensors in the tuple we pass to callback
+      // correspond to the outputs of *ctx.  For example, the
+      // ReaderRead Op uses TryDequeue() to get a filename out of a
+      // queue that is used internally by the reader and is not
+      // associated with any output of the ReaderRead.
+      // mrry@ adds:
+      // Maybe we need to pass a std::function<Tensor*(...)> (or
+      // better signature) that calls the appropriate allocator
+      // function in addition to ctx?  (Or support a shim Allocator
+      // that has an internal OpKernelContext*, and dispatches to the
+      // appropriate method?)
+      // misard@ adds:
+      // I don't see that a std::function would help. The problem is
+      // that at this point (allocation time) the system doesn't know
+      // what is going to happen to the element read out of the
+      // queue. As long as we keep the generality that TensorFlow Ops
+      // do their own dynamic allocation in arbitrary C++ code, we
+      // need to preserve robustness to allocating output Tensors with
+      // the 'wrong' attributes, and fixing up with a copy. The only
+      // improvement I can see here in the future would be to support
+      // an optimized case where the queue 'knows' what attributes to
+      // use, and plumbs them through here.
+      Tensor element;
+      ctx->allocate_temp(component_dtypes_[i], ManyOutShape(i, 0), &element);
+      tuple.emplace_back(element);
+    }
+    callback(tuple);
+    return;
+  }
+
+  CancellationManager* cm = ctx->cancellation_manager();
+  CancellationToken token = cm->get_cancellation_token();
+  bool already_cancelled;
+  {
+    mutex_lock l(mu_);
+    already_cancelled = !cm->RegisterCallback(
+        token, [this, token]() { Cancel(kDequeue, token); });
+    if (!already_cancelled) {
+      // TODO(josh11b): This makes two copies of callback, avoid this if possible.
+      dequeue_attempts_.emplace_back(
+          num_elements, [callback]() { callback(Tuple()); }, ctx, token,
+          [callback, this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+            int32 s = queues_[0].size();
+            if (closed_ && s < attempt->elements_requested) {
+              attempt->context->SetStatus(errors::OutOfRange(
+                  "RandomSuffleQueue '", name_, "' is closed and has ",
+                  "insufficient elements (requested ",
+                  attempt->elements_requested, ", current size ", s, ")"));
+              return kComplete;
+            }
+
+            RunResult result = kNoProgress;
+            if (!closed_) s -= min_after_dequeue_;
+            for (; s > 0; --s) {
+              if (attempt->tuple.empty()) {
+                // Only allocate tuple when we have something to dequeue
+                // so we don't use exceessive memory when there are many
+                // blocked dequeue attempts waiting.
+                attempt->tuple.reserve(num_components());
+                for (int i = 0; i < num_components(); ++i) {
+                  const TensorShape shape =
+                      ManyOutShape(i, attempt->elements_requested);
+                  Tensor element;
+                  attempt->context->allocate_temp(component_dtypes_[i], shape,
+                                                  &element);
+                  attempt->tuple.emplace_back(element);
+                }
+              }
+              result = kProgress;
+              Tuple tuple;
+              DequeueLocked(attempt->context, &tuple);
+              const int index =
+                  attempt->tuple[0].dim_size(0) - attempt->elements_requested;
+              for (int i = 0; i < num_components(); ++i) {
+                attempt->context->SetStatus(
+                    CopyElementToSlice(tuple[i], &attempt->tuple[i], index));
+                if (!attempt->context->status().ok()) return kComplete;
+              }
+              tuple.clear();
+              --attempt->elements_requested;
+              if (attempt->elements_requested == 0) {
+                tuple = attempt->tuple;
+                attempt->done_callback = [callback, tuple]() {
+                  callback(tuple);
+                };
+                return kComplete;
+              }
+            }
+            return result;
+          });
+    }
+  }
+  if (!already_cancelled) {
+    FlushUnlocked();
+  } else {
+    ctx->SetStatus(errors::Cancelled("Dequeue operation was cancelled"));
+    callback(Tuple());
+  }
+}
+
+void RandomShuffleQueue::Close(OpKernelContext* ctx,
+                               bool cancel_pending_enqueues,
+                               DoneCallback callback) {
+  if (cancel_pending_enqueues) {
+    CloseAndCancel();
+    callback();
+  } else {
+    {
+      mutex_lock lock(mu_);
+      enqueue_attempts_.emplace_back(
+          0, callback, ctx, CancellationManager::kInvalidToken,
+          [this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+            if (closed_) {
+              attempt->context->SetStatus(errors::Aborted(
+                  "RandomShuffleQueue '", name_, "' is already closed."));
+            } else {
+              closed_ = true;
+            }
+            return kComplete;
+          });
+    }
+    FlushUnlocked();
+  }
+}
+
+Status RandomShuffleQueue::MatchesNodeDef(const NodeDef& node_def) {
+  TF_RETURN_IF_ERROR(MatchesNodeDefOp(node_def, "RandomShuffleQueue"));
+  TF_RETURN_IF_ERROR(MatchesNodeDefCapacity(node_def, capacity_));
+
+  int32 min_after_dequeue = -1;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(node_def, "min_after_dequeue", &min_after_dequeue));
+  if (min_after_dequeue != min_after_dequeue_) {
+    return errors::InvalidArgument(
+        "Shared queue '", name_, "' has min_after_dequeue ",
+        min_after_dequeue_, " but requested min_after_dequeue was ",
+        min_after_dequeue, ".");
+  }
+
+  int64 seed = -1;
+  int64 seed2 = -1;
+  TF_RETURN_IF_ERROR(GetNodeAttr(node_def, "seed", &seed));
+  TF_RETURN_IF_ERROR(GetNodeAttr(node_def, "seed2", &seed2));
+  if ((seed != 0 || seed2 != 0) &&
+      (seed != original_seed_ || seed2 != original_seed2_)) {
+    return errors::InvalidArgument(
+        "Shared queue '", name_, "' has random seeds (", original_seed_, ", ",
+        original_seed2_, ") but requested seeds are (", seed, ", ", seed2,
+        ").");
+  }
+
+  TF_RETURN_IF_ERROR(MatchesNodeDefTypes(node_def));
+  TF_RETURN_IF_ERROR(MatchesNodeDefShapes(node_def));
+
+  return Status::OK();
+}
+
+typedef std::shared_ptr<QueueInterface> QueueInterfacePtr;
+
+// Defines a RandomShuffleQueueOp, which produces a Queue (specifically, one
+// backed by RandomShuffleQueue) that persists across different graph
+// executions, and sessions. Running this op produces a single-element
+// tensor of handles to Queues in the corresponding device.
+class RandomShuffleQueueOp : public OpKernel {
+ public:
+  explicit RandomShuffleQueueOp(OpKernelConstruction* context)
+      : OpKernel(context), queue_handle_set_(false) {
+    OP_REQUIRES_OK(context, context->GetAttr("capacity", &capacity_));
+    OP_REQUIRES_OK(context,
+                   context->allocate_persistent(DT_STRING, TensorShape({2}),
+                                                &queue_handle_, nullptr));
+    if (capacity_ < 0) {
+      capacity_ = RandomShuffleQueue::kUnbounded;
+    }
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("min_after_dequeue", &min_after_dequeue_));
+    OP_REQUIRES(context, min_after_dequeue_ >= 0,
+                errors::InvalidArgument("min_after_dequeue ",
+                                        min_after_dequeue_, " must be >= 0"));
+    OP_REQUIRES(
+        context, min_after_dequeue_ < capacity_,
+        errors::InvalidArgument("min_after_dequeue ", min_after_dequeue_,
+                                " must be < capacity ", capacity_));
+    OP_REQUIRES_OK(context, context->GetAttr("seed", &seed_));
+    OP_REQUIRES_OK(context, context->GetAttr("seed2", &seed2_));
+
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("component_types", &component_types_));
+    OP_REQUIRES_OK(context, context->GetAttr("shapes", &component_shapes_));
+  }
+
+  ~RandomShuffleQueueOp() override {
+    // If the queue object was not shared, delete it.
+    if (queue_handle_set_ && cinfo_.resource_is_private_to_kernel()) {
+      TF_CHECK_OK(cinfo_.resource_manager()->Delete<QueueInterface>(
+          cinfo_.container(), cinfo_.name()));
+    }
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    mutex_lock l(mu_);
+    if (!queue_handle_set_) {
+      OP_REQUIRES_OK(ctx, SetQueueHandle(ctx));
+    }
+    ctx->set_output_ref(0, &mu_, queue_handle_.AccessTensor(ctx));
+  }
+
+ private:
+  Status SetQueueHandle(OpKernelContext* ctx) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    TF_RETURN_IF_ERROR(cinfo_.Init(ctx->resource_manager(), def()));
+    QueueInterface* queue;
+    auto creator = [this](QueueInterface** ret) {
+      auto* q = new RandomShuffleQueue(capacity_, min_after_dequeue_, seed_,
+                                       seed2_, component_types_,
+                                       component_shapes_, cinfo_.name());
+      Status s = q->Initialize();
+      if (s.ok()) {
+        *ret = q;
+      } else {
+        q->Unref();
+      }
+      return s;
+    };
+    TF_RETURN_IF_ERROR(
+        cinfo_.resource_manager()->LookupOrCreate<QueueInterface>(
+            cinfo_.container(), cinfo_.name(), &queue, creator));
+    core::ScopedUnref unref_me(queue);
+    // Verify that the shared queue is compatible with the requested arguments.
+    TF_RETURN_IF_ERROR(queue->MatchesNodeDef(def()));
+    auto h = queue_handle_.AccessTensor(ctx)->flat<string>();
+    h(0) = cinfo_.container();
+    h(1) = cinfo_.name();
+    queue_handle_set_ = true;
+    return Status::OK();
+  }
+
+  int32 capacity_;
+  int32 min_after_dequeue_;
+  int64 seed_;
+  int64 seed2_;
+  DataTypeVector component_types_;
+  std::vector<TensorShape> component_shapes_;
+  ContainerInfo cinfo_;
+
+  mutex mu_;
+  PersistentTensor queue_handle_ GUARDED_BY(mu_);
+  bool queue_handle_set_ GUARDED_BY(mu_);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(RandomShuffleQueueOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("RandomShuffleQueue").Device(DEVICE_CPU),
+                        RandomShuffleQueueOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/range_sampler.cc b/tensorflow/core/kernels/range_sampler.cc
new file mode 100644
index 0000000000..a3f4e0b0cb
--- /dev/null
+++ b/tensorflow/core/kernels/range_sampler.cc
@@ -0,0 +1,305 @@
+#include "tensorflow/core/kernels/range_sampler.h"
+
+#include <vector>
+#include <unordered_set>
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/io/inputbuffer.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+
+namespace tensorflow {
+
+using gtl::ArraySlice;
+using gtl::MutableArraySlice;
+
+RangeSampler::~RangeSampler() {}
+
+void RangeSampler::SampleBatch(random::SimplePhilox* rnd, bool unique,
+                               gtl::MutableArraySlice<int64> batch) const {
+  SampleBatchGetExpectedCount(
+      rnd, unique, batch, gtl::MutableArraySlice<float>(),
+      gtl::ArraySlice<int64>(), gtl::MutableArraySlice<float>());
+}
+
+void RangeSampler::SampleBatchGetExpectedCount(
+    random::SimplePhilox* rnd, bool unique, gtl::MutableArraySlice<int64> batch,
+    gtl::MutableArraySlice<float> batch_expected_count,
+    gtl::ArraySlice<int64> extras,
+    gtl::MutableArraySlice<float> extras_expected_count) const {
+  SampleBatchGetExpectedCountAvoid(rnd, unique, batch, batch_expected_count,
+                                   extras, extras_expected_count,
+                                   gtl::ArraySlice<int64>());
+}
+
+namespace {
+
+// Approximates the expected count of a value in the output of SampleBatch.
+//
+// If unique=false, then this is (Probability(value) * batch_size)
+//
+// We use batch_size and num_tries, where num_tries is the observed number of
+// tries it took to get batch_size unique values.
+//
+// Assuming (falsely) that the nubmer of tries to get a batch of batch_size
+// distinct values is _always_ num_tries, the probability that the value
+// is in a batch is (1 - (1-p)^num_tries)
+static float ExpectedCountHelper(float p, int batch_size, int num_tries) {
+  if (num_tries == batch_size) {
+    // This shortcut will always be taken if unique=false
+    return p * batch_size;
+  }
+  // numerically stable version of (1 - (1-p)^num_tries)
+  return -expm1(num_tries * log1p(-p));
+}
+
+}  // namespace
+
+void RangeSampler::SampleBatchGetExpectedCountAvoid(
+    random::SimplePhilox* rnd, bool unique, MutableArraySlice<int64> batch,
+    MutableArraySlice<float> batch_expected_count, ArraySlice<int64> extras,
+    MutableArraySlice<float> extras_expected_count,
+    ArraySlice<int64> avoided_values) const {
+  const int batch_size = batch.size();
+  int num_tries;
+
+  if (unique) {
+    CHECK_LE(batch_size + avoided_values.size(), range_);
+    std::unordered_set<int64> used(batch_size);
+    used.insert(avoided_values.begin(), avoided_values.end());
+    int num_picked = 0;
+    num_tries = 0;
+    while (num_picked < batch_size) {
+      num_tries++;
+      CHECK_LT(num_tries, kint32max);
+      int64 value = Sample(rnd);
+      if (gtl::InsertIfNotPresent(&used, value)) {
+        batch[num_picked++] = value;
+      }
+    }
+  } else {
+    CHECK_EQ(avoided_values.size(), 0)
+        << "avoided_values only supported with unique=true";
+    for (int i = 0; i < batch_size; i++) {
+      batch[i] = Sample(rnd);
+    }
+    num_tries = batch_size;
+  }
+  // Compute the expected counts of the batch and the extra values
+  if (batch_expected_count.size() > 0) {
+    CHECK_EQ(batch_size, batch_expected_count.size());
+    for (int i = 0; i < batch_size; i++) {
+      batch_expected_count[i] =
+          ExpectedCountHelper(Probability(batch[i]), batch_size, num_tries);
+    }
+  }
+  CHECK_EQ(extras.size(), extras_expected_count.size());
+  for (size_t i = 0; i < extras.size(); i++) {
+    extras_expected_count[i] =
+        ExpectedCountHelper(Probability(extras[i]), batch_size, num_tries);
+  }
+}
+
+AllSampler::AllSampler(int64 range)
+    : RangeSampler(range), inv_range_(1.0 / range) {}
+
+void AllSampler::SampleBatchGetExpectedCountAvoid(
+    random::SimplePhilox* rnd, bool unique, MutableArraySlice<int64> batch,
+    MutableArraySlice<float> batch_expected_count, ArraySlice<int64> extras,
+    MutableArraySlice<float> extras_expected_count,
+    ArraySlice<int64> avoided_values) const {
+  const int batch_size = batch.size();
+  CHECK_EQ(range_, batch_size);
+  for (int i = 0; i < batch_size; i++) {
+    batch[i] = i;
+  }
+  if (batch_expected_count.size() > 0) {
+    CHECK_EQ(batch_size, batch_expected_count.size());
+    for (int i = 0; i < batch_size; i++) {
+      batch_expected_count[i] = 1;
+    }
+  }
+  CHECK_EQ(0, avoided_values.size());
+  CHECK_EQ(extras.size(), extras_expected_count.size());
+  for (size_t i = 0; i < extras.size(); i++) {
+    extras_expected_count[i] = 1;
+  }
+}
+
+UniformSampler::UniformSampler(int64 range)
+    : RangeSampler(range), inv_range_(1.0 / range) {}
+
+int64 UniformSampler::Sample(random::SimplePhilox* rnd) const {
+  return rnd->Uniform64(range_);
+}
+
+float UniformSampler::Probability(int64 value) const { return inv_range_; }
+
+LogUniformSampler::LogUniformSampler(int64 range)
+    : RangeSampler(range), log_range_(log(range + 1)) {}
+
+int64 LogUniformSampler::Sample(random::SimplePhilox* rnd) const {
+  const int64 value =
+      static_cast<int64>(exp(rnd->RandDouble() * log_range_)) - 1;
+  CHECK_GE(value, 0);
+  // Mathematically, value should be <= range_, but might not be due to some
+  // floating point roundoff, so we mod by range_.
+  return value % range_;
+}
+
+float LogUniformSampler::Probability(int64 value) const {
+  // value is returned iff the call to UniformDouble(log_range_) in the
+  // Sample() function returns a value between log(value + 1)
+  // and log(value + 2).   The probability of this is:
+  // (log(value + 2) - log(value + 1)) / log_range
+  // To avoid two calls to log(), we compute this as follows:
+  return (log((value + 2.0) / (value + 1.0))) / log_range_;
+}
+
+ThreadUnsafeUnigramSampler::ThreadUnsafeUnigramSampler(int64 range)
+    : RangeSampler(range), picker_(range) {
+  CHECK_LT(range, kint32max);
+}
+
+int64 ThreadUnsafeUnigramSampler::Sample(random::SimplePhilox* rnd) const {
+  return picker_.Pick(rnd);
+}
+
+float ThreadUnsafeUnigramSampler::Probability(int64 value) const {
+  return static_cast<float>(picker_.get_weight(value)) / picker_.total_weight();
+}
+
+void ThreadUnsafeUnigramSampler::Update(ArraySlice<int64> values) {
+  int num_updates = std::min(static_cast<int>(values.size()),
+                             kint32max - picker_.total_weight());
+  for (int i = 0; i < num_updates; i++) {
+    const int64 value = values[i];
+    picker_.set_weight(value, picker_.get_weight(value) + 1);
+  }
+}
+
+// Thread-safe unigram sampler
+UnigramSampler::UnigramSampler(int64 range)
+    : RangeSampler(range), unsafe_sampler_(range) {
+  CHECK_LT(range, kint32max);
+}
+
+int64 UnigramSampler::Sample(random::SimplePhilox* rnd) const {
+  mutex_lock lock(mu_);  // could use reader lock
+  return unsafe_sampler_.Sample(rnd);
+}
+
+float UnigramSampler::Probability(int64 value) const {
+  mutex_lock lock(mu_);  // could use reader lock
+  return unsafe_sampler_.Probability(value);
+}
+
+// Overriding at a high level results in far fewer lock aquisitions.
+void UnigramSampler::SampleBatchGetExpectedCountAvoid(
+    random::SimplePhilox* rnd, bool unique, MutableArraySlice<int64> batch,
+    MutableArraySlice<float> batch_expected_count, ArraySlice<int64> extras,
+    MutableArraySlice<float> extras_expected_count,
+    ArraySlice<int64> avoided_values) const {
+  mutex_lock lock(mu_);  // could use reader lock
+  unsafe_sampler_.SampleBatchGetExpectedCountAvoid(
+      rnd, unique, batch, batch_expected_count, extras, extras_expected_count,
+      avoided_values);
+}
+
+void UnigramSampler::Update(ArraySlice<int64> values) {
+  mutex_lock lock(mu_);
+  unsafe_sampler_.Update(values);
+}
+
+FixedUnigramSampler::FixedUnigramSampler(Env* env, int64 range,
+                                         const string& vocab_file,
+                                         float distortion,
+                                         int32 num_reserved_ids,
+                                         int32 num_shards, int32 shard)
+    : RangeSampler(range),
+      total_weight_(0.0),
+      num_shards_(num_shards),
+      shard_(shard) {
+  FillReservedIds(num_reserved_ids);
+  // TODO(vanhoucke): make this non-crashing.
+  TF_CHECK_OK(LoadFromFile(env, vocab_file, distortion));
+  CHECK_EQ(range, weights_.size());
+  dist_sampler_.reset(new random::DistributionSampler(weights_));
+}
+
+FixedUnigramSampler::FixedUnigramSampler(int64 range,
+                                         const std::vector<float>& unigrams,
+                                         float distortion,
+                                         int32 num_reserved_ids,
+                                         int32 num_shards, int32 shard)
+    : RangeSampler(range),
+      total_weight_(0.0),
+      num_shards_(num_shards),
+      shard_(shard) {
+  FillReservedIds(num_reserved_ids);
+  LoadFromUnigrams(unigrams, distortion);
+  // TODO(vanhoucke): make this non-crashing.
+  CHECK_EQ(range, weights_.size());
+  dist_sampler_.reset(new random::DistributionSampler(weights_));
+}
+
+float FixedUnigramSampler::Probability(int64 value) const {
+  return weights_.at(value) / total_weight_;
+}
+
+int64 FixedUnigramSampler::Sample(random::SimplePhilox* rnd) const {
+  return dist_sampler_->Sample(rnd);
+}
+
+void FixedUnigramSampler::FillReservedIds(int32 num_reserved_ids) {
+  for (int32 word_id = 0; word_id < num_reserved_ids; ++word_id) {
+    if (word_id % num_shards_ == shard_) weights_.push_back(0.0);
+  }
+}
+
+Status FixedUnigramSampler::LoadFromFile(Env* env, const string& vocab_file,
+                                         float distortion) {
+  RandomAccessFile* file;
+  TF_RETURN_IF_ERROR(env->NewRandomAccessFile(vocab_file, &file));
+  io::InputBuffer in(file, 262144 /*bytes*/);
+  string line;
+  int32 word_id = weights_.size();
+  while (in.ReadLine(&line).ok()) {
+    // The vocabulary file should be in csv like format, with the last
+    // field the weight associated with the word.
+    std::vector<string> cols = str_util::Split(line, ',');
+    if (cols.size() == 0) continue;
+    // Skip entries that do not belong to this shard.
+    if (word_id % num_shards_ == shard_) {
+      float w = 0.0;
+      if (!strings::safe_strtof(cols.at(cols.size() - 1).c_str(), &w)) {
+        return errors::InvalidArgument("Wrong vocabulary format at line: ",
+                                       line);
+      }
+      w = pow(w, distortion);
+      total_weight_ += w;
+      weights_.push_back(w);
+    }
+    ++word_id;
+  }
+  return Status::OK();
+}
+
+void FixedUnigramSampler::LoadFromUnigrams(const std::vector<float>& unigrams,
+                                           float distortion) {
+  int32 word_id = weights_.size();
+  for (float w : unigrams) {
+    // Skip entries that do not belong to this shard.
+    if (word_id % num_shards_ == shard_) {
+      w = pow(w, distortion);
+      total_weight_ += w;
+      weights_.push_back(w);
+    }
+    ++word_id;
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/range_sampler.h b/tensorflow/core/kernels/range_sampler.h
new file mode 100644
index 0000000000..18364c2c03
--- /dev/null
+++ b/tensorflow/core/kernels/range_sampler.h
@@ -0,0 +1,237 @@
+#ifndef TENSORFLOW_KERNELS_RANGE_SAMPLER_H_
+#define TENSORFLOW_KERNELS_RANGE_SAMPLER_H_
+
+#include <vector>
+
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/random/distribution_sampler.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+#include "tensorflow/core/lib/random/weighted_picker.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/public/status.h"
+
+namespace tensorflow {
+
+class Env;
+
+// Abstract subclass for sampling from the set of non-negative integers
+// [0, range)
+class RangeSampler {
+ public:
+  explicit RangeSampler(int range) : range_(range) { CHECK_GT(range_, 0); }
+  virtual ~RangeSampler();
+
+  // Sample a single value
+  virtual int64 Sample(random::SimplePhilox* rnd) const = 0;
+
+  // The probability that a single call to Sample() returns the given value.
+  // Assumes that value is in [0, range).  No range checking is done.
+  virtual float Probability(int64 value) const = 0;
+
+  // Fill "batch" with samples from the distribution.
+  // If unique=true, then we re-pick each element until we get a
+  // value distinct from all previously picked values in the batch.
+  void SampleBatch(random::SimplePhilox* rnd, bool unique,
+                   gtl::MutableArraySlice<int64> batch) const;
+
+  // Fill "batch" with samples from the distribution, and report
+  // "expected counts".
+  //
+  // The "expected count" of a value is an estimate of the expected
+  // number of occurrences of the value in the batch returned by a
+  // call to this function with the given parameters.  If unique=true,
+  // the expected count is an inclusion probability.  For details on
+  // this estimation, see the comment to "ExpectedCountHelper" in the
+  // .cc file.
+  //
+  // Expected counts for the elements of the returned "batch" are reported
+  // in the aligned array "batch_expected_count".
+  //
+  // The user can optionally provide "extras", containg values in the range.
+  // The expected counts for the extras are reported in the aligned array
+  // "extras_expected_count".
+  //
+  // "batch_expected_count" must have size equal to 0 or to the size of "batch".
+  // "extras" and "extras_expected_count" must have equal size.
+  void SampleBatchGetExpectedCount(
+      random::SimplePhilox* rnd, bool unique,
+      gtl::MutableArraySlice<int64> batch,
+      gtl::MutableArraySlice<float> batch_expected_count,
+      gtl::ArraySlice<int64> extras,
+      gtl::MutableArraySlice<float> extras_expected_count) const;
+
+  // Same as SampleBatchGetExpectedCount (see above), but with avoided values.
+  // We repick to avoid all of the values in "avoided_values".
+  // "avoided_values" is only supported with unique=true.  If
+  // unique=false, then avoided_values must be empty.
+  virtual void SampleBatchGetExpectedCountAvoid(
+      random::SimplePhilox* rnd, bool unique,
+      gtl::MutableArraySlice<int64> batch,
+      gtl::MutableArraySlice<float> batch_expected_count,
+      gtl::ArraySlice<int64> extras,
+      gtl::MutableArraySlice<float> extras_expected_count,
+      gtl::ArraySlice<int64> avoided_values) const;
+
+  // Does this sampler need to be updated with values, e.g. UnigramSampler
+  virtual bool NeedsUpdates() const { return false; }
+
+  // Updates the underlying distribution
+  virtual void Update(gtl::ArraySlice<int64> values) {
+    LOG(FATAL) << "Update not supported for this sampler type.";
+  }
+
+  int64 range() { return range_; }
+
+ protected:
+  const int64 range_;
+};
+
+// An AllSampler only samples batches of size equal to range.
+// It returns the entire range.
+// It cannot sample single values.
+class AllSampler : public RangeSampler {
+ public:
+  explicit AllSampler(int64 range);
+
+  ~AllSampler() override {}
+
+  int64 Sample(random::SimplePhilox* rnd) const override {
+    LOG(FATAL) << "Should not be called";
+  }
+
+  float Probability(int64 value) const override {
+    LOG(FATAL) << "Should not be called";
+  }
+
+  void SampleBatchGetExpectedCountAvoid(
+      random::SimplePhilox* rnd, bool unique,
+      gtl::MutableArraySlice<int64> batch,
+      gtl::MutableArraySlice<float> batch_expected_count,
+      gtl::ArraySlice<int64> extras,
+      gtl::MutableArraySlice<float> extras_expected_count,
+      gtl::ArraySlice<int64> avoided_values) const override;
+
+ private:
+  const float inv_range_;
+};
+
+class UniformSampler : public RangeSampler {
+ public:
+  explicit UniformSampler(int64 range);
+
+  ~UniformSampler() override {}
+
+  int64 Sample(random::SimplePhilox* rnd) const override;
+
+  float Probability(int64 value) const override;
+
+ private:
+  const float inv_range_;
+};
+
+class LogUniformSampler : public RangeSampler {
+ public:
+  explicit LogUniformSampler(int64 range);
+
+  ~LogUniformSampler() override {}
+
+  int64 Sample(random::SimplePhilox* rnd) const override;
+
+  float Probability(int64 value) const override;
+
+ private:
+  const double log_range_;
+};
+
+// Thread-unsafe unigram sampler
+class ThreadUnsafeUnigramSampler : public RangeSampler {
+ public:
+  explicit ThreadUnsafeUnigramSampler(int64 range);
+  ~ThreadUnsafeUnigramSampler() override {}
+
+  int64 Sample(random::SimplePhilox* rnd) const override;
+
+  float Probability(int64 value) const override;
+
+  bool NeedsUpdates() const override { return true; }
+  void Update(gtl::ArraySlice<int64> values) override;
+
+ private:
+  random::WeightedPicker picker_;
+};
+
+// Thread-safe unigram sampler
+class UnigramSampler : public RangeSampler {
+ public:
+  explicit UnigramSampler(int64 range);
+  ~UnigramSampler() override {}
+
+  int64 Sample(random::SimplePhilox* rnd) const override;
+
+  float Probability(int64 value) const override;
+
+  // Overriding at a high level results in far fewer lock aquisitions.
+  void SampleBatchGetExpectedCountAvoid(
+      random::SimplePhilox* rnd, bool unique,
+      gtl::MutableArraySlice<int64> batch,
+      gtl::MutableArraySlice<float> batch_expected_count,
+      gtl::ArraySlice<int64> extras,
+      gtl::MutableArraySlice<float> extras_expected_count,
+      gtl::ArraySlice<int64> avoided_values) const override;
+
+  bool NeedsUpdates() const override { return true; }
+  void Update(gtl::ArraySlice<int64> values) override;
+
+ private:
+  ThreadUnsafeUnigramSampler unsafe_sampler_ GUARDED_BY(mu_);
+  mutable mutex mu_;
+};
+
+// A unigram sampler that uses a fixed unigram distribution read from a
+// file or passed in as an in-memory array instead of building up the
+// distribution from data on the fly. There is also an option to skew the
+// distribution by applying a distortion power to the weights.
+class FixedUnigramSampler : public RangeSampler {
+ public:
+  // The vocab_file is assumed to be a CSV, with the last entry of each row a
+  // value representing the counts or probabilities for the corresponding ID.
+  FixedUnigramSampler(Env* env, int64 range, const string& vocab_file,
+                      float distortion, int32 num_reserved_ids,
+                      int32 num_shards, int32 shard);
+
+  FixedUnigramSampler(int64 range, const std::vector<float>& unigrams,
+                      float distortion, int32 num_reserved_ids,
+                      int32 num_shards, int32 shard);
+
+  float Probability(int64 value) const override;
+
+  int64 Sample(random::SimplePhilox* rnd) const override;
+
+ private:
+  // Underlying distribution sampler.
+  std::unique_ptr<random::DistributionSampler> dist_sampler_;
+  // Weights for individual samples. The probability of a sample i is defined
+  // as weights_.at(i) / total_weight_.
+  std::vector<float> weights_;
+  // The total weights of all samples.
+  float total_weight_;
+  // Sharding information of the sampler. The whole vocabulary is sharded
+  // into num_shards_ smaller ranges and each sampler is responsible for one
+  // such smaller range, identified by the shard number.
+  int32 num_shards_;
+  int32 shard_;
+
+  // Fill the sampler with the appropriate number of reserved IDs.
+  void FillReservedIds(int32 num_reserved_ids);
+  // Load IDs to sample from a CSV file. It is assumed that the last item of
+  // each row contains a count or probability for the corresponding ID.
+  Status LoadFromFile(Env* env, const string& vocab_file, float distortion);
+  // Load from an in-memory array.
+  void LoadFromUnigrams(const std::vector<float>& unigrams, float distortion);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_RANGE_SAMPLER_H_
diff --git a/tensorflow/core/kernels/range_sampler_test.cc b/tensorflow/core/kernels/range_sampler_test.cc
new file mode 100644
index 0000000000..72c39009e4
--- /dev/null
+++ b/tensorflow/core/kernels/range_sampler_test.cc
@@ -0,0 +1,320 @@
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/core/kernels/range_sampler.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/env.h"
+
+namespace tensorflow {
+namespace {
+
+using gtl::ArraySlice;
+using gtl::MutableArraySlice;
+
+class RangeSamplerTest : public ::testing::Test {
+ protected:
+  void CheckProbabilitiesSumToOne() {
+    double sum = 0;
+    for (int i = 0; i < sampler_->range(); i++) {
+      sum += sampler_->Probability(i);
+    }
+    EXPECT_NEAR(sum, 1.0, 1e-4);
+  }
+  void CheckHistogram(int num_samples, float tolerance) {
+    const int range = sampler_->range();
+    std::vector<int> h(range);
+    std::vector<int64> a(num_samples);
+    // Using a fixed random seed to make the test deterministic.
+    random::PhiloxRandom philox(123, 17);
+    random::SimplePhilox rnd(&philox);
+    sampler_->SampleBatch(&rnd, false, &a);
+    for (int i = 0; i < num_samples; i++) {
+      int64 val = a[i];
+      ASSERT_GE(val, 0);
+      ASSERT_LT(val, range);
+      h[val]++;
+    }
+    for (int val = 0; val < range; val++) {
+      EXPECT_NEAR((h[val] + 0.0) / num_samples, sampler_->Probability(val),
+                  tolerance);
+    }
+  }
+  void Update1() {
+    // Add the value 3 ten times.
+    std::vector<int64> a(10);
+    for (int i = 0; i < 10; i++) {
+      a[i] = 3;
+    }
+    sampler_->Update(a);
+  }
+  void Update2() {
+    // Add the value n n times.
+    int64 a[10];
+    for (int i = 0; i < 10; i++) {
+      a[i] = i;
+    }
+    for (int64 i = 1; i < 10; i++) {
+      sampler_->Update(ArraySlice<int64>(a + i, 10 - i));
+    }
+  }
+  std::unique_ptr<RangeSampler> sampler_;
+};
+
+TEST_F(RangeSamplerTest, UniformProbabilities) {
+  sampler_.reset(new UniformSampler(10));
+  for (int i = 0; i < 10; i++) {
+    CHECK_EQ(sampler_->Probability(i), sampler_->Probability(0));
+  }
+}
+
+TEST_F(RangeSamplerTest, UniformChecksum) {
+  sampler_.reset(new UniformSampler(10));
+  CheckProbabilitiesSumToOne();
+}
+
+TEST_F(RangeSamplerTest, UniformHistogram) {
+  sampler_.reset(new UniformSampler(10));
+  CheckHistogram(1000, 0.05);
+}
+
+TEST_F(RangeSamplerTest, LogUniformProbabilities) {
+  int range = 1000000;
+  sampler_.reset(new LogUniformSampler(range));
+  for (int i = 100; i < range; i *= 2) {
+    float ratio = sampler_->Probability(i) / sampler_->Probability(i / 2);
+    EXPECT_NEAR(ratio, 0.5, 0.1);
+  }
+}
+
+TEST_F(RangeSamplerTest, LogUniformChecksum) {
+  sampler_.reset(new LogUniformSampler(10));
+  CheckProbabilitiesSumToOne();
+}
+
+TEST_F(RangeSamplerTest, LogUniformHistogram) {
+  sampler_.reset(new LogUniformSampler(10));
+  CheckHistogram(1000, 0.05);
+}
+
+TEST_F(RangeSamplerTest, UnigramProbabilities1) {
+  sampler_.reset(new UnigramSampler(10));
+  Update1();
+  EXPECT_NEAR(sampler_->Probability(3), 0.55, 1e-4);
+  for (int i = 0; i < 10; i++) {
+    if (i != 3) {
+      ASSERT_NEAR(sampler_->Probability(i), 0.05, 1e-4);
+    }
+  }
+}
+TEST_F(RangeSamplerTest, UnigramProbabilities2) {
+  sampler_.reset(new UnigramSampler(10));
+  Update2();
+  for (int i = 0; i < 10; i++) {
+    ASSERT_NEAR(sampler_->Probability(i), (i + 1) / 55.0, 1e-4);
+  }
+}
+TEST_F(RangeSamplerTest, UnigramChecksum) {
+  sampler_.reset(new UnigramSampler(10));
+  Update1();
+  CheckProbabilitiesSumToOne();
+}
+
+TEST_F(RangeSamplerTest, UnigramHistogram) {
+  sampler_.reset(new UnigramSampler(10));
+  Update1();
+  CheckHistogram(1000, 0.05);
+}
+
+static const char kVocabContent[] =
+    "w1,1\n"
+    "w2,2\n"
+    "w3,4\n"
+    "w4,8\n"
+    "w5,16\n"
+    "w6,32\n"
+    "w7,64\n"
+    "w8,128\n"
+    "w9,256";
+TEST_F(RangeSamplerTest, FixedUnigramProbabilities) {
+  Env* env = Env::Default();
+  string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
+  TF_CHECK_OK(WriteStringToFile(env, fname, kVocabContent));
+  sampler_.reset(new FixedUnigramSampler(env, 9, fname, 0.8, 0, 1, 0));
+  // 1^0.8+2^0.8+4^0.8+...+256^0.8=197.05
+  for (int i = 0; i < 9; i++) {
+    ASSERT_NEAR(sampler_->Probability(i), pow(2, i * 0.8) / 197.05, 1e-4);
+  }
+}
+TEST_F(RangeSamplerTest, FixedUnigramChecksum) {
+  Env* env = Env::Default();
+  string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
+  TF_CHECK_OK(WriteStringToFile(env, fname, kVocabContent));
+  sampler_.reset(new FixedUnigramSampler(env, 9, fname, 0.8, 0, 1, 0));
+  CheckProbabilitiesSumToOne();
+}
+
+TEST_F(RangeSamplerTest, FixedUnigramHistogram) {
+  Env* env = Env::Default();
+  string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
+  TF_CHECK_OK(WriteStringToFile(env, fname, kVocabContent));
+  sampler_.reset(new FixedUnigramSampler(env, 9, fname, 0.8, 0, 1, 0));
+  CheckHistogram(1000, 0.05);
+}
+TEST_F(RangeSamplerTest, FixedUnigramProbabilitiesReserve1) {
+  Env* env = Env::Default();
+  string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
+  TF_CHECK_OK(WriteStringToFile(env, fname, kVocabContent));
+  sampler_.reset(new FixedUnigramSampler(env, 10, fname, 0.8, 1, 1, 0));
+  ASSERT_NEAR(sampler_->Probability(0), 0, 1e-4);
+  // 1^0.8+2^0.8+4^0.8+...+256^0.8=197.05
+  for (int i = 1; i < 10; i++) {
+    ASSERT_NEAR(sampler_->Probability(i), pow(2, (i - 1) * 0.8) / 197.05, 1e-4);
+  }
+}
+TEST_F(RangeSamplerTest, FixedUnigramProbabilitiesReserve2) {
+  Env* env = Env::Default();
+  string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
+  TF_CHECK_OK(WriteStringToFile(env, fname, kVocabContent));
+  sampler_.reset(new FixedUnigramSampler(env, 11, fname, 0.8, 2, 1, 0));
+  ASSERT_NEAR(sampler_->Probability(0), 0, 1e-4);
+  ASSERT_NEAR(sampler_->Probability(1), 0, 1e-4);
+  // 1^0.8+2^0.8+4^0.8+...+256^0.8=197.05
+  for (int i = 2; i < 11; i++) {
+    ASSERT_NEAR(sampler_->Probability(i), pow(2, (i - 2) * 0.8) / 197.05, 1e-4);
+  }
+}
+TEST_F(RangeSamplerTest, FixedUnigramProbabilitiesFromVector) {
+  std::vector<float> weights = {1, 2, 4, 8, 16, 32, 64, 128, 256};
+  sampler_.reset(new FixedUnigramSampler(9, weights, 0.8, 0, 1, 0));
+  // 1^0.8+2^0.8+4^0.8+...+256^0.8=197.05
+  for (int i = 0; i < 9; i++) {
+    ASSERT_NEAR(sampler_->Probability(i), pow(2, i * 0.8) / 197.05, 1e-4);
+  }
+}
+TEST_F(RangeSamplerTest, FixedUnigramChecksumFromVector) {
+  std::vector<float> weights = {1, 2, 4, 8, 16, 32, 64, 128, 256};
+  sampler_.reset(new FixedUnigramSampler(9, weights, 0.8, 0, 1, 0));
+  CheckProbabilitiesSumToOne();
+}
+TEST_F(RangeSamplerTest, FixedUnigramHistogramFromVector) {
+  std::vector<float> weights = {1, 2, 4, 8, 16, 32, 64, 128, 256};
+  sampler_.reset(new FixedUnigramSampler(9, weights, 0.8, 0, 1, 0));
+  CheckHistogram(1000, 0.05);
+}
+TEST_F(RangeSamplerTest, FixedUnigramProbabilitiesReserve1FromVector) {
+  std::vector<float> weights = {1, 2, 4, 8, 16, 32, 64, 128, 256};
+  sampler_.reset(new FixedUnigramSampler(10, weights, 0.8, 1, 1, 0));
+  ASSERT_NEAR(sampler_->Probability(0), 0, 1e-4);
+  // 1^0.8+2^0.8+4^0.8+...+256^0.8=197.05
+  for (int i = 1; i < 10; i++) {
+    ASSERT_NEAR(sampler_->Probability(i), pow(2, (i - 1) * 0.8) / 197.05, 1e-4);
+  }
+}
+TEST_F(RangeSamplerTest, FixedUnigramProbabilitiesReserve2FromVector) {
+  std::vector<float> weights = {1, 2, 4, 8, 16, 32, 64, 128, 256};
+  sampler_.reset(new FixedUnigramSampler(11, weights, 0.8, 2, 1, 0));
+  ASSERT_NEAR(sampler_->Probability(0), 0, 1e-4);
+  ASSERT_NEAR(sampler_->Probability(1), 0, 1e-4);
+  // 1^0.8+2^0.8+4^0.8+...+256^0.8=197.05
+  for (int i = 2; i < 11; i++) {
+    ASSERT_NEAR(sampler_->Probability(i), pow(2, (i - 2) * 0.8) / 197.05, 1e-4);
+  }
+}
+
+// AllSampler cannot call Sample or Probability directly.
+// We will test SampleBatchGetExpectedCount instead.
+TEST_F(RangeSamplerTest, All) {
+  int batch_size = 10;
+  sampler_.reset(new AllSampler(10));
+  std::vector<int64> batch(batch_size);
+  std::vector<float> batch_expected(batch_size);
+  std::vector<int64> extras(2);
+  std::vector<float> extras_expected(2);
+  extras[0] = 0;
+  extras[1] = batch_size - 1;
+  sampler_->SampleBatchGetExpectedCount(nullptr,  // no random numbers needed
+                                        false, &batch, &batch_expected, extras,
+                                        &extras_expected);
+  for (int i = 0; i < batch_size; i++) {
+    EXPECT_EQ(i, batch[i]);
+    EXPECT_EQ(1, batch_expected[i]);
+  }
+  EXPECT_EQ(1, extras_expected[0]);
+  EXPECT_EQ(1, extras_expected[1]);
+}
+
+TEST_F(RangeSamplerTest, Unique) {
+  // We sample num_batches batches, each without replacement.
+  //
+  // We check that the returned expected counts roughly agree with each other
+  // and with the average observed frequencies over the set of batches.
+  random::PhiloxRandom philox(123, 17);
+  random::SimplePhilox rnd(&philox);
+  const int range = 100;
+  const int batch_size = 50;
+  const int num_batches = 100;
+  sampler_.reset(new LogUniformSampler(range));
+  std::vector<int> histogram(range);
+  std::vector<int64> batch(batch_size);
+  std::vector<int64> all_values(range);
+  for (int i = 0; i < range; i++) {
+    all_values[i] = i;
+  }
+  std::vector<float> expected(range);
+
+  // Sample one batch and get the expected counts of all values
+  sampler_->SampleBatchGetExpectedCount(
+      &rnd, true, &batch, MutableArraySlice<float>(), all_values, &expected);
+  // Check that all elements are unique
+  std::set<int64> s(batch.begin(), batch.end());
+  CHECK_EQ(batch_size, s.size());
+
+  for (int trial = 0; trial < num_batches; trial++) {
+    std::vector<float> trial_expected(range);
+    sampler_->SampleBatchGetExpectedCount(&rnd, true, &batch,
+                                          MutableArraySlice<float>(),
+                                          all_values, &trial_expected);
+    for (int i = 0; i < range; i++) {
+      EXPECT_NEAR(expected[i], trial_expected[i], expected[i] * 0.5);
+    }
+    for (int i = 0; i < batch_size; i++) {
+      histogram[batch[i]]++;
+    }
+  }
+  for (int i = 0; i < range; i++) {
+    // Check that the computed expected count agrees with the average observed
+    // count.
+    const float average_count = static_cast<float>(histogram[i]) / num_batches;
+    EXPECT_NEAR(expected[i], average_count, 0.2);
+  }
+}
+
+TEST_F(RangeSamplerTest, Avoid) {
+  random::PhiloxRandom philox(123, 17);
+  random::SimplePhilox rnd(&philox);
+  sampler_.reset(new LogUniformSampler(100));
+  std::vector<int64> avoided(2);
+  avoided[0] = 17;
+  avoided[1] = 23;
+  std::vector<int64> batch(98);
+
+  // We expect to pick all elements of [0, 100) except the avoided two.
+  sampler_->SampleBatchGetExpectedCountAvoid(
+      &rnd, true, &batch, MutableArraySlice<float>(), ArraySlice<int64>(),
+      MutableArraySlice<float>(), avoided);
+
+  int sum = 0;
+  for (auto val : batch) {
+    sum += val;
+  }
+  const int expected_sum = 100 * 99 / 2 - avoided[0] - avoided[1];
+  EXPECT_EQ(expected_sum, sum);
+}
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reader_base.cc b/tensorflow/core/kernels/reader_base.cc
new file mode 100644
index 0000000000..06211efb38
--- /dev/null
+++ b/tensorflow/core/kernels/reader_base.cc
@@ -0,0 +1,156 @@
+#include "tensorflow/core/kernels/reader_base.h"
+
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/coding.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+
+namespace tensorflow {
+
+// ReaderBase ------------------------------------------------------
+
+ReaderBase::ReaderBase(const string& name) : name_(name) {}
+
+int64 ReaderBase::NumRecordsProduced() {
+  mutex_lock lock(mu_);
+  return num_records_produced_;
+}
+
+int64 ReaderBase::NumWorkUnitsCompleted() {
+  mutex_lock lock(mu_);
+  return work_finished_;
+}
+
+Status ReaderBase::Reset() {
+  mutex_lock lock(mu_);
+  return ResetLocked();
+}
+
+Status ReaderBase::ResetLocked() {
+  work_started_ = 0;
+  work_finished_ = 0;
+  num_records_produced_ = 0;
+  work_.clear();
+  return Status::OK();
+}
+
+Status ReaderBase::SerializeState(string* state) {
+  mutex_lock lock(mu_);
+  return SerializeStateLocked(state);
+}
+
+Status ReaderBase::SerializeStateLocked(string* state) {
+  return errors::Unimplemented("Reader SerializeState");
+}
+
+Status ReaderBase::RestoreState(const string& state) {
+  mutex_lock lock(mu_);
+  Status status = RestoreStateLocked(state);
+  if (!status.ok()) {
+    ResetLocked();
+  }
+  return status;
+}
+
+Status ReaderBase::RestoreStateLocked(const string& state) {
+  return errors::Unimplemented("Reader RestoreState");
+}
+
+void ReaderBase::Read(QueueInterface* queue, string* key, string* value,
+                      OpKernelContext* context) {
+  mutex_lock lock(mu_);
+  while (true) {
+    if (!work_in_progress()) {
+      GetNextWorkLocked(queue, context);
+      if (!context->status().ok()) return;
+    }
+
+    bool produced = false;
+    bool at_end = false;
+    Status status = ReadLocked(key, value, &produced, &at_end);
+
+    if (!at_end && status.ok() && !produced) {
+      status = errors::Internal(
+          "ReadLocked() for ", name(),
+          " must set *at_end=true, *produced=true, or return an error.");
+    }
+    if (!status.ok() && produced) {
+      status = errors::Internal("ReadLocked() for ", name(),
+                                " set *produced=true *and* returned an error: ",
+                                status.ToString());
+    }
+    if (status.ok() && at_end) {
+      status = OnWorkFinishedLocked();
+      work_finished_ = work_started_;
+    }
+    if (!status.ok()) {
+      context->SetStatus(status);
+      return;
+    }
+    if (produced) {
+      ++num_records_produced_;
+      return;
+    }
+  }
+}
+
+void ReaderBase::GetNextWorkLocked(QueueInterface* queue,
+                                   OpKernelContext* context) {
+  Notification n;
+  queue->TryDequeue(
+      context, [this, context, &n](const QueueInterface::Tuple& tuple) {
+        if (context->status().ok()) {
+          if (tuple.size() != 1) {
+            context->SetStatus(
+                errors::InvalidArgument("Expected single component queue"));
+          } else if (tuple[0].dtype() != DT_STRING) {
+            context->SetStatus(errors::InvalidArgument(
+                "Expected queue with single string component"));
+          } else if (tuple[0].NumElements() != 1) {
+            context->SetStatus(errors::InvalidArgument(
+                "Expected to dequeue a one-element string tensor"));
+          } else {
+            work_ = tuple[0].flat<string>()(0);
+            ++work_started_;
+            Status status = OnWorkStartedLocked();
+            if (!status.ok()) {
+              context->SetStatus(status);
+              --work_started_;
+            }
+          }
+        }
+        n.Notify();
+      });
+  n.WaitForNotification();
+}
+
+void ReaderBase::SaveBaseState(ReaderBaseState* state) const {
+  state->Clear();
+  state->set_work_started(work_started_);
+  state->set_work_finished(work_finished_);
+  state->set_num_records_produced(num_records_produced_);
+  state->set_current_work(work_);
+}
+
+Status ReaderBase::RestoreBaseState(const ReaderBaseState& state) {
+  work_started_ = state.work_started();
+  work_finished_ = state.work_finished();
+  num_records_produced_ = state.num_records_produced();
+  work_ = state.current_work();
+  if (work_started_ < 0 || work_finished_ < 0 || num_records_produced_ < 0) {
+    return errors::InvalidArgument(
+        "Unexpected negative value when restoring in ", name(), ": ",
+        state.ShortDebugString());
+  }
+  if (work_started_ > work_finished_) {
+    return errors::InvalidArgument(
+        "Inconsistent work started vs. finished when restoring in ", name(),
+        ": ", state.ShortDebugString());
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reader_base.h b/tensorflow/core/kernels/reader_base.h
new file mode 100644
index 0000000000..d344300388
--- /dev/null
+++ b/tensorflow/core/kernels/reader_base.h
@@ -0,0 +1,107 @@
+#ifndef TENSORFLOW_KERNELS_READER_BASE_H_
+#define TENSORFLOW_KERNELS_READER_BASE_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "tensorflow/core/framework/queue_interface.h"
+#include "tensorflow/core/framework/reader_interface.h"
+#include "tensorflow/core/kernels/reader_base.pb.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+
+namespace tensorflow {
+
+// Default implementation of ReaderInterface.
+class ReaderBase : public ReaderInterface {
+ public:
+  // name: For use in error messages, should mention both the name of
+  // the op and the node.
+  explicit ReaderBase(const string& name);
+
+  // Note that methods with names ending in "Locked" are called while
+  // the ReaderBase's mutex is held.
+
+  // Implement this function in descendants -----------------------------------
+
+  // Produce the next key/value pair from the current work item.
+  // This is called "Locked" since it is executed under a mutex
+  // that serializes all Reader calls.
+  // Usage:
+  //  a) If a record was successfully produced, set *produced = true,
+  //  and fill in *key and *value.
+  //  b) If no more records will be produced for this work item, set
+  //  *at_end = true.
+  //  c) If a record was produced, but no more will be produced, you
+  //     may either do both (a) and (b), or do (a) in this call and do (b) in
+  //     the next call to ReadLocked().
+  //  d) If there was an error producing (e.g. an error reading the file,
+  //     data corruption), return a non-OK() status.  ReadLocked may be
+  //     called again if the user reruns this part of the graph.
+  virtual Status ReadLocked(string* key, string* value, bool* produced,
+                            bool* at_end) = 0;
+
+  // Descendants may optionally implement these -------------------------------
+
+  // Called when work starts / finishes.
+  virtual Status OnWorkStartedLocked() { return Status::OK(); }
+  virtual Status OnWorkFinishedLocked() { return Status::OK(); }
+
+  // Called to reset the Reader to a newly constructed state.
+  virtual Status ResetLocked();
+
+  // Default implementation generates an Unimplemented error.
+  // See the protected helper methods below.
+  virtual Status SerializeStateLocked(string* state);
+  virtual Status RestoreStateLocked(const string& state);
+
+  // Accessors ----------------------------------------------------------------
+
+  // Always true during a call to ReadLocked().
+  bool work_in_progress() const { return work_finished_ < work_started_; }
+
+  // Returns the name of the current work item (valid if
+  // work_in_progress() returns true).  May change between calls to
+  // ReadLocked().
+  const string& current_work() const { return work_; }
+
+  // What was passed to the constructor.
+  const string& name() const { return name_; }
+
+ protected:
+  // For descendants wishing to implement serialize & restore state.
+
+  // Writes ReaderBase state to *state.
+  void SaveBaseState(ReaderBaseState* state) const;
+
+  // Restores ReaderBase state from state. Assumes state was filled
+  // using SaveBaseState() above.
+  Status RestoreBaseState(const ReaderBaseState& state);
+
+ private:
+  // Implementations of ReaderInterface methods.  These ensure thread-safety
+  // and call the methods above to do the work.
+  void Read(QueueInterface* queue, string* key, string* value,
+            OpKernelContext* context) override;
+  Status Reset() override;
+  int64 NumRecordsProduced() override;
+  int64 NumWorkUnitsCompleted() override;
+  Status SerializeState(string* state) override;
+  Status RestoreState(const string& state) override;
+
+  // For implementing Read().  Dequeues the next work item from
+  // *queue, and if successful updates work_, work_started_
+  // (establishing work_in_progress() == true) and calls
+  // OnWorkStartedLocked().  May block.
+  void GetNextWorkLocked(QueueInterface* queue, OpKernelContext* context);
+
+  mutable mutex mu_;
+  const string name_;
+  int64 work_started_ = 0;
+  int64 work_finished_ = 0;
+  int64 num_records_produced_ = 0;
+  string work_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_READER_BASE_H_
diff --git a/tensorflow/core/kernels/reader_base.proto b/tensorflow/core/kernels/reader_base.proto
new file mode 100644
index 0000000000..4335cb2152
--- /dev/null
+++ b/tensorflow/core/kernels/reader_base.proto
@@ -0,0 +1,13 @@
+syntax = "proto3";
+
+package tensorflow;
+// option cc_enable_arenas = true;
+
+// For serializing and restoring the state of ReaderBase, see
+// reader_base.h for details.
+message ReaderBaseState {
+  int64 work_started = 1;
+  int64 work_finished = 2;
+  int64 num_records_produced = 3;
+  bytes current_work = 4;
+};
diff --git a/tensorflow/core/kernels/reader_ops.cc b/tensorflow/core/kernels/reader_ops.cc
new file mode 100644
index 0000000000..38c1013604
--- /dev/null
+++ b/tensorflow/core/kernels/reader_ops.cc
@@ -0,0 +1,132 @@
+// See docs in ../ops/io_ops.cc.
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/queue_interface.h"
+#include "tensorflow/core/framework/reader_interface.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+class ReaderVerbOpKernel : public OpKernel {
+ public:
+  using OpKernel::OpKernel;
+
+  void Compute(OpKernelContext* context) override {
+    ReaderInterface* reader;
+    OP_REQUIRES_OK(context,
+                   GetResourceFromContext(context, "reader_handle", &reader));
+    ComputeWithReader(context, reader);
+    reader->Unref();
+  }
+
+ protected:
+  virtual void ComputeWithReader(OpKernelContext* context,
+                                 ReaderInterface* reader) = 0;
+};
+
+class ReaderReadOp : public ReaderVerbOpKernel {
+ public:
+  using ReaderVerbOpKernel::ReaderVerbOpKernel;
+
+  void ComputeWithReader(OpKernelContext* context,
+                         ReaderInterface* reader) override {
+    QueueInterface* queue;
+    OP_REQUIRES_OK(context,
+                   GetResourceFromContext(context, "queue_handle", &queue));
+    core::ScopedUnref unref_me(queue);
+    Tensor* key = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output("key", TensorShape({}), &key));
+    Tensor* value = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output("value", TensorShape({}), &value));
+
+    auto key_scalar = key->scalar<string>();
+    auto value_scalar = value->scalar<string>();
+    reader->Read(queue, &key_scalar(), &value_scalar(), context);
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("ReaderRead").Device(DEVICE_CPU), ReaderReadOp);
+
+class ReaderNumRecordsProducedOp : public ReaderVerbOpKernel {
+ public:
+  using ReaderVerbOpKernel::ReaderVerbOpKernel;
+
+  void ComputeWithReader(OpKernelContext* context,
+                         ReaderInterface* reader) override {
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output("records_produced",
+                                                     TensorShape({}), &output));
+    output->scalar<int64>()() = reader->NumRecordsProduced();
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("ReaderNumRecordsProduced").Device(DEVICE_CPU),
+                        ReaderNumRecordsProducedOp);
+
+class ReaderNumWorkUnitsCompletedOp : public ReaderVerbOpKernel {
+ public:
+  using ReaderVerbOpKernel::ReaderVerbOpKernel;
+
+  void ComputeWithReader(OpKernelContext* context,
+                         ReaderInterface* reader) override {
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output("units_completed",
+                                                     TensorShape({}), &output));
+    output->scalar<int64>()() = reader->NumWorkUnitsCompleted();
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("ReaderNumWorkUnitsCompleted").Device(DEVICE_CPU),
+                        ReaderNumWorkUnitsCompletedOp);
+
+class ReaderSerializeStateOp : public ReaderVerbOpKernel {
+ public:
+  using ReaderVerbOpKernel::ReaderVerbOpKernel;
+
+  void ComputeWithReader(OpKernelContext* context,
+                         ReaderInterface* reader) override {
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output("state", TensorShape({}), &output));
+    OP_REQUIRES_OK(context,
+                   reader->SerializeState(&output->scalar<string>()()));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("ReaderSerializeState").Device(DEVICE_CPU),
+                        ReaderSerializeStateOp);
+
+class ReaderRestoreStateOp : public ReaderVerbOpKernel {
+ public:
+  using ReaderVerbOpKernel::ReaderVerbOpKernel;
+
+  void ComputeWithReader(OpKernelContext* context,
+                         ReaderInterface* reader) override {
+    const Tensor* tensor;
+    OP_REQUIRES_OK(context, context->input("state", &tensor));
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsScalar(tensor->shape()),
+        errors::InvalidArgument("Reader state must be scalar, but had shape: ",
+                                tensor->shape().DebugString()));
+    OP_REQUIRES_OK(context, reader->RestoreState(tensor->scalar<string>()()));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("ReaderRestoreState").Device(DEVICE_CPU),
+                        ReaderRestoreStateOp);
+
+class ReaderResetOp : public ReaderVerbOpKernel {
+ public:
+  using ReaderVerbOpKernel::ReaderVerbOpKernel;
+
+  void ComputeWithReader(OpKernelContext* context,
+                         ReaderInterface* reader) override {
+    OP_REQUIRES_OK(context, reader->Reset());
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("ReaderReset").Device(DEVICE_CPU), ReaderResetOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops.h b/tensorflow/core/kernels/reduction_ops.h
new file mode 100644
index 0000000000..b412617a65
--- /dev/null
+++ b/tensorflow/core/kernels/reduction_ops.h
@@ -0,0 +1,66 @@
+#ifndef TENSORFLOW_KERNELS_REDUCTION_OPS_H_
+#define TENSORFLOW_KERNELS_REDUCTION_OPS_H_
+
+// Functor definitions for Reduction ops, must be compilable by nvcc.
+
+#include <iostream>
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+// When eigen3 has better implementation of AllReducer and AnyReducer,
+// replaces reducers here.
+
+// Reduction using logical_and.
+struct AllReducer {
+  // TODO(zhifengc): Implement PacketAccess when performance matters.
+  static const bool PacketAccess = false;
+  static const bool IsStateful = false;
+
+  EIGEN_DEVICE_FUNC void reduce(const bool t, bool* accum) const {
+    *accum &= t;
+  }
+
+  EIGEN_DEVICE_FUNC bool initialize() const { return true; }
+
+  EIGEN_DEVICE_FUNC bool finalize(const bool accum) const { return accum; }
+};
+
+// Reduction using logical_or.
+struct AnyReducer {
+  // TODO(zhifengc): Implement PacketAccess when performance matters.
+  static const bool PacketAccess = false;
+  static const bool IsStateful = false;
+
+  EIGEN_DEVICE_FUNC void reduce(const bool t, bool* accum) const {
+    *accum |= t;
+  }
+
+  EIGEN_DEVICE_FUNC bool initialize() const { return false; }
+
+  EIGEN_DEVICE_FUNC bool finalize(const bool accum) const { return accum; }
+};
+
+template <typename Device, typename OUT_T, typename IN_T,
+          typename ReductionAxes, typename Reducer>
+void ReduceEigenImpl(const Device& d, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const Reducer& reducer) {
+  out.device(d) = in.reduce(reduction_axes, reducer);
+}
+
+template <typename Device>
+struct ReduceFunctor {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes,
+            typename Reducer>
+  static void Reduce(const Device& d, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const Reducer& reducer);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_REDUCTION_OPS_H_
diff --git a/tensorflow/core/kernels/reduction_ops_all.cc b/tensorflow/core/kernels/reduction_ops_all.cc
new file mode 100644
index 0000000000..11d399e70a
--- /dev/null
+++ b/tensorflow/core/kernels/reduction_ops_all.cc
@@ -0,0 +1,17 @@
+#include "tensorflow/core/kernels/reduction_ops_common.h"
+
+namespace tensorflow {
+
+REGISTER_KERNEL_BUILDER(Name("All")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("reduction_indices"),
+                        ReductionOp<CPUDevice, bool, functor::AllReducer>);
+
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("All")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("reduction_indices"),
+                        ReductionOp<GPUDevice, bool, functor::AllReducer>);
+#endif
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_any.cc b/tensorflow/core/kernels/reduction_ops_any.cc
new file mode 100644
index 0000000000..a89ef22b08
--- /dev/null
+++ b/tensorflow/core/kernels/reduction_ops_any.cc
@@ -0,0 +1,17 @@
+#include "tensorflow/core/kernels/reduction_ops_common.h"
+
+namespace tensorflow {
+
+REGISTER_KERNEL_BUILDER(Name("Any")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("reduction_indices"),
+                        ReductionOp<CPUDevice, bool, functor::AnyReducer>);
+
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("Any")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("reduction_indices"),
+                        ReductionOp<GPUDevice, bool, functor::AnyReducer>);
+#endif
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_common.h b/tensorflow/core/kernels/reduction_ops_common.h
new file mode 100644
index 0000000000..2bde3a1a54
--- /dev/null
+++ b/tensorflow/core/kernels/reduction_ops_common.h
@@ -0,0 +1,302 @@
+// This is an internal header file intended to only be included as the
+// front-matter in the implementation files of various reduction ops.  It
+// is a header file because we split the various reduction ops into their
+// own compilation units to get more parallelism in compilation.
+
+#ifndef TENSORFLOW_KERNELS_REDUCTION_OPS_COMMON_H_
+#define TENSORFLOW_KERNELS_REDUCTION_OPS_COMMON_H_
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/reduction_ops.h"
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/Eigen/Core"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/public/status.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device>
+struct Constants {
+  // Derive Index type. int (32-bit) or long (64-bit) depending on the
+  // compile-time configuration. "float" here is not relevant.
+  // TODO(zhifengc): Moves the definition to TTypes.
+  typedef TTypes<float>::Tensor::Index Index;
+  Eigen::array<Index, 1> kZero;
+  Eigen::array<Index, 1> kOne;
+  Eigen::array<Index, 2> kZeroTwo;
+
+  Constants() {
+    kZero[0] = 0;
+    kOne[0] = 1;
+    kZeroTwo[0] = 0;
+    kZeroTwo[1] = 2;
+  }
+};
+
+#if defined(EIGEN_HAS_INDEX_LIST)
+template <>
+struct Constants<CPUDevice> {
+  const Eigen::IndexList<Eigen::type2index<0>> kZero;
+  const Eigen::IndexList<Eigen::type2index<1>> kOne;
+  const Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<2>> kZeroTwo;
+};
+#endif
+
+namespace {
+
+class ReductionHelper {
+ public:
+  ReductionHelper() : reduce_first_axis_(false) {}
+
+  Status Simplify(const Tensor& data, const Tensor& axis,
+                       const bool keep_dims) {
+    // bitmap[i] indicates whether to reduce data along i-th axis.
+    std::vector<bool> bitmap(data.dims(), false);
+    auto axis_vec = axis.flat<int32>();
+    for (int64 i = 0; i < axis.NumElements(); ++i) {
+      const int32 index = axis_vec(i);
+      if (index < 0 || index >= data.dims()) {
+        return errors::OutOfRange("Invalid reduction dimension (", index,
+                                  " for input with ", data.dims(),
+                                  " dimension(s)");
+      }
+      bitmap[index] = true;
+    }
+
+    // Output tensor's dim sizes.
+    out_shape_.clear();
+    for (int i = 0; i < data.dims(); ++i) {
+      if (!bitmap[i]) {
+        // If we are not reducing along dimension i.
+        out_shape_.push_back(data.dim_size(i));
+      } else if (keep_dims) {
+        // We are reducing along dimension i, but we want to keep the
+        // same number of dimensions, so we set the dimension of i to
+        // '1'.
+        out_shape_.push_back(1);
+      }
+    }
+
+    // Depending on bitmap[i] and bitmap[i-1], we can collapse axis of
+    // the input data before doing the reduction on the resulting
+    // tensor.  The shape of the reduction is a reshape of the final
+    // output.
+
+    // We'll skip the leading 1s.
+    int dim_index = 0;
+    for (; dim_index < data.dims(); ++dim_index) {
+      if (data.dim_size(dim_index) != 1) break;
+    }
+    if (dim_index >= data.dims()) {
+      // Special case. The input is essentially a scalar.
+      reduce_first_axis_ = true;
+    } else {
+      // Starting from the (dim_index)-th dimension, dimensions
+      // alternates between runs that need to be reduced and runs that
+      // don't.
+      //
+      // NOTE: If a dimension has size 1, we group it as the current
+      // run so that we can minimize the number of runs.
+      //
+      // E.g., when we want to reduce a tensor of shape [2, 1, 3, 1,
+      // 5] by axes = [1, 4], we should treat the tensor as a [6, 5]
+      // and reduce by axes = [1] (i.e., the output is shape [6]).
+      reduce_first_axis_ = bitmap[dim_index];
+      data_reshape_.push_back(data.dim_size(dim_index));
+      ++dim_index;
+      for (; dim_index < data.dims(); ++dim_index) {
+        const auto size = data.dim_size(dim_index);
+        if (size == 1) {
+          bitmap[dim_index] = bitmap[dim_index - 1];
+        }
+        if (bitmap[dim_index - 1] != bitmap[dim_index]) {
+          // Starts a new run of reduce or !reduce.
+          data_reshape_.push_back(size);
+        } else {
+          // Continue a run of reduce or !reduce.
+          data_reshape_.back() *= size;
+        }
+      }
+      // If reduce_first_axis_ is true (input's dimension 0, 2, 4, etc
+      // are reduced), data_reshape_[1, 3, 5, ...]  is out_reshape_,
+      // otherwise, data_reshape_[0, 2, 4, ...] is.
+      for (size_t i = reduce_first_axis_ ? 1 : 0; i < data_reshape_.size();
+           i += 2) {
+        out_reshape_.push_back(data_reshape_[i]);
+      }
+    }
+
+    VLOG(1) << "data reshape: " << str_util::Join(data_reshape_, ",");
+    VLOG(1) << "out  reshape: " << str_util::Join(out_reshape_, ",");
+    VLOG(1) << "out    shape: " << str_util::Join(out_shape_, ",");
+    return Status::OK();
+  }
+
+  // We need to do roughly:
+  //   tmp_out = allocate(out_reshape())
+  //   tmp_out.reshape(out_reshape) = data.reshape(data_reshape).reduce(axes)
+  //   out = tmp_out.reshape(out_shape)
+
+  // The reduction result must be allocated with this shape.
+  TensorShape out_reshape() const {
+    TensorShape shape;
+    for (auto size : out_reshape_) shape.AddDim(size);
+    return shape;
+  }
+
+  // The final output shape must be allocated with this shape.
+  TensorShape out_shape() const {
+    TensorShape shape;
+    for (auto size : out_shape_) shape.AddDim(size);
+    return shape;
+  }
+
+  // The reduction is on a reshaped tensor of this rank.
+  int ndims() const { return data_reshape_.size(); }
+
+  // True if need to reduce the 0-th dimension.
+  bool reduce_first_axis() const { return reduce_first_axis_; }
+
+  // The output is reshaped.
+  template <typename T, int N>
+  typename TTypes<T, N>::Tensor out(Tensor* out) {
+    return out->shaped<T, N>(out_reshape_);
+  }
+
+  // The input is reshaped.
+  template <typename T, int N>
+  typename TTypes<T, N>::ConstTensor in(const Tensor& data) {
+    return data.shaped<T, N>(data_reshape_);
+  }
+
+ private:
+  bool reduce_first_axis_;      // True if need to reduce the 0-th dimension.
+  std::vector<int64> data_reshape_;  // Reshape the data before reduction.
+  std::vector<int64> out_shape_;     // The final output shape.
+  std::vector<int64> out_reshape_;   // Reshape the output for reduction.
+};
+
+}  // end namespace
+
+// For operations where the output is a reduction function along some
+// dimensions of the input.
+template <typename Device, class T, typename Reducer>
+class ReductionOp : public OpKernel {
+ public:
+  explicit ReductionOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    const DataType dt = DataTypeToEnum<T>::v();
+    OP_REQUIRES_OK(ctx, ctx->MatchSignature({dt, DT_INT32}, {dt}));
+
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("keep_dims", &keep_dims_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& data = ctx->input(0);
+    const Tensor& axes = ctx->input(1);
+    VLOG(1) << "data shape: " << data.shape().ShortDebugString();
+    VLOG(1) << "axes      : " << axes.SummarizeValue(10);
+
+    ReductionHelper helper;
+    OP_REQUIRES_OK(ctx, helper.Simplify(data, axes, keep_dims_));
+    CHECK_GE(helper.ndims(), 0);
+
+    // The real output shape will be assigned below.
+    TensorShape empty_shape;
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, empty_shape, &out));
+
+    if (helper.ndims() == 0 ||
+        (helper.ndims() == 1 && !helper.reduce_first_axis())) {
+      // Special case. Reduces nothing.  It is unclear why this is
+      // necessary, but tests fail without it.  Look into why this
+      // case occurs.
+      if (!out->CopyFrom(data, helper.out_shape())) {
+        ctx->SetStatus(errors::Internal("Error during reduction copy."));
+      }
+      return;
+    }
+
+    // A temporary tensor whose size matches the size of the reduced
+    // output.
+    Tensor tmp_out;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_temp(out->dtype(), helper.out_reshape(), &tmp_out));
+
+    typedef functor::ReduceFunctor<Device> Functor;
+    Constants<Device> constants;
+    const Device& d = ctx->eigen_device<Device>();
+    Reducer reducer;
+
+    if ((helper.ndims() == 1) && helper.reduce_first_axis()) {
+      // Reduce to a scalar.
+      Functor::Reduce(d, helper.out<T, 0>(&tmp_out), helper.in<T, 1>(data),
+                      constants.kZero, reducer);
+    } else if ((helper.ndims() == 2) && helper.reduce_first_axis()) {
+      // Can be viewed as a reduction of a matrix along 1st dimension.
+      Functor::Reduce(d, helper.out<T, 1>(&tmp_out), helper.in<T, 2>(data),
+                      constants.kZero, reducer);
+    } else if ((helper.ndims() == 2) && !helper.reduce_first_axis()) {
+      // Can be viewed as a reduction of a matrix along 2nd dimension.
+      Functor::Reduce(d, helper.out<T, 1>(&tmp_out), helper.in<T, 2>(data),
+                      constants.kOne, reducer);
+    } else if ((helper.ndims() == 3) && helper.reduce_first_axis()) {
+      // Can be viewed as a reduction of a 3D tensor along 1st and 3rd
+      // dimensions.
+      Functor::Reduce(d, helper.out<T, 1>(&tmp_out), helper.in<T, 3>(data),
+                      constants.kZeroTwo, reducer);
+    } else if ((helper.ndims() == 3) && !helper.reduce_first_axis()) {
+      // Can be viewed as a reduction of a 3D tensor along 2nd dimension.
+      Functor::Reduce(d, helper.out<T, 2>(&tmp_out), helper.in<T, 3>(data),
+                      constants.kOne, reducer);
+    } else {
+      // TODO(zhifengc): We can implement reduction for arbitrary rank
+      // tensor and arbitrary reduction axes by iterating the reduction
+      // multiple times. This may also be accomplished in the graph
+      // construction.
+      ctx->SetStatus(
+          errors::Unimplemented("Reducing ", data.shape().ShortDebugString(),
+                                " axes [", axes.SummarizeValue(10), "] to ",
+                                tmp_out.shape().ShortDebugString()));
+      return;
+    }
+
+    // Set the real output using the contents of the reduction but the
+    // real expected output shape.  The number of elements should
+    // match between the two shapes.
+    if (!out->CopyFrom(tmp_out, helper.out_shape())) {
+      ctx->SetStatus(errors::Internal("Error during reduction copy."));
+    }
+  }
+
+ private:
+  // True if the number of dimensions should be maintained.
+  bool keep_dims_;
+};
+
+namespace functor {
+
+template <>
+struct ReduceFunctor<CPUDevice> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes,
+            typename Reducer>
+  static void Reduce(const CPUDevice& d, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const Reducer& reducer) {
+    ReduceEigenImpl(d, out, in, reduction_axes, reducer);
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_REDUCTION_OPS_COMMON_H_
diff --git a/tensorflow/core/kernels/reduction_ops_gpu.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu.cu.cc
new file mode 100644
index 0000000000..8e29d2d06c
--- /dev/null
+++ b/tensorflow/core/kernels/reduction_ops_gpu.cu.cc
@@ -0,0 +1,65 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/kernels/reduction_ops.h"
+
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Derive Index type. int (32-bit) or long (64-bit) depending on the
+// compile-time configuration. "float" here is not relevant.
+// TODO(zhifengc): Moves the definition to TTypes.
+typedef TTypes<float>::Tensor::Index Index;
+
+template <>
+struct ReduceFunctor<GPUDevice> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes,
+            typename Reducer>
+  static void Reduce(const GPUDevice& d, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const Reducer& reducer) {
+    ReduceEigenImpl(d, To32Bit(out), To32Bit(in), reduction_axes, reducer);
+  }
+};
+
+// T: the data type
+// REDUCER: the reducer functor
+// NUM_AXES: the number of axes to reduce
+// IN_DIMS: the number of dimensions of the input tensor
+#define DEFINE(T, REDUCER, IN_DIMS, NUM_AXES)                        \
+  template void ReduceFunctor<GPUDevice>::Reduce(                    \
+      const GPUDevice& d, TTypes<T, IN_DIMS - NUM_AXES>::Tensor out, \
+      TTypes<T, IN_DIMS>::ConstTensor in,                            \
+      const Eigen::array<Index, NUM_AXES>& reduction_axes,           \
+      const REDUCER& reducer);
+
+#define DEFINE_FOR_TYPE_AND_R(T, R) \
+  DEFINE(T, R, 1, 1);               \
+  DEFINE(T, R, 2, 1);               \
+  DEFINE(T, R, 3, 1);               \
+  DEFINE(T, R, 3, 2);
+
+#define DEFINE_FOR_ALL_REDUCERS(T)                          \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>); \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MinReducer<T>); \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MaxReducer<T>); \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::ProdReducer<T>)
+
+DEFINE_FOR_ALL_REDUCERS(float);
+#undef DEFINE_FOR_ALL_REDUCERS
+
+DEFINE_FOR_TYPE_AND_R(complex64, Eigen::internal::SumReducer<complex64>);
+DEFINE_FOR_TYPE_AND_R(bool, AllReducer);
+DEFINE_FOR_TYPE_AND_R(bool, AnyReducer);
+#undef DEFINE_FOR_TYPE_AND_R
+
+#undef DEFINE
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/reduction_ops_max.cc b/tensorflow/core/kernels/reduction_ops_max.cc
new file mode 100644
index 0000000000..1749360b6e
--- /dev/null
+++ b/tensorflow/core/kernels/reduction_ops_max.cc
@@ -0,0 +1,26 @@
+#include "tensorflow/core/kernels/reduction_ops_common.h"
+
+namespace tensorflow {
+
+#define REGISTER_CPU_KERNELS(type)                              \
+  REGISTER_KERNEL_BUILDER(                                      \
+      Name("Max").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      ReductionOp<CPUDevice, type, Eigen::internal::MaxReducer<type>>);
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS
+
+#if GOOGLE_CUDA
+
+#define REGISTER_GPU_KERNELS(type)          \
+  REGISTER_KERNEL_BUILDER(                  \
+      Name("Max")                           \
+          .Device(DEVICE_GPU)               \
+          .TypeConstraint<type>("T")        \
+          .HostMemory("reduction_indices"), \
+      ReductionOp<GPUDevice, type, Eigen::internal::MaxReducer<type>>);
+REGISTER_GPU_KERNELS(float);
+#undef REGISTER_GPU_KERNELS
+
+#endif
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_mean.cc b/tensorflow/core/kernels/reduction_ops_mean.cc
new file mode 100644
index 0000000000..b00c36fed8
--- /dev/null
+++ b/tensorflow/core/kernels/reduction_ops_mean.cc
@@ -0,0 +1,12 @@
+#include "tensorflow/core/kernels/reduction_ops_common.h"
+
+namespace tensorflow {
+
+#define REGISTER_CPU_KERNELS(type)                               \
+  REGISTER_KERNEL_BUILDER(                                       \
+      Name("Mean").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      ReductionOp<CPUDevice, type, Eigen::internal::MeanReducer<type>>);
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_min.cc b/tensorflow/core/kernels/reduction_ops_min.cc
new file mode 100644
index 0000000000..de1f4b8520
--- /dev/null
+++ b/tensorflow/core/kernels/reduction_ops_min.cc
@@ -0,0 +1,26 @@
+#include "tensorflow/core/kernels/reduction_ops_common.h"
+
+namespace tensorflow {
+
+#define REGISTER_CPU_KERNELS(type)                              \
+  REGISTER_KERNEL_BUILDER(                                      \
+      Name("Min").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      ReductionOp<CPUDevice, type, Eigen::internal::MinReducer<type>>);
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS
+
+#if GOOGLE_CUDA
+
+#define REGISTER_GPU_KERNELS(type)          \
+  REGISTER_KERNEL_BUILDER(                  \
+      Name("Min")                           \
+          .Device(DEVICE_GPU)               \
+          .TypeConstraint<type>("T")        \
+          .HostMemory("reduction_indices"), \
+      ReductionOp<GPUDevice, type, Eigen::internal::MinReducer<type>>);
+REGISTER_GPU_KERNELS(float);
+#undef REGISTER_GPU_KERNELS
+
+#endif
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_prod.cc b/tensorflow/core/kernels/reduction_ops_prod.cc
new file mode 100644
index 0000000000..4068c7feda
--- /dev/null
+++ b/tensorflow/core/kernels/reduction_ops_prod.cc
@@ -0,0 +1,26 @@
+#include "tensorflow/core/kernels/reduction_ops_common.h"
+
+namespace tensorflow {
+
+#define REGISTER_CPU_KERNELS(type)                               \
+  REGISTER_KERNEL_BUILDER(                                       \
+      Name("Prod").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      ReductionOp<CPUDevice, type, Eigen::internal::ProdReducer<type>>);
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS
+
+#if GOOGLE_CUDA
+
+#define REGISTER_GPU_KERNELS(type)          \
+  REGISTER_KERNEL_BUILDER(                  \
+      Name("Prod")                          \
+          .Device(DEVICE_GPU)               \
+          .TypeConstraint<type>("T")        \
+          .HostMemory("reduction_indices"), \
+      ReductionOp<GPUDevice, type, Eigen::internal::ProdReducer<type>>);
+REGISTER_GPU_KERNELS(float);
+#undef REGISTER_GPU_KERNELS
+
+#endif
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_sum.cc b/tensorflow/core/kernels/reduction_ops_sum.cc
new file mode 100644
index 0000000000..82d685e225
--- /dev/null
+++ b/tensorflow/core/kernels/reduction_ops_sum.cc
@@ -0,0 +1,37 @@
+#include "tensorflow/core/kernels/reduction_ops_common.h"
+
+namespace tensorflow {
+
+#define REGISTER_CPU_KERNELS(type)                              \
+  REGISTER_KERNEL_BUILDER(                                      \
+      Name("Sum").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      ReductionOp<CPUDevice, type, Eigen::internal::SumReducer<type>>);
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS
+
+// NOTE: We should have mean(complex64,int32), too. But that needs to
+// change Eigen::internal::MeanReducer to cast int to complex<float>.
+// We don't see immediate need of mean(complex64,int32) anyway.
+REGISTER_KERNEL_BUILDER(
+    Name("Sum").Device(DEVICE_CPU).TypeConstraint<complex64>("T"),
+    ReductionOp<CPUDevice, complex64, Eigen::internal::SumReducer<complex64>>);
+
+#if GOOGLE_CUDA
+
+#define REGISTER_GPU_KERNELS(type)          \
+  REGISTER_KERNEL_BUILDER(                  \
+      Name("Sum")                           \
+          .Device(DEVICE_GPU)               \
+          .TypeConstraint<type>("T")        \
+          .HostMemory("reduction_indices"), \
+      ReductionOp<GPUDevice, type, Eigen::internal::SumReducer<type>>);
+REGISTER_GPU_KERNELS(float);
+#undef REGISTER_GPU_KERNELS
+
+REGISTER_KERNEL_BUILDER(
+    Name("Sum").Device(DEVICE_GPU).TypeConstraint<complex64>("T"),
+    ReductionOp<GPUDevice, complex64, Eigen::internal::SumReducer<complex64>>);
+
+#endif
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_test.cc b/tensorflow/core/kernels/reduction_ops_test.cc
new file mode 100644
index 0000000000..d96da3c7f1
--- /dev/null
+++ b/tensorflow/core/kernels/reduction_ops_test.cc
@@ -0,0 +1,73 @@
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include <gtest/gtest.h>
+
+namespace tensorflow {
+
+// Creates a Graph which "reduce"s a 3D float tensor of "num" elements
+// into a scalar.
+static Graph* ToScalar(const string& reduce, int num) {
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor data(DT_FLOAT, TensorShape({64, 64, num / (64 * 64)}));
+  data.flat<float>().setRandom();
+  Tensor axes(DT_INT32, TensorShape({3}));
+  axes.flat<int32>()(0) = 0;
+  axes.flat<int32>()(1) = 1;
+  axes.flat<int32>()(2) = 2;
+  test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
+                      test::graph::Constant(g, axes));
+  return g;
+}
+
+// Creates a bench which reduces a 3D tensor with total "num" floats
+// into a scalar on a "device". Runs the bench for "iters" times.
+static void ReduceToScalar(int iters, const string& device,
+                           const string& reduce, int num) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num);
+  testing::BytesProcessed(static_cast<int64>(iters) * num * sizeof(float));
+  test::Benchmark(device, ToScalar(reduce, num)).Run(iters);
+}
+
+static void BM_Sum3DToScalarCPU(int iters, int num) {
+  ReduceToScalar(iters, "cpu", "Sum", num);
+}
+BENCHMARK(BM_Sum3DToScalarCPU)->Range(1 << 13, 1 << 20);
+
+static void BM_Max3DToScalarCPU(int iters, int num) {
+  ReduceToScalar(iters, "cpu", "Max", num);
+}
+BENCHMARK(BM_Max3DToScalarCPU)->Range(1 << 13, 1 << 20);
+
+static void BM_Prod3DToScalarCPU(int iters, int num) {
+  ReduceToScalar(iters, "cpu", "Prod", num);
+}
+BENCHMARK(BM_Prod3DToScalarCPU)->Range(1 << 13, 1 << 20);
+
+static void BM_Mean3DToScalarCPU(int iters, int num) {
+  ReduceToScalar(iters, "cpu", "Mean", num);
+}
+BENCHMARK(BM_Mean3DToScalarCPU)->Range(1 << 13, 1 << 20);
+
+static void BM_Sum3DToScalarGPU(int iters, int num) {
+  ReduceToScalar(iters, "gpu", "Sum", num);
+}
+BENCHMARK(BM_Sum3DToScalarGPU)->Range(1 << 13, 1 << 20);
+
+static void BM_Max3DToScalarGPU(int iters, int num) {
+  ReduceToScalar(iters, "gpu", "Max", num);
+}
+BENCHMARK(BM_Max3DToScalarGPU)->Range(1 << 13, 1 << 20);
+
+static void BM_Prod3DToScalarGPU(int iters, int num) {
+  ReduceToScalar(iters, "gpu", "Prod", num);
+}
+BENCHMARK(BM_Prod3DToScalarGPU)->Range(1 << 13, 1 << 20);
+
+// Once Mean is available on GPU, enable this.
+// static void BM_Mean3DToScalarGPU(int iters, int num) {
+//   ReduceToScalar(iters, "gpu", "Mean", num);
+// }
+// BENCHMARK(BM_Mean3DToScalarGPU)->Range(1 << 13, 1 << 20);
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/reference_gemm.h b/tensorflow/core/kernels/reference_gemm.h
new file mode 100644
index 0000000000..77c6ef35e9
--- /dev/null
+++ b/tensorflow/core/kernels/reference_gemm.h
@@ -0,0 +1,75 @@
+#ifndef TENSORFLOW_KERNELS_REFERENCE_GEMM_H_
+#define TENSORFLOW_KERNELS_REFERENCE_GEMM_H_
+
+// This is an unoptimized but debuggable implementation of the GEMM matrix
+// multiply function, used to compare to faster but more opaque versions, or
+// for bit depths or argument combinations that aren't supported by optimized
+// code.
+// It assumes the row-major convention used by TensorFlow, and implements
+// C = A * B, like the standard BLAS GEMM interface. If the tranpose flags are
+// true, then the relevant matrix is treated as stored in column-major order.
+
+namespace tensorflow {
+template <class T1, class T2, class T3>
+void ReferenceGemm(bool transpose_a, bool transpose_b, bool transpose_c,
+                   size_t m, size_t n, size_t k, const T1* a, T1 offset_a,
+                   size_t lda, const T2* b, T2 offset_b, size_t ldb, T3* c,
+                   int32 shift_c, int32 offset_c, int32 mult_c, size_t ldc) {
+  int a_i_stride;
+  int a_l_stride;
+  if (transpose_a) {
+    a_i_stride = 1;
+    a_l_stride = lda;
+  } else {
+    a_i_stride = lda;
+    a_l_stride = 1;
+  }
+  int b_j_stride;
+  int b_l_stride;
+  if (transpose_b) {
+    b_j_stride = ldb;
+    b_l_stride = 1;
+  } else {
+    b_j_stride = 1;
+    b_l_stride = ldb;
+  }
+  int c_i_stride;
+  int c_j_stride;
+  if (transpose_c) {
+    c_i_stride = 1;
+    c_j_stride = ldc;
+  } else {
+    c_i_stride = ldc;
+    c_j_stride = 1;
+  }
+
+  const int32 highest = static_cast<int32>(Eigen::NumTraits<T3>::highest());
+  const int32 lowest = static_cast<int32>(Eigen::NumTraits<T3>::lowest());
+  const int32 rounding = (shift_c < 1) ? 0 : (1 << (shift_c - 1));
+
+  int i, j, l;
+  for (j = 0; j < n; j++) {
+    for (i = 0; i < m; i++) {
+      int32 total = 0;
+      for (l = 0; l < k; l++) {
+        const size_t a_index = ((i * a_i_stride) + (l * a_l_stride));
+        const int32 a_value = a[a_index] - offset_a;
+        const size_t b_index = ((j * b_j_stride) + (l * b_l_stride));
+        const int32 b_value = b[b_index] - offset_b;
+        total += (a_value * b_value);
+      }
+      const size_t c_index = ((i * c_i_stride) + (j * c_j_stride));
+      int32_t output = ((((total + offset_c) * mult_c) + rounding) >> shift_c);
+      if (output > highest) {
+        output = highest;
+      }
+      if (output < lowest) {
+        output = lowest;
+      }
+      c[c_index] = static_cast<T3>(output);
+    }
+  }
+}
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_REFERENCE_GEMM_H_
diff --git a/tensorflow/core/kernels/relu_op.cc b/tensorflow/core/kernels/relu_op.cc
new file mode 100644
index 0000000000..d5dd7a8119
--- /dev/null
+++ b/tensorflow/core/kernels/relu_op.cc
@@ -0,0 +1,154 @@
+// See docs in ../ops/nn_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/relu_op.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class ReluOp : public UnaryElementWiseOp<T, ReluOp<Device, T>> {
+ public:
+  using UnaryElementWiseOp<T, ReluOp<Device, T>>::UnaryElementWiseOp;
+
+  void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
+    functor::Relu<Device, T> functor;
+    functor(context->eigen_device<Device>(), input.flat<T>(),
+            output->flat<T>());
+  }
+};
+
+template <typename Device, typename T>
+class Relu6Op : public UnaryElementWiseOp<T, Relu6Op<Device, T>> {
+ public:
+  using UnaryElementWiseOp<T, Relu6Op<Device, T>>::UnaryElementWiseOp;
+
+  void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
+    functor::Relu6<Device, T> functor;
+    functor(context->eigen_device<Device>(), input.flat<T>(),
+            output->flat<T>());
+  }
+};
+
+template <typename Device, typename T>
+class ReluGradOp : public BinaryElementWiseOp<T, ReluGradOp<Device, T>> {
+ public:
+  using BinaryElementWiseOp<T, ReluGradOp<Device, T>>::BinaryElementWiseOp;
+
+  // INPUTS:
+  //   g (gradients): backpropagated gradients
+  //   a (inputs): inputs that were passed to ReluOp()
+  // OUTPUT:
+  //   gradients to backprop
+  template <int NDIMS>
+  void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a,
+               Tensor* output) {
+    OP_REQUIRES(context, a.IsSameSize(g),
+                errors::InvalidArgument("g and a must be the same size"));
+    functor::ReluGrad<Device, T> functor;
+    functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(),
+            output->flat<T>());
+  }
+};
+
+template <typename Device, typename T>
+class Relu6GradOp : public BinaryElementWiseOp<T, Relu6GradOp<Device, T>> {
+ public:
+  using BinaryElementWiseOp<T, Relu6GradOp<Device, T>>::BinaryElementWiseOp;
+
+  // INPUTS:
+  //   g (gradients): backpropagated gradients
+  //   a (inputs): inputs that were passed to Relu6Op()
+  // OUTPUT:
+  //   gradients to backprop
+  template <int NDIMS>
+  void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a,
+               Tensor* output) {
+    OP_REQUIRES(context, a.IsSameSize(g),
+                errors::InvalidArgument("g and a must be the same size"));
+    functor::Relu6Grad<Device, T> functor;
+    functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(),
+            output->flat<T>());
+  }
+};
+
+#define REGISTER_KERNELS(type)                                        \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("Relu").Device(DEVICE_CPU).TypeConstraint<type>("T"),      \
+      ReluOp<CPUDevice, type>);                                       \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("Relu6").Device(DEVICE_CPU).TypeConstraint<type>("T"),     \
+      Relu6Op<CPUDevice, type>);                                      \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("ReluGrad").Device(DEVICE_CPU).TypeConstraint<type>("T"),  \
+      ReluGradOp<CPUDevice, type>);                                   \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("Relu6Grad").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      Relu6GradOp<CPUDevice, type>)
+
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                          \
+  template <>                                                        \
+  void Relu<GPUDevice, T>::operator()(                               \
+      const GPUDevice& d, typename TTypes<T>::ConstTensor features,  \
+      typename TTypes<T>::Tensor activations);                       \
+  extern template struct Relu<GPUDevice, T>;                         \
+                                                                     \
+  template <>                                                        \
+  void ReluGrad<GPUDevice, T>::operator()(                           \
+      const GPUDevice& d, typename TTypes<T>::ConstTensor gradients, \
+      typename TTypes<T>::ConstTensor features,                      \
+      typename TTypes<T>::Tensor backprops);                         \
+                                                                     \
+  extern template struct ReluGrad<GPUDevice, T>;                     \
+  template <>                                                        \
+  void Relu6<GPUDevice, T>::operator()(                              \
+      const GPUDevice& d, typename TTypes<T>::ConstTensor features,  \
+      typename TTypes<T>::Tensor activations);                       \
+  extern template struct Relu6<GPUDevice, T>;                        \
+                                                                     \
+  template <>                                                        \
+  void Relu6Grad<GPUDevice, T>::operator()(                          \
+      const GPUDevice& d, typename TTypes<T>::ConstTensor gradients, \
+      typename TTypes<T>::ConstTensor features,                      \
+      typename TTypes<T>::Tensor backprops);                         \
+  extern template struct Relu6Grad<GPUDevice, T>;
+
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
+}  // namespace functor
+
+// Registration of the GPU implementations.
+#define REGISTER_GPU_KERNELS(type)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("Relu").Device(DEVICE_GPU).TypeConstraint<type>("T"),      \
+      ReluOp<GPUDevice, type>);                                       \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("Relu6").Device(DEVICE_GPU).TypeConstraint<type>("T"),     \
+      Relu6Op<GPUDevice, type>);                                      \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("ReluGrad").Device(DEVICE_GPU).TypeConstraint<type>("T"),  \
+      ReluGradOp<GPUDevice, type>);                                   \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("Relu6Grad").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+      Relu6GradOp<GPUDevice, type>)
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+#undef REGISTER_GPU_KERNELS
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/relu_op.h b/tensorflow/core/kernels/relu_op.h
new file mode 100644
index 0000000000..8ed071cc4a
--- /dev/null
+++ b/tensorflow/core/kernels/relu_op.h
@@ -0,0 +1,79 @@
+#ifndef TENSORFLOW_KERNELS_RELU_OP_H_
+#define TENSORFLOW_KERNELS_RELU_OP_H_
+// Functor definition for ReluOp and ReluGradOp, must be compilable by nvcc.
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by ReluOp to do the computations.
+template <typename Device, typename T>
+struct Relu {
+  // Computes Relu activation.
+  //
+  // features: any shape.
+  // activations: same shape as "features".
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
+                  typename TTypes<T>::Tensor activations) {
+    activations.device(d) = features.cwiseMax(static_cast<T>(0));
+  }
+};
+
+// Functor used by ReluGradOp to do the computations.
+template <typename Device, typename T>
+struct ReluGrad {
+  // Computes ReluGrad backprops.
+  //
+  // gradients: gradients backpropagated to the Relu op.
+  // features: inputs that where passed to the Relu op.
+  // backprops: gradients to backpropagate to the Relu inputs.
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients,
+                  typename TTypes<T>::ConstTensor features,
+                  typename TTypes<T>::Tensor backprops) {
+    // NOTE: When the activation is exactly zero, we arbitrarily choose to not
+    // propagate the associated gradient value.
+    backprops.device(d) =
+        gradients * (features > features.constant(static_cast<T>(0)));
+  }
+};
+
+// Functor used by Relu6Op to do the computations.
+template <typename Device, typename T>
+struct Relu6 {
+  // Computes Relu6 activation.
+  //
+  // features: any shape.
+  // activations: same shape as "features".
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
+                  typename TTypes<T>::Tensor activations) {
+    activations.device(d) =
+        features.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(6));
+  }
+};
+
+// Functor used by ReluGradOp to do the computations.
+template <typename Device, typename T>
+struct Relu6Grad {
+  // Computes Relu6Grad backprops.
+  //
+  // gradients: gradients backpropagated to the Relu6 op.
+  // features: inputs that where passed to the Relu6 op.
+  // backprops: gradients to backpropagate to the Relu6 inputs.
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients,
+                  typename TTypes<T>::ConstTensor features,
+                  typename TTypes<T>::Tensor backprops) {
+    // NOTE: When the activation is exactly zero or six, we
+    // arbitrarily choose to not propagate the associated gradient
+    // value.
+    backprops.device(d) = gradients *
+                          (features > features.constant(static_cast<T>(0))) *
+                          (features < features.constant(static_cast<T>(6)));
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_RELU_OP_H_
diff --git a/tensorflow/core/kernels/relu_op_gpu.cu.cc b/tensorflow/core/kernels/relu_op_gpu.cu.cc
new file mode 100644
index 0000000000..6bd87ff8e4
--- /dev/null
+++ b/tensorflow/core/kernels/relu_op_gpu.cu.cc
@@ -0,0 +1,27 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <stdio.h>
+
+#include "tensorflow/core/kernels/relu_op.h"
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Definition of the GPU implementations declared in relu_op.cc.
+#define DEFINE_GPU_KERNELS(T)                      \
+  template struct functor::Relu<GPUDevice, T>;     \
+  template struct functor::ReluGrad<GPUDevice, T>; \
+  template struct functor::Relu6<GPUDevice, T>;    \
+  template struct functor::Relu6Grad<GPUDevice, T>;
+
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
+
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/reshape_op.cc b/tensorflow/core/kernels/reshape_op.cc
new file mode 100644
index 0000000000..7e1cf029de
--- /dev/null
+++ b/tensorflow/core/kernels/reshape_op.cc
@@ -0,0 +1,29 @@
+// See docs in ../ops/array_ops.cc.
+#include "tensorflow/core/kernels/reshape_op.h"
+
+namespace tensorflow {
+
+REGISTER_KERNEL_BUILDER(Name("Reshape").Device(DEVICE_CPU).HostMemory("shape"),
+                        ReshapeOp);
+
+#define REGISTER_GPU_KERNEL(type)                         \
+  REGISTER_KERNEL_BUILDER(Name("Reshape")                 \
+                              .Device(DEVICE_GPU)         \
+                              .HostMemory("shape")        \
+                              .TypeConstraint<type>("T"), \
+                          ReshapeOp);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+#undef REGISTER_GPU_KERNEL
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Reshape")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("tensor")
+                            .HostMemory("shape")
+                            .HostMemory("output")
+                            .TypeConstraint<int32>("T"),
+                        ReshapeOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reshape_op.h b/tensorflow/core/kernels/reshape_op.h
new file mode 100644
index 0000000000..3fd3f4492e
--- /dev/null
+++ b/tensorflow/core/kernels/reshape_op.h
@@ -0,0 +1,83 @@
+#ifndef TENSORFLOW_KERNELS_RESHAPE_OP_H_
+#define TENSORFLOW_KERNELS_RESHAPE_OP_H_
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+class ReshapeOp : public OpKernel {
+ public:
+  explicit ReshapeOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const Tensor& sizes = context->input(1);
+    // Preliminary validation of sizes.
+    OP_REQUIRES(context, TensorShapeUtils::IsLegacyVector(sizes.shape()),
+                errors::InvalidArgument("sizes input must be 1-D, not shape ",
+                                        sizes.shape().ShortDebugString()));
+    const int64 num_dims = sizes.NumElements();
+    OP_REQUIRES(
+        context, num_dims <= 8,
+        errors::InvalidArgument(num_dims, " > max 8 output dims supported"));
+
+    // Compute the output shape.  Determine product of specified
+    // dimensions, and find the index of the unspecified one.
+    TensorShape shape;
+    int32 product = 1;
+    int unknown_index = -1;
+    auto Svec = sizes.flat<int32>();
+    for (int d = 0; d < num_dims; ++d) {
+      const int32 size = Svec(d);
+      if (size == -1) {
+        OP_REQUIRES(
+            context, unknown_index == -1,
+            errors::InvalidArgument("only one input size may be -1, not both ",
+                                    unknown_index, " and ", d));
+        unknown_index = d;
+        shape.AddDim(1);
+      } else {
+        OP_REQUIRES(context, size >= 0,
+                    errors::InvalidArgument(
+                        "size ", d, " must be non-negative, not ", size));
+        shape.AddDim(size);
+        product *= size;
+      }
+    }
+    if (unknown_index != -1) {
+      OP_REQUIRES(
+          context, product > 0,
+          errors::InvalidArgument("cannot infer the missing input size for "
+                                  "an empty tensor unless all specified "
+                                  "input sizes are non-zero"));
+      const int32 missing = input.NumElements() / product;
+      OP_REQUIRES(context, product * missing == input.NumElements(),
+                  errors::InvalidArgument("Input has ", input.NumElements(),
+                                          " values, which isn't divisible by ",
+                                          product));
+      shape.set_dim(unknown_index, missing);
+    }
+    OP_REQUIRES(context, shape.num_elements() == input.NumElements(),
+                errors::InvalidArgument("Input has ", input.NumElements(),
+                                        " values, which isn't the same as ",
+                                        shape.num_elements()));
+
+    // Actually produce the reshaped output.
+    Tensor output(input.dtype());
+    CHECK(output.CopyFrom(input, shape));
+    context->set_output(0, output);
+  }
+
+  bool IsExpensive() override { return false; }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_RESHAPE_OP_H_
diff --git a/tensorflow/core/kernels/resize_area_op.cc b/tensorflow/core/kernels/resize_area_op.cc
new file mode 100644
index 0000000000..2b22d38ad6
--- /dev/null
+++ b/tensorflow/core/kernels/resize_area_op.cc
@@ -0,0 +1,139 @@
+// See docs in ../ops/image_ops.cc
+#define EIGEN_USE_THREADS
+
+#include <algorithm>
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, typename T>
+class ResizeAreaOp : public OpKernel {
+ public:
+  explicit ResizeAreaOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    OP_REQUIRES(context, input.dims() == 4,
+                errors::InvalidArgument("input must be 4-dimensional",
+                                        input.shape().ShortDebugString()));
+    const Tensor& shape_t = context->input(1);
+    OP_REQUIRES(context, shape_t.dims() == 1,
+                errors::InvalidArgument("shape_t must be 1-dimensional",
+                                        shape_t.shape().ShortDebugString()));
+    OP_REQUIRES(context, shape_t.NumElements() == 2,
+                errors::InvalidArgument("shape_t must have two elements",
+                                        shape_t.shape().ShortDebugString()));
+
+    auto Svec = shape_t.vec<int32>();
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                0, TensorShape({input.dim_size(0), Svec(0),
+                                                Svec(1), input.dim_size(3)}),
+                                &output));
+    const int64 batch_size = input.dim_size(0);
+    const int64 in_height = input.dim_size(1);
+    const int64 in_width = input.dim_size(2);
+    const int64 channels = input.dim_size(3);
+    const int64 out_height = output->dim_size(1);
+    const int64 out_width = output->dim_size(2);
+
+    typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>();
+    typename TTypes<float, 4>::Tensor output_data = output->tensor<float, 4>();
+
+    // A temporary tensor for computing the sum.
+    Tensor sum_tensor;
+    OP_REQUIRES_OK(
+        context, context->allocate_temp(DataTypeToEnum<float>::value,
+                                        TensorShape({channels}), &sum_tensor));
+    typename TTypes<float, 1>::Tensor sum_data = sum_tensor.vec<float>();
+
+    const float height_scale = in_height / static_cast<float>(out_height);
+    const float width_scale = in_width / static_cast<float>(out_width);
+
+    // When using this algorithm for downsizing, the target pixel value is the
+    // weighted average of all the source pixels. The weight is determined by
+    // the contribution percentage of the source pixel.
+    //
+    // Let "scale" be "target_image_size/source_image_size". If 1/n of the
+    // source pixel contributes to the target pixel, then the weight is (1/n *
+    // scale); if the complete source pixel contributes to the target pixel,
+    // then the weight is scale.
+    //
+    // To visualize the implementation, use one dimension as an example:
+    // Resize in[4] to out[3].
+    //   scale = 3/4 = 0.75
+    //   out[0]: in[0] and 1/3 of in[1]
+    //   out[1]: 2/3 of in[1] and 2/3 of in[2]
+    //   out[2]: 1/3 of in[2] and in[1]
+    // Hence, the output pixel values are:
+    //   out[0] = (in[0] * 1.0 + in[1] * 1/3) * scale
+    //   out[1] = (in[1] * 2/3 + in[2] * 2/3 * scale
+    //   out[2] = (in[3] * 1/3 + in[3] * 1.0) * scale
+    float scale = 1.0 / (height_scale * width_scale);
+    for (int64 b = 0; b < batch_size; ++b) {
+      for (int64 y = 0; y < out_height; ++y) {
+        const float in_y = y * height_scale;
+        const float in_y1 = (y + 1) * height_scale;
+        // The start and end height indices of all the cells that could
+        // contribute to the target cell.
+        int64 y_start = floor(in_y);
+        int64 y_end = ceil(in_y1);
+
+        for (int64 x = 0; x < out_width; ++x) {
+          const float in_x = x * width_scale;
+          const float in_x1 = (x + 1) * width_scale;
+          // The start and end width indices of all the cells that could
+          // contribute to the target cell.
+          int64 x_start = floor(in_x);
+          int64 x_end = ceil(in_x1);
+
+          sum_data.setConstant(0.0);
+          for (int64 i = y_start; i < y_end; ++i) {
+            float scale_y =
+                i < in_y ? i + 1 - in_y : (i + 1 > in_y1 ? in_y1 - i : 1.0);
+            for (int64 j = x_start; j < x_end; ++j) {
+              float scale_x =
+                  j < in_x ? j + 1 - in_x : (j + 1 > in_x1 ? in_x1 - j : 1.0);
+              for (int64 c = 0; c < channels; ++c) {
+#define BOUND(val, limit) std::min(((limit)-1ll), (std::max(0ll, (val))))
+                sum_data(c) +=
+                    input_data(b, BOUND(i, in_height), BOUND(j, in_width), c) *
+                    scale_y * scale_x * scale;
+#undef BOUND
+              }
+            }
+          }
+          for (int64 c = 0; c < channels; ++c) {
+            output_data(b, y, x, c) = sum_data(c);
+          }
+        }
+      }
+    }
+  }
+};
+
+#define REGISTER_KERNEL(T)                            \
+  REGISTER_KERNEL_BUILDER(Name("ResizeArea")          \
+                              .Device(DEVICE_CPU)     \
+                              .TypeConstraint<T>("T") \
+                              .HostMemory("size"),    \
+                          ResizeAreaOp<CPUDevice, T>);
+
+REGISTER_KERNEL(uint8);
+REGISTER_KERNEL(int8);
+REGISTER_KERNEL(int32);
+REGISTER_KERNEL(float);
+REGISTER_KERNEL(double);
+#undef REGISTER_KERNEL
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/resize_bicubic_op.cc b/tensorflow/core/kernels/resize_bicubic_op.cc
new file mode 100644
index 0000000000..472fc19b82
--- /dev/null
+++ b/tensorflow/core/kernels/resize_bicubic_op.cc
@@ -0,0 +1,121 @@
+// See docs in ../ops/image_ops.cc
+#define EIGEN_USE_THREADS
+
+#include <algorithm>
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, typename T>
+class ResizeBicubicOp : public OpKernel {
+ public:
+  explicit ResizeBicubicOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    OP_REQUIRES(context, input.dims() == 4,
+                errors::InvalidArgument("input must be 4-dimensional",
+                                        input.shape().ShortDebugString()));
+    const Tensor& shape_t = context->input(1);
+    OP_REQUIRES(context, shape_t.dims() == 1,
+                errors::InvalidArgument("shape_t must be 1-dimensional",
+                                        shape_t.shape().ShortDebugString()));
+    OP_REQUIRES(context, shape_t.NumElements() == 2,
+                errors::InvalidArgument("shape_t must have two elements",
+                                        shape_t.shape().ShortDebugString()));
+
+    auto Svec = shape_t.vec<int32>();
+    // Initialize shape to the batch size of the input, then add
+    // the rest of the dimensions
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                0, TensorShape({input.dim_size(0), Svec(0),
+                                                Svec(1), input.dim_size(3)}),
+                                &output));
+    const int64 batch_size = input.dim_size(0);
+    const int64 in_height = input.dim_size(1);
+    const int64 in_width = input.dim_size(2);
+    const int64 channels = input.dim_size(3);
+    const int64 out_height = output->dim_size(1);
+    const int64 out_width = output->dim_size(2);
+
+    typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>();
+    typename TTypes<float, 4>::Tensor output_data = output->tensor<float, 4>();
+
+    const float height_scale = in_height / static_cast<float>(out_height);
+    const float width_scale = in_width / static_cast<float>(out_width);
+
+    // Initialize coefficients table using Bicubic convolution algorithm.
+    // https://en.wikipedia.org/wiki/Bicubic_interpolation
+    static const int64 tab_size = (1 << 10);
+    static float coeffs_tab[(tab_size + 1) * 2];
+    static const double A = -0.75;
+    for (int i = 0; i <= tab_size; ++i) {
+      float x = i * 1.0 / tab_size;
+      coeffs_tab[i * 2] = ((A + 2) * x - (A + 3)) * x * x + 1;
+      x += 1.0;
+      coeffs_tab[i * 2 + 1] = ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
+    }
+
+    auto cal = [](float v0, float v1, float v2, float v3, float dx) {
+      const int64 offset = round(dx * tab_size);
+      const float a0 = coeffs_tab[offset * 2 + 1];
+      const float a1 = coeffs_tab[offset * 2];
+      const float a2 = coeffs_tab[(tab_size - offset) * 2];
+      const float a3 = coeffs_tab[(tab_size - offset) * 2 + 1];
+      return a0 * v0 + a1 * v1 + a2 * v2 + a3 * v3;
+    };
+
+    float coeff[4] = {0.0};
+    for (int64 b = 0; b < batch_size; ++b) {
+      for (int64 y = 0; y < out_height; ++y) {
+        const int64 in_y = floor(height_scale * y);
+        const float dy = height_scale * y - in_y;
+        for (int64 x = 0; x < out_width; ++x) {
+          const int64 in_x = floor(width_scale * x);
+          const float dx = width_scale * x - in_x;
+          for (int64 c = 0; c < channels; ++c) {
+            for (int64 i = 0; i < 4; ++i) {
+#define BOUND(val, limit) std::min(((limit)-1ll), (std::max(0ll, (val))))
+              int64 bound_y = BOUND(in_y - 1 + i, in_height);
+              coeff[i] =
+                  cal(input_data(b, bound_y, BOUND(in_x - 1, in_width), c),
+                      input_data(b, bound_y, BOUND(in_x, in_width), c),
+                      input_data(b, bound_y, BOUND(in_x + 1, in_width), c),
+                      input_data(b, bound_y, BOUND(in_x + 2, in_width), c), dx);
+#undef BOUND
+            }
+            output_data(b, y, x, c) =
+                cal(coeff[0], coeff[1], coeff[2], coeff[3], dy);
+          }
+        }
+      }
+    }
+  }
+};
+
+#define REGISTER_KERNEL(T)                            \
+  REGISTER_KERNEL_BUILDER(Name("ResizeBicubic")       \
+                              .Device(DEVICE_CPU)     \
+                              .TypeConstraint<T>("T") \
+                              .HostMemory("size"),    \
+                          ResizeBicubicOp<CPUDevice, T>);
+
+REGISTER_KERNEL(uint8);
+REGISTER_KERNEL(int8);
+REGISTER_KERNEL(int32);
+REGISTER_KERNEL(float);
+REGISTER_KERNEL(double);
+#undef REGISTER_KERNEL
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/resize_bilinear_op.cc b/tensorflow/core/kernels/resize_bilinear_op.cc
new file mode 100644
index 0000000000..5119b93508
--- /dev/null
+++ b/tensorflow/core/kernels/resize_bilinear_op.cc
@@ -0,0 +1,109 @@
+// See docs in ../ops/image_ops.cc
+#define EIGEN_USE_THREADS
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, typename T>
+class ResizeBilinearOp : public OpKernel {
+ public:
+  explicit ResizeBilinearOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    OP_REQUIRES(context, input.dims() == 4,
+                errors::InvalidArgument("input must be 4-dimensional",
+                                        input.shape().ShortDebugString()));
+    const Tensor& shape_t = context->input(1);
+    OP_REQUIRES(context, shape_t.dims() == 1,
+                errors::InvalidArgument("shape_t must be 1-dimensional",
+                                        shape_t.shape().ShortDebugString()));
+    OP_REQUIRES(context, shape_t.NumElements() == 2,
+                errors::InvalidArgument("shape_t must have two elements",
+                                        shape_t.shape().ShortDebugString()));
+
+    auto Svec = shape_t.vec<int32>();
+    // Initialize shape to the batch size of the input, then add
+    // the rest of the dimensions
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                0, TensorShape({input.dim_size(0), Svec(0),
+                                                Svec(1), input.dim_size(3)}),
+                                &output));
+
+    const int64 batch_size = input.dim_size(0);
+    const int64 in_height = input.dim_size(1);
+    const int64 in_width = input.dim_size(2);
+    const int64 channels = input.dim_size(3);
+    const int64 out_height = output->dim_size(1);
+    const int64 out_width = output->dim_size(2);
+
+    typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>();
+    typename TTypes<float, 4>::Tensor output_data = output->tensor<float, 4>();
+
+    const float height_scale = in_height / static_cast<float>(out_height);
+    const float width_scale = in_width / static_cast<float>(out_width);
+
+    for (int b = 0; b < batch_size; ++b) {
+      for (int y = 0; y < out_height; ++y) {
+        const float in_y = y * height_scale;
+        const int top_y_index = static_cast<int>(floorf(in_y));
+        const int bottom_y_index =
+            std::min(static_cast<int64>(ceilf(in_y)), (in_height - 1));
+        const float y_lerp = in_y - top_y_index;
+        const float inverse_y_lerp = (1.0f - y_lerp);
+        for (int x = 0; x < out_width; ++x) {
+          const float in_x = x * width_scale;
+          const int left_x_index = static_cast<int>(floorf(in_x));
+          const int right_x_index =
+              std::min(static_cast<int64>(ceilf(in_x)), (in_width - 1));
+          const float x_lerp = in_x - left_x_index;
+          const float inverse_x_lerp = (1.0f - x_lerp);
+          for (int c = 0; c < channels; ++c) {
+            const float top_left = input_data(b, top_y_index, left_x_index, c);
+            const float top_right =
+                input_data(b, top_y_index, right_x_index, c);
+            const float bottom_left =
+                input_data(b, bottom_y_index, left_x_index, c);
+            const float bottom_right =
+                input_data(b, bottom_y_index, right_x_index, c);
+            const float top =
+                (top_left * inverse_x_lerp) + (top_right * x_lerp);
+            const float bottom =
+                (bottom_left * inverse_x_lerp) + (bottom_right * x_lerp);
+            output_data(b, y, x, c) =
+                (top * inverse_y_lerp) + (bottom * y_lerp);
+          }
+        }
+      }
+    }
+  }
+};
+
+#define REGISTER_KERNEL(T)                            \
+  REGISTER_KERNEL_BUILDER(Name("ResizeBilinear")      \
+                              .Device(DEVICE_CPU)     \
+                              .TypeConstraint<T>("T") \
+                              .HostMemory("size"),    \
+                          ResizeBilinearOp<CPUDevice, T>);
+
+REGISTER_KERNEL(uint8);
+REGISTER_KERNEL(int8);
+REGISTER_KERNEL(int32);
+REGISTER_KERNEL(float);
+REGISTER_KERNEL(double);
+#undef REGISTER_KERNEL
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/resize_bilinear_op_test.cc b/tensorflow/core/kernels/resize_bilinear_op_test.cc
new file mode 100644
index 0000000000..0ebe2e5f8c
--- /dev/null
+++ b/tensorflow/core/kernels/resize_bilinear_op_test.cc
@@ -0,0 +1,171 @@
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/public/tensor.h"
+#include <gtest/gtest.h>
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+
+class ResizeBilinearOpTest : public OpsTestBase {
+ protected:
+  ResizeBilinearOpTest() {
+    RequireDefaultOps();
+    EXPECT_OK(NodeDefBuilder("resize_bilinear_op", "ResizeBilinear")
+                  .Input(FakeInput(DT_FLOAT))
+                  .Input(FakeInput(DT_INT32))
+                  .Finalize(node_def()));
+    EXPECT_OK(InitOp());
+  }
+};
+
+TEST_F(ResizeBilinearOpTest, TestBilinear2x2To1x1) {
+  // Input:
+  //  1, 2
+  //  3, 4
+  AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({2}), {1, 1});
+  ASSERT_OK(RunOpKernel());
+
+  // When scaling down, we have to arbitrarily pick a pixel from the
+  // original input.  In this case, we choose the top/left most pixel.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 1}));
+  test::FillValues<float>(&expected, {1.0});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(ResizeBilinearOpTest, TestBilinear2x2To3x3) {
+  // Input:
+  //  1, 2
+  //  3, 4
+  AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
+
+  // The corners should match the original corners, and we bilinear
+  // interpolate the values in between.
+
+  // clang-format off
+  test::FillValues<float>(&expected,
+    {1,     5.0/3,   2,
+     7.0/3, 3,       10.0/3,
+     3,     11.0/3,  4});
+
+  // clang-format on
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(ResizeBilinearOpTest, TestBilinear3x3To4x4) {
+  // Input:
+  //  1, 2, 3,
+  //  4, 5, 6,
+  //  7, 8, 9
+  AddInputFromArray<float>(TensorShape({1, 3, 3, 1}),
+                           {1, 2, 3, 4, 5, 6, 7, 8, 9});
+  AddInputFromArray<int32>(TensorShape({2}), {4, 4});
+  ASSERT_OK(RunOpKernel());
+
+  // The corners should match the original corners, and we bilinear
+  // interpolate the values in between.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 4, 4, 1}));
+  // clang-format off
+  test::FillValues<float>(&expected,
+    {1, 1.75, 2.5, 3,
+     3.25, 4, 4.75, 5.25,
+     5.5, 6.25, 7, 7.5,
+     7,  7.75, 8.5, 9});
+
+  // clang-format on
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(ResizeBilinearOpTest, TestBilinear2x2To3x3Batch2) {
+  // Input:
+  //  1, 2
+  //  3, 4
+  //
+  // repeated twice
+  AddInputFromArray<float>(TensorShape({2, 2, 2, 1}), {1, 2, 3, 4, 1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3, 3, 1}));
+  // clang-format off
+  test::FillValues<float>(&expected,
+    {1, 5.0/3, 2, 7.0/3, 3, 10.0/3, 3, 11.0/3, 4,
+     1, 5.0/3, 2, 7.0/3, 3, 10.0/3, 3, 11.0/3, 4
+    });
+  // clang-format on
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(ResizeBilinearOpTest, TestBilinear2x2x2To3x3x2) {
+  AddInputFromArray<float>(TensorShape({1, 2, 2, 2}),
+                           {1, -1, 2, -2, 3, -3, 4, -4});
+  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 2}));
+  // clang-format off
+  test::FillValues<float>(&expected,
+    {
+      1,      -1,
+      5.0/3,  -5.0/3,
+      2,      -2,
+      7.0/3,  -7.0/3,
+      3,      -3,
+      10.0/3, -10.0/3,
+      3,      -3,
+      11.0/3, -11.0/3,
+      4,      -4
+    });
+  // clang-format on
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(ResizeBilinearOpTest, TestBilinear2x2To4x4) {
+  // Input:
+  //  1, 2
+  //  3, 4
+  AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({2}), {4, 4});
+  ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 4, 4, 1}));
+  // clang-format off
+  test::FillValues<float>(&expected,
+    {1,  1.5, 2, 2,
+     2,  2.5, 3, 3,
+     3,  3.5, 4, 4,
+     3,  3.5, 4, 4});
+  // clang-format on
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(ResizeBilinearOpTest, TestInvalidInputShape) {
+  AddInputFromArray<float>(TensorShape({2, 2, 1}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({2}), {4, 4});
+  ASSERT_FALSE(RunOpKernel().ok());
+}
+
+TEST_F(ResizeBilinearOpTest, TestInvalidSizeDim) {
+  AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({2, 1}), {4, 4});
+  ASSERT_FALSE(RunOpKernel().ok());
+}
+TEST_F(ResizeBilinearOpTest, TestInvalidSizeElements) {
+  AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({3}), {4, 4, 1});
+  ASSERT_FALSE(RunOpKernel().ok());
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op.cc b/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
new file mode 100644
index 0000000000..13089308ce
--- /dev/null
+++ b/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
@@ -0,0 +1,89 @@
+// See docs in ../ops/image_ops.cc
+#define EIGEN_USE_THREADS
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, typename T>
+class ResizeNearestNeighborOp : public OpKernel {
+ public:
+  explicit ResizeNearestNeighborOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    OP_REQUIRES(context, input.dims() == 4,
+                errors::InvalidArgument("input must be 4-dimensional",
+                                        input.shape().ShortDebugString()));
+    const Tensor& shape_t = context->input(1);
+    OP_REQUIRES(context, shape_t.dims() == 1,
+                errors::InvalidArgument("shape_t must be 1-dimensional",
+                                        shape_t.shape().ShortDebugString()));
+    OP_REQUIRES(context, shape_t.NumElements() == 2,
+                errors::InvalidArgument("shape_t must have two elements",
+                                        shape_t.shape().ShortDebugString()));
+
+    auto Svec = shape_t.vec<int32>();
+    // Initialize shape to the batch size of the input, then add
+    // the rest of the dimensions
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                0, TensorShape({input.dim_size(0), Svec(0),
+                                                Svec(1), input.dim_size(3)}),
+                                &output));
+
+    const int64 batch_size = input.dim_size(0);
+    const int64 in_height = input.dim_size(1);
+    const int64 in_width = input.dim_size(2);
+    const int64 channels = input.dim_size(3);
+    const int64 out_height = output->dim_size(1);
+    const int64 out_width = output->dim_size(2);
+
+    typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>();
+    typename TTypes<T, 4>::Tensor output_data = output->tensor<T, 4>();
+
+    const float height_scale = in_height / static_cast<float>(out_height);
+    const float width_scale = in_width / static_cast<float>(out_width);
+
+    for (int b = 0; b < batch_size; ++b) {
+      for (int y = 0; y < out_height; ++y) {
+        const int in_y = std::min(static_cast<int64>(floorf(y * height_scale)),
+                                  (in_height - 1));
+        for (int x = 0; x < out_width; ++x) {
+          const int in_x = std::min(static_cast<int64>(floorf(x * width_scale)),
+                                    (in_width - 1));
+          for (int c = 0; c < channels; ++c) {
+            output_data(b, y, x, c) = input_data(b, in_y, in_x, c);
+          }
+        }
+      }
+    }
+  }
+};
+
+#define REGISTER_KERNEL(T)                              \
+  REGISTER_KERNEL_BUILDER(Name("ResizeNearestNeighbor") \
+                              .Device(DEVICE_CPU)       \
+                              .TypeConstraint<T>("T")   \
+                              .HostMemory("size"),      \
+                          ResizeNearestNeighborOp<CPUDevice, T>);
+
+REGISTER_KERNEL(uint8);
+REGISTER_KERNEL(int8);
+REGISTER_KERNEL(int32);
+REGISTER_KERNEL(float);
+REGISTER_KERNEL(double);
+#undef REGISTER_KERNEL
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op_test.cc b/tensorflow/core/kernels/resize_nearest_neighbor_op_test.cc
new file mode 100644
index 0000000000..8fca1f34e3
--- /dev/null
+++ b/tensorflow/core/kernels/resize_nearest_neighbor_op_test.cc
@@ -0,0 +1,163 @@
+// TODO(shlens, sherrym): Consider adding additional tests in image_ops.py in
+// order to compare the reference implementation for image resizing in Python
+// Image Library.
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/public/tensor.h"
+#include <gtest/gtest.h>
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+
+class ResizeNearestNeighborOpTest : public OpsTestBase {
+ protected:
+  ResizeNearestNeighborOpTest() {
+    RequireDefaultOps();
+    EXPECT_OK(NodeDefBuilder("resize_nn", "ResizeNearestNeighbor")
+                  .Input(FakeInput(DT_FLOAT))
+                  .Input(FakeInput(DT_INT32))
+                  .Finalize(node_def()));
+    EXPECT_OK(InitOp());
+  }
+};
+
+TEST_F(ResizeNearestNeighborOpTest, TestNearest2x2To1x1) {
+  // Input:
+  //  1, 2
+  //  3, 4
+  AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({2}), {1, 1});
+  ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 1}));
+
+  // clang-format off
+  test::FillValues<float>(&expected, {1});
+
+  // clang-format on
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(ResizeNearestNeighborOpTest, TestNearest2x2To3x3) {
+  // Input:
+  //  1, 2
+  //  3, 4
+  AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
+
+  // clang-format off
+  test::FillValues<float>(&expected,
+    {1, 1, 2,
+     1, 1, 2,
+     3, 3, 4});
+
+  // clang-format on
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(ResizeNearestNeighborOpTest, TestNearest2x2To2x5) {
+  // Input:
+  //  1, 2
+  //  3, 4
+  AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({2}), {2, 5});
+  ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 5, 1}));
+
+  // clang-format off
+  test::FillValues<float>(&expected,
+    {1, 1, 1, 2, 2,
+     3, 3, 3, 4, 4});
+
+  // clang-format on
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(ResizeNearestNeighborOpTest, TestNearest2x2To5x2) {
+  // Input:
+  //  1, 2
+  //  3, 4
+  AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({2}), {5, 2});
+  ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 5, 2, 1}));
+
+  // clang-format off
+  test::FillValues<float>(&expected,
+    {1, 2,
+     1, 2,
+     1, 2,
+     3, 4,
+     3, 4});
+
+  // clang-format on
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(ResizeNearestNeighborOpTest, TestNearest2x2To4x4) {
+  // Input:
+  //  1, 2
+  //  3, 4
+  AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({2}), {4, 4});
+  ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 4, 4, 1}));
+
+  // clang-format off
+  test::FillValues<float>(&expected,
+    {1, 1, 2, 2,
+     1, 1, 2, 2,
+     3, 3, 4, 4,
+     3, 3, 4, 4});
+
+  // clang-format on
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(ResizeNearestNeighborOpTest, TestNearest2x2x2x2To2x3x3x2) {
+  // Input:
+  //  [ [ 1, 1 ], [ 2, 2],
+  //    [ 3, 3 ], [ 4, 4] ],
+  //  [ [ 5, 5 ], [ 6, 6],
+  //    [ 7, 7 ], [ 8, 8] ]
+  AddInputFromArray<float>(TensorShape({2, 2, 2, 2}),
+                           {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8});
+  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3, 3, 2}));
+
+  // clang-format off
+  test::FillValues<float>(&expected,
+    {1, 1, 1,
+     1, 2, 2,
+     1, 1, 1,
+     1, 2, 2,
+     3, 3, 3,
+     3, 4, 4,
+     5, 5, 5,
+     5, 6, 6,
+     5, 5, 5,
+     5, 6, 6,
+     7, 7, 7,
+     7, 8, 8});
+
+  // clang-format on
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/restore_op.cc b/tensorflow/core/kernels/restore_op.cc
new file mode 100644
index 0000000000..b52c69449c
--- /dev/null
+++ b/tensorflow/core/kernels/restore_op.cc
@@ -0,0 +1,65 @@
+// See docs in ../ops/io_ops.cc.
+#include "tensorflow/core/kernels/io.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/tensor_slice_reader.h"
+
+namespace tensorflow {
+
+class RestoreOp : public OpKernel {
+ public:
+  explicit RestoreOp(OpKernelConstruction* context) : OpKernel(context) {
+    int preferred_shard;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("preferred_shard", &preferred_shard));
+    if (preferred_shard == -1) {
+      preferred_shard_ = checkpoint::TensorSliceReader::kLoadAllShards;
+    } else {
+      OP_REQUIRES(context, preferred_shard >= 0,
+                  errors::InvalidArgument("Attribute 'preferred_shard' must be "
+                                          "greater or equal to -1"));
+      preferred_shard_ = preferred_shard;
+    }
+  }
+  void Compute(OpKernelContext* context) override {
+    RestoreTensor(context, &checkpoint::OpenTableTensorSliceReader,
+                  preferred_shard_, false);
+  }
+
+ private:
+  int preferred_shard_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("Restore").Device(DEVICE_CPU), RestoreOp);
+
+class RestoreSliceOp : public OpKernel {
+ public:
+  explicit RestoreSliceOp(OpKernelConstruction* context) : OpKernel(context) {
+    int preferred_shard;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("preferred_shard", &preferred_shard));
+    if (preferred_shard == -1) {
+      preferred_shard_ = checkpoint::TensorSliceReader::kLoadAllShards;
+    } else {
+      OP_REQUIRES(context, preferred_shard >= 0,
+                  errors::InvalidArgument("Attribute 'preferred_shard' must be "
+                                          "greater or equal to -1"));
+      preferred_shard_ = preferred_shard;
+    }
+  }
+  void Compute(OpKernelContext* context) override {
+    RestoreTensor(context, &checkpoint::OpenTableTensorSliceReader,
+                  preferred_shard_, true);
+  }
+
+ private:
+  int preferred_shard_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("RestoreSlice").Device(DEVICE_CPU),
+                        RestoreSliceOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/restore_op_test.cc b/tensorflow/core/kernels/restore_op_test.cc
new file mode 100644
index 0000000000..59343a8037
--- /dev/null
+++ b/tensorflow/core/kernels/restore_op_test.cc
@@ -0,0 +1,305 @@
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/util/tensor_slice_reader_cache.h"
+#include <gtest/gtest.h>
+
+namespace tensorflow {
+namespace {
+
+class RestoreOpTest : public OpsTestBase {
+ protected:
+  // Makes an operation to restore two tensors
+  void MakeRestoreOp(DataType dt) {
+    RequireDefaultOps();
+    ASSERT_OK(NodeDefBuilder("myop", "Restore")
+                  .Input(FakeInput())
+                  .Input(FakeInput())
+                  .Attr("dt", dt)
+                  .Finalize(node_def()));
+    ASSERT_OK(InitOp());
+  }
+};
+
+TEST_F(RestoreOpTest, RestoreInt) {
+  const string filename = io::JoinPath(testing::TmpDir(), "tensor_int");
+  const string tensor_name = "tensor_int";
+
+  // We first need to write a tensor using the save_op
+  {
+    // Initialize an operation
+    NodeDef save;
+    ASSERT_OK(NodeDefBuilder("save", "Save")
+                  .Input(FakeInput(DT_STRING))
+                  .Input(FakeInput(DT_STRING))
+                  .Input(FakeInput({DT_INT32}))
+                  .Finalize(&save));
+
+    std::unique_ptr<Device> device(
+        DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+
+    gtl::InlinedVector<TensorValue, 4> inputs;
+
+    Status status;
+    std::unique_ptr<OpKernel> op(CreateOpKernel(
+        DEVICE_CPU, device.get(), cpu_allocator(), save, &status));
+    EXPECT_OK(status);
+
+    // Run it
+
+    // Input #0 is the file name
+    Tensor input_0(DT_STRING, TensorShape({}));
+    input_0.scalar<string>()() = filename;
+    inputs.push_back({nullptr, &input_0});
+
+    // Input #1 is the tensor name
+    Tensor input_1(DT_STRING, TensorShape({}));
+    input_1.scalar<string>()() = tensor_name;
+    inputs.push_back({nullptr, &input_1});
+
+    // Input #2 is an integer tensor: it's a 1-d array.
+    Tensor input_2(DT_INT32, TensorShape({10}));
+    for (int i = 0; i < 10; ++i) {
+      input_2.flat<int32>()(i) = i + 1;
+    }
+    inputs.push_back({nullptr, &input_2});
+
+    OpKernelContext::Params params;
+    params.device = device.get();
+    params.frame_iter = FrameAndIter(0, 0);
+    params.inputs = &inputs;
+    params.op_kernel = op.get();
+    params.output_alloc_attr = [&device, &op, &params](int index) {
+      AllocatorAttributes attr;
+      const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY);
+      attr.set_on_host(on_host);
+      return attr;
+    };
+    checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_wrapper;
+    params.slice_reader_cache = &slice_reader_cache_wrapper;
+
+    OpKernelContext ctx(params);
+    op->Compute(&ctx);
+    EXPECT_OK(ctx.status());
+  }
+
+  // Now we restore
+  MakeRestoreOp(DT_INT32);
+  // Add a file name
+  AddInput<string>(TensorShape({}),
+                   [&filename](int x) -> string { return filename; });
+  // Add the tensor names
+  AddInput<string>(TensorShape({}),
+                   [&tensor_name](int x) -> string { return tensor_name; });
+
+  ASSERT_OK(RunOpKernel());
+
+  // Check that we have an integer tensor
+  Tensor* output = GetOutput(0);
+  TensorShape expected({10});
+  EXPECT_TRUE(output->shape().IsSameSize(expected));
+  for (int i = 0; i < 10; ++i) {
+    EXPECT_EQ(i + 1, output->flat<int32>()(i));
+  }
+}
+
+TEST_F(RestoreOpTest, RestoreFloat) {
+  const string filename = io::JoinPath(testing::TmpDir(), "tensor_float");
+  const string tensor_name = "tensor_float";
+
+  // We first need to write a tensor using the save_op
+  {
+    // Initialize an operation
+    NodeDef save;
+    ASSERT_OK(NodeDefBuilder("save", "Save")
+                  .Input(FakeInput(DT_STRING))
+                  .Input(FakeInput(DT_STRING))
+                  .Input(FakeInput({DT_FLOAT}))
+                  .Finalize(&save));
+
+    std::unique_ptr<Device> device(
+        DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+    gtl::InlinedVector<TensorValue, 4> inputs;
+
+    Status status;
+    std::unique_ptr<OpKernel> op(CreateOpKernel(
+        DEVICE_CPU, device.get(), cpu_allocator(), save, &status));
+    EXPECT_OK(status);
+
+    // Run it
+
+    // Input #0 is the file name
+    Tensor input_0(DT_STRING, TensorShape({}));
+    input_0.scalar<string>()() = filename;
+    inputs.push_back({nullptr, &input_0});
+
+    // Input #1 is the tensor name
+    Tensor input_1(DT_STRING, TensorShape({}));
+    input_1.scalar<string>()() = tensor_name;
+    inputs.push_back({nullptr, &input_1});
+
+    // Input #2 is a float tensor: it's a 2-d array.
+    Tensor input_2(DT_FLOAT, TensorShape({2, 4}));
+    for (int i = 0; i < 8; ++i) {
+      input_2.flat<float>()(i) = static_cast<float>(i) / 10;
+    }
+    inputs.push_back({nullptr, &input_2});
+
+    OpKernelContext::Params params;
+    params.device = device.get();
+    params.frame_iter = FrameAndIter(0, 0);
+    params.inputs = &inputs;
+    params.op_kernel = op.get();
+    params.output_alloc_attr = [&device, &op, &params](int index) {
+      AllocatorAttributes attr;
+      const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY);
+      attr.set_on_host(on_host);
+      return attr;
+    };
+    checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_wrapper;
+    params.slice_reader_cache = &slice_reader_cache_wrapper;
+
+    OpKernelContext ctx(params);
+    op->Compute(&ctx);
+    EXPECT_OK(ctx.status());
+  }
+
+  // Now we restore
+  MakeRestoreOp(DT_FLOAT);
+  // Add a file name
+  AddInput<string>(TensorShape({}),
+                   [&filename](int x) -> string { return filename; });
+  // Add the tensor names
+  AddInput<string>(TensorShape({}),
+                   [&tensor_name](int x) -> string { return tensor_name; });
+
+  ASSERT_OK(RunOpKernel());
+
+  // Check that we have a float tensor.
+  Tensor* output = GetOutput(0);
+  TensorShape expected({2, 4});
+  EXPECT_TRUE(output->shape().IsSameSize(expected));
+  for (int i = 0; i < 8; ++i) {
+    EXPECT_EQ(static_cast<float>(i) / 10, output->flat<float>()(i));
+  }
+}
+
+class RestoreSliceOpTest : public OpsTestBase {
+ protected:
+  void MakeRestoreSliceOp(DataType dt) {
+    RequireDefaultOps();
+    ASSERT_OK(NodeDefBuilder("myop", "RestoreSlice")
+                  .Input(FakeInput())
+                  .Input(FakeInput())
+                  .Input(FakeInput())
+                  .Attr("dt", dt)
+                  .Finalize(node_def()));
+    ASSERT_OK(InitOp());
+  }
+};
+
+TEST_F(RestoreSliceOpTest, RestoreInt) {
+  const string filename = io::JoinPath(testing::TmpDir(), "tensor_int");
+  const string tensor_name = "tensor_int";
+
+  // We first need to write a tensor using the save_op
+  {
+    // Initialize an operation
+    NodeDef save;
+    ASSERT_OK(NodeDefBuilder("save", "Save")
+                  .Input(FakeInput(DT_STRING))
+                  .Input(FakeInput(DT_STRING))
+                  .Input(FakeInput({DT_INT32}))
+                  .Finalize(&save));
+
+    std::unique_ptr<Device> device(
+        DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+
+    gtl::InlinedVector<TensorValue, 4> inputs;
+
+    Status status;
+    std::unique_ptr<OpKernel> op(CreateOpKernel(
+        DEVICE_CPU, device.get(), cpu_allocator(), save, &status));
+    EXPECT_OK(status);
+
+    // Run it
+
+    // Input #0 is the file name
+    Tensor input_0(DT_STRING, TensorShape({}));
+    input_0.scalar<string>()() = filename;
+    inputs.push_back({nullptr, &input_0});
+
+    // Input #1 is the tensor name
+    Tensor input_1(DT_STRING, TensorShape({}));
+    input_1.scalar<string>()() = tensor_name;
+    inputs.push_back({nullptr, &input_1});
+
+    // Input #2 is a 4x16 integer tensor.
+    Tensor input_2(DT_INT32, TensorShape({4, 16}));
+    for (int64 i = 0; i < input_2.NumElements(); ++i) {
+      input_2.flat<int32>()(i) = i + 1;
+    }
+    inputs.push_back({nullptr, &input_2});
+
+    OpKernelContext::Params params;
+    params.device = device.get();
+    params.frame_iter = FrameAndIter(0, 0);
+    params.inputs = &inputs;
+    params.op_kernel = op.get();
+    params.output_alloc_attr = [&device, &op, &params](int index) {
+      AllocatorAttributes attr;
+      const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY);
+      attr.set_on_host(on_host);
+      return attr;
+    };
+    checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_wrapper;
+    params.slice_reader_cache = &slice_reader_cache_wrapper;
+
+    OpKernelContext ctx(params);
+    op->Compute(&ctx);
+    EXPECT_OK(ctx.status());
+  }
+
+  // Now we restore
+  MakeRestoreSliceOp(DT_INT32);
+  string shape_and_slice = "4 16 0,2:-";
+  // Add a file name
+  AddInput<string>(TensorShape({}),
+                   [&filename](int x) -> string { return filename; });
+  // Add the tensor names
+  AddInput<string>(TensorShape({}),
+                   [&tensor_name](int x) -> string { return tensor_name; });
+  // Add the tensor shape and slice
+  AddInput<string>(TensorShape({}), [&shape_and_slice](int x) -> string {
+    return shape_and_slice;
+  });
+
+  ASSERT_OK(RunOpKernel());
+
+  // Check that we have an integer tensor
+  Tensor* output = GetOutput(0);
+  TensorShape expected({2, 16});
+  EXPECT_TRUE(output->shape().IsSameSize(expected));
+  for (int64 i = 0; i < expected.num_elements(); ++i) {
+    EXPECT_EQ(i + 1, output->flat<int32>()(i));
+  }
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc
new file mode 100644
index 0000000000..c63dfc1e70
--- /dev/null
+++ b/tensorflow/core/kernels/reverse_op.cc
@@ -0,0 +1,139 @@
+// See docs in ../ops/array_ops.cc
+#define EIGEN_USE_THREADS
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/reverse_op.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class ReverseOp : public OpKernel {
+ public:
+  explicit ReverseOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const Tensor& dims = context->input(1);
+
+    if (TensorShapeUtils::IsScalar(input.shape())) {
+      Tensor* output = nullptr;
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(0, input.shape(), &output));
+      output->scalar<T>() = input.scalar<T>();
+
+    } else {
+      const int input_dims = input.dims();
+      OP_REQUIRES(context, TensorShapeUtils::IsVector(dims.shape()),
+                  errors::InvalidArgument("'dims' must be 1-dimension, not ",
+                                          dims.dims()));
+
+      OP_REQUIRES(context, input_dims == dims.dim_size(0),
+                  errors::InvalidArgument(
+          "'dims' must have the same number of values as 'input' has "
+          "dimensions. 'input' has ", input_dims, "'dims' has ",
+          dims.dim_size(0), " values"));
+      OP_REQUIRES(context, input_dims <= 8, errors::Unimplemented(
+                  "reverse is not implemented for tensors of rank > 8."));
+
+      Tensor* output = nullptr;
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(0, input.shape(), &output));
+
+#define HANDLE_REVERSE(NDIMS)                                      \
+  case NDIMS:                                                      \
+    functor::Reverse<Device, T, NDIMS>()(                          \
+        context->eigen_device<Device>(), input.tensor<T, NDIMS>(), \
+        dims.vec<bool>(), output->tensor<T, NDIMS>());             \
+    return;
+
+      switch (input_dims) {
+        HANDLE_REVERSE(0);
+        HANDLE_REVERSE(1);
+        HANDLE_REVERSE(2);
+        HANDLE_REVERSE(3);
+        HANDLE_REVERSE(4);
+        HANDLE_REVERSE(5);
+        HANDLE_REVERSE(6);
+        HANDLE_REVERSE(7);
+        HANDLE_REVERSE(8);
+      }
+#undef HANDLE_REVERSE
+    }
+  }
+};
+
+#define REGISTER_KERNEL(T)                            \
+  REGISTER_KERNEL_BUILDER(Name("Reverse")             \
+                              .Device(DEVICE_CPU)     \
+                              .TypeConstraint<T>("T") \
+                              .HostMemory("dims"),    \
+                          ReverseOp<CPUDevice, T>)
+
+REGISTER_KERNEL(uint8);
+REGISTER_KERNEL(int8);
+REGISTER_KERNEL(int32);
+REGISTER_KERNEL(bool);
+REGISTER_KERNEL(float);
+REGISTER_KERNEL(double);
+#undef REGISTER_KERNEL
+
+#if GOOGLE_CUDA
+
+// Forward declarations of the function specializations for GPU (to prevent
+// building the GPU versions here, they will be built compiling _gpu.cu.cc).
+namespace functor {
+#define DECLARE_GPU_SPEC_DIM(T, DIM)                                  \
+  template <>                                                         \
+  void Reverse<GPUDevice, T, DIM>::operator()(                        \
+      const GPUDevice& d, typename TTypes<T, DIM>::ConstTensor input, \
+      typename TTypes<bool, 1>::ConstTensor dims,                     \
+      typename TTypes<T, DIM>::Tensor output);                        \
+  extern template struct Reverse<GPUDevice, T, DIM>;
+#define DECLARE_GPU_SPEC(T)  \
+  DECLARE_GPU_SPEC_DIM(T, 0) \
+  DECLARE_GPU_SPEC_DIM(T, 1) \
+  DECLARE_GPU_SPEC_DIM(T, 2) \
+  DECLARE_GPU_SPEC_DIM(T, 3) \
+  DECLARE_GPU_SPEC_DIM(T, 4) \
+  DECLARE_GPU_SPEC_DIM(T, 5) \
+  DECLARE_GPU_SPEC_DIM(T, 6) \
+  DECLARE_GPU_SPEC_DIM(T, 7) \
+  DECLARE_GPU_SPEC_DIM(T, 8)
+
+DECLARE_GPU_SPEC(uint8);
+DECLARE_GPU_SPEC(int8);
+DECLARE_GPU_SPEC(int32);
+DECLARE_GPU_SPEC(bool);
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+#undef DECLARE_GPU_SPEC_DIM
+}  // namespace functor
+
+// Registration of the GPU implementations.
+#define REGISTER_GPU_KERNEL(T)                        \
+  REGISTER_KERNEL_BUILDER(Name("Reverse")             \
+                              .Device(DEVICE_GPU)     \
+                              .TypeConstraint<T>("T") \
+                              .HostMemory("dims"),    \
+                          ReverseOp<GPUDevice, T>)
+REGISTER_GPU_KERNEL(uint8);
+REGISTER_GPU_KERNEL(int8);
+REGISTER_GPU_KERNEL(float);
+REGISTER_GPU_KERNEL(double);
+#undef REGISTER_GPU_KERNEL
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reverse_op.h b/tensorflow/core/kernels/reverse_op.h
new file mode 100644
index 0000000000..bba25f70e8
--- /dev/null
+++ b/tensorflow/core/kernels/reverse_op.h
@@ -0,0 +1,28 @@
+#ifndef TENSORFLOW_KERNELS_REVERSE_OP_H_
+#define TENSORFLOW_KERNELS_REVERSE_OP_H_
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by MirrorOp to do the computations.
+template <typename Device, typename T, int Dims>
+struct Reverse {
+  void operator()(const Device& d, typename TTypes<T, Dims>::ConstTensor input,
+                  typename TTypes<bool, 1>::ConstTensor dims,
+                  typename TTypes<T, Dims>::Tensor output) {
+    // mirror is in host memory
+    Eigen::array<bool, Dims> reverse_dims;
+    for (int i = 0; i < Dims; ++i) {
+      reverse_dims[i] = dims(i);
+    }
+    output.device(d) = input.reverse(reverse_dims);
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_MIRROR_OP_H_
diff --git a/tensorflow/core/kernels/reverse_op_gpu.cu.cc b/tensorflow/core/kernels/reverse_op_gpu.cu.cc
new file mode 100644
index 0000000000..b510add3f3
--- /dev/null
+++ b/tensorflow/core/kernels/reverse_op_gpu.cu.cc
@@ -0,0 +1,33 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/reverse_op.h"
+
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+#define DEFINE_REVERSE(DIM)                                \
+  template struct functor::Reverse<GPUDevice, uint8, DIM>; \
+  template struct functor::Reverse<GPUDevice, int8, DIM>;  \
+  template struct functor::Reverse<GPUDevice, int32, DIM>; \
+  template struct functor::Reverse<GPUDevice, bool, DIM>;  \
+  template struct functor::Reverse<GPUDevice, float, DIM>; \
+  template struct functor::Reverse<GPUDevice, double, DIM>;
+DEFINE_REVERSE(0)
+DEFINE_REVERSE(1)
+DEFINE_REVERSE(2)
+DEFINE_REVERSE(3)
+DEFINE_REVERSE(4)
+DEFINE_REVERSE(5)
+DEFINE_REVERSE(6)
+DEFINE_REVERSE(7)
+DEFINE_REVERSE(8)
+#undef DEFINE_REVERSE
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/reverse_op_test.cc b/tensorflow/core/kernels/reverse_op_test.cc
new file mode 100644
index 0000000000..d41c36e693
--- /dev/null
+++ b/tensorflow/core/kernels/reverse_op_test.cc
@@ -0,0 +1,101 @@
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/tensor.h"
+#include <gtest/gtest.h>
+
+namespace tensorflow {
+namespace {
+
+class ReverseOpTest : public OpsTestBase {
+ protected:
+  void MakeOp(DataType data_type) {
+    RequireDefaultOps();
+    ASSERT_OK(NodeDefBuilder("myop", "Reverse")
+                  .Input(FakeInput(data_type))
+                  .Input(FakeInput())
+                  .Attr("T", data_type)
+                  .Finalize(node_def()));
+    ASSERT_OK(InitOp());
+  }
+};
+
+TEST_F(ReverseOpTest, Reverse_0) {
+  MakeOp(DT_FLOAT);
+  AddInputFromArray<float>(TensorShape({}), {3});
+  AddInputFromArray<bool>(TensorShape({}), {true});
+  ASSERT_OK(RunOpKernel());
+
+  Tensor* output = GetOutput(0);
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({}));
+  expected.scalar<float>() = expected.scalar<float>().constant(3.f);
+  test::ExpectTensorEqual<float>(expected, *output);
+}
+
+TEST_F(ReverseOpTest, Reverse_234) {
+  MakeOp(DT_FLOAT);
+
+  // Feed and run
+  // [[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]]
+  //  [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]]
+  AddInputFromArray<float>(TensorShape({2, 3, 4}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+                            15, 16, 17, 18, 19, 20, 21, 22, 23});
+  AddInputFromArray<bool>(TensorShape({3}), {true, false, true});
+
+  ASSERT_OK(RunOpKernel());
+
+  // Check the new state of the input
+  Tensor* params_tensor = GetOutput(0);
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3, 4}));
+  // Should become
+  // [[[15, 14, 13, 12], [19, 18, 17, 16], [23, 22, 21, 20]]
+  //  [[3, 2, 1, 0], [7, 6, 5, 4], [11, 10, 9, 8]]]
+  test::FillValues<float>(
+      &expected, {15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 3, 2, 1, 0, 7,
+                  6, 5, 4, 11, 10, 9, 8});
+  test::ExpectTensorEqual<float>(expected, *params_tensor);
+}
+
+TEST_F(ReverseOpTest, Reverse_1234) {
+  MakeOp(DT_FLOAT);
+
+  // Feed and run
+  // [[[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]]
+  //   [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]]]
+  AddInputFromArray<float>(TensorShape({1, 2, 3, 4}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+                            15, 16, 17, 18, 19, 20, 21, 22, 23});
+  AddInputFromArray<bool>(TensorShape({4}), {true, true, false, true});
+
+  ASSERT_OK(RunOpKernel());
+
+  // Check the new state of the input
+  Tensor* params_tensor = GetOutput(0);
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 3, 4}));
+  // Should become
+  // [[[[15, 14, 13, 12], [19, 18, 17, 16], [23, 22, 21, 20]]
+  //   [[3, 2, 1, 0], [7, 6, 5, 4], [11, 10, 9, 8]]]]
+  test::FillValues<float>(
+      &expected, {15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 3, 2, 1, 0, 7,
+                  6, 5, 4, 11, 10, 9, 8});
+  test::ExpectTensorEqual<float>(expected, *params_tensor);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reverse_sequence_op.cc b/tensorflow/core/kernels/reverse_sequence_op.cc
new file mode 100644
index 0000000000..6673a700ef
--- /dev/null
+++ b/tensorflow/core/kernels/reverse_sequence_op.cc
@@ -0,0 +1,170 @@
+// See docs in ../ops/array_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/reverse_sequence_op.h"
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device>
+void CheckErrors(OpKernelContext* context, int seq_dim) {
+  const Tensor& input = context->input(0);
+  const Tensor& seq_lens = context->input(1);
+
+  auto seq_lens_t = seq_lens.vec<int64>();
+
+  std::vector<int64> seq_lens_vec(seq_lens_t.size());
+
+  // Copy seq_len info down for validity checks
+  context->eigen_device<Device>().memcpyDeviceToHost(
+      seq_lens_vec.data(), seq_lens_t.data(),
+      sizeof(int64) * seq_lens_t.size());
+
+  OP_REQUIRES(context, 0 != seq_dim, errors::InvalidArgument("0 == seq_dim"));
+  OP_REQUIRES(context, seq_dim < input.dims(),
+              errors::InvalidArgument("seq_dim must be < input.dims()", "( ",
+                                      seq_dim, " vs. ", input.dims(), ")"));
+
+  OP_REQUIRES(context, seq_lens.NumElements() == input.dim_size(0),
+              errors::InvalidArgument("len(seq_lens) != input.dims(", 0, "), ",
+                                      "(", seq_lens.NumElements(), " vs. ",
+                                      input.dim_size(seq_dim)));
+
+  for (int d = 0; d < seq_lens_vec.size(); ++d) {
+    OP_REQUIRES(context, seq_lens_vec[d] >= 0,
+                errors::InvalidArgument("seq_lens(", d, ") < 0"));
+    OP_REQUIRES(context, seq_lens_vec[d] <= input.dim_size(seq_dim),
+                errors::InvalidArgument("seq_lens(", d, ") > input.dims(",
+                                        seq_dim, ")"));
+  }
+}
+
+template <>
+void CheckErrors<GPUDevice>(OpKernelContext* context, int seq_dim) {
+  const Tensor& input = context->input(0);
+  const Tensor& seq_lens = context->input(1);
+
+  OP_REQUIRES(context, 0 != seq_dim, errors::InvalidArgument("0 == seq_dim"));
+  OP_REQUIRES(context, seq_dim < input.dims(),
+              errors::InvalidArgument("seq_dim must be < input.dims()", "( ",
+                                      seq_dim, " vs. ", input.dims(), ")"));
+
+  OP_REQUIRES(context, seq_lens.NumElements() == input.dim_size(0),
+              errors::InvalidArgument("len(seq_lens) != input.dims(", 0, "), ",
+                                      "(", seq_lens.NumElements(), " vs. ",
+                                      input.dim_size(seq_dim)));
+}
+
+template <typename Device, typename T>
+class ReverseSequenceOp : public OpKernel {
+ public:
+  explicit ReverseSequenceOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("seq_dim", &seq_dim_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const Tensor& seq_lens = context->input(1);
+
+    // Preliminary validation of sizes.
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(seq_lens.shape()),
+                errors::InvalidArgument("seq_lens input must be 1-dim, not ",
+                                        seq_lens.dims()));
+
+    auto seq_lens_t = seq_lens.vec<int64>();
+
+    CheckErrors<Device>(context, seq_dim_);
+
+    const int input_dims = input.dims();
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input.shape(), &output));
+
+#define HANDLE_DIM(NDIM)                                                    \
+  case NDIM:                                                                \
+    functor::ReverseSequence<Device, T, NDIM>::Compute(                     \
+        context->eigen_device<Device>(), input.tensor<T, NDIM>(), seq_dim_, \
+        seq_lens_t, output->tensor<T, NDIM>());                             \
+    break;
+
+    switch (input_dims) {
+      HANDLE_DIM(2);
+      HANDLE_DIM(3);
+      HANDLE_DIM(4);
+      HANDLE_DIM(5);
+
+      default:
+        OP_REQUIRES(context, false,
+                    errors::InvalidArgument(
+                        "ReverseSequenceOp : Unhandled input dimensions: ",
+                        input_dims));
+    }
+  }
+
+ private:
+  int32 seq_dim_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(ReverseSequenceOp);
+};
+
+#define REGISTER_REVERSE_SEQUENCE(type)                                     \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("ReverseSequence").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      ReverseSequenceOp<CPUDevice, type>);
+
+TF_CALL_NUMBER_TYPES(REGISTER_REVERSE_SEQUENCE);
+
+#if GOOGLE_CUDA
+
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T, Dims)                                      \
+  template <>                                                          \
+  void ReverseSequence<GPUDevice, T, Dims>::Compute(                   \
+      const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input, \
+      int32 seq_dim, TTypes<int64>::ConstVec seq_lens,                 \
+      typename TTypes<T, Dims>::Tensor output);                        \
+  extern template struct ReverseSequence<GPUDevice, T, Dims>;
+
+#define DECLARE_GPU_SPECS(T) \
+  DECLARE_GPU_SPEC(T, 2);    \
+  DECLARE_GPU_SPEC(T, 3);    \
+  DECLARE_GPU_SPEC(T, 4);    \
+  DECLARE_GPU_SPEC(T, 5);
+
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
+
+}  // namespace functor
+
+// Registration of the GPU implementations.
+#define REGISTER_REVERSE_SEQUENCE_GPU(type)                                 \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("ReverseSequence").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+      ReverseSequenceOp<GPUDevice, type>);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_REVERSE_SEQUENCE_GPU);
+
+#undef REGISTER_REVERSE_SEQUENCE_GPU
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reverse_sequence_op.h b/tensorflow/core/kernels/reverse_sequence_op.h
new file mode 100644
index 0000000000..d1dd572dcb
--- /dev/null
+++ b/tensorflow/core/kernels/reverse_sequence_op.h
@@ -0,0 +1,56 @@
+#ifndef TENSORFLOW_KERNELS_REVERSE_SEQUENCE_OP_H_
+#define TENSORFLOW_KERNELS_REVERSE_SEQUENCE_OP_H_
+// Generator definition for ReverseSequenceOp, must be compilable by nvcc.
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+namespace generator {
+
+template <typename T, size_t Dims>
+class ReverseGenerator {
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  ReverseGenerator(typename TTypes<T, Dims>::ConstTensor input, int32 seq_dim,
+                   TTypes<int64>::ConstVec seq_lengths)
+      : input_(input), seq_dim_(seq_dim), seq_lengths_(seq_lengths) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
+  operator()(const Eigen::array<Eigen::DenseIndex, Dims>& coords) const {
+    Eigen::array<Eigen::DenseIndex, Dims> new_coords = coords;
+    if (coords[seq_dim_] < seq_lengths_(coords[0])) {
+      new_coords[seq_dim_] = seq_lengths_(coords[0]) - coords[seq_dim_] - 1;
+    }
+
+    return input_(new_coords);
+  }
+
+ private:
+  typename TTypes<T, Dims>::ConstTensor input_;
+  int32 seq_dim_;
+  TTypes<int64>::ConstVec seq_lengths_;
+};
+
+}  // namespace generator
+
+namespace functor {
+
+template <typename Device, typename T, size_t Dims>
+struct ReverseSequence {
+  EIGEN_ALWAYS_INLINE static void Compute(
+      const Device& d, typename TTypes<T, Dims>::ConstTensor input,
+      int32 seq_dim, TTypes<int64>::ConstVec seq_lengths,
+      typename TTypes<T, Dims>::Tensor output) {
+    generator::ReverseGenerator<T, Dims> generator(input, seq_dim, seq_lengths);
+    output.device(d) = input.generate(generator);
+  }
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_REVERSE_SEQUENCE_OP_H_
diff --git a/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc b/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc
new file mode 100644
index 0000000000..7b5d533026
--- /dev/null
+++ b/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc
@@ -0,0 +1,26 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/reverse_sequence_op.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+#define DEFINE_GPU_SPEC(T, dims)                       \
+  template class generator::ReverseGenerator<T, dims>; \
+  template struct functor::ReverseSequence<GPUDevice, T, dims>;
+
+#define DEFINE_GPU_SPECS(T) \
+  DEFINE_GPU_SPEC(T, 2);    \
+  DEFINE_GPU_SPEC(T, 3);    \
+  DEFINE_GPU_SPEC(T, 4);    \
+  DEFINE_GPU_SPEC(T, 5);
+
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
+
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/save_op.cc b/tensorflow/core/kernels/save_op.cc
new file mode 100644
index 0000000000..71a15c643e
--- /dev/null
+++ b/tensorflow/core/kernels/save_op.cc
@@ -0,0 +1,81 @@
+// See docs in ../ops/io_ops.cc
+#include "tensorflow/core/kernels/io.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/util/tensor_slice_writer.h"
+
+namespace tensorflow {
+
+class SaveOp : public OpKernel {
+ public:
+  explicit SaveOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    SaveTensors(context, &checkpoint::CreateTableTensorSliceBuilder, false);
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("Save").Device(DEVICE_CPU), SaveOp);
+
+class SaveSlicesOp : public OpKernel {
+ public:
+  explicit SaveSlicesOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    SaveTensors(context, &checkpoint::CreateTableTensorSliceBuilder, true);
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("SaveSlices").Device(DEVICE_CPU), SaveSlicesOp);
+
+class ShardedFilenameOp : public OpKernel {
+ public:
+  explicit ShardedFilenameOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    static const char* input_names[3] = {"basename", "shard", "num_shards"};
+    for (int i = 0; i < ctx->num_inputs(); ++i) {
+      OP_REQUIRES(ctx, TensorShapeUtils::IsLegacyScalar(ctx->input(i).shape()),
+                  errors::InvalidArgument(
+                      input_names[i], " must be a scalar, got shape ",
+                      ctx->input(i).shape().ShortDebugString()));
+    }
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &out));
+    out->scalar<string>()() = strings::Printf(
+        "%s-%05d-of-%05d", ctx->input(0).scalar<string>()().c_str(),
+        ctx->input(1).scalar<int32>()(), ctx->input(2).scalar<int32>()());
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("ShardedFilename").Device(DEVICE_CPU),
+                        ShardedFilenameOp);
+
+class ShardedFilespecOp : public OpKernel {
+ public:
+  explicit ShardedFilespecOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    static const char* input_names[2] = {"basename", "num_shards"};
+    for (int i = 0; i < ctx->num_inputs(); ++i) {
+      OP_REQUIRES(ctx, TensorShapeUtils::IsLegacyScalar(ctx->input(i).shape()),
+                  errors::InvalidArgument(
+                      input_names[i], " must be a scalar, got shape ",
+                      ctx->input(i).shape().ShortDebugString()));
+    }
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &out));
+    out->scalar<string>()() = strings::Printf(
+        "%s-\?\?\?\?\?-of-%05d", ctx->input(0).scalar<string>()().c_str(),
+        ctx->input(1).scalar<int32>()());
+  }
+};
+REGISTER_KERNEL_BUILDER(Name("ShardedFilespec").Device(DEVICE_CPU),
+                        ShardedFilespecOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/save_op_test.cc b/tensorflow/core/kernels/save_op_test.cc
new file mode 100644
index 0000000000..ee1ba492a6
--- /dev/null
+++ b/tensorflow/core/kernels/save_op_test.cc
@@ -0,0 +1,443 @@
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/util/tensor_slice_reader.h"
+#include <gtest/gtest.h>
+
+namespace tensorflow {
+namespace {
+
+class SaveOpTest : public OpsTestBase {
+ protected:
+  void MakeOp() {
+    RequireDefaultOps();
+    ASSERT_OK(NodeDefBuilder("myop", "Save")
+                  .Input(FakeInput())
+                  .Input(FakeInput())
+                  .Input(FakeInput(
+                      {DT_INT32, DT_FLOAT, DT_DOUBLE, DT_QINT8, DT_QINT32}))
+                  .Finalize(node_def()));
+    ASSERT_OK(InitOp());
+  }
+};
+
+TEST_F(SaveOpTest, Simple) {
+  const string filename = io::JoinPath(testing::TmpDir(), "tensor_simple");
+  const string tensornames[] = {"tensor_int", "tensor_float", "tensor_double",
+                                "tensor_qint8", "tensor_qint32"};
+
+  MakeOp();
+  // Add a file name
+  AddInput<string>(TensorShape({}),
+                   [&filename](int x) -> string { return filename; });
+
+  // Add the tensor names
+  AddInput<string>(TensorShape({5}),
+                   [&tensornames](int x) -> string { return tensornames[x]; });
+
+  // Add a 1-d integer tensor
+  AddInput<int32>(TensorShape({10}), [](int x) -> int32 { return x + 1; });
+
+  // Add a 2-d float tensor
+  AddInput<float>(TensorShape({2, 4}),
+                  [](int x) -> float { return static_cast<float>(x) / 10; });
+
+  // Add a 2-d double tensor
+  AddInput<double>(TensorShape({2, 4}),
+                   [](int x) -> double { return static_cast<double>(x) / 20; });
+
+  // Add a 2-d qint8 tensor
+  AddInput<qint8>(TensorShape({3, 2}),
+                  [](int x) -> qint8 { return *reinterpret_cast<qint8*>(&x); });
+
+  // Add a 2-d qint32 tensor
+  AddInput<qint32>(TensorShape({2, 3}), [](int x) -> qint32 {
+    return *reinterpret_cast<qint32*>(&x) * qint8(2);
+  });
+
+  ASSERT_OK(RunOpKernel());
+
+  // Check that the checkpoint file is properly written
+  checkpoint::TensorSliceReader reader(filename,
+                                       checkpoint::OpenTableTensorSliceReader);
+  EXPECT_OK(reader.status());
+
+  // We expect to find all saved tensors
+  {
+    // The 1-d integer tensor
+    TensorShape shape;
+    DataType type;
+    EXPECT_TRUE(reader.HasTensor("tensor_int", &shape, &type));
+    TensorShape expected({10});
+    EXPECT_TRUE(shape.IsSameSize(expected));
+    EXPECT_EQ(DT_INT32, type);
+
+    // We expect the tensor value to be correct.
+    TensorSlice s = TensorSlice::ParseOrDie("-");
+    int data[10];
+    std::fill_n(data, 10, 0);
+    EXPECT_TRUE(reader.CopySliceData("tensor_int", s, data));
+    for (int i = 0; i < 10; ++i) {
+      EXPECT_EQ(i + 1, data[i]);
+    }
+  }
+
+  {
+    // The 2-d float tensor
+    TensorShape shape;
+    DataType type;
+    EXPECT_TRUE(reader.HasTensor("tensor_float", &shape, &type));
+    TensorShape expected({2, 4});
+    EXPECT_TRUE(shape.IsSameSize(expected));
+    EXPECT_EQ(DT_FLOAT, type);
+
+    // We expect the tensor value to be correct.
+    TensorSlice s = TensorSlice::ParseOrDie("-:-");
+    float data[8];
+    std::fill_n(data, 8, 0);
+    EXPECT_TRUE(reader.CopySliceData("tensor_float", s, data));
+    for (int i = 0; i < 8; ++i) {
+      EXPECT_EQ(static_cast<float>(i) / 10, data[i]);
+    }
+  }
+
+  {
+    // The 2-d double tensor
+    TensorShape shape;
+    DataType type;
+    EXPECT_TRUE(reader.HasTensor("tensor_double", &shape, &type));
+    TensorShape expected({2, 4});
+    EXPECT_TRUE(shape.IsSameSize(expected));
+    EXPECT_EQ(DT_DOUBLE, type);
+
+    // We expect the tensor value to be correct.
+    TensorSlice s = TensorSlice::ParseOrDie("-:-");
+    double data[8];
+    std::fill_n(data, 8, 0);
+    EXPECT_TRUE(reader.CopySliceData("tensor_double", s, data));
+    for (int i = 0; i < 8; ++i) {
+      EXPECT_EQ(static_cast<double>(i) / 20, data[i]);
+    }
+  }
+
+  {
+    // The 2-d qint8 tensor
+    TensorShape shape;
+    DataType type;
+    EXPECT_TRUE(reader.HasTensor("tensor_qint8", &shape, &type));
+    TensorShape expected({3, 2});
+    EXPECT_TRUE(shape.IsSameSize(expected));
+    EXPECT_EQ(DT_QINT8, type);
+
+    // We expect the tensor value to be correct.
+    TensorSlice s = TensorSlice::ParseOrDie("-:-");
+    qint8 data[6];
+    EXPECT_TRUE(reader.CopySliceData("tensor_qint8", s, data));
+    for (int i = 0; i < 6; ++i) {
+      EXPECT_EQ(*reinterpret_cast<qint8*>(&i), data[i]);
+    }
+  }
+
+  {
+    // The 2-d qint32 tensor
+    TensorShape shape;
+    DataType type;
+    EXPECT_TRUE(reader.HasTensor("tensor_qint32", &shape, &type));
+    TensorShape expected({2, 3});
+    EXPECT_TRUE(shape.IsSameSize(expected));
+    EXPECT_EQ(DT_QINT32, type);
+
+    // We expect the tensor value to be correct.
+    TensorSlice s = TensorSlice::ParseOrDie("-:-");
+    qint32 data[6];
+    EXPECT_TRUE(reader.CopySliceData("tensor_qint32", s, data));
+    for (int i = 0; i < 6; ++i) {
+      EXPECT_EQ(*reinterpret_cast<qint32*>(&i) * qint8(2), data[i]);
+    }
+  }
+}
+
+class SaveSlicesOpTest : public OpsTestBase {
+ protected:
+  void MakeOp() {
+    RequireDefaultOps();
+    ASSERT_OK(NodeDefBuilder("myop", "SaveSlices")
+                  .Input(FakeInput())
+                  .Input(FakeInput())
+                  .Input(FakeInput())
+                  .Input(FakeInput(
+                      {DT_INT32, DT_FLOAT, DT_DOUBLE, DT_QINT8, DT_QINT32}))
+                  .Finalize(node_def()));
+    ASSERT_OK(InitOp());
+  }
+};
+
+// Here we save only slices.  We restore them in a larger tensor and we check
+// that the right slice is restored.  It is quite tricky to check that the
+// right slices are actually restored so instead we just check that
+// CopySliceData() return true/false depending on the slice we ask for.
+TEST_F(SaveSlicesOpTest, Slices) {
+  const string filename = io::JoinPath(testing::TmpDir(), "tensor_slices");
+  const string tensornames[] = {"tensor_int", "tensor_float", "tensor_double",
+                                "tensor_qint8", "tensor_qint32"};
+  // Specifies that the data we save are slices of larger tensors.
+  // See core/framework/tensor_slice.h for the slice syntax.
+  const string tensorshapes[] = {
+      "10 -",         // Full contents of a 10 element vector.
+      "2 4 -:0,2",    // A 2x2 slice of a 2x4 tensor.
+      "2 4 0,1:2,2",  // A 1x2 slice of a 2x4 tensor.
+      "3 2 -:-",      // Full contents of a 3x2 tensor.
+      "2 3 1,1:2,1"   // Another 1x1 slice of a2x3 tensor.
+  };
+
+  MakeOp();
+  // Add a file name
+  AddInput<string>(TensorShape({}),
+                   [&filename](int x) -> string { return filename; });
+
+  // Add the tensor names
+  AddInput<string>(TensorShape({5}),
+                   [&tensornames](int x) -> string { return tensornames[x]; });
+
+  // Add the tensor shapes and slices
+  AddInput<string>(TensorShape({5}), [&tensorshapes](int x) -> string {
+    return tensorshapes[x];
+  });
+
+  // Add a 1-d integer tensor
+  AddInput<int32>(TensorShape({10}), [](int x) -> int32 { return x + 1; });
+
+  // Add a 2-d float tensor
+  AddInput<float>(TensorShape({2, 2}),
+                  [](int x) -> float { return static_cast<float>(x) / 10; });
+
+  // Add a 2-d double tensor
+  AddInput<double>(TensorShape({1, 2}),
+                   [](int x) -> double { return static_cast<double>(x) / 20; });
+
+  // Add a 2-d qint8 tensor
+  AddInput<qint8>(TensorShape({3, 2}),
+                  [](int x) -> qint8 { return *reinterpret_cast<qint8*>(&x); });
+
+  // Add a 2-d qint32 tensor
+  AddInput<qint32>(TensorShape({1, 1}), [](int x) -> qint32 {
+    return *reinterpret_cast<qint32*>(&x) * qint8(2);
+  });
+
+  ASSERT_OK(RunOpKernel());
+
+  // Check that the checkpoint file is properly written
+  checkpoint::TensorSliceReader reader(filename,
+                                       checkpoint::OpenTableTensorSliceReader);
+  EXPECT_OK(reader.status());
+
+  // We expect to find all saved tensors
+  {
+    // The 1-d integer tensor
+    TensorShape shape;
+    DataType type;
+    EXPECT_TRUE(reader.HasTensor("tensor_int", &shape, &type));
+    TensorShape expected({10});
+    EXPECT_TRUE(shape.IsSameSize(expected));
+    EXPECT_EQ(DT_INT32, type);
+
+    // We saved the full tensor so we should be able to read it all.
+    TensorSlice s = TensorSlice::ParseOrDie("-");
+    int data[10];
+    EXPECT_TRUE(reader.CopySliceData("tensor_int", s, data));
+  }
+
+  {
+    // The 2-d float tensor
+    TensorShape shape;
+    DataType type;
+    EXPECT_TRUE(reader.HasTensor("tensor_float", &shape, &type));
+    TensorShape expected({2, 4});
+    EXPECT_TRUE(shape.IsSameSize(expected));
+    EXPECT_EQ(DT_FLOAT, type);
+
+    // We saved the slice "-:0,2" so we should not be able to read the full
+    // tensor.
+    TensorSlice full_slice = TensorSlice::ParseOrDie("-:-");
+    TensorSlice saved_slice = TensorSlice::ParseOrDie("-:0,2");
+    float data[8];
+    EXPECT_FALSE(reader.CopySliceData("tensor_float", full_slice, data));
+    EXPECT_TRUE(reader.CopySliceData("tensor_float", saved_slice, data));
+  }
+
+  {
+    // The 2-d double tensor
+    TensorShape shape;
+    DataType type;
+    EXPECT_TRUE(reader.HasTensor("tensor_double", &shape, &type));
+    TensorShape expected({2, 4});
+    EXPECT_TRUE(shape.IsSameSize(expected));
+    EXPECT_EQ(DT_DOUBLE, type);
+
+    // We saved the slice "0,1:2,2" so we should not be able to read the full
+    // tensor.
+    TensorSlice full_slice = TensorSlice::ParseOrDie("-:-");
+    TensorSlice saved_slice = TensorSlice::ParseOrDie("0,1:2,2");
+    double data[8];
+    EXPECT_FALSE(reader.CopySliceData("tensor_double", full_slice, data));
+    EXPECT_TRUE(reader.CopySliceData("tensor_double", saved_slice, data));
+  }
+
+  {
+    // The 2-d qint8 tensor
+    TensorShape shape;
+    DataType type;
+    EXPECT_TRUE(reader.HasTensor("tensor_qint8", &shape, &type));
+    TensorShape expected({3, 2});
+    EXPECT_TRUE(shape.IsSameSize(expected));
+    EXPECT_EQ(DT_QINT8, type);
+
+    // We saved the full slice.
+    TensorSlice s = TensorSlice::ParseOrDie("-:-");
+    qint8 data[6];
+    EXPECT_TRUE(reader.CopySliceData("tensor_qint8", s, data));
+  }
+
+  {
+    // The 2-d qint32 tensor
+    TensorShape shape;
+    DataType type;
+    EXPECT_TRUE(reader.HasTensor("tensor_qint32", &shape, &type));
+    TensorShape expected({2, 3});
+    EXPECT_TRUE(shape.IsSameSize(expected));
+    EXPECT_EQ(DT_QINT32, type);
+
+    // We expect the tensor value to be correct.
+    TensorSlice s = TensorSlice::ParseOrDie("1,1:2,1");
+    TensorSlice full_slice = TensorSlice::ParseOrDie("-:-");
+    TensorSlice saved_slice = TensorSlice::ParseOrDie("1,1:2,1");
+    qint32 data[6];
+    EXPECT_FALSE(reader.CopySliceData("tensor_qint32", full_slice, data));
+    EXPECT_TRUE(reader.CopySliceData("tensor_qint32", saved_slice, data));
+  }
+}
+
+class SaveOpSlices2Test : public OpsTestBase {
+ protected:
+  void MakeOp() {
+    RequireDefaultOps();
+    ASSERT_OK(NodeDefBuilder("myop", "SaveSlices")
+                  .Input(FakeInput())
+                  .Input(FakeInput())
+                  .Input(FakeInput())
+                  .Input(FakeInput({DT_INT32, DT_INT32, DT_FLOAT}))
+                  .Finalize(node_def()));
+    ASSERT_OK(InitOp());
+  }
+};
+
+TEST_F(SaveOpSlices2Test, TwoSlices) {
+  const string filename = io::JoinPath(testing::TmpDir(), "three_slices");
+  // We will save 2 slices of the tensor named "four_by_sixteen" which is 4x16,
+  // and one slice of the "small" tensor.
+  const string tensornames[] = {"four_by_sixteen", "four_by_sixteen", "small"};
+  const string tensorshapes[] = {
+      // Slice specifications for the 2 slices of "four_by_sixteen"
+      "4 16 0,2:-",  // 1st slice covers indices 0 and 1 in the first dim.
+      "4 16 2,2:-",  // 2nd slice covers indices 2 and 3 in the first dim.
+      ""             // We save the full "small" tensors.
+  };
+
+  MakeOp();
+  // Add a file name
+  AddInput<string>(TensorShape({}),
+                   [&filename](int x) -> string { return filename; });
+
+  // Add the tensor names
+  AddInput<string>(TensorShape({3}),
+                   [&tensornames](int x) -> string { return tensornames[x]; });
+
+  // Add the tensor shapes and slices
+  AddInput<string>(TensorShape({3}), [&tensorshapes](int x) -> string {
+    return tensorshapes[x];
+  });
+
+  // Add an integer tensor for slice 0,2:- of a 4x16 tensor: It is 2x16.
+  AddInput<int32>(TensorShape({2, 16}), [](int x) -> int32 { return x + 1; });
+
+  // Add an integer tensor for slice 2,2:- of a 4x16 tensor: It is 2x16.
+  AddInput<int32>(TensorShape({2, 16}),
+                  [](int x) -> int32 { return 10 * (x + 1); });
+
+  // Add a float tensor for "small"
+  AddInput<float>(TensorShape({2, 4}),
+                  [](int x) -> float { return static_cast<float>(x) / 10; });
+
+  ASSERT_OK(RunOpKernel());
+
+  // Check that the checkpoint file is properly written
+  checkpoint::TensorSliceReader reader(filename,
+                                       checkpoint::OpenTableTensorSliceReader);
+  EXPECT_OK(reader.status());
+
+  {
+    // Reload the two slices of "four_by_sixteen" into that tensor.
+    Tensor reloaded(DT_INT32, {4, 16});
+
+    // We expect to find all slices
+    TensorShape shape;
+    DataType type;
+    EXPECT_TRUE(reader.HasTensor("four_by_sixteen", &shape, &type));
+    EXPECT_TRUE(shape.IsSameSize(reloaded.shape()));
+    EXPECT_EQ(type, reloaded.dtype());
+
+    // Reload the whole tensor.
+    EXPECT_TRUE(reader.CopySliceData("four_by_sixteen",
+                                     TensorSlice(reloaded.dims()),
+                                     reloaded.flat<int>().data()));
+
+    {
+      auto slice = reloaded.Slice(0, 2).flat<int>();
+      for (int i = 0; i < slice.size(); ++i) {
+        EXPECT_EQ(i + 1, slice(i));
+      }
+    }
+    {
+      auto slice = reloaded.Slice(2, 4).flat<int>();
+      for (int i = 0; i < slice.size(); ++i) {
+        EXPECT_EQ(10 * (i + 1), slice(i));
+      }
+    }
+  }
+
+  {
+    // Reload the small float tensor.
+    Tensor reloaded(DT_FLOAT, {2, 4});
+
+    TensorShape shape;
+    DataType type;
+    EXPECT_TRUE(reader.HasTensor("small", &shape, &type));
+    EXPECT_TRUE(shape.IsSameSize(reloaded.shape()));
+    EXPECT_EQ(DT_FLOAT, reloaded.dtype());
+
+    EXPECT_TRUE(reader.CopySliceData("small", TensorSlice(reloaded.dims()),
+                                     reloaded.flat<float>().data()));
+
+    for (int64 i = 0; i < reloaded.NumElements(); ++i) {
+      EXPECT_EQ(static_cast<float>(i) / 10, reloaded.flat<float>().data()[i]);
+    }
+  }
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/scatter_op.cc b/tensorflow/core/kernels/scatter_op.cc
new file mode 100644
index 0000000000..88fcc1bdcc
--- /dev/null
+++ b/tensorflow/core/kernels/scatter_op.cc
@@ -0,0 +1,167 @@
+// See docs in ../ops/state_ops.cc.
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+
+enum class UpdateOp { ASSIGN, ADD, SUB };
+
+template <class T, typename Index, UpdateOp op>
+class ScatterUpdateOp : public OpKernel {
+ public:
+  //   QUESTION: It'd be nice to support DT_INT16, DT_UINT8,
+  //   etc. here.  Should we have the framework do some sort of
+  //   integer promotion automatically, or should that be something
+  //   that users have to do explicitly with a conversion operator
+  //   in the graph?
+  explicit ScatterUpdateOp(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* c) override {
+    if (use_exclusive_lock_) {
+      // Hold mutex while we apply updates
+      mutex_lock l(*c->input_ref_mutex(0));
+      DoCompute(c);
+    } else {
+      DoCompute(c);
+    }
+  }
+
+ private:
+  bool use_exclusive_lock_;
+
+  // Check whether updates.shape = indices.shape + params.shape[1:]
+  static bool ValidShapes(const Tensor& params, const Tensor& updates,
+                          const Tensor& indices) {
+    if (updates.dims() != indices.dims() + params.dims() - 1) return false;
+    for (int d = 0; d < indices.dims(); d++) {
+      if (updates.dim_size(d) != indices.dim_size(d)) {
+        return false;
+      }
+    }
+    for (int d = 1; d < params.dims(); d++) {
+      if (params.dim_size(d) != updates.dim_size(d - 1 + indices.dims())) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  void DoCompute(OpKernelContext* c) {
+    Tensor Tparams = c->mutable_input(0, use_exclusive_lock_);
+    OP_REQUIRES(c, Tparams.IsInitialized(),
+                errors::FailedPrecondition("Null ref for params"));
+    const Tensor& Tindices = c->input(1);
+    const Tensor& Tupdates = c->input(2);
+    OP_REQUIRES(
+        c, TensorShapeUtils::IsVectorOrHigher(Tparams.shape()),
+        errors::InvalidArgument("params must be at least 1-D, got shape ",
+                                Tparams.shape().ShortDebugString()));
+    OP_REQUIRES(
+        c, ValidShapes(Tparams, Tupdates, Tindices),
+        errors::InvalidArgument(
+            "Must have updates.shape = indices.shape + params.shape[1:], got ",
+            "updates.shape ", Tupdates.shape().ShortDebugString(),
+            ", indices.shape ", Tindices.shape().ShortDebugString(),
+            ", params.shape ", Tparams.shape().ShortDebugString()));
+    const Index N = Tindices.NumElements();
+
+    // We always return the input ref.
+    c->forward_ref_input_to_ref_output(0, 0);
+
+    if (N > 0) {
+      const Index first_dim_size = Tparams.dim_size(0);
+      // Validate all the indices are in range
+      auto Tindices_vec = Tindices.flat<Index>();
+      for (Index i = 0; i < N; i++) {
+        const Index index = Tindices_vec(i);
+        OP_REQUIRES(c, index >= 0 && index < first_dim_size,
+                    errors::InvalidArgument(
+                        strings::StrCat("Index ", index, " at offset ", i,
+                                        " in indices is out of range")));
+      }
+      auto Tparams_flat = Tparams.flat_outer_dims<T>();
+      auto Tupdates_flat =
+          Tupdates.shaped<T, 2>({N, Tupdates.NumElements() / N});
+      for (Index i = 0; i < N; i++) {
+        // Copy last Ndim-1 dimensions of Tupdates[i] to
+        // Tparams[Tindices[i]]
+        switch (op) {
+          case UpdateOp::ASSIGN: {
+            Tparams_flat.template chip<0>(Tindices_vec(i)) =
+                Tupdates_flat.template chip<0>(i);
+            break;
+          }
+          case UpdateOp::ADD: {
+            Tparams_flat.template chip<0>(Tindices_vec(i)) +=
+                Tupdates_flat.template chip<0>(i);
+            break;
+          }
+          case UpdateOp::SUB: {
+            Tparams_flat.template chip<0>(Tindices_vec(i)) -=
+                Tupdates_flat.template chip<0>(i);
+            break;
+          }
+        }
+      }
+    }
+  }
+};
+
+#define REGISTER_SCATTER_UPDATE(type, index_type)  \
+  REGISTER_KERNEL_BUILDER(                         \
+      Name("ScatterUpdate")                        \
+          .Device(DEVICE_CPU)                      \
+          .TypeConstraint<type>("T")               \
+          .TypeConstraint<index_type>("Tindices"), \
+      ScatterUpdateOp<type, index_type, UpdateOp::ASSIGN>);
+
+#define REGISTER_SCATTER_UPDATE_INT32(type) REGISTER_SCATTER_UPDATE(type, int32)
+#define REGISTER_SCATTER_UPDATE_INT64(type) REGISTER_SCATTER_UPDATE(type, int64)
+
+TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_UPDATE_INT32);
+TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_UPDATE_INT64);
+
+#undef REGISTER_SCATTER_UPDATE_INT64
+#undef REGISTER_SCATTER_UPDATE_INT32
+#undef REGISTER_SCATTER_UPDATE
+
+#define REGISTER_SCATTER_ADD(type, index_type)                         \
+  REGISTER_KERNEL_BUILDER(Name("ScatterAdd")                           \
+                              .Device(DEVICE_CPU)                      \
+                              .TypeConstraint<type>("T")               \
+                              .TypeConstraint<index_type>("Tindices"), \
+                          ScatterUpdateOp<type, index_type, UpdateOp::ADD>);
+
+#define REGISTER_SCATTER_ADD_INT32(type) REGISTER_SCATTER_ADD(type, int32)
+#define REGISTER_SCATTER_ADD_INT64(type) REGISTER_SCATTER_ADD(type, int64)
+
+TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ADD_INT32);
+TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ADD_INT64);
+
+#undef REGISTER_SCATTER_ADD_INT32
+#undef REGISTER_SCATTER_ADD_INT64
+#undef REGISTER_SCATTER_ADD
+
+#define REGISTER_SCATTER_SUB(type, index_type)                         \
+  REGISTER_KERNEL_BUILDER(Name("ScatterSub")                           \
+                              .Device(DEVICE_CPU)                      \
+                              .TypeConstraint<type>("T")               \
+                              .TypeConstraint<index_type>("Tindices"), \
+                          ScatterUpdateOp<type, index_type, UpdateOp::SUB>);
+
+#define REGISTER_SCATTER_SUB_INT32(type) REGISTER_SCATTER_SUB(type, int32)
+#define REGISTER_SCATTER_SUB_INT64(type) REGISTER_SCATTER_SUB(type, int64)
+
+TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_SUB_INT32);
+TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_SUB_INT64);
+
+#undef REGISTER_SCATTER_SUB_INT64
+#undef REGISTER_SCATTER_SUB_INT32
+#undef REGISTER_SCATTER_SUB
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/scatter_op_test.cc b/tensorflow/core/kernels/scatter_op_test.cc
new file mode 100644
index 0000000000..8885f1edb3
--- /dev/null
+++ b/tensorflow/core/kernels/scatter_op_test.cc
@@ -0,0 +1,255 @@
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+namespace {
+
+class ScatterUpdateOpTest : public OpsTestBase {
+ protected:
+  void MakeOp(DataType index_type) {
+    RequireDefaultOps();
+    ASSERT_OK(NodeDefBuilder("myop", "ScatterUpdate")
+                  .Input(FakeInput(DT_FLOAT_REF))
+                  .Input(FakeInput(index_type))
+                  .Input(FakeInput(DT_FLOAT))
+                  .Finalize(node_def()));
+    ASSERT_OK(InitOp());
+  }
+};
+
+TEST_F(ScatterUpdateOpTest, Simple_TwoD32) {
+  MakeOp(DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({5, 3}),
+                           {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 4, 2});
+  AddInputFromArray<float>(TensorShape({3, 3}),
+                           {100, 101, 102, 777, 778, 779, 10000, 10001, 10002});
+  ASSERT_OK(RunOpKernel());
+
+  // Check the new state of the input
+  Tensor params_tensor = *mutable_input(0).tensor;
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({5, 3}));
+  test::FillValues<float>(&expected, {100, 101, 102, 0, 0, 0, 10000, 10001,
+                                      10002, 0, 0, 0, 777, 778, 779});
+  test::ExpectTensorEqual<float>(expected, params_tensor);
+}
+
+TEST_F(ScatterUpdateOpTest, Simple_Two64) {
+  MakeOp(DT_INT64);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({5, 3}),
+                           {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
+  AddInputFromArray<int64>(TensorShape({3}), {0, 4, 2});
+  AddInputFromArray<float>(TensorShape({3, 3}),
+                           {100, 101, 102, 777, 778, 779, 10000, 10001, 10002});
+  ASSERT_OK(RunOpKernel());
+
+  // Check the new state of the input
+  Tensor params_tensor = *mutable_input(0).tensor;
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({5, 3}));
+  test::FillValues<float>(&expected, {100, 101, 102, 0, 0, 0, 10000, 10001,
+                                      10002, 0, 0, 0, 777, 778, 779});
+  test::ExpectTensorEqual<float>(expected, params_tensor);
+}
+
+TEST_F(ScatterUpdateOpTest, Simple_ZeroD) {
+  MakeOp(DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({5}), {0, 0, 0, 0, 0});
+  AddInputFromArray<int32>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {101});
+  ASSERT_OK(RunOpKernel());
+
+  // Check the new state of the input
+  Tensor params_tensor = *mutable_input(0).tensor;
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({5}));
+  test::FillValues<float>(&expected, {0, 0, 0, 101, 0});
+  test::ExpectTensorEqual<float>(expected, params_tensor);
+}
+
+TEST_F(ScatterUpdateOpTest, Simple_OneD) {
+  MakeOp(DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({5}), {0, 0, 0, 0, 0});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 4, 2});
+  AddInputFromArray<float>(TensorShape({3}), {100, 101, 102});
+  ASSERT_OK(RunOpKernel());
+
+  // Check the new state of the input
+  Tensor params_tensor = *mutable_input(0).tensor;
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({5}));
+  test::FillValues<float>(&expected, {100, 0, 102, 0, 101});
+  test::ExpectTensorEqual<float>(expected, params_tensor);
+}
+
+TEST_F(ScatterUpdateOpTest, HigherRank) {
+  MakeOp(DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({8}), {0, 0, 0, 0, 0, 0, 0, 0});
+  AddInputFromArray<int32>(TensorShape({2, 3}), {0, 4, 2, 1, 3, 6});
+  AddInputFromArray<float>(TensorShape({2, 3}), {10, 20, 30, 40, 50, 60});
+  ASSERT_OK(RunOpKernel());
+
+  // Check the new state of the input
+  Tensor params_tensor = *mutable_input(0).tensor;
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({8}));
+  test::FillValues<float>(&expected, {10, 40, 30, 50, 20, 0, 60, 0});
+  test::ExpectTensorEqual<float>(expected, params_tensor);
+}
+
+TEST_F(ScatterUpdateOpTest, Error_IndexOutOfRange) {
+  MakeOp(DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({5, 3}),
+                           {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 4, 99});
+  AddInputFromArray<float>(TensorShape({3, 3}),
+                           {100, 101, 102, 777, 778, 779, 10000, 10001, 10002});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString())
+                  .contains("Index 99 at offset 2 in indices is out of range"))
+      << s;
+}
+
+TEST_F(ScatterUpdateOpTest, Error_WrongDimsIndices) {
+  MakeOp(DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({2, 3}), {0, 0, 0, 0, 0, 0});
+  AddInputFromArray<int32>(TensorShape({1, 3}), {0, 4, 99});
+  AddInputFromArray<float>(TensorShape({3, 3}),
+                           {100, 101, 102, 777, 778, 779, 10000, 10001, 10002});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString())
+                  .contains("Must have updates.shape = indices.shape + "
+                            "params.shape[1:], got "))
+      << s;
+}
+
+TEST_F(ScatterUpdateOpTest, Error_MismatchedParamsAndUpdateDimensions) {
+  MakeOp(DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({5, 3}),
+                           {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 4, 2});
+  AddInputFromArray<float>(
+      TensorShape({3, 4}),
+      {100, 101, 102, 103, 777, 778, 779, 780, 10000, 10001, 10002, 10004});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString())
+                  .contains("Must have updates.shape = indices.shape + "
+                            "params.shape[1:], got "))
+
+      << s;
+}
+
+TEST_F(ScatterUpdateOpTest, Error_MismatchedIndicesAndUpdateDimensions) {
+  MakeOp(DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({5, 3}),
+                           {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 4, 2});
+  AddInputFromArray<float>(TensorShape({2, 3}),
+                           {100, 101, 102, 10000, 10001, 10002});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString())
+                  .contains("Must have updates.shape = indices.shape + "
+                            "params.shape[1:], got "))
+      << s;
+}
+
+class ScatterUpdateBM : public ScatterUpdateOpTest {
+ public:
+  virtual void TestBody() {}
+  void MakeBenchmarkOp(const char* op, DataType index_type) {
+    ASSERT_OK(NodeDefBuilder("myop", op)
+                  .Input(FakeInput(DT_FLOAT_REF))
+                  .Input(FakeInput(index_type))
+                  .Input(FakeInput(DT_FLOAT))
+                  .Finalize(node_def()));
+    TF_CHECK_OK(InitOp());
+  }
+};
+
+template <typename Index>
+static void BM_ScatterHelper(int iters, int embedding_size, const char* op) {
+  testing::StopTiming();
+  const int kRows = 10000000 / embedding_size;
+  std::vector<float> values;
+  for (int i = 0; i < kRows * embedding_size; i++) {
+    values.push_back(i);
+  }
+  const int kNumUpdates = 1000;
+  random::PhiloxRandom philox(301, 17);
+  random::SimplePhilox rnd(&philox);
+  std::vector<Index> indices;
+  std::vector<float> updates;
+  for (int i = 0; i < kNumUpdates; i++) {
+    indices.push_back(rnd.Uniform(kRows));
+    for (int j = 0; j < embedding_size; j++) {
+      updates.push_back(i * 10 + j);
+    }
+  }
+
+  ScatterUpdateBM bm;
+  bm.MakeBenchmarkOp(op, DataTypeToEnum<Index>::v());
+  bm.AddInputFromArray<float>(TensorShape({kRows, embedding_size}), values);
+  bm.AddInputFromArray<Index>(TensorShape({kNumUpdates}), indices);
+  bm.AddInputFromArray<float>(TensorShape({kNumUpdates, embedding_size}),
+                              updates);
+  testing::ItemsProcessed((static_cast<int64>(kNumUpdates) * embedding_size) *
+                          iters);
+  testing::StartTiming();
+  while (iters-- > 0) {
+    Status s = bm.RunOpKernel();
+  }
+}
+
+static void BM_ScatterUpdateInt32(int iters, int embedding_size) {
+  BM_ScatterHelper<int32>(iters, embedding_size, "ScatterUpdate");
+}
+static void BM_ScatterUpdateInt64(int iters, int embedding_size) {
+  BM_ScatterHelper<int64>(iters, embedding_size, "ScatterUpdate");
+}
+
+static void BM_ScatterAddInt32(int iters, int embedding_size) {
+  BM_ScatterHelper<int32>(iters, embedding_size, "ScatterAdd");
+}
+static void BM_ScatterAddInt64(int iters, int embedding_size) {
+  BM_ScatterHelper<int64>(iters, embedding_size, "ScatterAdd");
+}
+
+BENCHMARK(BM_ScatterUpdateInt32)->Arg(1)->Arg(10)->Arg(64)->Arg(256)->Arg(1024);
+BENCHMARK(BM_ScatterUpdateInt64)->Arg(1)->Arg(10)->Arg(64)->Arg(256)->Arg(1024);
+
+BENCHMARK(BM_ScatterAddInt32)->Arg(1)->Arg(10)->Arg(64)->Arg(256)->Arg(1024);
+BENCHMARK(BM_ScatterAddInt64)->Arg(1)->Arg(10)->Arg(64)->Arg(256)->Arg(1024);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/segment_reduction_ops.cc b/tensorflow/core/kernels/segment_reduction_ops.cc
new file mode 100644
index 0000000000..2b6a8c5a88
--- /dev/null
+++ b/tensorflow/core/kernels/segment_reduction_ops.cc
@@ -0,0 +1,466 @@
+// See docs in ../ops/math_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/Eigen/Core"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/public/status.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+// This operator handles reducing segments along the first dimension.
+// See core/ops/math_ops.cc for more details.
+template <typename Device, class T, class Index, typename Reducer>
+class SegmentReductionOp : public OpKernel {
+ public:
+  explicit SegmentReductionOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const Tensor& segment_ids = context->input(1);
+
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(segment_ids.shape()),
+                errors::InvalidArgument("segment_ids should be a vector."));
+    const int64 num_indices = segment_ids.NumElements();
+    OP_REQUIRES(context, num_indices == input.dim_size(0),
+                errors::InvalidArgument(
+                    "segment_ids should be the same size as dimension 0 of"
+                    " input."));
+
+    auto input_flat = input.flat_outer_dims<T>();
+    const int64 num_col = input_flat.dimension(1);
+
+    const auto segment_vec = segment_ids.vec<Index>();
+    // Note that the current implementation assumes that segment_vec values are
+    // sorted.
+    const Index output_rows =
+        num_indices > 0 ? segment_vec(num_indices - 1) + 1 : 0;
+
+    TensorShape output_shape = input.shape();
+    output_shape.set_dim(0, output_rows);
+
+    // Note that we do not initialize the output buffer with a default value.
+    // We require that segment ids be sorted and cover all values (otherwise we
+    // return an error).
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+    auto output_flat = output->flat_outer_dims<T>();
+
+#if !defined(EIGEN_HAS_INDEX_LIST)
+    Eigen::DSizes<Eigen::DenseIndex, 1> dims_to_reduce;
+    dims_to_reduce[0] = 0;
+#else
+    Eigen::IndexList<Eigen::type2index<0>> dims_to_reduce;
+#endif
+    Index start = 0, end = 1;
+    // TODO(agarwal): if this loop becomes a bottleneck, consider sharding it
+    // across threads.
+    Eigen::DSizes<Eigen::DenseIndex, 1> out_slice_shape(num_col);
+    while (end <= num_indices) {
+      if (end < num_indices) {
+        if (segment_vec(start) == segment_vec(end)) {
+          ++end;
+          continue;
+        }
+        // We have a new segment here.  Verify that the segment ids grow by one
+        // each time, so that we cover every possible output value.
+        OP_REQUIRES(
+            context, segment_vec(start) + 1 == segment_vec(end),
+            errors::InvalidArgument("segment ids are not increasing by 1"));
+      }
+
+      // Process segment [start, end)
+      const T* in_slice_ptr = &input_flat(start, 0);
+      typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>,
+                               Eigen::Unaligned> OutT;
+      T* out_slice_ptr = &output_flat(segment_vec(start), 0);
+      OutT out_slice(out_slice_ptr, out_slice_shape);
+      // We don't use out_slice.device(context->egien_device<Device>)
+      // because these pieces of work are likely to be very small and
+      // the context switching overhead dwarfs any benefit we get from
+      // using another thread to do this work.
+      if (start == end - 1) {
+        typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor>,
+                                 Eigen::Unaligned> InT;
+        InT in_slice(in_slice_ptr, out_slice_shape);
+        out_slice = in_slice;
+      } else {
+        Eigen::DSizes<Eigen::DenseIndex, 2> in_slice_shape(end - start,
+                                                           num_col);
+        typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor>,
+                                 Eigen::Unaligned> InT;
+        InT in_slice(in_slice_ptr, in_slice_shape);
+
+        out_slice = in_slice.reduce(dims_to_reduce, Reducer());
+      }
+      start = end;
+      ++end;
+    }
+  }
+};
+
+#define REGISTER_CPU_KERNELS(type, index_type)                 \
+  REGISTER_KERNEL_BUILDER(                                     \
+      Name("SegmentSum")                                       \
+          .Device(DEVICE_CPU)                                  \
+          .TypeConstraint<type>("T")                           \
+          .TypeConstraint<index_type>("Tindices"),             \
+      SegmentReductionOp<CPUDevice, type, index_type,          \
+                         Eigen::internal::SumReducer<type>>);  \
+  REGISTER_KERNEL_BUILDER(                                     \
+      Name("SegmentMean")                                      \
+          .Device(DEVICE_CPU)                                  \
+          .TypeConstraint<type>("T")                           \
+          .TypeConstraint<index_type>("Tindices"),             \
+      SegmentReductionOp<CPUDevice, type, index_type,          \
+                         Eigen::internal::MeanReducer<type>>); \
+  REGISTER_KERNEL_BUILDER(                                     \
+      Name("SegmentProd")                                      \
+          .Device(DEVICE_CPU)                                  \
+          .TypeConstraint<type>("T")                           \
+          .TypeConstraint<index_type>("Tindices"),             \
+      SegmentReductionOp<CPUDevice, type, index_type,          \
+                         Eigen::internal::ProdReducer<type>>); \
+  REGISTER_KERNEL_BUILDER(                                     \
+      Name("SegmentMin")                                       \
+          .Device(DEVICE_CPU)                                  \
+          .TypeConstraint<type>("T")                           \
+          .TypeConstraint<index_type>("Tindices"),             \
+      SegmentReductionOp<CPUDevice, type, index_type,          \
+                         Eigen::internal::MinReducer<type>>);  \
+  REGISTER_KERNEL_BUILDER(                                     \
+      Name("SegmentMax")                                       \
+          .Device(DEVICE_CPU)                                  \
+          .TypeConstraint<type>("T")                           \
+          .TypeConstraint<index_type>("Tindices"),             \
+      SegmentReductionOp<CPUDevice, type, index_type,          \
+                         Eigen::internal::MaxReducer<type>>);
+
+#define REGISTER_CPU_KERNELS_ALL(type) \
+  REGISTER_CPU_KERNELS(type, int32);   \
+  REGISTER_CPU_KERNELS(type, int64);
+
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS_ALL);
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_CPU_KERNELS_ALL
+
+// Similar to SegmentReductionOp but can handle unsorted segment definitions and
+// specifying size of output.
+template <typename Device, class T, class Index>
+class UnsortedSegmentSumOp : public OpKernel {
+ public:
+  explicit UnsortedSegmentSumOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& data = context->input(0);
+    const Tensor& segment_ids = context->input(1);
+    const Tensor& num_segments = context->input(2);
+
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsLegacyScalar(num_segments.shape()),
+        errors::InvalidArgument("num_segments should be a scalar, not shape ",
+                                num_segments.shape().ShortDebugString()));
+
+    OP_REQUIRES(context,
+                TensorShapeUtils::StartsWith(data.shape(), segment_ids.shape()),
+                errors::InvalidArgument(
+                    "data.shape = ", data.shape().ShortDebugString(),
+                    " does not start with segment_ids.shape = ",
+                    segment_ids.shape().ShortDebugString()));
+
+    const auto segment_flat = segment_ids.flat<Index>();
+    const int32 N = segment_flat.dimension(0);
+    const int32 output_rows = num_segments.scalar<int32>()();
+
+    if (N > 0) {
+      Eigen::Tensor<Index, 0, Eigen::RowMajor> m = segment_flat.maximum();
+      OP_REQUIRES(
+          context, m() < output_rows,
+          errors::InvalidArgument("More segments found than output size"));
+    }
+
+    TensorShape output_shape;
+    output_shape.AddDim(output_rows);
+    for (int i = segment_ids.dims(); i < data.dims(); i++) {
+      output_shape.AddDim(data.dim_size(i));
+    }
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+    auto output_flat = output->flat_outer_dims<T>();
+    output_flat.setZero();
+
+    if (data.NumElements() > 0) {
+      auto data_flat = data.shaped<T, 2>({N, data.NumElements() / N});
+      for (int i = 0; i < N; ++i) {
+        output_flat.template chip<0>(segment_flat(i)) +=
+            data_flat.template chip<0>(i);
+      }
+    }
+  }
+};
+
+#define REGISTER_CPU_UNSORTED_KERNELS(type, index_type)                \
+  REGISTER_KERNEL_BUILDER(Name("UnsortedSegmentSum")                   \
+                              .Device(DEVICE_CPU)                      \
+                              .TypeConstraint<type>("T")               \
+                              .TypeConstraint<index_type>("Tindices"), \
+                          UnsortedSegmentSumOp<CPUDevice, type, index_type>);
+
+#define REGISTER_CPU_UNSORTED_KERNELS_ALL(type) \
+  REGISTER_CPU_UNSORTED_KERNELS(type, int32);   \
+  REGISTER_CPU_UNSORTED_KERNELS(type, int64);
+
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_UNSORTED_KERNELS_ALL);
+#undef REGISTER_CPU_UNSORTED_KERNELS
+#undef REGISTER_CPU_UNSORTED_KERNELS_ALL
+
+// Same as SegmentReductionOp but takes as input a "sparse" tensor, represented
+// by two dense tensors, one containing the data, and the other containing
+// indices into the data.
+template <typename Device, class T>
+class SparseSegmentReductionOpBase : public OpKernel {
+ public:
+  explicit SparseSegmentReductionOpBase(OpKernelConstruction* context,
+                                        bool is_mean)
+      : OpKernel(context), is_mean_(is_mean) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const Tensor& indices = context->input(1);
+    const Tensor& segment_ids = context->input(2);
+
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(indices.shape()),
+                errors::InvalidArgument("indices should be a vector."));
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(segment_ids.shape()),
+                errors::InvalidArgument("segment_ids should be a vector."));
+
+    const int32 num_indices = indices.NumElements();
+    OP_REQUIRES(context, num_indices == segment_ids.NumElements(),
+                errors::InvalidArgument(
+                    "segment_ids and indices should have same size."));
+
+    auto input_flat = input.flat_outer_dims<T>();
+
+    const auto indices_vec = indices.vec<int32>();
+    const auto segment_vec = segment_ids.vec<int32>();
+    // Note that the current implementation assumes that segment_vec values are
+    // sorted.
+    const int32 output_rows =
+        num_indices > 0 ? segment_vec(num_indices - 1) + 1 : 0;
+
+    TensorShape output_shape = input.shape();
+    output_shape.set_dim(0, output_rows);
+
+    // Note that we do not initialize the output buffer with a default value.
+    // We require that segment ids be sorted and cover all values (otherwise we
+    // return an error).
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+    if (num_indices == 0) return;
+    auto output_flat = output->flat_outer_dims<T>();
+
+    int32 start = 0, end = 1;
+    while (end <= num_indices) {
+      if (end < num_indices) {
+        if (segment_vec(start) == segment_vec(end)) {
+          ++end;
+          continue;
+        }
+        // We have a new segment here.  Verify that the segment ids grow by one
+        // each time, so that we cover every possible output value.
+        OP_REQUIRES(
+            context, segment_vec(start) + 1 == segment_vec(end),
+            errors::InvalidArgument("segment ids are not increasing by 1"));
+      }
+
+      auto out = output_flat.template chip<0>(segment_vec(start));
+#define I(i) input_flat.template chip<0>(indices_vec(start + i))
+      int num = end - start;
+      if (num == 1) {
+        out = I(0);
+      } else {
+        int r = num % 8;
+        T m = (is_mean_ && (num < 10)) ? num : 1;
+        switch (r) {
+          case 2:
+            out = (I(0) + I(1)) / m;
+            break;
+          case 3:
+            out = (I(0) + I(1) + I(2)) / m;
+            break;
+          case 4:
+            out = (I(0) + I(1) + I(2) + I(3)) / m;
+            break;
+          case 5:
+            out = (I(0) + I(1) + I(2) + I(3) + I(4)) / m;
+            break;
+          case 6:
+            out = (I(0) + I(1) + I(2) + I(3) + I(4) + I(5)) / m;
+            break;
+          case 7:
+            out = (I(0) + I(1) + I(2) + I(3) + I(4) + I(5) + I(6)) / m;
+            break;
+          case 0:
+            out = (I(0) + I(1) + I(2) + I(3) + I(4) + I(5) + I(6) + I(7)) / m;
+            r = 8;
+            break;
+          case 1:
+            out =
+                (I(0) + I(1) + I(2) + I(3) + I(4) + I(5) + I(6) + I(7) + I(8)) /
+                m;
+            r = 9;
+            break;
+        }
+        for (; r < num; r += 8) {
+          out += I(r) + I(r + 1) + I(r + 2) + I(r + 3) + I(r + 4) + I(r + 5) +
+                 I(r + 6) + I(r + 7);
+        }
+#undef I
+        if (is_mean_ && num >= 10) {
+          out = out / static_cast<T>(num);
+        }
+      }
+      start = end;
+      ++end;
+    }
+  }
+
+ private:
+  bool is_mean_;
+};
+
+template <typename Device, class T>
+class SparseSegmentReductionMeanOp
+    : public SparseSegmentReductionOpBase<Device, T> {
+ public:
+  explicit SparseSegmentReductionMeanOp(OpKernelConstruction* context)
+      : SparseSegmentReductionOpBase<Device, T>(context, true /*is_mean*/) {}
+};
+
+template <typename Device, class T>
+class SparseSegmentReductionSumOp
+    : public SparseSegmentReductionOpBase<Device, T> {
+ public:
+  explicit SparseSegmentReductionSumOp(OpKernelConstruction* context)
+      : SparseSegmentReductionOpBase<Device, T>(context, false /*is_mean*/) {}
+};
+
+#define REGISTER_CPU_SPARSE_KERNELS(type)                                    \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name("SparseSegmentSum").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      SparseSegmentReductionSumOp<CPUDevice, type>);
+
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_SPARSE_KERNELS);
+#undef REGISTER_CPU_SPARSE_KERNELS
+
+#define REGISTER_CPU_SPARSE_KERNELS(type)                                     \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("SparseSegmentMean").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      SparseSegmentReductionMeanOp<CPUDevice, type>);
+REGISTER_CPU_SPARSE_KERNELS(float);
+REGISTER_CPU_SPARSE_KERNELS(double);
+#undef REGISTER_CPU_SPARSE_KERNELS
+
+template <class T>
+class SparseSegmentMeanGradOp : public OpKernel {
+ public:
+  explicit SparseSegmentMeanGradOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const Tensor& indices = context->input(1);
+    const Tensor& segment_ids = context->input(2);
+    const Tensor& output_dim0 = context->input(3);
+
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(indices.shape()),
+                errors::InvalidArgument("indices should be a vector."));
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(segment_ids.shape()),
+                errors::InvalidArgument("segment_ids should be a vector."));
+    OP_REQUIRES(context, TensorShapeUtils::IsLegacyScalar(output_dim0.shape()),
+                errors::InvalidArgument("output_dim0 should be a scalar."));
+
+    const int64 N = indices.NumElements();
+    OP_REQUIRES(context, N == segment_ids.NumElements(),
+                errors::InvalidArgument(
+                    "segment_ids and indices should have same size."));
+    const int32 M = output_dim0.scalar<int32>()();
+
+    auto input_flat = input.flat_outer_dims<T>();
+    const auto indices_vec = indices.vec<int32>();
+    const auto segment_vec = segment_ids.vec<int32>();
+
+    TensorShape output_shape = input.shape();
+    output_shape.set_dim(0, M);
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+    if (M == 0 || N == 0) return;
+
+    // Note that similar to SparseSegmentMean, we assume that segment_vec is
+    // already sorted and has non-negative values.
+    int num_segments = segment_vec(N - 1) + 1;
+    OP_REQUIRES(context, input.dim_size(0) == num_segments,
+                errors::InvalidArgument("Invalid number of segments"));
+
+    // Compute scaling factors for input.
+    std::vector<double> scaling(num_segments, 0.0);
+    for (int64 i = 0; i < N; ++i) {
+      scaling[segment_vec(i)] += 1;
+    }
+    for (int i = 0; i < scaling.size(); ++i) {
+      scaling[i] = 1.0 / std::max(scaling[i], 1.0);
+    }
+
+    auto output_flat = output->flat_outer_dims<T>();
+    output_flat.setZero();
+    std::vector<bool> is_modified(M, false);
+
+    for (int64 i = 0; i < N; ++i) {
+      int output_idx = indices_vec(i);
+      int idx = segment_vec(i);
+      T scale = static_cast<T>(scaling[idx]);
+      if (is_modified[output_idx]) {
+        if (scale == 1.0) {
+          output_flat.template chip<0>(output_idx) +=
+              input_flat.template chip<0>(idx);
+        } else {
+          output_flat.template chip<0>(output_idx) +=
+              input_flat.template chip<0>(idx) * scale;
+        }
+      } else {
+        if (scale == 1.0) {
+          output_flat.template chip<0>(output_idx) =
+              input_flat.template chip<0>(idx);
+        } else {
+          output_flat.template chip<0>(output_idx) =
+              input_flat.template chip<0>(idx) * scale;
+        }
+      }
+      is_modified[output_idx] = true;
+    }
+  }
+};
+
+#define REGISTER_CPU_SPARSE_KERNELS(type)                 \
+  REGISTER_KERNEL_BUILDER(Name("SparseSegmentMeanGrad")   \
+                              .Device(DEVICE_CPU)         \
+                              .TypeConstraint<type>("T"), \
+                          SparseSegmentMeanGradOp<type>);
+
+REGISTER_CPU_SPARSE_KERNELS(float);
+REGISTER_CPU_SPARSE_KERNELS(double);
+
+#undef REGISTER_CPU_SPARSE_KERNELS
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/segment_reduction_ops_test.cc b/tensorflow/core/kernels/segment_reduction_ops_test.cc
new file mode 100644
index 0000000000..87647a21a8
--- /dev/null
+++ b/tensorflow/core/kernels/segment_reduction_ops_test.cc
@@ -0,0 +1,157 @@
+#include <functional>
+
+#include "tensorflow/core/public/session_options.h"
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/tensor.h"
+#include <gtest/gtest.h>
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+
+namespace tensorflow {
+
+template <typename Index>
+static void BM_SegmentReduction(int iters, string reduction, Index num_rows,
+                                Index num_cols, Index segment_size) {
+  testing::StopTiming();
+  std::unique_ptr<Device> device(
+      DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+
+  // Create inputs
+  gtl::InlinedVector<TensorValue, 4> reduction_inputs;
+  TensorShape shape1({num_rows, num_cols});
+  Tensor input1(DT_FLOAT, shape1);
+  reduction_inputs.push_back({nullptr, &input1});
+
+  TensorShape shape2({num_rows});
+  Tensor input2(DataTypeToEnum<Index>::v(), shape2);
+  test::FillFn<Index>(&input2, [&num_rows, &segment_size](Index i) -> Index {
+    return std::min(i / segment_size, num_rows - 1);
+  });
+  reduction_inputs.push_back({nullptr, &input2});
+
+  NodeDef reduction_node_def;
+  TF_CHECK_OK(NodeDefBuilder(reduction, reduction)
+                  .Input(FakeInput(DT_FLOAT))
+                  .Input(FakeInput(DataTypeToEnum<Index>::v()))
+                  .Finalize(&reduction_node_def));
+  Status status;
+  std::unique_ptr<OpKernel> reduction_op(CreateOpKernel(
+      DEVICE_CPU, device.get(), cpu_allocator(), reduction_node_def, &status));
+  OpKernelContext::Params params;
+  params.device = device.get();
+  params.frame_iter = FrameAndIter(0, 0);
+  params.inputs = &reduction_inputs;
+  params.op_kernel = reduction_op.get();
+  params.output_alloc_attr = [&device, &reduction_op, &params](int index) {
+    AllocatorAttributes attr;
+    const bool on_host =
+        (reduction_op->output_memory_types()[index] == HOST_MEMORY);
+    attr.set_on_host(on_host);
+    return attr;
+  };
+
+  std::unique_ptr<OpKernelContext> reduction_context(
+      new OpKernelContext(params));
+
+  reduction_op->Compute(reduction_context.get());
+  TF_CHECK_OK(reduction_context->status());
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    delete reduction_context->release_output(0).tensor;
+    reduction_op->Compute(reduction_context.get());
+  }
+  int64 bytes_per_iter =
+      static_cast<int64>(num_rows * num_cols * sizeof(float));
+  testing::BytesProcessed(bytes_per_iter * iters);
+}
+
+#define BM_Reduce(O, R, C, S)                                      \
+  static void BM_Reduce_##O##_##R##_##C##_##S##_int32(int iters) { \
+    BM_SegmentReduction<int32>(iters, #O, R, C, S);                \
+  }                                                                \
+  static void BM_Reduce_##O##_##R##_##C##_##S##_int64(int iters) { \
+    BM_SegmentReduction<int64>(iters, #O, R, C, S);                \
+  }                                                                \
+  BENCHMARK(BM_Reduce_##O##_##R##_##C##_##S##_int32);              \
+  BENCHMARK(BM_Reduce_##O##_##R##_##C##_##S##_int64);
+
+#define BM_Reduce_Arg(R, C, S)    \
+  BM_Reduce(SegmentSum, R, C, S); \
+  BM_Reduce(SegmentMean, R, C, S);
+
+BM_Reduce_Arg(64, 32, 1);
+BM_Reduce_Arg(4096, 128, 1);
+
+BM_Reduce_Arg(16, 8, 2);
+BM_Reduce_Arg(64, 32, 2);
+BM_Reduce_Arg(4096, 32, 2);
+BM_Reduce_Arg(4096, 128, 2);
+
+static void SparseSegmentMeanGradHelper(int iters, float uniqueness, int size) {
+  testing::StopTiming();
+  RequireDefaultOps();
+  Graph* g = new Graph(OpRegistry::Global());
+  CHECK_LE(uniqueness, 1.0);
+  CHECK_GT(uniqueness, 0.0);
+
+  const int kNumIndices = size;
+  Tensor indices(DT_INT32, TensorShape({kNumIndices}));
+  auto indices_flat = indices.flat<int32>();
+  Tensor segments(DT_INT32, TensorShape({kNumIndices}));
+  auto segments_flat = segments.flat<int32>();
+
+  int kUniqueIndices = uniqueness * kNumIndices;
+  Tensor output_dim0(DT_INT32, TensorShape({}));
+  output_dim0.scalar<int32>()() = kUniqueIndices;
+
+  for (int i = 0; i < kNumIndices; ++i) {
+    indices_flat(i) = (i * 31) % kUniqueIndices;
+    segments_flat(i) = i * .8;
+  }
+
+  const int kDim1 = segments_flat(kNumIndices - 1) + 1;
+  const int kDim2 = 128;
+  Tensor input(DT_FLOAT, TensorShape({kDim1, kDim2}));
+  input.flat<float>().setRandom();
+
+  Node* node;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "SparseSegmentMeanGrad")
+                  .Input(test::graph::Constant(g, input))
+                  .Input(test::graph::Constant(g, indices))
+                  .Input(test::graph::Constant(g, segments))
+                  .Input(test::graph::Constant(g, output_dim0))
+                  .Attr("T", DT_FLOAT)
+                  .Finalize(g, &node));
+
+  testing::UseRealTime();
+  testing::BytesProcessed(static_cast<int64>(iters) * (kDim1 * kDim2) *
+                          sizeof(float));
+  testing::StartTiming();
+  test::Benchmark("cpu", g).Run(iters);
+}
+
+static void BM_SparseSegmentMeanGrad_Low(int iters, int size) {
+  return SparseSegmentMeanGradHelper(iters, 1.0, size);
+}
+
+static void BM_SparseSegmentMeanGrad_High(int iters, int size) {
+  return SparseSegmentMeanGradHelper(iters, 0.01, size);
+}
+
+BENCHMARK(BM_SparseSegmentMeanGrad_Low)->Arg(1000)->Arg(100000);
+BENCHMARK(BM_SparseSegmentMeanGrad_High)->Arg(1000)->Arg(100000);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/sendrecv_ops.cc b/tensorflow/core/kernels/sendrecv_ops.cc
new file mode 100644
index 0000000000..2abb183d1a
--- /dev/null
+++ b/tensorflow/core/kernels/sendrecv_ops.cc
@@ -0,0 +1,116 @@
+#include "tensorflow/core/kernels/sendrecv_ops.h"
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+static string GetRendezvousKeyPrefix(const string& send_device,
+                                     const string& recv_device,
+                                     const uint64 send_device_incarnation,
+                                     const string& tensor_name) {
+  return strings::StrCat(send_device, ";",
+                         strings::FpToString(send_device_incarnation), ";",
+                         recv_device, ";", tensor_name);
+}
+
+static string GetRendezvousKey(const string& key_prefix,
+                               const FrameAndIter& frame_iter) {
+  return strings::StrCat(key_prefix, ";", frame_iter.frame_id, ":",
+                         frame_iter.iter_id);
+}
+
+SendOp::SendOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+  string send_device;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("send_device", &send_device));
+  string recv_device;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("recv_device", &recv_device));
+  uint64 send_device_incarnation;
+  OP_REQUIRES_OK(
+      ctx, ctx->GetAttr("send_device_incarnation",
+                        reinterpret_cast<int64*>(&send_device_incarnation)));
+  string tensor_name;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name));
+  key_prefix_ = GetRendezvousKeyPrefix(send_device, recv_device,
+                                       send_device_incarnation, tensor_name);
+}
+
+void SendOp::Compute(OpKernelContext* ctx) {
+  OP_REQUIRES(
+      ctx, ctx->rendezvous() != nullptr,
+      errors::Internal("Op kernel context needs to provide a rendezvous."));
+  const string key = GetRendezvousKey(key_prefix_, ctx->frame_iter());
+  VLOG(2) << "Send " << key;
+
+  // The device context may be passed between the Send/Recv
+  // boundary, so that the device context used to produce the Tensor
+  // is used when performing the copy on the recv side (which may be
+  // a different device).
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = ctx->input_alloc_attr(0);
+  Status s =
+      ctx->rendezvous()->Send(key, args, ctx->input(0), ctx->is_input_dead());
+  ctx->SetStatus(s);
+}
+
+REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_CPU), SendOp);
+REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_GPU), SendOp);
+
+REGISTER_KERNEL_BUILDER(Name("_HostSend").Device(DEVICE_CPU), SendOp);
+REGISTER_KERNEL_BUILDER(
+    Name("_HostSend").Device(DEVICE_GPU).HostMemory("tensor"), SendOp);
+
+RecvOp::RecvOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
+  string send_device;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("send_device", &send_device));
+  string recv_device;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("recv_device", &recv_device));
+  uint64 send_device_incarnation;
+  OP_REQUIRES_OK(
+      ctx, ctx->GetAttr("send_device_incarnation",
+                        reinterpret_cast<int64*>(&send_device_incarnation)));
+  string tensor_name;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name));
+  key_prefix_ = GetRendezvousKeyPrefix(send_device, recv_device,
+                                       send_device_incarnation, tensor_name);
+}
+
+void RecvOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
+  OP_REQUIRES(
+      ctx, ctx->rendezvous() != nullptr,
+      errors::Internal("Op kernel context needs to provide a rendezvous."));
+  const string key = GetRendezvousKey(key_prefix_, ctx->frame_iter());
+  VLOG(2) << "Recv " << key;
+
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = ctx->output_alloc_attr(0);
+  ctx->rendezvous()->RecvAsync(
+      key, args, [ctx, done](const Status& s, const Rendezvous::Args& send_args,
+                             const Rendezvous::Args& recv_args,
+                             const Tensor& val, bool is_dead) {
+        ctx->SetStatus(s);
+        if (s.ok()) {
+          // 'ctx' allocates the output tensor of the expected type.  The
+          // runtime checks whether the tensor received here is the same type.
+          if (!is_dead) {
+            ctx->set_output(0, val);
+          }
+          *ctx->is_output_dead() = is_dead;
+        }
+        done();
+      });
+}
+
+REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_CPU), RecvOp);
+REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_GPU), RecvOp);
+
+REGISTER_KERNEL_BUILDER(Name("_HostRecv").Device(DEVICE_CPU), RecvOp);
+REGISTER_KERNEL_BUILDER(
+    Name("_HostRecv").Device(DEVICE_GPU).HostMemory("tensor"), RecvOp);
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/sendrecv_ops.h b/tensorflow/core/kernels/sendrecv_ops.h
new file mode 100644
index 0000000000..b3f5703ccf
--- /dev/null
+++ b/tensorflow/core/kernels/sendrecv_ops.h
@@ -0,0 +1,32 @@
+#ifndef TENSORFLOW_KERNELS_SENDRECV_OPS_H_
+#define TENSORFLOW_KERNELS_SENDRECV_OPS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+class SendOp : public OpKernel {
+ public:
+  explicit SendOp(OpKernelConstruction* ctx);
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  string key_prefix_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(SendOp);
+};
+
+class RecvOp : public AsyncOpKernel {
+ public:
+  explicit RecvOp(OpKernelConstruction* ctx);
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override;
+
+ private:
+  string key_prefix_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(RecvOp);
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_SENDRECV_OPS_H_
diff --git a/tensorflow/core/kernels/sequence_ops.cc b/tensorflow/core/kernels/sequence_ops.cc
new file mode 100644
index 0000000000..60ba2e15f9
--- /dev/null
+++ b/tensorflow/core/kernels/sequence_ops.cc
@@ -0,0 +1,123 @@
+// See docs in ../ops/math_ops.cc.
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+
+int32 GetValue(int32 v) { return v; }
+
+template <typename T>
+class RangeOp : public OpKernel {
+ public:
+  explicit RangeOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& start_in = context->input(0);
+    const Tensor& limit_in = context->input(1);
+    const Tensor& delta_in = context->input(2);
+    OP_REQUIRES(context, TensorShapeUtils::IsLegacyScalar(start_in.shape()),
+                errors::InvalidArgument("start must be a scalar, not shape ",
+                                        start_in.shape().ShortDebugString()));
+    OP_REQUIRES(context, TensorShapeUtils::IsLegacyScalar(limit_in.shape()),
+                errors::InvalidArgument("limit must be a scalar, not shape ",
+                                        limit_in.shape().ShortDebugString()));
+    OP_REQUIRES(context, TensorShapeUtils::IsLegacyScalar(delta_in.shape()),
+                errors::InvalidArgument("delta must be a scalar, not shape ",
+                                        delta_in.shape().ShortDebugString()));
+    const int32 start = GetValue(start_in.scalar<T>()());
+    const int32 limit = GetValue(limit_in.scalar<T>()());
+    OP_REQUIRES(context, start <= limit,
+                errors::InvalidArgument("Requires start <= limit: ", start, "/",
+                                        limit));
+    const int32 delta = GetValue(delta_in.scalar<T>()());
+    OP_REQUIRES(context, delta > 0,
+                errors::InvalidArgument("Requires delta > 0: ", delta));
+    int32 size = (limit - start + delta - 1) / delta;
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape({size}), &out));
+    auto flat = out->flat<T>();
+    int32 val = start;
+    for (int32 i = 0; i < size; ++i) {
+      flat(i) = T(val);
+      val += delta;
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("Range")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("start")
+                            .HostMemory("limit")
+                            .HostMemory("delta")
+                            .HostMemory("output"),
+                        RangeOp<int32>);
+
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("Range")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("start")
+                            .HostMemory("limit")
+                            .HostMemory("delta")
+                            .HostMemory("output"),
+                        RangeOp<int32>);
+#endif  // GOOGLE_CUDA
+
+template <typename T>
+class LinSpaceOp : public OpKernel {
+ public:
+  explicit LinSpaceOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& start_in = context->input(0);
+    const Tensor& stop_in = context->input(1);
+    const Tensor& num_in = context->input(2);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(start_in.shape()),
+                errors::InvalidArgument("start must be a scalar, not shape ",
+                                        start_in.shape().ShortDebugString()));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(stop_in.shape()),
+                errors::InvalidArgument("stop must be a scalar, not shape ",
+                                        stop_in.shape().ShortDebugString()));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(num_in.shape()),
+                errors::InvalidArgument("num must be a scalar, not shape ",
+                                        num_in.shape().ShortDebugString()));
+    const T start = start_in.scalar<T>()();
+    const T stop = stop_in.scalar<T>()();
+    const int32 num = num_in.scalar<int32>()();
+    OP_REQUIRES(context, num > 0,
+                errors::InvalidArgument("Requires num > 0: ", num));
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape({num}), &out));
+    auto flat = out->flat<T>();
+    if (num == 1) {
+      flat(0) = start;
+    } else {
+      const T step = (stop - start) / (num - 1);
+      for (int32 i = 0; i < num; ++i) flat(i) = start + step * i;
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("LinSpace")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T")
+                            .HostMemory("start")
+                            .HostMemory("stop")
+                            .HostMemory("num")
+                            .HostMemory("output"),
+                        LinSpaceOp<float>);
+REGISTER_KERNEL_BUILDER(Name("LinSpace")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<double>("T")
+                            .HostMemory("start")
+                            .HostMemory("stop")
+                            .HostMemory("num")
+                            .HostMemory("output"),
+                        LinSpaceOp<double>);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/shape_ops.cc b/tensorflow/core/kernels/shape_ops.cc
new file mode 100644
index 0000000000..7cb1da8983
--- /dev/null
+++ b/tensorflow/core/kernels/shape_ops.cc
@@ -0,0 +1,261 @@
+// See docs in ../ops/array_ops.cc.
+
+#include <unordered_set>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+class ShapeOp : public OpKernel {
+ public:
+  explicit ShapeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& inp = ctx->input(0);
+    const int rank = inp.dims();
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({rank}), &out));
+    auto vec = out->vec<int32>();
+    for (int i = 0; i < rank; ++i) vec(i) = inp.dim_size(i);
+  }
+
+  bool IsExpensive() override { return false; }
+};
+REGISTER_KERNEL_BUILDER(Name("Shape").Device(DEVICE_CPU).HostMemory("output"),
+                        ShapeOp);
+
+#define REGISTER_GPU_KERNEL(type)                         \
+  REGISTER_KERNEL_BUILDER(Name("Shape")                   \
+                              .Device(DEVICE_GPU)         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          ShapeOp)
+TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+#undef REGISTER_GPU_KERNEL
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Shape")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("input")
+                            .HostMemory("output")
+                            .TypeConstraint<int32>("T"),
+                        ShapeOp);
+
+class RankOp : public OpKernel {
+ public:
+  explicit RankOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& inp = ctx->input(0);
+    const int rank = inp.dims();
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &out));
+    out->scalar<int32>()() = rank;
+  }
+
+  bool IsExpensive() override { return false; }
+};
+REGISTER_KERNEL_BUILDER(Name("Rank").Device(DEVICE_CPU).HostMemory("output"),
+                        RankOp);
+
+#define REGISTER_GPU_KERNEL(type)                        \
+  REGISTER_KERNEL_BUILDER(Name("Rank")                   \
+                              .Device(DEVICE_GPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("output"),     \
+                          RankOp);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+#undef REGISTER_GPU_KERNEL
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Rank")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int32>("T")
+                            .HostMemory("input")
+                            .HostMemory("output"),
+                        RankOp);
+
+class SizeOp : public OpKernel {
+ public:
+  explicit SizeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& inp = ctx->input(0);
+    const int64 size = inp.NumElements();
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &out));
+    // TODO(josh11b): switch output to int64?
+    out->scalar<int32>()() = size;
+  }
+
+  bool IsExpensive() override { return false; }
+};
+REGISTER_KERNEL_BUILDER(Name("Size").Device(DEVICE_CPU).HostMemory("output"),
+                        SizeOp);
+
+#define REGISTER_GPU_KERNEL(type)                        \
+  REGISTER_KERNEL_BUILDER(Name("Size")                   \
+                              .Device(DEVICE_GPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("output"),     \
+                          SizeOp);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+#undef REGISTER_GPU_KERNEL
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Size")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int32>("T")
+                            .HostMemory("input")
+                            .HostMemory("output"),
+                        SizeOp);
+
+class ExpandDimsOp : public OpKernel {
+ public:
+  explicit ExpandDimsOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    int dim = ctx->input(1).flat<int>()(0);
+    OP_REQUIRES(
+        ctx, (dim >= -1 - ctx->input(0).dims() && dim <= ctx->input(0).dims()),
+        errors::InvalidArgument("Tried to expand dim index ", dim,
+                                " for tensor with ", ctx->input(0).dims(),
+                                " dimensions."));
+
+    auto existing_dims = ctx->input(0).shape().dim_sizes();
+    std::vector<int64> new_shape(existing_dims.size());
+    for (size_t i = 0; i < new_shape.size(); ++i) {
+      new_shape[i] = existing_dims[i];
+    }
+
+    // We emulate numpy's interpretation of the dim axis when
+    // -input.dims() >= dim <= input.dims().
+    if (dim < 0) {
+      dim += existing_dims.size() + 1;
+    }
+
+    // Clamp to the end if needed.
+    dim = std::min<int32>(dim, existing_dims.size());
+    new_shape.emplace(new_shape.begin() + dim, 1);
+    const TensorShape output_shape(new_shape);
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, {0}, &output));
+    if (!output->CopyFrom(ctx->input(0), output_shape)) {
+      // This should never happen, since the sizes of the input and output
+      // should always be the same (we only expand the dimension with 1).
+      ctx->SetStatus(
+          errors::Internal("Could not expand dimension with input shape ",
+                           ctx->input(0).shape().DebugString(),
+                           " and output shape ", output_shape.DebugString()));
+    }
+  }
+};
+REGISTER_KERNEL_BUILDER(Name("ExpandDims").Device(DEVICE_CPU).HostMemory("dim"),
+                        ExpandDimsOp);
+
+#define REGISTER_GPU_KERNEL(type)                        \
+  REGISTER_KERNEL_BUILDER(Name("ExpandDims")             \
+                              .Device(DEVICE_GPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("dim"),        \
+                          ExpandDimsOp);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+#undef REGISTER_GPU_KERNEL
+
+REGISTER_KERNEL_BUILDER(Name("ExpandDims")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int32>("T")
+                            .HostMemory("input")
+                            .HostMemory("dim")
+                            .HostMemory("output"),
+                        ExpandDimsOp);
+
+class SqueezeOp : public OpKernel {
+ public:
+  explicit SqueezeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    std::vector<int32> squeeze_dims;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("squeeze_dims", &squeeze_dims));
+    squeeze_dims_.insert(squeeze_dims.begin(), squeeze_dims.end());
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    auto existing_dims = ctx->input(0).shape().dim_sizes();
+    std::vector<int64> new_shape;
+
+    std::unordered_set<int32> wrapped_squeeze_dims;
+    wrapped_squeeze_dims.reserve(squeeze_dims_.size());
+    // Validate squeeze dims against the input.
+    for (int32 dim : squeeze_dims_) {
+      OP_REQUIRES(
+          ctx, (dim >= -ctx->input(0).dims() && dim < ctx->input(0).dims()),
+          errors::InvalidArgument("Tried to squeeze dim index ", dim,
+                                  " for tensor with ", ctx->input(0).dims(),
+                                  " dimensions."));
+      // If dim is < 0, we wrap around (-1 means the last element).
+      if (dim < 0) {
+        dim = existing_dims.size() + dim;
+      }
+
+      wrapped_squeeze_dims.insert(dim);
+    }
+
+    for (size_t i = 0; i < existing_dims.size(); ++i) {
+      auto existing_dim = existing_dims[i];
+
+      // If squeeze_set is non-empty, only squeeze those dimensions.
+      if (!wrapped_squeeze_dims.empty()) {
+        if (wrapped_squeeze_dims.count(i) > 0) {
+          OP_REQUIRES(ctx, existing_dim == 1,
+                      errors::InvalidArgument("Tried to explicitly squeeze "
+                                              "dimension ",
+                                              i, " but dimension was not 1: ",
+                                              existing_dim));
+        } else {
+          // This dimension is not being squeezed.
+          new_shape.push_back(existing_dim);
+        }
+      } else {
+        // Copy over all non-1-length dimensions.
+        if (existing_dim != 1) {
+          new_shape.push_back(existing_dim);
+        }
+      }
+    }
+
+    const TensorShape output_shape(new_shape);
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, {0}, &output));
+    if (!output->CopyFrom(ctx->input(0), output_shape)) {
+      // This should never happen, since the sizes of the input and
+      // output should always be the same.
+      ctx->SetStatus(errors::Internal("Could not squeeze input with shape ",
+                                      ctx->input(0).shape().DebugString(),
+                                      " and output shape ",
+                                      output_shape.DebugString()));
+    }
+  }
+
+ private:
+  std::unordered_set<int32> squeeze_dims_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("Squeeze").Device(DEVICE_CPU), SqueezeOp);
+
+#define REGISTER_GPU_KERNEL(type)                                   \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("Squeeze").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+      SqueezeOp);
+TF_CALL_NUMBER_TYPES(REGISTER_GPU_KERNEL);
+#undef REGISTER_GPU_KERNEL
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/slice_op.cc b/tensorflow/core/kernels/slice_op.cc
new file mode 100644
index 0000000000..3477266d5d
--- /dev/null
+++ b/tensorflow/core/kernels/slice_op.cc
@@ -0,0 +1,242 @@
+// See docs in ../ops/array_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/slice_op.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+namespace {
+
+gtl::InlinedVector<int64, 4> IntTensorToInt64Vec(const Tensor& tensor) {
+  gtl::InlinedVector<int64, 4> out;
+  if (tensor.dtype() == DT_INT32) {
+    for (int64 i = 0; i < tensor.NumElements(); ++i) {
+      out.push_back(tensor.flat<int32>()(i));
+    }
+  } else if (tensor.dtype() == DT_INT64) {
+    for (int64 i = 0; i < tensor.NumElements(); ++i) {
+      out.push_back(tensor.flat<int64>()(i));
+    }
+  } else {
+    LOG(FATAL) << "begin must be either int32 or int64";
+  }
+  return out;
+}
+
+}  // namespace
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+// Shared code that is not dependent on the type of T.  We do this to reduce
+// code size by not duplicating all this for all T (float, double, int32, etc.)
+static void SharedValidation(OpKernelContext* context,
+                             TensorShape* output_shape, bool* is_identity,
+                             bool* slice_dim0,
+                             gtl::InlinedVector<int64, 4>* begin,
+                             gtl::InlinedVector<int64, 4>* size) {
+  const Tensor& input = context->input(0);
+  const Tensor& begin_tensor = context->input(1);
+  const Tensor& size_tensor = context->input(2);
+
+  OP_REQUIRES(
+      context, TensorShapeUtils::IsLegacyVector(begin_tensor.shape()) &&
+                   TensorShapeUtils::IsLegacyVector(size_tensor.shape()) &&
+                   begin_tensor.NumElements() == input.dims() &&
+                   size_tensor.NumElements() == input.dims(),
+      errors::InvalidArgument(
+          "Expected begin and size arguments to be 1-D tensors of size ",
+          input.dims(), ", but got ", begin_tensor.NumElements(), " and ",
+          size_tensor.NumElements(), " instead."));
+
+  const int input_dims = input.dims();
+  *begin = IntTensorToInt64Vec(begin_tensor);
+  *size = IntTensorToInt64Vec(size_tensor);
+  for (int i = 0; i < input_dims; ++i) {
+    if ((*size)[i] == -1) {
+      // A size[i] of -1 means "all elements from begin[i] to dim_size(i)".
+      (*size)[i] = input.dim_size(i) - (*begin)[i];
+    }
+  }
+
+  *is_identity = true;
+  *slice_dim0 = true;
+  for (int i = 0; i < input_dims; ++i) {
+    int64 b = (*begin)[i];
+    int64 s = (*size)[i];
+    if (input.dim_size(i) == 0) {
+      OP_REQUIRES(
+          context, b == 0 && s == 0,
+          errors::InvalidArgument("Expected begin[", i, "] == 0 (got ", b,
+                                  ") and size[", i, "] == 0 ", "(got ", s,
+                                  ") when ", "input.dim_size(", i, ") == 0"));
+    } else {
+      OP_REQUIRES(context, 0 <= b && b <= input.dim_size(i),
+                  errors::InvalidArgument("Expected begin[", i, "] in [0, ",
+                                          input.dim_size(i), "], but got ", b));
+      OP_REQUIRES(
+          context, 0 <= s && b + s <= input.dim_size(i),
+          errors::InvalidArgument("Expected size[", i, "] in [0, ",
+                                  input.dim_size(i) - b, "], but ", "got ", s));
+    }
+    output_shape->AddDim(s);
+    const bool take_all = (b == 0) && (s == input.dim_size(i));
+    (*is_identity) &= take_all;
+    (*slice_dim0) &= (i == 0) || take_all;
+  }
+}
+
+template <typename Device, typename T>
+class SliceOp : public OpKernel {
+ public:
+  explicit SliceOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    TensorShape output_shape;
+    bool is_identity = true;
+    bool slice_dim0 = true;
+    gtl::InlinedVector<int64, 4> begin;
+    gtl::InlinedVector<int64, 4> size;
+    SharedValidation(context, &output_shape, &is_identity, &slice_dim0, &begin,
+                     &size);
+    if (!context->status().ok()) return;
+    const Tensor& input = context->input(0);
+    if (is_identity) {
+      VLOG(1) << "Slice identity";
+      context->set_output(0, input);
+      return;
+    }
+
+    if (slice_dim0 && IsInnerDimsSizeAligned<T>(input.shape())) {
+      VLOG(1) << "Slice dim 0: " << input.shape().DebugString();
+      CHECK_GE(input.dims(), 1);  // Otherwise, is_identity should be true.
+      context->set_output(0, input.Slice(begin[0], begin[0] + size[0]));
+      return;
+    }
+
+    Tensor* result = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &result));
+    const int input_dims = input.dims();
+
+    if (output_shape.num_elements() > 0) {
+      if (std::is_same<Device, CPUDevice>::value && input_dims == 2 &&
+          DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) {
+        auto input = context->input(0).tensor<T, 2>();
+        auto output = result->tensor<T, 2>();
+        // TODO(agarwal): Consider multi-threading this loop for cases where
+        // size[0] is very large.
+        for (int i = 0; i < size[0]; ++i) {
+          const int row = begin[0] + i;
+          if (i + 1 < size[0]) {
+            port::prefetch<port::PREFETCH_HINT_T0>(&output(i + 1, 0));
+            port::prefetch<port::PREFETCH_HINT_T0>(&input(row + 1, begin[1]));
+          }
+          memcpy(&output(i, 0), &input(row, begin[1]), size[1] * sizeof(T));
+        }
+        return;
+      }
+#define HANDLE_DIM(NDIM)                            \
+  if (input_dims == NDIM) {                         \
+    HandleCase<NDIM>(context, begin, size, result); \
+    return;                                         \
+  }
+
+      HANDLE_DIM(1);
+      HANDLE_DIM(2);
+      HANDLE_DIM(3);
+      HANDLE_DIM(4);
+      HANDLE_DIM(5);
+
+#undef HANDLE_DIM
+
+      OP_REQUIRES(context, false, errors::Unimplemented(
+                                      "SliceOp : Unhandled input dimensions"));
+    }
+  }
+
+ private:
+  template <int NDIM>
+  void HandleCase(OpKernelContext* context, const gtl::ArraySlice<int64>& begin,
+                  const gtl::ArraySlice<int64>& size, Tensor* result) {
+    Eigen::DSizes<ptrdiff_t, NDIM> indices;
+    Eigen::DSizes<ptrdiff_t, NDIM> sizes;
+    for (int i = 0; i < NDIM; ++i) {
+      indices[i] = begin[i];
+      sizes[i] = size[i];
+    }
+
+    functor::Slice<Device, T, NDIM>()(
+        context->eigen_device<Device>(), result->tensor<T, NDIM>(),
+        context->input(0).tensor<T, NDIM>(), indices, sizes);
+  }
+};
+
+#define REGISTER_SLICE(type)                             \
+  REGISTER_KERNEL_BUILDER(Name("Slice")                  \
+                              .Device(DEVICE_CPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("begin")       \
+                              .HostMemory("size"),       \
+                          SliceOp<CPUDevice, type>)
+
+TF_CALL_ALL_TYPES(REGISTER_SLICE);
+REGISTER_SLICE(bfloat16);
+
+#undef REGISTER_SLICE
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T, NDIM)                                  \
+  template <>                                                      \
+  void Slice<GPUDevice, T, NDIM>::operator()(                      \
+      const GPUDevice& d, typename TTypes<T, NDIM>::Tensor output, \
+      typename TTypes<T, NDIM>::ConstTensor input,                 \
+      const Eigen::DSizes<ptrdiff_t, NDIM>& indices,               \
+      const Eigen::DSizes<ptrdiff_t, NDIM>& sizes);                \
+  extern template struct Slice<GPUDevice, T, NDIM>;
+
+#define DECLARE_FOR_N(T)  \
+  DECLARE_GPU_SPEC(T, 1); \
+  DECLARE_GPU_SPEC(T, 2); \
+  DECLARE_GPU_SPEC(T, 3); \
+  DECLARE_GPU_SPEC(T, 4); \
+  DECLARE_GPU_SPEC(T, 5);
+
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_N);
+DECLARE_FOR_N(int32);
+
+#undef DECLARE_FOR_N
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+#define REGISTER_GPU(type)                                     \
+  REGISTER_KERNEL_BUILDER(Name("Slice")                        \
+                              .Device(DEVICE_GPU)              \
+                              .TypeConstraint<type>("T")       \
+                              .HostMemory("begin")             \
+                              .HostMemory("size")              \
+                              .TypeConstraint<int32>("Index"), \
+                          SliceOp<GPUDevice, type>)
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+REGISTER_GPU(int32);
+
+#undef REGISTER_GPU
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/slice_op.h b/tensorflow/core/kernels/slice_op.h
new file mode 100644
index 0000000000..1b6bd9c112
--- /dev/null
+++ b/tensorflow/core/kernels/slice_op.h
@@ -0,0 +1,25 @@
+#ifndef TENSORFLOW_KERNELS_SLICE_OP_H_
+#define TENSORFLOW_KERNELS_SLICE_OP_H_
+
+// Functor definition for SliceOp, must be compilable by nvcc.
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T, int NDIMS>
+struct Slice {
+  void operator()(const Device& d, typename TTypes<T, NDIMS>::Tensor output,
+                  typename TTypes<T, NDIMS>::ConstTensor input,
+                  const Eigen::DSizes<ptrdiff_t, NDIMS>& slice_indices,
+                  const Eigen::DSizes<ptrdiff_t, NDIMS>& slice_sizes) {
+    output.device(d) = input.slice(slice_indices, slice_sizes);
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_SLICE_OP_H_
diff --git a/tensorflow/core/kernels/slice_op_gpu.cu.cc b/tensorflow/core/kernels/slice_op_gpu.cu.cc
new file mode 100644
index 0000000000..6e919b244c
--- /dev/null
+++ b/tensorflow/core/kernels/slice_op_gpu.cu.cc
@@ -0,0 +1,31 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <stdio.h>
+
+#include "tensorflow/core/kernels/slice_op.h"
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/port.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+#define DEFINE_GPU_KERNELS(T)                      \
+  template struct functor::Slice<GPUDevice, T, 1>; \
+  template struct functor::Slice<GPUDevice, T, 2>; \
+  template struct functor::Slice<GPUDevice, T, 3>; \
+  template struct functor::Slice<GPUDevice, T, 4>; \
+  template struct functor::Slice<GPUDevice, T, 5>;
+
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
+DEFINE_GPU_KERNELS(int32);
+
+#undef DEFINE_GPU_KERNELS
+
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/slice_op_test.cc b/tensorflow/core/kernels/slice_op_test.cc
new file mode 100644
index 0000000000..27c78c6dc0
--- /dev/null
+++ b/tensorflow/core/kernels/slice_op_test.cc
@@ -0,0 +1,73 @@
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/tensor.h"
+#include <gtest/gtest.h>
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace {
+
+// For the benchmark, we set up two 2-dimensional tensors, each kDim1 x 'dim'
+// in size, and concat them together along "concat_dimension"
+template <typename T>
+static void SliceHelper(int iters, int size) {
+  testing::StopTiming();
+  RequireDefaultOps();
+  Graph* g = new Graph(OpRegistry::Global());
+  DataType dt = DataTypeToEnum<T>::v();
+  int kDim = 100;
+  int kMaxSize = 15000;
+  CHECK_LT(size, kMaxSize);
+
+  Tensor begin(DT_INT32, TensorShape({2}));
+  begin.flat<int32>()(0) = 10;
+  begin.flat<int32>()(1) = 10;
+
+  Tensor sizes(DT_INT32, TensorShape({2}));
+  sizes.flat<int32>()(0) = kDim;
+  sizes.flat<int32>()(1) = size;
+
+  Tensor input(dt, TensorShape({2 * kDim, kMaxSize}));
+  input.flat<T>().setRandom();
+
+  Node* node;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Slice")
+                  .Input(test::graph::Constant(g, input))
+                  .Input(test::graph::Constant(g, begin))
+                  .Input(test::graph::Constant(g, sizes))
+                  .Attr("T", dt)
+                  .Finalize(g, &node));
+
+  testing::BytesProcessed(static_cast<int64>(iters) * kDim * size * sizeof(T));
+  testing::StartTiming();
+  test::Benchmark("cpu", g).Run(iters);
+  testing::UseRealTime();
+}
+
+static void BM_SliceFloat(int iters, int dim2) {
+  SliceHelper<float>(iters, dim2);
+}
+
+BENCHMARK(BM_SliceFloat)->Arg(100)->Arg(1000)->Arg(10000);
+
+static void BM_SliceBFloat16(int iters, int dim2) {
+  SliceHelper<bfloat16>(iters, dim2);
+}
+
+BENCHMARK(BM_SliceBFloat16)->Arg(100)->Arg(1000)->Arg(10000);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/softmax_op.cc b/tensorflow/core/kernels/softmax_op.cc
new file mode 100644
index 0000000000..abe6331a4f
--- /dev/null
+++ b/tensorflow/core/kernels/softmax_op.cc
@@ -0,0 +1,62 @@
+// See docs in ../ops/nn_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/kernels/softmax_op.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class SoftmaxOp : public OpKernel {
+ public:
+  explicit SoftmaxOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& logits_in = context->input(0);
+    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(logits_in.shape()),
+                errors::InvalidArgument("logits must be 2-dimensional"));
+    Tensor* softmax_out = nullptr;
+    OP_REQUIRES_OK(
+        context, context->allocate_output(0, logits_in.shape(), &softmax_out));
+    functor::SoftmaxFunctor<Device, T> functor;
+    functor(context->eigen_device<Device>(), logits_in.matrix<T>(),
+            softmax_out->matrix<T>());
+  }
+};
+
+// Partial specialization for a CPUDevice, that uses the Eigen implementation
+// from SoftmaxEigenImpl.
+namespace functor {
+template <typename T>
+struct SoftmaxFunctor<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::ConstMatrix logits,
+                  typename TTypes<T>::Matrix softmax) {
+    SoftmaxEigenImpl<CPUDevice, T>::Compute(d, logits, softmax);
+  }
+};
+}  // namespace functor
+
+REGISTER_KERNEL_BUILDER(Name("Softmax")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T"),
+                        SoftmaxOp<CPUDevice, float>);
+REGISTER_KERNEL_BUILDER(Name("Softmax")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<double>("T"),
+                        SoftmaxOp<CPUDevice, double>);
+
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("Softmax")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<float>("T"),
+                        SoftmaxOp<GPUDevice, float>);
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/softmax_op.h b/tensorflow/core/kernels/softmax_op.h
new file mode 100644
index 0000000000..69bd531b70
--- /dev/null
+++ b/tensorflow/core/kernels/softmax_op.h
@@ -0,0 +1,70 @@
+#ifndef TENSORFLOW_KERNELS_SOFTMAX_OP_H_
+#define TENSORFLOW_KERNELS_SOFTMAX_OP_H_
+// Functor definition for SoftmaxOp, must be compilable by nvcc.
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by SoftmaxOp to do the computations.
+template <typename Device, typename T>
+struct SoftmaxFunctor {
+  // Computes Softmax activation.
+  //
+  // logits: dim: batch_size, num_classes.
+  // softmax: dims: batch_size, num_classes.
+  void operator()(const Device& d, typename TTypes<T>::ConstMatrix logits,
+                  typename TTypes<T>::Matrix softmax);
+};
+
+// Eigen code implementing SoftmaxFunctor::operator().
+// This code works for both CPU and GPU and is used by the functor
+// specializations for both device types.
+template <typename Device, typename T>
+struct SoftmaxEigenImpl {
+  static void Compute(const Device& d, typename TTypes<T>::ConstMatrix logits,
+                      typename TTypes<T>::Matrix softmax) {
+    const int kBatchDim = 0;
+    const int kClassDim = 1;
+
+    const int batch_size = logits.dimension(kBatchDim);
+    const int num_classes = logits.dimension(kClassDim);
+
+// These arrays are used to reduce along the class dimension, and broadcast
+// the resulting value to all classes.
+#if !defined(EIGEN_HAS_INDEX_LIST)
+    Eigen::DSizes<int, 1> along_class(kClassDim);
+    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+#else
+    Eigen::IndexList<Eigen::type2index<kClassDim> > along_class;
+    Eigen::IndexList<Eigen::type2index<1> > depth_dim;
+    Eigen::IndexList<int, Eigen::type2index<1> > batch_by_one;
+    batch_by_one.set(0, batch_size);
+    Eigen::IndexList<Eigen::type2index<1>, int> one_by_class;
+    one_by_class.set(1, num_classes);
+#endif
+    // NOTE(mdevin): If you modify this implementation please run
+    // the ImageNetSoftmaxFwd benchmark in core_ops_test.cc.
+    //
+    // softmax = exp(logits - max(logits along classes));
+    softmax.device(d) = (logits -
+                         logits.maximum(along_class)
+                             .eval()
+                             .reshape(batch_by_one)
+                             .broadcast(one_by_class)).exp();
+    // softmax = softmax / sum(softmax along classes);
+    softmax.device(d) = (softmax /
+                         softmax.sum(along_class)
+                             .eval()
+                             .reshape(batch_by_one)
+                             .broadcast(one_by_class));
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_SOFTMAX_OP_H_
diff --git a/tensorflow/core/kernels/softmax_op_gpu.cu.cc b/tensorflow/core/kernels/softmax_op_gpu.cu.cc
new file mode 100644
index 0000000000..d5aaf9c364
--- /dev/null
+++ b/tensorflow/core/kernels/softmax_op_gpu.cu.cc
@@ -0,0 +1,31 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/softmax_op.h"
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Partial specialization for a GPUDevice, that uses the Eigen implementation
+// from SoftmaxEigenImpl.
+namespace functor {
+template <typename T>
+struct SoftmaxFunctor<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::ConstMatrix logits,
+                  typename TTypes<T>::Matrix softmax) {
+    SoftmaxEigenImpl<GPUDevice, T>::Compute(d, logits, softmax);
+  }
+};
+}  // end namespace functor
+
+// Instantiate the GPU implementation for float.
+template struct functor::SoftmaxFunctor<GPUDevice, float>;
+
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/softplus_op.cc b/tensorflow/core/kernels/softplus_op.cc
new file mode 100644
index 0000000000..b5fb57d3c5
--- /dev/null
+++ b/tensorflow/core/kernels/softplus_op.cc
@@ -0,0 +1,97 @@
+// See docs in ../ops/nn_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/softplus_op.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class SoftplusOp : public UnaryElementWiseOp<T, SoftplusOp<Device, T>> {
+ public:
+  using UnaryElementWiseOp<T, SoftplusOp<Device, T>>::UnaryElementWiseOp;
+
+  void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
+    functor::Softplus<Device, T> functor;
+    functor(context->eigen_device<Device>(), input.flat<T>(),
+            output->flat<T>());
+  }
+};
+
+template <typename Device, typename T>
+class SoftplusGradOp
+    : public BinaryElementWiseOp<T, SoftplusGradOp<Device, T>> {
+ public:
+  using BinaryElementWiseOp<T, SoftplusGradOp<Device, T>>::BinaryElementWiseOp;
+
+  // INPUTS:
+  //   g (gradients): backpropagated gradients
+  //   a (inputs): inputs that were passed to SoftplusOp()
+  // OUTPUT:
+  //   gradients to backprop
+  template <int NDIMS>
+  void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a,
+               Tensor* output) {
+    OP_REQUIRES(context, a.IsSameSize(g),
+                errors::InvalidArgument("g and a must be the same size"));
+    functor::SoftplusGrad<Device, T> functor;
+    functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(),
+            output->flat<T>());
+  }
+};
+
+#define REGISTER_KERNELS(type)                                           \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("Softplus").Device(DEVICE_CPU).TypeConstraint<type>("T"),     \
+      SoftplusOp<CPUDevice, type>);                                      \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("SoftplusGrad").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      SoftplusGradOp<CPUDevice, type>);
+
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                          \
+  template <>                                                        \
+  void Softplus<GPUDevice, T>::operator()(                           \
+      const GPUDevice& d, typename TTypes<T>::ConstTensor features,  \
+      typename TTypes<T>::Tensor activations);                       \
+  extern template struct Softplus<GPUDevice, T>;                     \
+                                                                     \
+  template <>                                                        \
+  void SoftplusGrad<GPUDevice, T>::operator()(                       \
+      const GPUDevice& d, typename TTypes<T>::ConstTensor gradients, \
+      typename TTypes<T>::ConstTensor features,                      \
+      typename TTypes<T>::Tensor backprops);                         \
+  extern template struct SoftplusGrad<GPUDevice, T>;
+
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
+}  // namespace functor
+
+// Registration of the GPU implementations.
+#define REGISTER_GPU_KERNELS(type)                                       \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("Softplus").Device(DEVICE_GPU).TypeConstraint<type>("T"),     \
+      SoftplusOp<GPUDevice, type>);                                      \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("SoftplusGrad").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+      SoftplusGradOp<GPUDevice, type>);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+#undef REGISTER_GPU_KERNELS
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/softplus_op.h b/tensorflow/core/kernels/softplus_op.h
new file mode 100644
index 0000000000..3545a78246
--- /dev/null
+++ b/tensorflow/core/kernels/softplus_op.h
@@ -0,0 +1,46 @@
+#ifndef TENSORFLOW_KERNELS_SOFTPLUS_OP_H_
+#define TENSORFLOW_KERNELS_SOFTPLUS_OP_H_
+// Functor definition for SoftplusOp and SoftplusGradOp, must be compilable by
+// nvcc.
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by SoftplusOp to do the computations.
+template <typename Device, typename T>
+struct Softplus {
+  // Computes Softplus activation.
+  //
+  // features: any shape.
+  // activations: same shape as "features".
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
+                  typename TTypes<T>::Tensor activations) {
+    activations.device(d) =
+        (features > features.constant(30.f))
+            .select(features, (features.exp() + features.constant(1.0f)).log());
+  }
+};
+
+// Functor used by SoftplusGradOp to do the computations.
+template <typename Device, typename T>
+struct SoftplusGrad {
+  // Computes SoftplusGrad backprops.
+  //
+  // gradients: gradients backpropagated to the Softplus op.
+  // features: inputs that where passed to the Softplus op.
+  // backprops: gradients to backpropagate to the Softplus inputs.
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients,
+                  typename TTypes<T>::ConstTensor features,
+                  typename TTypes<T>::Tensor backprops) {
+    backprops.device(d) =
+        gradients / ((-features).exp() + features.constant(1.0f));
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_SOFTPLUS_OP_H_
diff --git a/tensorflow/core/kernels/softplus_op_gpu.cu.cc b/tensorflow/core/kernels/softplus_op_gpu.cu.cc
new file mode 100644
index 0000000000..7a974321a7
--- /dev/null
+++ b/tensorflow/core/kernels/softplus_op_gpu.cu.cc
@@ -0,0 +1,25 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <stdio.h>
+
+#include "tensorflow/core/kernels/softplus_op.h"
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Definition of the GPU implementations declared in softplus_op.cc.
+#define DEFINE_GPU_KERNELS(T)                      \
+  template struct functor::Softplus<GPUDevice, T>; \
+  template struct functor::SoftplusGrad<GPUDevice, T>;
+
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
+
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/sparse_concat_op.cc b/tensorflow/core/kernels/sparse_concat_op.cc
new file mode 100644
index 0000000000..72c267a47d
--- /dev/null
+++ b/tensorflow/core/kernels/sparse_concat_op.cc
@@ -0,0 +1,139 @@
+#define EIGEN_USE_THREADS
+
+#include <algorithm>
+#include <unordered_map>
+#include <utility>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/util/sparse/sparse_tensor.h"
+
+namespace tensorflow {
+
+template <typename T>
+class SparseConcatOp : public OpKernel {
+ public:
+  explicit SparseConcatOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("concat_dim", &concat_dim_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    OpInputList inds;
+    OP_REQUIRES_OK(context, context->input_list("indices", &inds));
+    const int N = inds.size();
+    for (int i = 0; i < N; i++) {
+      OP_REQUIRES(context, TensorShapeUtils::IsMatrix(inds[i].shape()),
+                  errors::InvalidArgument(
+                      "Input indices should be a matrix but received shape ",
+                      inds[i].shape().DebugString(), " at position ", i));
+    }
+
+    OpInputList vals;
+    OP_REQUIRES_OK(context, context->input_list("values", &vals));
+    OP_REQUIRES(context, vals.size() == N,
+                errors::InvalidArgument("Expected ", N, " input values, got ",
+                                        vals.size()));
+    for (int i = 0; i < N; i++) {
+      OP_REQUIRES(context, TensorShapeUtils::IsVector(vals[i].shape()),
+                  errors::InvalidArgument(
+                      "Input values should be a vector but received shape ",
+                      vals[i].shape().DebugString(), " at position ", i));
+    }
+
+    OpInputList shapes;
+    OP_REQUIRES_OK(context, context->input_list("shapes", &shapes));
+    OP_REQUIRES(context, shapes.size() == N,
+                errors::InvalidArgument("Expected ", N, " input shapes, got ",
+                                        shapes.size()));
+    for (int i = 0; i < N; i++) {
+      OP_REQUIRES(context, TensorShapeUtils::IsVector(shapes[i].shape()),
+                  errors::InvalidArgument(
+                      "Input shapes should be a vector but received shape ",
+                      shapes[i].shape().DebugString(), " at position ", i));
+    }
+
+    const TensorShape input_shape(shapes[0].vec<int64>());
+    OP_REQUIRES(
+        context, concat_dim_ >= 0 && concat_dim_ < input_shape.dims(),
+        errors::InvalidArgument("Concat dimension must be between 0 and rank (",
+                                input_shape.dims(), "), got ", concat_dim_));
+    for (int i = 1; i < N; ++i) {
+      const TensorShape current_shape(shapes[i].vec<int64>());
+      OP_REQUIRES(context, current_shape.dims() == input_shape.dims(),
+                  errors::InvalidArgument(
+                      "Ranks of all input tensors must match: expected ",
+                      input_shape.dims(), " but got ", current_shape.dims(),
+                      " at position ", i));
+      for (int j = 0; j < input_shape.dims(); ++j) {
+        if (j != concat_dim_) {
+          OP_REQUIRES(
+              context, input_shape.dim_size(j) == current_shape.dim_size(j),
+              errors::InvalidArgument(
+                  "Input shapes must match: expected ", input_shape.dim_size(j),
+                  " for dimension ", j, " but got ", current_shape.dim_size(j),
+                  " at position ", i));
+        }
+      }
+    }
+
+    // The input and output sparse tensors are assumed to be ordered along
+    // increasing dimension number. But in order for concat to work properly,
+    // order[0] must be concat_dim. So we will reorder the inputs to the
+    // concat ordering, concatenate, then reorder back to the standard order.
+    // We make a deep copy of the input tensors to ensure that the in-place
+    // reorder doesn't create race conditions for other ops that may be
+    // concurrently reading the indices and values tensors.
+
+    gtl::InlinedVector<int64, 8> std_order(input_shape.dims());
+    std::iota(std_order.begin(), std_order.end(), 0);
+
+    std::vector<int64> concat_order;
+    concat_order.reserve(input_shape.dims());
+    concat_order.push_back(concat_dim_);
+    for (int j = 0; j < input_shape.dims(); ++j) {
+      if (j != concat_dim_) {
+        concat_order.push_back(j);
+      }
+    }
+
+    std::vector<sparse::SparseTensor> sp_inputs;
+    for (int i = 0; i < N; ++i) {
+      const TensorShape current_shape(shapes[i].vec<int64>());
+      sp_inputs.emplace_back(tensor::DeepCopy(inds[i]),
+                             tensor::DeepCopy(vals[i]), current_shape,
+                             std_order);
+      sp_inputs[i].Reorder<T>(concat_order);
+    }
+
+    sparse::SparseTensor concat = sparse::SparseTensor::Concat<T>(sp_inputs);
+    concat.Reorder<T>(std_order);
+
+    context->set_output(0, concat.indices());
+    context->set_output(1, concat.values());
+
+    Tensor* output_shape_out = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                2, TensorShape({concat.shape().dims()}),
+                                &output_shape_out));
+    auto output_shape = output_shape_out->vec<int64>();
+    for (int j = 0; j < concat.shape().dims(); ++j) {
+      output_shape(j) = concat.shape().dim_size(j);
+    }
+  }
+
+ private:
+  int concat_dim_;
+};
+
+#define REGISTER_KERNELS(type)                                           \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("SparseConcat").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      SparseConcatOp<type>)
+
+TF_CALL_ALL_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
new file mode 100644
index 0000000000..919e129ff8
--- /dev/null
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -0,0 +1,192 @@
+// See docs in ../ops/math_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/port.h"
+
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename T>
+void PrefetchBlockNTA(const T& tensor, int si, int ei, int sj, int ej) {
+  for (int i = si; i < ei; ++i) {
+    for (int j = sj; j < ej; j = j + 16) {
+      port::prefetch<port::PREFETCH_HINT_NTA>(&tensor(i, j));
+    }
+  }
+}
+
+template <typename T>
+void PrefetchBlockT1(const T& tensor, int si, int ei, int sj, int ej) {
+  for (int i = si; i < ei; ++i) {
+    for (int j = sj; j < ej; j = j + 16) {
+      port::prefetch<port::PREFETCH_HINT_T1>(&tensor(i, j));
+    }
+  }
+}
+
+struct Block {
+  Block(int sm, int em, int sk, int ek, int sn, int en)
+      : startm(sm), endm(em), startk(sk), endk(ek), startn(sn), endn(en) {}
+
+  int startm;
+  int endm;
+  int startk;
+  int endk;
+  int startn;
+  int endn;
+};
+
+bool NextBlock(const int Bm, const int Bk, const int Bn, const int m_start,
+               const int m, const int k, const int n, const Block& b,
+               Block* next) {
+  *next = b;
+  if (b.endk < k) {
+    next->startk = b.endk;
+    next->endk = std::min(b.endk + Bk, k);
+  } else {
+    next->startk = 0;
+    next->endk = std::min(Bk, k);
+    if (b.endm < m) {
+      next->startm = b.endm;
+      next->endm = std::min(b.endm + Bm, m);
+    } else {
+      next->startm = m_start;
+      next->endm = std::min(m_start + Bm, m);
+      next->startn = b.endn;
+      next->endn = std::min(b.endn + Bn, n);
+    }
+  }
+  return next->startn == next->endn;
+}
+
+class SparseMatMulOp : public OpKernel {
+ public:
+  explicit SparseMatMulOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_a", &transpose_a_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_b", &transpose_b_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("a_is_sparse", &a_is_sparse_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("b_is_sparse", &b_is_sparse_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& a = ctx->input(0);
+    const Tensor& b = ctx->input(1);
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a.shape()),
+                errors::InvalidArgument("a is not a matrix"));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(b.shape()),
+                errors::InvalidArgument("b is not a matrix"));
+
+    auto left = a.matrix<float>();
+    auto right_mat = b.matrix<float>();
+    const int m = transpose_a_ ? left.dimension(1) : left.dimension(0);
+    const int k = transpose_a_ ? left.dimension(0) : left.dimension(1);
+    const int n =
+        transpose_b_ ? right_mat.dimension(0) : right_mat.dimension(1);
+    const int k2 =
+        transpose_b_ ? right_mat.dimension(1) : right_mat.dimension(0);
+
+    OP_REQUIRES(ctx, k == k2,
+                errors::InvalidArgument("Matrix size incompatible: a: ",
+                                        a.shape().DebugString(), ", b: ",
+                                        b.shape().DebugString()));
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({m, n}), &output));
+    auto out = output->matrix<float>();
+
+    if (!a_is_sparse_) {
+      // Fallback to Eigen contract.
+      // Note that we currently don't optimize the case where only right is
+      // sparse. That can generally be handled by tranposing the order of the
+      // matmul.
+      Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
+      dim_pair[0].first = transpose_a_ ? 0 : 1;
+      dim_pair[0].second = transpose_b_ ? 1 : 0;
+      out.device(ctx->template eigen_device<CPUDevice>()) =
+          left.contract(right_mat, dim_pair);
+      return;
+    }
+    typedef Eigen::Tensor<float, 2, Eigen::RowMajor> Matrix;
+    std::unique_ptr<Matrix> right_tr_mat;
+    std::unique_ptr<TTypes<float>::ConstMatrix> right_tr_map;
+    if (transpose_b_) {
+      right_tr_mat.reset(new Matrix(k, n));
+      Eigen::array<int, 2> perm({1, 0});
+      right_tr_mat->device(ctx->template eigen_device<CPUDevice>()) =
+          right_mat.shuffle(perm);
+      right_tr_map.reset(new TTypes<float>::ConstMatrix(
+          right_tr_mat->data(), right_tr_mat->dimensions()));
+    }
+    TTypes<float>::ConstMatrix& right =
+        transpose_b_ ? *right_tr_map : right_mat;
+
+    const bool transpose_a = transpose_a_;
+
+    typedef Eigen::TensorMap<Eigen::Tensor<float, 1, Eigen::RowMajor>,
+                             Eigen::Unaligned> TensorMap;
+    typedef Eigen::TensorMap<Eigen::Tensor<const float, 1, Eigen::RowMajor>,
+                             Eigen::Unaligned> ConstTensorMap;
+    typedef Eigen::DSizes<Eigen::DenseIndex, 1> DSizes;
+    const int Bm = 16;
+    const int Bk = 16;
+    const int Bn = 1024;
+
+    auto work_shard = [m, n, k, transpose_a, Bm, Bk, Bn, &left, &right, &out](
+        int64 start64, int64 end64) {
+      const int start = static_cast<int>(start64);
+      const int end = static_cast<int>(end64);
+      Block curr(start, std::min(start + Bm, end), 0, std::min(Bk, k), 0,
+                 std::min(Bn, n));
+      Block next(curr);
+      bool done = false;
+      for (int i = start; i < end; ++i) {
+        out.chip<0>(i).setZero();
+      }
+      while (true) {
+        done = NextBlock(Bm, Bk, Bn, start, end, k, n, curr, &next);
+
+        PrefetchBlockT1(right, curr.startk, curr.endk, curr.startn, curr.endn);
+
+        // Process current block
+        for (int i = curr.startm; i < curr.endm; ++i) {
+          PrefetchBlockNTA(left, i, i + 1, curr.startk, curr.endk);
+          PrefetchBlockNTA(out, i, i + 1, curr.startn, curr.endn);
+          DSizes out_slice_shape(curr.endn - curr.startn);
+          TensorMap out_i(&out(i, curr.startn), out_slice_shape);
+          for (int j = curr.startk; j < curr.endk; ++j) {
+            const float l = transpose_a ? left(j, i) : left(i, j);
+            if (l == 0) continue;
+            ConstTensorMap right_j(&right(j, curr.startn), out_slice_shape);
+            out_i += right_j * l;
+          }
+        }
+        if (done) break;
+        curr = next;
+      }
+    };
+    auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
+    Shard(worker_threads.num_threads, worker_threads.workers, m, 2 * k * n,
+          work_shard);
+  }
+
+ private:
+  bool transpose_a_;
+  bool transpose_b_;
+  bool a_is_sparse_;
+  bool b_is_sparse_;
+  TF_DISALLOW_COPY_AND_ASSIGN(SparseMatMulOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("SparseMatMul").Device(DEVICE_CPU),
+                        SparseMatMulOp);
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/sparse_matmul_op_test.cc b/tensorflow/core/kernels/sparse_matmul_op_test.cc
new file mode 100644
index 0000000000..883d0d1224
--- /dev/null
+++ b/tensorflow/core/kernels/sparse_matmul_op_test.cc
@@ -0,0 +1,139 @@
+#include "tensorflow/core/framework/types.pb.h"
+#include <gtest/gtest.h>
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+random::PhiloxRandom philox(1, 1);
+random::SimplePhilox rnd(&philox);
+
+void Sparsify(Tensor* t, float sparsity) {
+  const int64 N = t->NumElements();
+  CHECK_LE(sparsity, 1);
+  if (sparsity <= 0) return;
+  auto flat = t->flat<float>();
+  static const uint32 K = 10000;
+  for (int64 i = 0; i < N; ++i) {
+    if (rnd.Uniform(K) < sparsity * K) {
+      flat(i) = 0;
+    }
+  }
+}
+
+Node* SparseMatMulNode(Graph* g, Node* in0, Node* in1, bool transpose_a,
+                       bool transpose_b, bool a_sparse, bool b_sparse) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "SparseMatMul")
+                  .Input(in0)
+                  .Input(in1)
+                  .Attr("transpose_a", transpose_a)
+                  .Attr("transpose_b", transpose_b)
+                  .Attr("a_is_sparse", a_sparse)
+                  .Attr("b_is_sparse", b_sparse)
+                  .Finalize(g, &ret));
+  return ret;
+}
+
+static Graph* SparseMatMulHelper(Graph* g, int m, int n, int d, float sparsity,
+                                 bool transpose_a, bool transpose_b,
+                                 bool a_sparse, bool b_sparse) {
+  a_sparse = a_sparse && (sparsity > 0);
+  b_sparse = b_sparse && (sparsity > 0);
+
+  auto left_shape = transpose_a ? TensorShape({d, m}) : TensorShape({m, d});
+  Tensor left(DataTypeToEnum<float>::value, left_shape);
+  left.flat<float>().setRandom();
+  if (a_sparse) {
+    Sparsify(&left, sparsity);
+  }
+
+  auto right_shape = transpose_b ? TensorShape({n, d}) : TensorShape({d, n});
+  Tensor right(DataTypeToEnum<float>::value, right_shape);
+  right.flat<float>().setRandom();
+  if (b_sparse) {
+    Sparsify(&right, sparsity);
+  }
+
+  SparseMatMulNode(g, test::graph::Constant(g, left),
+                   test::graph::Constant(g, right), transpose_a, transpose_b,
+                   a_sparse, b_sparse);
+  return g;
+}
+
+static Graph* SparseMatMul(int m, int n, int d, float sparsity,
+                           bool transpose_a, bool transpose_b) {
+  Graph* g = new Graph(OpRegistry::Global());
+  return SparseMatMulHelper(g, m, n, d, sparsity, transpose_a, transpose_b,
+                            true, false);
+}
+
+static Graph* MultiSparseMatMul(int m, int n, int d, float sparsity_a,
+                                float sparsity_b) {
+  Graph* g = new Graph(OpRegistry::Global());
+  if (sparsity_a == 0 && sparsity_b > 0) {
+    SparseMatMulHelper(g, m, n, d, sparsity_a, false, false, false, false);
+    SparseMatMulHelper(g, n, d, m, sparsity_b, true, true, true, false);
+    SparseMatMulHelper(g, m, d, n, sparsity_b, false, false, true, false);
+  } else {
+    SparseMatMulHelper(g, m, n, d, sparsity_a, false, true, true, false);
+    SparseMatMulHelper(g, d, n, m, sparsity_a, true, false, true, true);
+    SparseMatMulHelper(g, m, d, n, sparsity_b, false, false, true, false);
+  }
+  return g;
+}
+
+#define BM_SPARSE(M, K, N, S)                                                  \
+  static void BM_Sparse##_##M##_##K##_##N##_##S(int iters) {                   \
+    testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * 2);        \
+    std::string label = strings::Printf("%d_%d_%d_%0.2f", M, K, N, S / 100.0); \
+    testing::SetLabel(label);                                                  \
+    test::Benchmark("cpu", SparseMatMul(M, N, K, S / 100.0, false, false))     \
+        .Run(iters);                                                           \
+  }                                                                            \
+  BENCHMARK(BM_Sparse##_##M##_##K##_##N##_##S);
+
+BM_SPARSE(2048, 2048, 2048, 0);
+BM_SPARSE(2048, 2048, 2048, 1);
+BM_SPARSE(2048, 2048, 2048, 85);
+
+BM_SPARSE(1024, 1024, 1024, 0);
+BM_SPARSE(1024, 1024, 1024, 1);
+BM_SPARSE(1024, 1024, 1024, 85);
+
+BM_SPARSE(256, 256, 256, 1);
+BM_SPARSE(512, 512, 512, 1);
+
+#define BM_SPARSE_MULTI(M, K, N, S1, S2)                                       \
+  static void BM_Sparse_Multi##_##M##_##K##_##N##_##S1##_##S2(int iters) {     \
+    testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * 2 * 3);    \
+    std::string label = strings::Printf("%d_%d_%d_%0.2f_%0.2f", M, K, N,       \
+                                        S1 / 100.0, S2 / 100.0);               \
+    testing::SetLabel(label);                                                  \
+    test::Benchmark("cpu", MultiSparseMatMul(M, N, K, S1 / 100.0, S2 / 100.0)) \
+        .Run(iters);                                                           \
+  }                                                                            \
+  BENCHMARK(BM_Sparse_Multi##_##M##_##K##_##N##_##S1##_##S2);
+
+BM_SPARSE_MULTI(512, 2140, 4096, 0, 82);
+BM_SPARSE_MULTI(512, 4096, 2048, 83, 83);
+
+#define BM_SPARSE_TR(M, K, N, S, TA, TB)                                     \
+  static void BM_Sparse##_##M##_##K##_##N##_##S##_##TA##_##TB(int iters) {   \
+    testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * 2);      \
+    std::string label =                                                      \
+        strings::Printf("%d_%d_%d_%d_%d_%0.2f", M, K, N, TA, TB, S / 100.0); \
+    testing::SetLabel(label);                                                \
+    test::Benchmark("cpu", SparseMatMul(M, N, K, S / 100.0, TA, TB))         \
+        .Run(iters);                                                         \
+  }                                                                          \
+  BENCHMARK(BM_Sparse##_##M##_##K##_##N##_##S##_##TA##_##TB);
+
+BM_SPARSE_TR(2048, 2048, 2048, 1, true, false);
+BM_SPARSE_TR(2048, 2048, 2048, 1, false, true);
+BM_SPARSE_TR(2048, 2048, 2048, 1, true, true);
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/sparse_reorder_op.cc b/tensorflow/core/kernels/sparse_reorder_op.cc
new file mode 100644
index 0000000000..fd6824a4e2
--- /dev/null
+++ b/tensorflow/core/kernels/sparse_reorder_op.cc
@@ -0,0 +1,71 @@
+#define EIGEN_USE_THREADS
+
+#include <algorithm>
+#include <unordered_map>
+#include <utility>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/util/sparse/sparse_tensor.h"
+
+namespace tensorflow {
+
+template <typename T>
+class SparseReorderOp : public OpKernel {
+ public:
+  explicit SparseReorderOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input_ind = context->input(0);
+    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(input_ind.shape()),
+                errors::InvalidArgument(
+                    "Input indices should be a matrix but received shape",
+                    input_ind.shape().DebugString()));
+
+    const Tensor& input_val = context->input(1);
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(input_val.shape()),
+                errors::InvalidArgument(
+                    "Input values should be a vector but received shape",
+                    input_val.shape().DebugString()));
+
+    const Tensor& input_shape_in = context->input(2);
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(input_shape_in.shape()),
+                errors::InvalidArgument(
+                    "Input shape should be a vector but received shape",
+                    input_shape_in.shape().DebugString()));
+
+    const TensorShape input_shape(input_shape_in.vec<int64>());
+
+    gtl::InlinedVector<int64, 8> std_order(input_shape.dims());
+    std::iota(std_order.begin(), std_order.end(), 0);
+
+    // Check if the sparse tensor is already ordered correctly
+    sparse::SparseTensor input_sp(input_ind, input_val, input_shape, std_order);
+
+    if (input_sp.IndicesValid()) {
+      context->set_output(0, input_sp.indices());
+      context->set_output(1, input_sp.values());
+    } else {
+      // Deep-copy the input Tensors, then reorder in-place
+      sparse::SparseTensor reordered_sp(tensor::DeepCopy(input_ind),
+                                        tensor::DeepCopy(input_val),
+                                        input_shape);
+      reordered_sp.Reorder<T>(std_order);
+      context->set_output(0, reordered_sp.indices());
+      context->set_output(1, reordered_sp.values());
+    }
+  }
+};
+
+#define REGISTER_KERNELS(type)                                            \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("SparseReorder").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      SparseReorderOp<type>)
+
+TF_CALL_ALL_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/sparse_to_dense_op.cc b/tensorflow/core/kernels/sparse_to_dense_op.cc
new file mode 100644
index 0000000000..47e91c134d
--- /dev/null
+++ b/tensorflow/core/kernels/sparse_to_dense_op.cc
@@ -0,0 +1,129 @@
+// See core/ops/sparse_ops.cc for documentation.
+//
+// NOTE: the operations in this file only are suitable for execution
+// on CPUs.
+
+#define EIGEN_USE_THREADS
+
+#include <string>
+#include <sstream>
+#include <unordered_map>
+#include <utility>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/util/sparse/sparse_tensor.h"
+
+namespace tensorflow {
+
+// Operator to convert sparse representations to dense.
+template <typename T, typename Index>
+class SparseToDense : public OpKernel {
+ public:
+  explicit SparseToDense(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* c) override {
+    // sparse_indices
+    const Tensor& indices = c->input(0);
+    OP_REQUIRES(c, indices.dims() <= 2,
+                errors::InvalidArgument(
+                    "sparse_indices should be a scalar, vector, or matrix, "
+                    "got shape ",
+                    indices.shape().ShortDebugString()));
+    const int64 num_elems = indices.dims() > 0 ? indices.dim_size(0) : 1;
+    const int64 num_dims = indices.dims() > 1 ? indices.dim_size(1) : 1;
+
+    // output_shape
+    const Tensor& output_shape = c->input(1);
+    OP_REQUIRES(
+        c, TensorShapeUtils::IsLegacyVector(output_shape.shape()),
+        errors::InvalidArgument("output_shape should be a vector, got shape ",
+                                output_shape.shape().ShortDebugString()));
+    OP_REQUIRES(c, output_shape.NumElements() == num_dims,
+                errors::InvalidArgument(
+                    "output_shape has incorrect number of elements: ",
+                    output_shape.NumElements(), " should be: ", num_dims));
+
+    // sparse_values
+    const Tensor& sparse_values = c->input(2);
+    const int64 num_values = sparse_values.NumElements();
+    OP_REQUIRES(
+        c, sparse_values.dims() == 0 ||
+               (sparse_values.dims() == 1 && num_values == num_elems),
+        errors::InvalidArgument("sparse_values has incorrect shape ",
+                                sparse_values.shape().ShortDebugString(),
+                                ", should be [] or [", num_elems, "]"));
+
+    // default_value
+    const Tensor& default_value = c->input(3);
+    OP_REQUIRES(c, TensorShapeUtils::IsScalar(default_value.shape()),
+                errors::InvalidArgument("default_value should be a scalar."));
+
+    auto output_shape_vec = output_shape.flat<Index>();
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(c, c->allocate_output(0, TensorShapeUtils::MakeShape(
+                                                output_shape_vec.data(),
+                                                output_shape_vec.size()),
+                                         &output));
+
+    TensorShape ix_shape({num_elems, num_dims});
+    Tensor indices_shaped(DT_INT64, ix_shape);
+    if (indices.dtype() == DT_INT64) {
+      CHECK(indices_shaped.CopyFrom(indices, ix_shape));
+    } else {
+      indices_shaped.matrix<int64>() =
+          indices.shaped<Index, 2>(ix_shape.dim_sizes()).template cast<int64>();
+    }
+
+    // If we received a scalar, we'll need to create a new
+    // tensor with copies of the values as a vec.
+    // TODO(ebrevdo): find a way to avoid this temp allocation.
+    Tensor sparse_values_b;
+
+    if (TensorShapeUtils::IsScalar(sparse_values.shape())) {
+      OP_REQUIRES_OK(
+          c, c->allocate_temp(DataTypeToEnum<T>::value,
+                              TensorShape({num_elems}), &sparse_values_b));
+      sparse_values_b.vec<T>().setConstant(sparse_values.scalar<T>()());
+    } else {
+      sparse_values_b = sparse_values;
+    }
+
+    gtl::InlinedVector<int64, 8> order(output->shape().dims());
+    std::iota(order.begin(), order.end(), 0);  // Assume order is correct
+    sparse::SparseTensor st(indices_shaped, sparse_values_b, output->shape(),
+                            order);
+
+    output->flat<T>().setConstant(default_value.scalar<T>()());
+    OP_REQUIRES(c, st.template ToDense<T>(output, false /* initialize */),
+                errors::InvalidArgument(
+                    "Indices are not valid (out of bounds).  Shape: ",
+                    output->shape().DebugString()));
+  }
+};
+
+#define REGISTER_KERNELS(type, index_type)                             \
+  REGISTER_KERNEL_BUILDER(Name("SparseToDense")                        \
+                              .Device(DEVICE_CPU)                      \
+                              .TypeConstraint<type>("T")               \
+                              .TypeConstraint<index_type>("Tindices"), \
+                          SparseToDense<type, index_type>);
+
+#define REGISTER_KERNELS_ALL(type) \
+  REGISTER_KERNELS(type, int32);   \
+  REGISTER_KERNELS(type, int64);
+
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS_ALL);
+REGISTER_KERNELS_ALL(bool);
+REGISTER_KERNELS_ALL(string);
+
+#undef REGISTER_KERNELS_ALL
+#undef REGISTER_KERNELS
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/sparse_to_dense_op_test.cc b/tensorflow/core/kernels/sparse_to_dense_op_test.cc
new file mode 100644
index 0000000000..e9800ccd68
--- /dev/null
+++ b/tensorflow/core/kernels/sparse_to_dense_op_test.cc
@@ -0,0 +1,283 @@
+#include <functional>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/public/tensor.h"
+#include <gtest/gtest.h>
+
+namespace tensorflow {
+
+namespace {
+
+class SparseToDenseTest : public OpsTestBase {
+ protected:
+  void SetUp() override { RequireDefaultOps(); }
+
+  void MakeOp(int dim, DataType index_type, DataType value_type) {
+    ASSERT_OK(NodeDefBuilder("sparsetodense", "SparseToDense")
+                  .Input(FakeInput(index_type))
+                  .Input(FakeInput(index_type))
+                  .Input(FakeInput(value_type))
+                  .Input(FakeInput(value_type))
+                  .Finalize(node_def()));
+    ASSERT_OK(InitOp());
+  }
+};
+
+TEST_F(SparseToDenseTest, OneD_OneValue) {
+  MakeOp(1, DT_INT32, DT_FLOAT);
+
+  // sparse_indices
+  AddInputFromArray<int32>(TensorShape({3}), {1, 3, 4});
+  // output_shape
+  AddInputFromArray<int32>(TensorShape({1}), {5});
+  // sparse_values
+  AddInputFromArray<float>(TensorShape({}), {2});
+  // default_value
+  AddInputFromArray<float>(TensorShape({}), {-2});
+
+  ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, {5});
+  test::FillValues<float>(&expected, {-2, 2, -2, 2, 2});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(SparseToDenseTest, OneD_OneValue_int64_double) {
+  MakeOp(1, DT_INT64, DT_DOUBLE);
+
+  // sparse_indices
+  AddInputFromArray<int64>(TensorShape({3}), {1, 3, 4});
+  // output_shape
+  AddInputFromArray<int64>(TensorShape({1}), {5});
+  // sparse_values
+  AddInputFromArray<double>(TensorShape({}), {2});
+  // default_value
+  AddInputFromArray<double>(TensorShape({}), {-2});
+
+  ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_DOUBLE, {5});
+  test::FillValues<double>(&expected, {-2, 2, -2, 2, 2});
+  test::ExpectTensorEqual<double>(expected, *GetOutput(0));
+}
+
+TEST_F(SparseToDenseTest, OneD_MultValues) {
+  MakeOp(1, DT_INT32, DT_FLOAT);
+
+  // sparse_indices
+  AddInputFromArray<int32>({3}, {1, 3, 4});
+  // output_shape
+  AddInputFromArray<int32>({1}, {5});
+  // sparse_values
+  AddInputFromArray<float>({3}, {3, 4, 5});
+  // default_value
+  AddInputFromArray<float>({}, {-2});
+
+  ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, {5});
+  test::FillValues<float>(&expected, {-2, 3, -2, 4, 5});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(SparseToDenseTest, TwoD_OneValue) {
+  MakeOp(2, DT_INT32, DT_FLOAT);
+
+  // sparse_indices
+  AddInputFromArray<int32>(TensorShape({3, 2}), {0, 1, 0, 2, 2, 3});
+  // output_shape
+  AddInputFromArray<int32>(TensorShape({2}), {3, 4});
+  // sparse_values
+  AddInputFromArray<float>(TensorShape({}), {2});
+  // default_value
+  AddInputFromArray<float>(TensorShape({}), {-2});
+
+  ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, {3, 4});
+  expected.flat<float>().setConstant(-2);
+  expected.tensor<float, 2>()(0, 1) = 2;
+  expected.tensor<float, 2>()(0, 2) = 2;
+  expected.tensor<float, 2>()(2, 3) = 2;
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(SparseToDenseTest, TwoD_MultValues) {
+  MakeOp(2, DT_INT32, DT_FLOAT);
+
+  // sparse_indices
+  AddInputFromArray<int32>(TensorShape({3, 2}), {0, 1, 0, 2, 2, 3});
+  // output_shape
+  AddInputFromArray<int32>(TensorShape({2}), {3, 4});
+  // sparse_values
+  AddInputFromArray<float>(TensorShape({3}), {3, 4, 5});
+  // default_value
+  AddInputFromArray<float>(TensorShape({}), {-2});
+
+  ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, {3, 4});
+  expected.flat<float>().setConstant(-2);
+  expected.tensor<float, 2>()(0, 1) = 3;
+  expected.tensor<float, 2>()(0, 2) = 4;
+  expected.tensor<float, 2>()(2, 3) = 5;
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(SparseToDenseTest, ThreeD_OneValue) {
+  MakeOp(3, DT_INT32, DT_FLOAT);
+
+  // sparse_indices
+  AddInputFromArray<int32>(TensorShape({3, 3}), {0, 1, 1, 0, 2, 0, 2, 3, 1});
+  // output_shape
+  AddInputFromArray<int32>(TensorShape({3}), {3, 4, 2});
+  // sparse_values
+  AddInputFromArray<float>(TensorShape({}), {2});
+  // default_value
+  AddInputFromArray<float>(TensorShape({}), {-2});
+
+  ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, {3, 4, 2});
+  expected.flat<float>().setConstant(-2);
+  expected.tensor<float, 3>()(0, 1, 1) = 2;
+  expected.tensor<float, 3>()(0, 2, 0) = 2;
+  expected.tensor<float, 3>()(2, 3, 1) = 2;
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(SparseToDenseTest, ThreeD_MultValues) {
+  MakeOp(3, DT_INT32, DT_FLOAT);
+
+  // sparse_indices
+  AddInputFromArray<int32>(TensorShape({3, 3}), {0, 1, 1, 0, 2, 0, 2, 3, 1});
+  // output_shape
+  AddInputFromArray<int32>(TensorShape({3}), {3, 4, 2});
+  // sparse_values
+  AddInputFromArray<float>(TensorShape({3}), {3, 4, 5});
+  // default_value
+  AddInputFromArray<float>(TensorShape({}), {-2});
+
+  ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, {3, 4, 2});
+  expected.flat<float>().setConstant(-2);
+  expected.tensor<float, 3>()(0, 1, 1) = 3;
+  expected.tensor<float, 3>()(0, 2, 0) = 4;
+  expected.tensor<float, 3>()(2, 3, 1) = 5;
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+}  // namespace
+
+static int BM_Arg(int ndim, int n) { return (ndim * 1000000) + n; }
+static int NDIM_from_arg(int bm_arg) { return bm_arg / 1000000; }
+static int N_from_arg(int bm_arg) { return bm_arg % 1000000; }
+
+static void BM_SparseToDense(int iters, const int bm_arg) {
+  const int NDIM = NDIM_from_arg(bm_arg);
+  const int N = N_from_arg(bm_arg);
+  // TODO(zhifengc): Switch to use kernel_benchmark_testlib.h
+  tensorflow::testing::StopTiming();
+
+  const int IndexDim = (NDIM == 1) ? 0 : 1;
+
+  std::unique_ptr<Device> device(
+      DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+
+  gtl::InlinedVector<TensorValue, 4> inputs;
+
+  // Create a dense tensor with dims [1, ..., 1, N]
+  Tensor output_shape(DT_INT32, TensorShape({NDIM}));
+  Tensor sparse_indices(DT_INT32, TensorShape({N, NDIM}));
+  Tensor sparse_values(DT_FLOAT, TensorShape({N}));
+  Tensor default_value(DT_FLOAT, TensorShape({}));
+  auto output_shape_t = output_shape.vec<int32>();
+  for (int d = 0; d < NDIM; ++d) {
+    output_shape_t(d) = (d == IndexDim) ? N : 3;
+  }
+
+  auto sparse_indices_t = sparse_indices.matrix<int32>();
+  for (int n = 0; n < N; ++n) {
+    for (int d = 0; d < NDIM; ++d)
+      sparse_indices_t(n, d) = (d == IndexDim) ? n : 0;
+  }
+
+  for (auto* ptr :
+       {&sparse_indices, &output_shape, &sparse_values, &default_value}) {
+    inputs.push_back({nullptr, ptr});
+  }
+
+  NodeDef sparse_node_def;
+  TF_CHECK_OK(NodeDefBuilder("sparsetodense", "SparseToDense")
+                  .Input(FakeInput(DT_INT32))
+                  .Input(FakeInput(DT_INT32))
+                  .Input(FakeInput(DT_FLOAT))
+                  .Input(FakeInput(DT_FLOAT))
+                  .Finalize(&sparse_node_def));
+
+  Status status;
+  std::unique_ptr<OpKernel> op(CreateOpKernel(
+      DEVICE_CPU, device.get(), cpu_allocator(), sparse_node_def, &status));
+
+  OpKernelContext::Params params;
+  params.device = device.get();
+  params.frame_iter = FrameAndIter(0, 0);
+  params.inputs = &inputs;
+  params.op_kernel = op.get();
+  params.output_alloc_attr = [&device, &op, &params](int index) {
+    AllocatorAttributes attr;
+    const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY);
+    attr.set_on_host(on_host);
+    return attr;
+  };
+
+  std::unique_ptr<OpKernelContext> sparse_context(new OpKernelContext(params));
+  op->Compute(sparse_context.get());
+  tensorflow::testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    delete sparse_context->release_output(0).tensor;
+    op->Compute(sparse_context.get());
+    ASSERT_OK(sparse_context->status());
+  }
+  tensorflow::testing::StopTiming();
+
+  // processing input, mainly
+  int64 bytes_per_iter = static_cast<int64>((N + N * NDIM) * sizeof(float));
+
+  tensorflow::testing::BytesProcessed(bytes_per_iter * iters);
+}
+
+BENCHMARK(BM_SparseToDense)
+    ->Arg(BM_Arg(1, 10))
+    ->Arg(BM_Arg(1, 100))
+    ->Arg(BM_Arg(1, 1000))
+    ->Arg(BM_Arg(1, 10000))
+    ->Arg(BM_Arg(2, 10))
+    ->Arg(BM_Arg(2, 100))
+    ->Arg(BM_Arg(2, 1000))
+    ->Arg(BM_Arg(2, 10000))
+    ->Arg(BM_Arg(3, 10))
+    ->Arg(BM_Arg(3, 100))
+    ->Arg(BM_Arg(3, 1000))
+    ->Arg(BM_Arg(3, 10000))
+    ->Arg(BM_Arg(5, 10))
+    ->Arg(BM_Arg(5, 100))
+    ->Arg(BM_Arg(5, 1000))
+    ->Arg(BM_Arg(5, 10000));
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/split_op.cc b/tensorflow/core/kernels/split_op.cc
new file mode 100644
index 0000000000..f4f9ada000
--- /dev/null
+++ b/tensorflow/core/kernels/split_op.cc
@@ -0,0 +1,146 @@
+// See docs in ../ops/array_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/split_op.h"
+
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class SplitOp : public OpKernel {
+ public:
+  explicit SplitOp(OpKernelConstruction* c) : OpKernel(c) {}
+
+  void Compute(OpKernelContext* context) override {
+    const int32 split_dim = context->input(0).flat<int32>()(0);
+    const int32 num_split = num_outputs();
+    const Tensor& input = context->input(1);
+    const TensorShape& input_shape = input.shape();
+
+    OP_REQUIRES(
+        context, 0 <= split_dim && split_dim < input_shape.dims(),
+        errors::InvalidArgument("0 <= split_dim < number of input dimensions (",
+                                input_shape.dims(), "), but got ", split_dim));
+
+    OP_REQUIRES(
+        context, num_split > 0,
+        errors::InvalidArgument(
+            "Number of ways to split should be > 0, but got ", num_split));
+
+    OP_REQUIRES(context, input_shape.dim_size(split_dim) % num_split == 0,
+                errors::InvalidArgument(
+                    "Number of ways to split should evenly divide the split "
+                    "dimension, but got split_dim ",
+                    split_dim, " (size = ", input_shape.dim_size(split_dim),
+                    ") ", "and num_split ", num_split));
+
+    // Special case 1: num_split == 1. Nothing to do.
+    if (num_split == 1) {
+      VLOG(1) << "Split identity";
+      context->set_output(0, context->input(1));
+      return;
+    }
+
+    // Special case 2: split along the 1st dimension. We can share the
+    // underlying buffer.
+    //
+    // Apply this optimization conservatively: if input is aligned,
+    // the resulting tensors must be aligned. It's conservative
+    // because if the immediate consumer of the resulting tensors are
+    // not using eigen for computation, its perfectly fine to avoid
+    // the copying.
+    if ((split_dim == 0) && IsInnerDimsSizeAligned<T>(input_shape)) {
+      VLOG(1) << "Slice dim 0: " << input_shape.DebugString();
+      const int64 delta = input_shape.dim_size(0) / num_split;
+      for (int i = 0; i < num_split; ++i) {
+        context->set_output(i, input.Slice(i * delta, (i + 1) * delta));
+      }
+      return;
+    }
+
+    int32 prefix_dim_size = 1;
+    for (int i = 0; i < split_dim; ++i) {
+      prefix_dim_size *= input_shape.dim_size(i);
+    }
+
+    int32 split_dim_size = input_shape.dim_size(split_dim);
+
+    int32 suffix_dim_size = 1;
+    for (int i = split_dim + 1; i < input_shape.dims(); ++i) {
+      suffix_dim_size *= input_shape.dim_size(i);
+    }
+
+    auto input_reshaped =
+        input.shaped<T, 3>({prefix_dim_size, split_dim_size, suffix_dim_size});
+
+    const int32 split_dim_output_size = split_dim_size / num_split;
+    TensorShape output_shape(input_shape);
+    output_shape.set_dim(split_dim, split_dim_output_size);
+
+    Eigen::DSizes<ptrdiff_t, 3> indices{0, 0, 0};
+    Eigen::DSizes<ptrdiff_t, 3> sizes{prefix_dim_size, split_dim_output_size,
+                                      suffix_dim_size};
+
+    for (int i = 0; i < num_split; ++i) {
+      Tensor* result = nullptr;
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(i, output_shape, &result));
+      if (prefix_dim_size * split_dim_output_size * suffix_dim_size > 0) {
+        Eigen::DSizes<ptrdiff_t, 3> slice_indices;
+        Eigen::DSizes<ptrdiff_t, 3> slice_sizes;
+        for (int j = 0; j < 3; ++j) {
+          slice_indices[j] = indices[j];
+          slice_sizes[j] = sizes[j];
+        }
+
+        auto result_shaped = result->shaped<T, 3>(
+            {prefix_dim_size, split_dim_output_size, suffix_dim_size});
+
+        functor::Split<Device, T>()(context->eigen_device<Device>(),
+                                    result_shaped, input_reshaped,
+                                    slice_indices, slice_sizes);
+      }
+      indices[1] += split_dim_output_size;
+    }
+  }
+};
+
+#define REGISTER_SPLIT(type)                             \
+  REGISTER_KERNEL_BUILDER(Name("Split")                  \
+                              .Device(DEVICE_CPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("split_dim"),  \
+                          SplitOp<CPUDevice, type>)
+
+TF_CALL_ALL_TYPES(REGISTER_SPLIT);
+
+#undef REGISTER_SPLIT
+
+#if GOOGLE_CUDA
+
+#define REGISTER_GPU(type)                               \
+  REGISTER_KERNEL_BUILDER(Name("Split")                  \
+                              .Device(DEVICE_GPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("split_dim"),  \
+                          SplitOp<GPUDevice, type>)
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+#undef REGISTER_GPU
+
+#endif  // GOOGLE_CUDA
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/split_op.h b/tensorflow/core/kernels/split_op.h
new file mode 100644
index 0000000000..2572c77285
--- /dev/null
+++ b/tensorflow/core/kernels/split_op.h
@@ -0,0 +1,31 @@
+#ifndef TENSORFLOW_KERNELS_SPLIT_OP_H_
+#define TENSORFLOW_KERNELS_SPLIT_OP_H_
+// Functor definition for SplitOp, must be compilable by nvcc.
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T>
+struct Split {
+  void operator()(const Device& d, typename TTypes<T, 3>::Tensor output,
+                  typename TTypes<T, 3>::ConstTensor input,
+                  const Eigen::DSizes<ptrdiff_t, 3>& slice_indices,
+                  const Eigen::DSizes<ptrdiff_t, 3>& slice_sizes);
+};
+
+template <typename T>
+struct Split<Eigen::ThreadPoolDevice, T> {
+  void operator()(const Eigen::ThreadPoolDevice& d,
+                  typename TTypes<T, 3>::Tensor output,
+                  typename TTypes<T, 3>::ConstTensor input,
+                  const Eigen::DSizes<ptrdiff_t, 3>& slice_indices,
+                  const Eigen::DSizes<ptrdiff_t, 3>& slice_sizes);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_SPLIT_OP_H_
diff --git a/tensorflow/core/kernels/split_op_cpu.cc b/tensorflow/core/kernels/split_op_cpu.cc
new file mode 100644
index 0000000000..b86deeb8fb
--- /dev/null
+++ b/tensorflow/core/kernels/split_op_cpu.cc
@@ -0,0 +1,30 @@
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/split_op.h"
+
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename T>
+void Split<Eigen::ThreadPoolDevice, T>::operator()(
+    const Eigen::ThreadPoolDevice& d, typename TTypes<T, 3>::Tensor output,
+    typename TTypes<T, 3>::ConstTensor input,
+    const Eigen::DSizes<ptrdiff_t, 3>& slice_indices,
+    const Eigen::DSizes<ptrdiff_t, 3>& slice_sizes) {
+  if (output.size() < 131072) {
+    output = input.slice(slice_indices, slice_sizes);
+  } else {
+    output.device(d) = input.slice(slice_indices, slice_sizes);
+  }
+}
+
+#define DEFINE_CPU_KERNELS(T) template struct Split<Eigen::ThreadPoolDevice, T>;
+
+TF_CALL_ALL_TYPES(DEFINE_CPU_KERNELS)
+
+}  // namespace functor
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/split_op_gpu.cu.cc b/tensorflow/core/kernels/split_op_gpu.cu.cc
new file mode 100644
index 0000000000..f8931d6a89
--- /dev/null
+++ b/tensorflow/core/kernels/split_op_gpu.cu.cc
@@ -0,0 +1,31 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <stdio.h>
+
+#include "tensorflow/core/kernels/split_op.h"
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T>
+void Split<Device, T>::operator()(
+    const Device& d, typename TTypes<T, 3>::Tensor output,
+    typename TTypes<T, 3>::ConstTensor input,
+    const Eigen::DSizes<ptrdiff_t, 3>& slice_indices,
+    const Eigen::DSizes<ptrdiff_t, 3>& slice_sizes) {
+  output.device(d) = input.slice(slice_indices, slice_sizes);
+}
+
+#define DEFINE_GPU_KERNELS(T) template struct Split<Eigen::GpuDevice, T>;
+
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/string_to_hash_bucket_op.cc b/tensorflow/core/kernels/string_to_hash_bucket_op.cc
new file mode 100644
index 0000000000..bd6fa47268
--- /dev/null
+++ b/tensorflow/core/kernels/string_to_hash_bucket_op.cc
@@ -0,0 +1,47 @@
+#include <string>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+
+class StringToHashBucketOp : public OpKernel {
+ public:
+  explicit StringToHashBucketOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("num_buckets", &num_buckets_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor* input_tensor;
+    OP_REQUIRES_OK(context, context->input("string_tensor", &input_tensor));
+    const auto& input_flat = input_tensor->flat<string>();
+
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output("output", input_tensor->shape(),
+                                            &output_tensor));
+    auto output_flat = output_tensor->flat<int64>();
+
+    for (int i = 0; i < input_flat.size(); ++i) {
+      const uint64 input_hash = Hash64(input_flat(i));
+      const uint64 bucket_id = input_hash % num_buckets_;
+      // The number of buckets is always in the positive range of int64 so is
+      // the resulting bucket_id. Casting the bucket_id from uint64 to int64 is
+      // safe.
+      output_flat(i) = static_cast<int64>(bucket_id);
+    }
+  }
+
+ private:
+  int64 num_buckets_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(StringToHashBucketOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("StringToHashBucket").Device(DEVICE_CPU),
+                        StringToHashBucketOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/string_to_number_op.cc b/tensorflow/core/kernels/string_to_number_op.cc
new file mode 100644
index 0000000000..8d23a4fdf8
--- /dev/null
+++ b/tensorflow/core/kernels/string_to_number_op.cc
@@ -0,0 +1,71 @@
+// See docs in ../ops/parse_ops.cc.
+
+#include <errno.h>
+#include <string>
+
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+
+static constexpr char kErrorMessage[] =
+    "StringToNumberOp could not correctly convert string: ";
+
+template <typename OutputType>
+class StringToNumberOp : public OpKernel {
+ public:
+  using OpKernel::OpKernel;
+
+  void Compute(OpKernelContext* context) override {
+    // This is not a deep copy of the input tensor; they will share the same
+    // underlying storage.
+    const Tensor* input_tensor;
+    OP_REQUIRES_OK(context, context->input("string_tensor", &input_tensor));
+    const auto& input_flat = input_tensor->flat<string>();
+
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output("output", input_tensor->shape(),
+                                            &output_tensor));
+    auto output_flat = output_tensor->flat<OutputType>();
+
+    for (int i = 0; i < input_flat.size(); ++i) {
+      const char* s = input_flat(i).data();
+      Convert(s, &output_flat(i), context);
+    }
+  }
+
+ private:
+  void Convert(const char* s, OutputType* output_data,
+               OpKernelContext* context);
+};
+
+template <>
+void StringToNumberOp<float>::Convert(const char* s, float* output_data,
+                                      OpKernelContext* context) {
+  OP_REQUIRES(context, strings::safe_strtof(s, output_data),
+              errors::InvalidArgument(kErrorMessage, s));
+}
+
+template <>
+void StringToNumberOp<int32>::Convert(const char* s, int32* output_data,
+                                      OpKernelContext* context) {
+  OP_REQUIRES(context, strings::safe_strto32(s, output_data),
+              errors::InvalidArgument(kErrorMessage, s));
+}
+
+// Registers the currently supported output types.
+#define REGISTER(type)                                           \
+  REGISTER_KERNEL_BUILDER(Name("StringToNumber")                 \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("out_type"), \
+                          StringToNumberOp<type>)
+REGISTER(float);
+REGISTER(int32);
+#undef REGISTER
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/summary_image_op.cc b/tensorflow/core/kernels/summary_image_op.cc
new file mode 100644
index 0000000000..ba765f2e84
--- /dev/null
+++ b/tensorflow/core/kernels/summary_image_op.cc
@@ -0,0 +1,169 @@
+// Operators that deal with SummaryProtos (encoded as DT_STRING tensors) as
+// inputs or outputs in various ways.
+
+// See docs in ../ops/summary_ops.cc.
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/png/png_io.h"
+
+namespace tensorflow {
+
+class SummaryImageOp : public OpKernel {
+ public:
+  explicit SummaryImageOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("max_images", &max_images_));
+    const TensorProto* proto;
+    OP_REQUIRES_OK(context, context->GetAttr("bad_color", &proto));
+    OP_REQUIRES_OK(context, context->device()->MakeTensorFromProto(
+                                *proto, AllocatorAttributes(), &bad_color_));
+    OP_REQUIRES(context, bad_color_.dtype() == DT_UINT8,
+                errors::InvalidArgument("bad_color must be uint8, got ",
+                                        DataTypeString(bad_color_.dtype())));
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsVector(bad_color_.shape()),
+        errors::InvalidArgument("bad_color must be a vector, got shape ",
+                                bad_color_.shape().ShortDebugString()));
+  }
+
+  void Compute(OpKernelContext* c) override {
+    const Tensor& tags = c->input(0);
+    const Tensor& tensor = c->input(1);
+    OP_REQUIRES(c, TensorShapeUtils::IsLegacyScalar(tags.shape()),
+                errors::InvalidArgument("Tags must have be a scalar"));
+    OP_REQUIRES(c, tensor.dims() == 4 &&
+                       (tensor.dim_size(3) == 1 || tensor.dim_size(3) == 3 ||
+                        tensor.dim_size(3) == 4),
+                errors::InvalidArgument(
+                    "Tensor must be 4-D with last dim 1, 3, or 4, not ",
+                    tensor.shape().DebugString()));
+    const string& base_tag = tags.scalar<string>()();
+
+    const int batch_size = tensor.dim_size(0);
+    const int h = tensor.dim_size(1);
+    const int w = tensor.dim_size(2);
+    const int hw = h * w;  // Compact these two dims for simplicity
+    const int depth = tensor.dim_size(3);
+    auto tensor_eigen = tensor.shaped<float, 3>({batch_size, hw, depth});
+
+    OP_REQUIRES(c, bad_color_.dim_size(0) >= depth,
+                errors::InvalidArgument(
+                    "expected depth <= bad_color.size, got depth = ", depth,
+                    ", bad_color.size = ", bad_color_.dim_size(0)));
+    auto bad_color_full = bad_color_.vec<uint8>();
+    typename TTypes<uint8>::Vec bad_color(bad_color_full.data(), depth);
+
+    // RGB (or gray or RGBA) is last dimension
+    Eigen::Tensor<uint8, 2, Eigen::RowMajor> image(hw, depth);
+
+    Summary s;
+    const int N = std::min<int>(max_images_, batch_size);
+    for (int i = 0; i < N; ++i) {
+      Summary::Value* v = s.add_value();
+      // The tag depends on the number of requested images (not the number
+      // produced.)
+      //
+      // Note that later on avisu uses "/" to figure out a consistent naming
+      // convention for display, so we append "/image" to guarantee that the
+      // image(s) won't be displayed in the global scope with no name.
+      if (max_images_ > 1) {
+        v->set_tag(strings::StrCat(base_tag, "/image/", i));
+      } else {
+        v->set_tag(strings::StrCat(base_tag, "/image"));
+      }
+
+      if (image.size()) {
+        typename TTypes<float>::ConstMatrix values(
+            &tensor_eigen(i, 0, 0),
+            Eigen::DSizes<Eigen::DenseIndex, 2>(hw, depth));
+
+        // Rescale the image to uint8 range.
+        //
+        // We are trying to generate an RCG image from a float tensor.  We do
+        // not have any info about the expected range of values in the tensor
+        // but the generated image needs to have all RGB values within [0, 255].
+        //
+        // We use two different algorithms to generate these values.  If the
+        // tensor has only positive values we scale them all by 255/max(values).
+        // If the tensor has both negative and positive values we scale them by
+        // the max of their absolute values and center them around 127.
+        //
+        // This works for most cases, but has the incovenient of not respecting
+        // the relative dynamic range across different instances of the tensor.
+
+        // Compute min and max ignoring nonfinite pixels
+        float image_min = std::numeric_limits<float>::infinity();
+        float image_max = -image_min;
+        for (int i = 0; i < hw; i++) {
+          bool finite = true;
+          for (int j = 0; j < depth; j++) {
+            if (!std::isfinite(values(i, j))) {
+              finite = false;
+              break;
+            }
+          }
+          if (finite) {
+            for (int j = 0; j < depth; j++) {
+              float value = values(i, j);
+              image_min = std::min(image_min, value);
+              image_max = std::max(image_max, value);
+            }
+          }
+        }
+
+        // Pick an affine transform into uint8
+        const float kZeroThreshold = 1e-6;
+        float scale, offset;
+        if (image_min < 0) {
+          float max_val = std::max(std::abs(image_min), std::abs(image_max));
+          scale = max_val < kZeroThreshold ? 0.0f : 127.0f / max_val;
+          offset = 128.0f;
+        } else {
+          scale = image_max < kZeroThreshold ? 0.0f : 255.0f / image_max;
+          offset = 0.0f;
+        }
+
+        // Transform image, turning nonfinite values to bad_color
+        for (int i = 0; i < hw; i++) {
+          bool finite = true;
+          for (int j = 0; j < depth; j++) {
+            if (!std::isfinite(values(i, j))) {
+              finite = false;
+              break;
+            }
+          }
+          if (finite) {
+            image.chip<0>(i) =
+                (values.chip<0>(i) * scale + offset).cast<uint8>();
+          } else {
+            image.chip<0>(i) = bad_color;
+          }
+        }
+      }
+
+      Summary::Image* si = v->mutable_image();
+      si->set_height(h);
+      si->set_width(w);
+      si->set_colorspace(depth);
+      OP_REQUIRES(c, png::WriteImageToBuffer(
+                         image.data(), w, h, w * depth, depth, 8, -1,
+                         si->mutable_encoded_image_string(), nullptr),
+                  errors::Internal("PNG encoding failed"));
+    }
+
+    Tensor* summary_tensor = nullptr;
+    OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape({}), &summary_tensor));
+    CHECK(s.SerializeToString(&summary_tensor->scalar<string>()()));
+  }
+
+ private:
+  int64 max_images_;
+  Tensor bad_color_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("ImageSummary").Device(DEVICE_CPU),
+                        SummaryImageOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/summary_image_op_test.cc b/tensorflow/core/kernels/summary_image_op_test.cc
new file mode 100644
index 0000000000..ddfeeffc0b
--- /dev/null
+++ b/tensorflow/core/kernels/summary_image_op_test.cc
@@ -0,0 +1,141 @@
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/histogram/histogram.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/public/env.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+namespace {
+
+static void EXPECT_SummaryMatches(const Summary& actual,
+                                  const string& expected_str) {
+  Summary expected;
+  CHECK(protobuf::TextFormat::ParseFromString(expected_str, &expected));
+  EXPECT_EQ(expected.DebugString(), actual.DebugString());
+}
+
+// --------------------------------------------------------------------------
+// SummaryImageOp
+// --------------------------------------------------------------------------
+class SummaryImageOpTest : public OpsTestBase {
+ protected:
+  void MakeOp(int max_images) {
+    RequireDefaultOps();
+    ASSERT_OK(NodeDefBuilder("myop", "ImageSummary")
+                  .Input(FakeInput())
+                  .Input(FakeInput())
+                  .Attr("max_images", max_images)
+                  .Finalize(node_def()));
+    ASSERT_OK(InitOp());
+  }
+
+  void CheckAndRemoveEncodedImages(Summary* summary) {
+    for (int i = 0; i < summary->value_size(); ++i) {
+      Summary::Value* value = summary->mutable_value(i);
+      ASSERT_TRUE(value->has_image()) << "No image for value: " << value->tag();
+      ASSERT_FALSE(value->image().encoded_image_string().empty())
+          << "No encoded_image_string for value: " << value->tag();
+      if (VLOG_IS_ON(2)) {
+        // When LOGGING, output the images to disk for manual inspection.
+        TF_CHECK_OK(WriteStringToFile(
+            Env::Default(), strings::StrCat("/tmp/", value->tag(), ".png"),
+            value->image().encoded_image_string()));
+      }
+      value->mutable_image()->clear_encoded_image_string();
+    }
+  }
+};
+
+TEST_F(SummaryImageOpTest, ThreeGrayImagesOutOfFive4dInput) {
+  MakeOp(3 /* max images */);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({}), {"tag"});
+  AddInputFromArray<float>(TensorShape({5, 2, 1, 1}),
+                           {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0});
+  ASSERT_OK(RunOpKernel());
+
+  // Check the output size.
+  Tensor* out_tensor = GetOutput(0);
+  ASSERT_EQ(0, out_tensor->dims());
+  Summary summary;
+  ParseProtoUnlimited(&summary, out_tensor->scalar<string>()());
+
+  CheckAndRemoveEncodedImages(&summary);
+  EXPECT_SummaryMatches(summary, R"(
+    value { tag: 'tag/image/0' image { width: 1 height: 2 colorspace: 1} }
+    value { tag: 'tag/image/1' image { width: 1 height: 2 colorspace: 1} }
+    value { tag: 'tag/image/2' image { width: 1 height: 2 colorspace: 1} }
+  )");
+}
+
+TEST_F(SummaryImageOpTest, OneGrayImage4dInput) {
+  MakeOp(1 /* max images */);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({}), {"tag"});
+  AddInputFromArray<float>(TensorShape({5 /*batch*/, 2, 1, 1 /*depth*/}),
+                           {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0});
+  ASSERT_OK(RunOpKernel());
+
+  // Check the output size.
+  Tensor* out_tensor = GetOutput(0);
+  ASSERT_EQ(0, out_tensor->dims());
+  Summary summary;
+  ParseProtoUnlimited(&summary, out_tensor->scalar<string>()());
+
+  CheckAndRemoveEncodedImages(&summary);
+  EXPECT_SummaryMatches(summary, R"(
+    value { tag: 'tag/image' image { width: 1 height: 2 colorspace: 1} })");
+}
+
+TEST_F(SummaryImageOpTest, OneColorImage4dInput) {
+  MakeOp(1 /* max images */);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({}), {"tag"});
+  AddInputFromArray<float>(
+      TensorShape({1 /*batch*/, 5 /*rows*/, 2 /*columns*/, 3 /*depth*/}),
+      {
+          /* r0, c0, RGB */ 1.0, 0.1, 0.2,
+          /* r0, c1, RGB */ 1.0, 0.3, 0.4,
+          /* r1, c0, RGB */ 0.0, 1.0, 0.0,
+          /* r1, c1, RGB */ 0.0, 1.0, 0.0,
+          /* r2, c0, RGB */ 0.0, 0.0, 1.0,
+          /* r2, c1, RGB */ 0.0, 0.0, 1.0,
+          /* r3, c0, RGB */ 1.0, 1.0, 0.0,
+          /* r3, c1, RGB */ 1.0, 0.0, 1.0,
+          /* r4, c0, RGB */ 1.0, 1.0, 0.0,
+          /* r4, c1, RGB */ 1.0, 0.0, 1.0,
+      });
+  ASSERT_OK(RunOpKernel());
+
+  // Check the output size.
+  Tensor* out_tensor = GetOutput(0);
+  ASSERT_EQ(0, out_tensor->dims());
+  Summary summary;
+  ParseProtoUnlimited(&summary, out_tensor->scalar<string>()());
+
+  CheckAndRemoveEncodedImages(&summary);
+  EXPECT_SummaryMatches(summary, R"(
+    value { tag: 'tag/image' image { width: 2 height: 5 colorspace: 3} })");
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/summary_op.cc b/tensorflow/core/kernels/summary_op.cc
new file mode 100644
index 0000000000..1c4be64b8b
--- /dev/null
+++ b/tensorflow/core/kernels/summary_op.cc
@@ -0,0 +1,141 @@
+// Operators that deal with SummaryProtos (encoded as DT_STRING tensors) as
+// inputs or outputs in various ways.
+
+// See docs in ../ops/summary_ops.cc.
+
+#include <unordered_set>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/histogram/histogram.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+template <typename T>
+class SummaryScalarOp : public OpKernel {
+ public:
+  explicit SummaryScalarOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* c) override {
+    const Tensor& tags = c->input(0);
+    const Tensor& values = c->input(1);
+
+    OP_REQUIRES(c, tags.IsSameSize(values) ||
+                       (TensorShapeUtils::IsLegacyScalar(tags.shape()) &&
+                        TensorShapeUtils::IsLegacyScalar(values.shape())),
+                errors::InvalidArgument("tags and values not the same shape: ",
+                                        tags.shape().ShortDebugString(), " != ",
+                                        values.shape().ShortDebugString()));
+    auto Ttags = tags.flat<string>();
+    auto Tvalues = values.flat<T>();
+    Summary s;
+    for (int i = 0; i < Ttags.size(); i++) {
+      Summary::Value* v = s.add_value();
+      v->set_tag(Ttags(i));
+      v->set_simple_value(Tvalues(i));
+    }
+
+    Tensor* summary_tensor = nullptr;
+    OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape({}), &summary_tensor));
+    CHECK(s.SerializeToString(&summary_tensor->scalar<string>()()));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("ScalarSummary")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T"),
+                        SummaryScalarOp<float>);
+REGISTER_KERNEL_BUILDER(Name("ScalarSummary")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<double>("T"),
+                        SummaryScalarOp<double>);
+
+class SummaryHistoOp : public OpKernel {
+ public:
+  // SummaryHistoOp could be extended to take a list of custom bucket
+  // boundaries as an option.
+  explicit SummaryHistoOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* c) override {
+    const Tensor& tags = c->input(0);
+    const Tensor& values = c->input(1);
+    const auto flat = values.flat<float>();
+    OP_REQUIRES(c, TensorShapeUtils::IsLegacyScalar(tags.shape()),
+                errors::InvalidArgument("tags must be scalar"));
+    // Build histogram of values in "values" tensor
+    histogram::Histogram histo;
+    for (int64 i = 0; i < flat.size(); i++) {
+      float v = flat(i);
+      if (!std::isfinite(v)) {
+        c->SetStatus(
+            errors::OutOfRange("Nan in summary histogram for: ", name()));
+        break;
+      }
+      histo.Add(v);
+    }
+
+    Summary s;
+    Summary::Value* v = s.add_value();
+    v->set_tag(tags.scalar<string>()());
+    histo.EncodeToProto(v->mutable_histo(), false /* Drop zero buckets */);
+
+    Tensor* summary_tensor = nullptr;
+    OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape({}), &summary_tensor));
+    CHECK(s.SerializeToString(&summary_tensor->scalar<string>()()));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("HistogramSummary").Device(DEVICE_CPU),
+                        SummaryHistoOp);
+
+struct HistogramResource : public ResourceBase {
+  histogram::ThreadSafeHistogram histogram;
+
+  string DebugString() override { return "A historam summary. Stats ..."; }
+};
+
+class SummaryMergeOp : public OpKernel {
+ public:
+  explicit SummaryMergeOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* c) override {
+    Summary s;
+    std::unordered_set<string> tags;
+    for (int input_num = 0; input_num < c->num_inputs(); input_num++) {
+      const Tensor& in = c->input(input_num);
+      auto in_vec = in.flat<string>();
+      for (int i = 0; i < in_vec.dimension(0); i++) {
+        const string& s_in = in_vec(i);
+        Summary summary_in;
+        if (!ParseProtoUnlimited(&summary_in, s_in)) {
+          c->SetStatus(errors::InvalidArgument(
+              "Could not parse one of the summary inputs"));
+          return;
+        }
+
+        for (int v = 0; v < summary_in.value_size(); v++) {
+          if (!tags.insert(summary_in.value(v).tag()).second) {
+            c->SetStatus(errors::InvalidArgument(
+                strings::StrCat("Duplicate tag ", summary_in.value(v).tag(),
+                                " found in summary inputs")));
+            return;
+          }
+          *s.add_value() = summary_in.value(v);
+        }
+      }
+    }
+
+    Tensor* summary_tensor = nullptr;
+    OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape({}), &summary_tensor));
+    CHECK(s.SerializeToString(&summary_tensor->scalar<string>()()));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MergeSummary").Device(DEVICE_CPU),
+                        SummaryMergeOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/summary_op_test.cc b/tensorflow/core/kernels/summary_op_test.cc
new file mode 100644
index 0000000000..fd271a6862
--- /dev/null
+++ b/tensorflow/core/kernels/summary_op_test.cc
@@ -0,0 +1,282 @@
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/lib/histogram/histogram.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/public/env.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include <gtest/gtest.h>
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace {
+
+static void EXPECT_SummaryMatches(const Summary& actual,
+                                  const string& expected_str) {
+  Summary expected;
+  CHECK(protobuf::TextFormat::ParseFromString(expected_str, &expected));
+  EXPECT_EQ(expected.DebugString(), actual.DebugString());
+}
+
+class SummaryScalarOpTest : public OpsTestBase {
+ protected:
+  void MakeOp(DataType dt) {
+    RequireDefaultOps();
+    ASSERT_OK(NodeDefBuilder("myop", "ScalarSummary")
+                  .Input(FakeInput())
+                  .Input(FakeInput(dt))
+                  .Finalize(node_def()));
+    ASSERT_OK(InitOp());
+  }
+};
+
+TEST_F(SummaryScalarOpTest, SimpleFloat) {
+  MakeOp(DT_FLOAT);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({3}), {"tag1", "tag2", "tag3"});
+  AddInputFromArray<float>(TensorShape({3}), {1.0, -0.73, 10000.0});
+  ASSERT_OK(RunOpKernel());
+
+  // Check the output size.
+  Tensor* out_tensor = GetOutput(0);
+  ASSERT_EQ(0, out_tensor->dims());
+  Summary summary;
+  ParseProtoUnlimited(&summary, out_tensor->scalar<string>()());
+  EXPECT_SummaryMatches(summary, R"(
+      value { tag: 'tag1' simple_value: 1.0 }
+      value { tag: 'tag2' simple_value: -0.73 }
+      value { tag: 'tag3' simple_value: 10000.0 }
+  )");
+}
+
+TEST_F(SummaryScalarOpTest, SimpleDouble) {
+  MakeOp(DT_DOUBLE);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({3}), {"tag1", "tag2", "tag3"});
+  AddInputFromArray<double>(TensorShape({3}), {1.0, -0.73, 10000.0});
+  ASSERT_OK(RunOpKernel());
+
+  // Check the output size.
+  Tensor* out_tensor = GetOutput(0);
+  ASSERT_EQ(0, out_tensor->dims());
+  Summary summary;
+  ParseProtoUnlimited(&summary, out_tensor->scalar<string>()());
+  EXPECT_SummaryMatches(summary, R"(
+      value { tag: 'tag1' simple_value: 1.0 }
+      value { tag: 'tag2' simple_value: -0.73 }
+      value { tag: 'tag3' simple_value: 10000.0 }
+  )");
+}
+
+TEST_F(SummaryScalarOpTest, Error_MismatchedSize) {
+  MakeOp(DT_FLOAT);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({2}), {"tag1", "tag2"});
+  AddInputFromArray<float>(TensorShape({3}), {1.0, -0.73, 10000.0});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString()).contains("not the same shape")) << s;
+}
+
+TEST_F(SummaryScalarOpTest, Error_WrongDimsTags) {
+  MakeOp(DT_FLOAT);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({2, 1}), {"tag1", "tag2"});
+  AddInputFromArray<float>(TensorShape({2}), {1.0, -0.73});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(
+      StringPiece(s.ToString()).contains("tags and values not the same shape"))
+      << s;
+}
+
+TEST_F(SummaryScalarOpTest, Error_WrongDimsValues) {
+  MakeOp(DT_FLOAT);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({2}), {"tag1", "tag2"});
+  AddInputFromArray<float>(TensorShape({2, 1}), {1.0, -0.73});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(
+      StringPiece(s.ToString()).contains("tags and values not the same shape"))
+      << s;
+}
+
+// --------------------------------------------------------------------------
+// SummaryHistoOp
+// --------------------------------------------------------------------------
+class SummaryHistoOpTest : public OpsTestBase {
+ protected:
+  void MakeOp() {
+    ASSERT_OK(NodeDefBuilder("myop", "HistogramSummary")
+                  .Input(FakeInput())
+                  .Input(FakeInput())
+                  .Finalize(node_def()));
+    ASSERT_OK(InitOp());
+  }
+};
+
+TEST_F(SummaryHistoOpTest, Simple) {
+  MakeOp();
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({}), {"taghisto"});
+  AddInputFromArray<float>(TensorShape({3, 2}), {0.1, -0.7, 4.1, 4., 5., 4.});
+  ASSERT_OK(RunOpKernel());
+
+  // Check the output size.
+  Tensor* out_tensor = GetOutput(0);
+  ASSERT_EQ(0, out_tensor->dims());
+  Summary summary;
+  ParseProtoUnlimited(&summary, out_tensor->scalar<string>()());
+  ASSERT_EQ(summary.value_size(), 1);
+  EXPECT_EQ(summary.value(0).tag(), "taghisto");
+  histogram::Histogram histo;
+  EXPECT_TRUE(histo.DecodeFromProto(summary.value(0).histo()));
+  EXPECT_EQ(
+      "Count: 6  Average: 2.7500  StdDev: 2.20\n"
+      "Min: -0.7000  Median: 3.9593  Max: 5.0000\n"
+      "------------------------------------------------------\n"
+      "[      -0.76,      -0.69 )       1  16.667%  16.667% ###\n"
+      "[      0.093,        0.1 )       1  16.667%  33.333% ###\n"
+      "[        3.8,        4.2 )       3  50.000%  83.333% ##########\n"
+      "[        4.6,        5.1 )       1  16.667% 100.000% ###\n",
+      histo.ToString());
+}
+
+TEST_F(SummaryHistoOpTest, Error_WrongDimsTags) {
+  MakeOp();
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({2, 1}), {"tag1", "tag2"});
+  AddInputFromArray<float>(TensorShape({2}), {1.0, -0.73});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString()).contains("tags must be scalar")) << s;
+}
+
+TEST_F(SummaryHistoOpTest, Error_TooManyTagValues) {
+  MakeOp();
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({2}), {"tag1", "tag2"});
+  AddInputFromArray<float>(TensorShape({2, 1}), {1.0, -0.73});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString()).contains("tags must be scalar")) << s;
+}
+
+// --------------------------------------------------------------------------
+// SummaryMergeOp
+// --------------------------------------------------------------------------
+class SummaryMergeOpTest : public OpsTestBase {
+ protected:
+  void MakeOp(int num_inputs) {
+    ASSERT_OK(NodeDefBuilder("myop", "MergeSummary")
+                  .Input(FakeInput(num_inputs))
+                  .Finalize(node_def()));
+    ASSERT_OK(InitOp());
+  }
+};
+
+TEST_F(SummaryMergeOpTest, Simple) {
+  MakeOp(1);
+
+  // Feed and run
+  Summary s1;
+  ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
+      "value { tag: \"tag1\" simple_value: 1.0 } "
+      "value { tag: \"tag2\" simple_value: -0.73 } ",
+      &s1));
+  Summary s2;
+  ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
+      "value { tag: \"tag3\" simple_value: 10000.0 }", &s2));
+  Summary s3;
+  ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
+      "value { tag: \"tag4\" simple_value: 11.0 }", &s3));
+
+  AddInputFromArray<string>(
+      TensorShape({3}),
+      {s1.SerializeAsString(), s2.SerializeAsString(), s3.SerializeAsString()});
+  ASSERT_OK(RunOpKernel());
+
+  // Check the output size.
+  Tensor* out_tensor = GetOutput(0);
+  ASSERT_EQ(0, out_tensor->dims());
+  Summary summary;
+  ParseProtoUnlimited(&summary, out_tensor->scalar<string>()());
+
+  EXPECT_SummaryMatches(summary,
+                        "value { tag: \"tag1\" simple_value: 1.0 } "
+                        "value { tag: \"tag2\" simple_value: -0.73 } "
+                        "value { tag: \"tag3\" simple_value: 10000.0 }"
+                        "value { tag: \"tag4\" simple_value: 11.0 }");
+}
+
+TEST_F(SummaryMergeOpTest, Simple_MultipleInputs) {
+  MakeOp(3);
+
+  // Feed and run
+  Summary s1;
+  ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
+      "value { tag: \"tag1\" simple_value: 1.0 } "
+      "value { tag: \"tag2\" simple_value: -0.73 } ",
+      &s1));
+  Summary s2;
+  ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
+      "value { tag: \"tag3\" simple_value: 10000.0 }", &s2));
+  Summary s3;
+  ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
+      "value { tag: \"tag4\" simple_value: 11.0 }", &s3));
+
+  AddInputFromArray<string>(TensorShape({}), {s1.SerializeAsString()});
+  AddInputFromArray<string>(TensorShape({}), {s2.SerializeAsString()});
+  AddInputFromArray<string>(TensorShape({}), {s3.SerializeAsString()});
+  ASSERT_OK(RunOpKernel());
+
+  // Check the output size.
+  Tensor* out_tensor = GetOutput(0);
+  ASSERT_EQ(0, out_tensor->dims());
+  Summary summary;
+  ParseProtoUnlimited(&summary, out_tensor->scalar<string>()());
+
+  EXPECT_SummaryMatches(summary,
+                        "value { tag: \"tag1\" simple_value: 1.0 } "
+                        "value { tag: \"tag2\" simple_value: -0.73 } "
+                        "value { tag: \"tag3\" simple_value: 10000.0 }"
+                        "value { tag: \"tag4\" simple_value: 11.0 }");
+}
+
+TEST_F(SummaryMergeOpTest, Error_MismatchedSize) {
+  MakeOp(1);
+
+  // Feed and run
+  Summary s1;
+  ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
+      "value { tag: \"tag1\" simple_value: 1.0 } "
+      "value { tag: \"tagduplicate\" simple_value: -0.73 } ",
+      &s1));
+  Summary s2;
+  ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
+      "value { tag: \"tagduplicate\" simple_value: 1.0 } ", &s2));
+  AddInputFromArray<string>(TensorShape({2}),
+                            {s1.SerializeAsString(), s2.SerializeAsString()});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString()).contains("Duplicate tag")) << s;
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/text_line_reader_op.cc b/tensorflow/core/kernels/text_line_reader_op.cc
new file mode 100644
index 0000000000..51e4d6a2b8
--- /dev/null
+++ b/tensorflow/core/kernels/text_line_reader_op.cc
@@ -0,0 +1,99 @@
+// See docs in ../ops/io_ops.cc.
+
+#include <memory>
+#include "tensorflow/core/framework/reader_op_kernel.h"
+#include "tensorflow/core/kernels/reader_base.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/io/inputbuffer.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/public/env.h"
+
+namespace tensorflow {
+
+class TextLineReader : public ReaderBase {
+ public:
+  TextLineReader(const string& node_name, int skip_header_lines, Env* env)
+      : ReaderBase(strings::StrCat("TextLineReader '", node_name, "'")),
+        skip_header_lines_(skip_header_lines),
+        env_(env),
+        line_number_(0) {}
+
+  Status OnWorkStartedLocked() override {
+    line_number_ = 0;
+    RandomAccessFile* file = nullptr;
+    TF_RETURN_IF_ERROR(env_->NewRandomAccessFile(current_work(), &file));
+    input_buffer_.reset(new io::InputBuffer(file, kBufferSize));
+    for (; line_number_ < skip_header_lines_; ++line_number_) {
+      string line_contents;
+      Status status = input_buffer_->ReadLine(&line_contents);
+      if (errors::IsOutOfRange(status)) {
+        // We ignore an end of file error when skipping header lines.
+        // We will end up skipping this file.
+        return Status::OK();
+      }
+      TF_RETURN_IF_ERROR(status);
+    }
+    return Status::OK();
+  }
+
+  Status OnWorkFinishedLocked() override {
+    input_buffer_.reset(nullptr);
+    return Status::OK();
+  }
+
+  Status ReadLocked(string* key, string* value, bool* produced,
+                    bool* at_end) override {
+    Status status = input_buffer_->ReadLine(value);
+    ++line_number_;
+    if (status.ok()) {
+      *key = strings::StrCat(current_work(), ":", line_number_);
+      *produced = true;
+      return status;
+    }
+    if (errors::IsOutOfRange(status)) {  // End of file, advance to the next.
+      *at_end = true;
+      return Status::OK();
+    } else {  // Some other reading error
+      return status;
+    }
+  }
+
+  Status ResetLocked() override {
+    line_number_ = 0;
+    input_buffer_.reset(nullptr);
+    return ReaderBase::ResetLocked();
+  }
+
+  // TODO(josh11b): Implement serializing and restoring the state.  Need
+  // to create TextLineReaderState proto to store ReaderBaseState,
+  // line_number_, and input_buffer_->Tell().
+
+ private:
+  enum { kBufferSize = 256 << 10 /* 256 kB */ };
+  const int skip_header_lines_;
+  Env* const env_;
+  int64 line_number_;
+  std::unique_ptr<io::InputBuffer> input_buffer_;
+};
+
+class TextLineReaderOp : public ReaderOpKernel {
+ public:
+  explicit TextLineReaderOp(OpKernelConstruction* context)
+      : ReaderOpKernel(context) {
+    int skip_header_lines = -1;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("skip_header_lines", &skip_header_lines));
+    OP_REQUIRES(context, skip_header_lines >= 0,
+                errors::InvalidArgument("skip_header_lines must be >= 0 not ",
+                                        skip_header_lines));
+    Env* env = context->env();
+    SetReaderFactory([this, skip_header_lines, env]() {
+      return new TextLineReader(name(), skip_header_lines, env);
+    });
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("TextLineReader").Device(DEVICE_CPU),
+                        TextLineReaderOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tf_record_reader_op.cc b/tensorflow/core/kernels/tf_record_reader_op.cc
new file mode 100644
index 0000000000..551be18d5f
--- /dev/null
+++ b/tensorflow/core/kernels/tf_record_reader_op.cc
@@ -0,0 +1,76 @@
+// See docs in ../ops/io_ops.cc.
+
+#include <memory>
+#include "tensorflow/core/framework/reader_op_kernel.h"
+#include "tensorflow/core/kernels/reader_base.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/io/record_reader.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/public/env.h"
+
+namespace tensorflow {
+
+class TFRecordReader : public ReaderBase {
+ public:
+  TFRecordReader(const string& node_name, Env* env)
+      : ReaderBase(strings::StrCat("TFRecordReader '", node_name, "'")),
+        env_(env),
+        offset_(0) {}
+
+  Status OnWorkStartedLocked() override {
+    offset_ = 0;
+    RandomAccessFile* file = nullptr;
+    TF_RETURN_IF_ERROR(env_->NewRandomAccessFile(current_work(), &file));
+    file_.reset(file);
+    reader_.reset(new io::RecordReader(file));
+    return Status::OK();
+  }
+
+  Status OnWorkFinishedLocked() override {
+    reader_.reset(nullptr);
+    file_.reset(nullptr);
+    return Status::OK();
+  }
+
+  Status ReadLocked(string* key, string* value, bool* produced,
+                    bool* at_end) override {
+    *key = strings::StrCat(current_work(), ":", offset_);
+    Status status = reader_->ReadRecord(&offset_, value);
+    if (errors::IsOutOfRange(status)) {
+      *at_end = true;
+      return Status::OK();
+    }
+    if (!status.ok()) return status;
+    *produced = true;
+    return Status::OK();
+  }
+
+  Status ResetLocked() override {
+    offset_ = 0;
+    reader_.reset(nullptr);
+    file_.reset(nullptr);
+    return ReaderBase::ResetLocked();
+  }
+
+  // TODO(josh11b): Implement serializing and restoring the state.
+
+ private:
+  Env* const env_;
+  uint64 offset_;
+  std::unique_ptr<RandomAccessFile> file_;
+  std::unique_ptr<io::RecordReader> reader_;
+};
+
+class TFRecordReaderOp : public ReaderOpKernel {
+ public:
+  explicit TFRecordReaderOp(OpKernelConstruction* context)
+      : ReaderOpKernel(context) {
+    Env* env = context->env();
+    SetReaderFactory([this, env]() { return new TFRecordReader(name(), env); });
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("TFRecordReader").Device(DEVICE_CPU),
+                        TFRecordReaderOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc
new file mode 100644
index 0000000000..d5e0e89d60
--- /dev/null
+++ b/tensorflow/core/kernels/tile_ops.cc
@@ -0,0 +1,460 @@
+// See docs in ../ops/array_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#ifdef GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/tile_ops.h"
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+// --------------------------------------------------------------------------
+template <typename Device>
+class TileOp : public OpKernel {
+ public:
+  explicit TileOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const Tensor& multiples = context->input(1);
+
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsLegacyVector(multiples.shape()),
+        errors::InvalidArgument("Expected multiples to be 1-D, but got shape ",
+                                multiples.shape().ShortDebugString()));
+    OP_REQUIRES(context, input.dims() == multiples.NumElements(),
+                errors::InvalidArgument(
+                    "Expected multiples argument to be a vector of length ",
+                    input.dims(), " but got length ", multiples.dim_size(0)));
+
+    const int input_dims = input.dims();
+    const gtl::ArraySlice<int32> multiples_array(multiples.flat<int32>().data(),
+                                                 input_dims);
+
+    TensorShape output_shape;
+    for (int i = 0; i < input_dims; ++i) {
+      OP_REQUIRES(
+          context, multiples_array[i] > 0,
+          errors::InvalidArgument("Expected multiples[", i, "] > 0, but got ",
+                                  multiples_array[i]));
+      output_shape.AddDim(input.dim_size(i) * multiples_array[i]);
+    }
+    Tensor* result = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &result));
+
+#define HANDLE_DIM(DT, NDIM)                                   \
+  if (context->input(0).dtype() == DT && input_dims == NDIM) { \
+    HandleCase<DT, NDIM>(context, multiples_array, result);    \
+    return;                                                    \
+  }
+
+#define HANDLE_TYPE(T) \
+  HANDLE_DIM(T, 0)     \
+  HANDLE_DIM(T, 1)     \
+  HANDLE_DIM(T, 2)     \
+  HANDLE_DIM(T, 3)     \
+  HANDLE_DIM(T, 4)     \
+  HANDLE_DIM(T, 5)
+
+    HANDLE_TYPE(DT_BOOL);
+    HANDLE_TYPE(DT_FLOAT);
+    HANDLE_TYPE(DT_DOUBLE);
+    HANDLE_TYPE(DT_UINT8);
+    HANDLE_TYPE(DT_INT32);
+    HANDLE_TYPE(DT_INT16);
+    HANDLE_TYPE(DT_INT64);
+    HANDLE_TYPE(DT_STRING);  // when DEVICE=CPUDevice.
+
+#undef HANDLE_TYPE
+#undef HANDLE_DIM
+
+    OP_REQUIRES(context, false,
+                errors::Unimplemented(
+                    "TileOp : Unhandled input dimensions, DT : ",
+                    context->input(0).dtype(), ", dims : ", input_dims));
+  }
+
+ private:
+  template <DataType DT, int NDIM>
+  void HandleCaseImpl(OpKernelContext* context,
+                      const gtl::ArraySlice<int32>& multiples_array,
+                      Tensor* result) {
+    typedef typename EnumToDataType<DT>::Type T;
+    Eigen::array<int32, NDIM> broadcast_array;
+    for (int i = 0; i < NDIM; ++i) {
+      broadcast_array[i] = multiples_array[i];
+    }
+    functor::Tile<Device, T, NDIM>()(
+        context->eigen_device<Device>(), result->tensor<T, NDIM>(),
+        context->input(0).tensor<T, NDIM>(), broadcast_array);
+  }
+
+  template <DataType DT, int NDIM>
+  void HandleCase(OpKernelContext* context,
+                  const gtl::ArraySlice<int32>& multiples_array,
+                  Tensor* result);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TileOp);
+};
+
+template <typename Device>
+template <DataType DT, int NDIM>
+inline void TileOp<Device>::HandleCase(
+    OpKernelContext* context, const gtl::ArraySlice<int32>& multiples_array,
+    Tensor* result) {
+  LOG(FATAL) << "TileOp: Invalid combination of Device, DT and NDIM: "
+             << typeid(Device).name() << ", " << DataTypeString(DT) << ", "
+             << NDIM;
+}
+
+#define HANDLE_CASE(device, dtype, ndim)                               \
+  template <>                                                          \
+  template <>                                                          \
+  void TileOp<device>::HandleCase<dtype, ndim>(                        \
+      OpKernelContext * context,                                       \
+      const gtl::ArraySlice<int32>& multiples_array, Tensor* result) { \
+    HandleCaseImpl<dtype, ndim>(context, multiples_array, result);     \
+  }
+
+#define HANDLE_CASE_DIM_POSITIVE(device, dtype) \
+  HANDLE_CASE(device, dtype, 1);                \
+  HANDLE_CASE(device, dtype, 2);                \
+  HANDLE_CASE(device, dtype, 3);                \
+  HANDLE_CASE(device, dtype, 4);                \
+  HANDLE_CASE(device, dtype, 5);
+
+#define HANDLE_CASE_DIM(device, dtype) \
+  HANDLE_CASE(device, dtype, 0);       \
+  HANDLE_CASE_DIM_POSITIVE(device, dtype);
+
+HANDLE_CASE_DIM(CPUDevice, DT_BOOL);
+HANDLE_CASE_DIM(CPUDevice, DT_FLOAT);
+HANDLE_CASE_DIM(CPUDevice, DT_DOUBLE);
+HANDLE_CASE_DIM(CPUDevice, DT_UINT8);
+HANDLE_CASE_DIM(CPUDevice, DT_INT32);
+HANDLE_CASE_DIM(CPUDevice, DT_INT16);
+HANDLE_CASE_DIM(CPUDevice, DT_INT64);
+HANDLE_CASE_DIM(CPUDevice, DT_STRING);
+
+#if GOOGLE_CUDA
+// Eigen on GPU does not handle 0-dimension data types yet.
+HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_FLOAT);
+HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_DOUBLE);
+HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_INT16);
+HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_INT32);
+HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_INT64);
+#endif  // GOOGLE_CUDA
+
+#undef HANDLE_CASE_DIM_POSITIVE
+#undef HANDLE_CASE_DIM
+#undef HANDLE_CASE
+
+// --------------------------------------------------------------------------
+template <typename Device>
+class TileGradientOp : public OpKernel {
+ public:
+  explicit TileGradientOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const Tensor& multiples = context->input(1);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsLegacyVector(multiples.shape()),
+        errors::InvalidArgument("Expected multiples to be 1-D, but got shape ",
+                                multiples.shape().ShortDebugString()));
+    OP_REQUIRES(context, input.dims() == multiples.NumElements(),
+                errors::InvalidArgument(
+                    "Expected multiples argument to be a vector of length ",
+                    input.dims(), " but got length ", multiples.dim_size(0)));
+
+    const int input_dims = input.dims();
+    const gtl::ArraySlice<int32> multiples_array(multiples.flat<int32>().data(),
+                                                 input_dims);
+
+    TensorShape output_shape;
+    std::vector<int32> input_dim_size_vec;
+    for (int i = 0; i < input_dims; ++i) {
+      OP_REQUIRES(
+          context, multiples_array[i] > 0,
+          errors::InvalidArgument("Expected multiples[", i, "] > 0, but got ",
+                                  multiples_array[i]));
+      OP_REQUIRES(context, input.dim_size(i) % multiples_array[i] == 0,
+                  errors::InvalidArgument("Expected input_dim[", i,
+                                          "] to be divisible by multiples[", i,
+                                          "], but ", input.dim_size(i), " % ",
+                                          multiples_array[i], " != 0"));
+      output_shape.AddDim(input.dim_size(i) / multiples_array[i]);
+      input_dim_size_vec.push_back(input.dim_size(i));
+    }
+    Tensor* result = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &result));
+
+#define HANDLE_DIM(DT, NDIM)                                           \
+  if (context->input(0).dtype() == DT && input_dims == NDIM) {         \
+    HandleCase<DT, NDIM>(context, input_dim_size_vec, multiples_array, \
+                         result);                                      \
+    return;                                                            \
+  }
+
+#define HANDLE_TYPE(T) \
+  HANDLE_DIM(T, 0)     \
+  HANDLE_DIM(T, 1)     \
+  HANDLE_DIM(T, 2)     \
+  HANDLE_DIM(T, 3)     \
+  HANDLE_DIM(T, 4)     \
+  HANDLE_DIM(T, 5)
+
+    HANDLE_TYPE(DT_FLOAT);
+    HANDLE_TYPE(DT_DOUBLE);
+    HANDLE_TYPE(DT_INT32);
+    HANDLE_TYPE(DT_INT16);
+    HANDLE_TYPE(DT_INT64);
+
+#undef HANDLE_TYPE
+#undef HANDLE_DIM
+
+    OP_REQUIRES(context, false,
+                errors::Unimplemented(
+                    "TileGradientOp : Unhandled input dimensions, DT : ",
+                    context->input(0).dtype(), ", dims : ", input_dims));
+  }
+
+ private:
+  template <DataType DT, int NDIM>
+  void HandleCase(OpKernelContext* context,
+                  const std::vector<int32>& input_dims,
+                  const gtl::ArraySlice<int32>& multiples_array,
+                  Tensor* result);
+
+  template <DataType DT, int NDIM>
+  void HandleCaseImpl(OpKernelContext* context,
+                      const std::vector<int32>& input_dims,
+                      const gtl::ArraySlice<int32>& multiples_array,
+                      Tensor* result) {
+    typedef typename EnumToDataType<DT>::Type T;
+
+    bool reduction_only = true;
+    std::vector<int> reduction_dims;
+
+    for (int i = 0; i < NDIM; ++i) {
+      if (input_dims[i] > multiples_array[i] && multiples_array[i] > 1) {
+        reduction_only = false;
+        break;
+      } else {
+        if (multiples_array[i] == input_dims[i]) {
+          reduction_dims.push_back(i);
+        }
+      }
+    }
+
+    if (reduction_only) {
+#define HANDLE_DIM(D)                                            \
+  if (reduction_dims.size() == (D)) {                            \
+    HandleReduce<T, NDIM, (D)>(context, reduction_dims, result); \
+    return;                                                      \
+  }
+      // NOTE(keveman): Handling the most common case here.
+      // Adding more cases here would require more templating and code
+      // explosion. For instance, HANDLE_DIM(2) wouldn't make sense for NDIM=1.
+      HANDLE_DIM(NDIM > 0 ? 1 : 0);
+
+// Fall through to the unoptimized version.
+#undef HANDLE_DIM
+    }
+
+    Eigen::DSizes<ptrdiff_t, NDIM> indices;
+    Eigen::DSizes<ptrdiff_t, NDIM> sizes;
+
+    // Accumulate slices along the dimensions into the output. The number of
+    // slices along dimension 'i' is simply the multiple along dimension 'i'
+    // passed to the original Tile op.
+    for (int i = 0; i < NDIM; ++i) {
+      sizes[i] = input_dims[i] / multiples_array[i];
+      indices[i] = 0;
+    }
+
+    bool first = true;
+    while (true) {
+      functor::TileGrad<Device, T, NDIM>()(
+          context->eigen_device<Device>(), result->tensor<T, NDIM>(),
+          context->input(0).tensor<T, NDIM>(), indices, sizes, first);
+      first = false;
+      // Increment the begin indices.
+      int i = 0;
+      while (i < NDIM && indices[i] / sizes[i] == multiples_array[i] - 1) {
+        indices[i] = 0;
+        ++i;
+      }
+      // We are finished if we have iterated to the maximum along all
+      // dimensions.
+      if (i == NDIM) {
+        break;
+      }
+      indices[i] += sizes[i];
+    }
+  }
+
+  template <typename T, int NDIM, int REDUCENDIM>
+  void HandleReduce(OpKernelContext* context,
+                    const std::vector<int32>& reduce_dim_in, Tensor* result) {
+    static_assert(NDIM >= REDUCENDIM, "Too many reduced dimensions");
+    Eigen::DSizes<ptrdiff_t, REDUCENDIM> reduce_dim;
+    Eigen::DSizes<ptrdiff_t, NDIM> reshape_dim;
+
+    for (int i = 0; i < REDUCENDIM; ++i) {
+      reduce_dim[i] = reduce_dim_in[i];
+    }
+
+    for (int i = 0; i < NDIM; ++i) {
+      reshape_dim[i] = result->dim_size(i);
+    }
+
+    functor::ReduceAndReshape<Device, T, NDIM, REDUCENDIM>()(
+        context->eigen_device<Device>(), result->tensor<T, NDIM>(),
+        context->input(0).tensor<T, NDIM>(), reduce_dim, reshape_dim);
+  }
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TileGradientOp);
+};
+
+template <typename Device>
+template <DataType DT, int NDIM>
+inline void TileGradientOp<Device>::HandleCase(
+    OpKernelContext* context, const std::vector<int32>& input_dims,
+    const gtl::ArraySlice<int32>& multiples_array, Tensor* result) {
+  LOG(FATAL) << "TileGradientOp: Invalid combination of Device, DT and NDIM: "
+             << typeid(Device).name() << ", " << DataTypeString(DT) << ", "
+             << NDIM;
+}
+
+#define HANDLE_CASE(device, dtype, ndim)                                       \
+  template <>                                                                  \
+  template <>                                                                  \
+  void TileGradientOp<device>::HandleCase<dtype, ndim>(                        \
+      OpKernelContext * context, const std::vector<int32>& input_dims,         \
+      const gtl::ArraySlice<int32>& multiples_array, Tensor* result) {         \
+    HandleCaseImpl<dtype, ndim>(context, input_dims, multiples_array, result); \
+  }
+
+#define HANDLE_CASE_DIM_POSITIVE(device, dtype) \
+  HANDLE_CASE(device, dtype, 1);                \
+  HANDLE_CASE(device, dtype, 2);                \
+  HANDLE_CASE(device, dtype, 3);                \
+  HANDLE_CASE(device, dtype, 4);                \
+  HANDLE_CASE(device, dtype, 5);
+
+#define HANDLE_CASE_DIM(device, dtype) \
+  HANDLE_CASE(device, dtype, 0);       \
+  HANDLE_CASE_DIM_POSITIVE(device, dtype);
+
+HANDLE_CASE_DIM(CPUDevice, DT_FLOAT);
+HANDLE_CASE_DIM(CPUDevice, DT_DOUBLE);
+HANDLE_CASE_DIM(CPUDevice, DT_INT16);
+HANDLE_CASE_DIM(CPUDevice, DT_INT32);
+HANDLE_CASE_DIM(CPUDevice, DT_INT64);
+
+#if GOOGLE_CUDA
+// Eigen on GPU does not handle 0-dimension data types yet.
+HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_FLOAT);
+HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_DOUBLE);
+HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_INT16);
+HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_INT32);
+HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_INT64);
+#endif  // GOOGLE_CUDA
+
+#undef HANDLE_CASE_DIM_POSITIVE
+#undef HANDLE_CASE_DIM
+#undef HANDLE_CASE
+
+REGISTER_KERNEL_BUILDER(Name("Tile").Device(DEVICE_CPU).HostMemory("multiples"),
+                        TileOp<CPUDevice>);
+REGISTER_KERNEL_BUILDER(Name("TileGrad")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("multiples"),
+                        TileGradientOp<CPUDevice>);
+
+#if GOOGLE_CUDA
+#define DEFINE_GPU_TYPE(T) \
+  DEFINE_GPU_DIM(T, 1)     \
+  DEFINE_GPU_DIM(T, 2)     \
+  DEFINE_GPU_DIM(T, 3)     \
+  DEFINE_GPU_DIM(T, 4)     \
+  DEFINE_GPU_DIM(T, 5)
+
+#define DEFINE_GPU_DIM(T, NDIM)                                       \
+  template <>                                                         \
+  void Tile<GPUDevice, T, NDIM>::operator()(                          \
+      const GPUDevice& d, typename TTypes<T, NDIM>::Tensor out,       \
+      typename TTypes<T, NDIM>::ConstTensor in,                       \
+      const Eigen::array<int32, NDIM>& broadcast_array) const;        \
+  extern template struct Tile<GPUDevice, T, NDIM>;                    \
+  template <>                                                         \
+  void TileGrad<GPUDevice, T, NDIM>::operator()(                      \
+      const GPUDevice& d, typename TTypes<T, NDIM>::Tensor out,       \
+      typename TTypes<T, NDIM>::ConstTensor in,                       \
+      const Eigen::DSizes<ptrdiff_t, NDIM>& indices,                  \
+      const Eigen::DSizes<ptrdiff_t, NDIM>& sizes, bool first) const; \
+  extern template struct TileGrad<GPUDevice, T, NDIM>;                \
+  template <>                                                         \
+  void ReduceAndReshape<GPUDevice, T, NDIM, 1>::operator()(           \
+      const GPUDevice& d, typename TTypes<T, NDIM>::Tensor out,       \
+      typename TTypes<T, NDIM>::ConstTensor in,                       \
+      const Eigen::DSizes<ptrdiff_t, 1>& reduce_dim,                  \
+      const Eigen::DSizes<ptrdiff_t, NDIM>& reshape_dim) const;       \
+  extern template struct ReduceAndReshape<GPUDevice, T, NDIM, 1>;
+
+namespace functor {
+DEFINE_GPU_TYPE(float);
+DEFINE_GPU_TYPE(double);
+DEFINE_GPU_TYPE(int64);
+DEFINE_GPU_TYPE(int32);
+DEFINE_GPU_TYPE(int16);
+}  // end namespace functor
+
+#undef DEFINE_GPU_DIM
+#undef DEFINE_GPU_TYPE
+
+REGISTER_KERNEL_BUILDER(Name("Tile")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<float>("T")
+                            .HostMemory("multiples"),
+                        TileOp<GPUDevice>);
+REGISTER_KERNEL_BUILDER(Name("Tile")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<double>("T")
+                            .HostMemory("multiples"),
+                        TileOp<GPUDevice>);
+REGISTER_KERNEL_BUILDER(Name("Tile")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int16>("T")
+                            .HostMemory("multiples"),
+                        TileOp<GPUDevice>);
+
+REGISTER_KERNEL_BUILDER(Name("TileGrad")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<float>("T")
+                            .HostMemory("multiples"),
+                        TileGradientOp<GPUDevice>);
+REGISTER_KERNEL_BUILDER(Name("TileGrad")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<double>("T")
+                            .HostMemory("multiples"),
+                        TileGradientOp<GPUDevice>);
+REGISTER_KERNEL_BUILDER(Name("TileGrad")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int16>("T")
+                            .HostMemory("multiples"),
+                        TileGradientOp<GPUDevice>);
+#endif  // GOOGLE_CUDA
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tile_ops.h b/tensorflow/core/kernels/tile_ops.h
new file mode 100644
index 0000000000..b3cc6165e0
--- /dev/null
+++ b/tensorflow/core/kernels/tile_ops.h
@@ -0,0 +1,48 @@
+#ifndef TENSORFLOW_KERNELS_TILE_OPS_H_
+#define TENSORFLOW_KERNELS_TILE_OPS_H_
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T, int NDIM>
+struct Tile {
+  void operator()(const Device& d, typename TTypes<T, NDIM>::Tensor out,
+                  typename TTypes<T, NDIM>::ConstTensor in,
+                  const Eigen::array<int32, NDIM>& broadcast_array) const {
+    out.device(d) = in.broadcast(broadcast_array);
+  }
+};
+
+template <typename Device, typename T, int NDIM>
+struct TileGrad {
+  void operator()(const Device& d, typename TTypes<T, NDIM>::Tensor out,
+                  typename TTypes<T, NDIM>::ConstTensor in,
+                  const Eigen::DSizes<ptrdiff_t, NDIM>& indices,
+                  const Eigen::DSizes<ptrdiff_t, NDIM>& sizes,
+                  bool first) const {
+    if (first) {
+      out.device(d) = in.slice(indices, sizes);
+    } else {
+      out.device(d) += in.slice(indices, sizes);
+    }
+  }
+};
+
+template <typename Device, typename T, int NDIM, int REDUCEDNDIM>
+struct ReduceAndReshape {
+  void operator()(const Device& d, typename TTypes<T, NDIM>::Tensor out,
+                  typename TTypes<T, NDIM>::ConstTensor in,
+                  const Eigen::DSizes<ptrdiff_t, REDUCEDNDIM>& reduce_dim,
+                  const Eigen::DSizes<ptrdiff_t, NDIM>& reshape_dim) const {
+    out.device(d) = in.sum(reduce_dim).reshape(reshape_dim);
+  }
+};
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_TILE_OPS_H_
diff --git a/tensorflow/core/kernels/tile_ops_gpu.cu.cc b/tensorflow/core/kernels/tile_ops_gpu.cu.cc
new file mode 100644
index 0000000000..29481e1a54
--- /dev/null
+++ b/tensorflow/core/kernels/tile_ops_gpu.cu.cc
@@ -0,0 +1,38 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/tile_ops.h"
+#include <stdio.h>
+
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+#define DEFINE_TYPE(T) \
+  DEFINE_DIM(T, 1)     \
+  DEFINE_DIM(T, 2)     \
+  DEFINE_DIM(T, 3)     \
+  DEFINE_DIM(T, 4)     \
+  DEFINE_DIM(T, 5)
+
+#define DEFINE_DIM(T, NDIM)                     \
+  template struct Tile<GPUDevice, T, NDIM>;     \
+  template struct TileGrad<GPUDevice, T, NDIM>; \
+  template struct ReduceAndReshape<GPUDevice, T, NDIM, 1>;
+
+DEFINE_TYPE(float)
+DEFINE_TYPE(double)
+DEFINE_TYPE(int64)
+DEFINE_TYPE(int32)
+DEFINE_TYPE(int16)
+// NOTE(keveman): Eigen's int8 and string versions don't compile yet with nvcc.
+
+#undef DEFINE_DIM
+#undef DEFINE_TYPE
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op.cc b/tensorflow/core/kernels/topk_op.cc
new file mode 100644
index 0000000000..79b5d4d07e
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op.cc
@@ -0,0 +1,71 @@
+// See docs in ../ops/nn_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/lib/gtl/top_n.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+template <typename T>
+class TopK : public OpKernel {
+ public:
+  explicit TopK(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("k", &k_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const auto& input_in = context->input(0);
+    OP_REQUIRES(context, input_in.dims() == 2,
+                errors::InvalidArgument("input must be 2-dimensional"));
+    OP_REQUIRES(context, input_in.dim_size(1) >= k_,
+                errors::InvalidArgument("input must have at least k columns"));
+
+    const auto& input = input_in.matrix<T>();
+
+    const auto num_rows = input_in.dim_size(0);  // generally batch_size
+    const auto num_cols = input_in.dim_size(1);
+
+    Tensor* values_out = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                0, TensorShape({num_rows, k_}), &values_out));
+    Tensor* indices_out = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                1, TensorShape({num_rows, k_}), &indices_out));
+    auto values = values_out->matrix<T>();
+    auto indices = indices_out->matrix<int32>();
+
+    gtl::TopN<std::pair<T, int32>> filter(k_);
+
+    for (int r = 0; r < num_rows; r++) {
+      for (int32 c = 0; c < num_cols; ++c) {
+        // The second element is the negated index, so that lower-index elements
+        // are considered larger than higher-index elements in case of ties.
+        filter.push(std::make_pair(input(r, c), -c));
+      }
+
+      std::unique_ptr<std::vector<std::pair<T, int32>>> top_k(filter.Extract());
+      for (int32 i = 0; i < k_; ++i) {
+        values(r, i) = (*top_k)[i].first;
+        indices(r, i) = -(*top_k)[i].second;
+      }
+      filter.Reset();
+    }
+  }
+
+ private:
+  int k_;
+};
+
+#define REGISTER_KERNELS(type) \
+  REGISTER_KERNEL_BUILDER(     \
+      Name("TopK").Device(DEVICE_CPU).TypeConstraint<type>("T"), TopK<type>)
+
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
new file mode 100644
index 0000000000..611fa4ac41
--- /dev/null
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -0,0 +1,884 @@
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/training_ops.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+static inline bool DoInline(int64 size) { return size <= (256ll << 10); }
+
+template <typename T>
+struct ApplyGradientDescent<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstFlat grad) {
+    if (DoInline(var.size())) {
+      var -= grad * lr();
+    } else {
+      var.device(d) -= grad * lr();
+    }
+  }
+};
+
+template <typename T>
+struct ApplyAdagrad<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstFlat grad) {
+    if (DoInline(var.size())) {
+      accum += grad.square();
+      var -= grad * lr() * accum.rsqrt();
+    } else {
+      accum.device(d) += grad.square();
+      var.device(d) -= grad * lr() * accum.rsqrt();
+    }
+  }
+};
+
+template <typename T>
+struct ApplyMomentum<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstFlat grad,
+                  typename TTypes<T>::ConstScalar momentum) {
+    if (DoInline(var.size())) {
+      accum = accum * momentum() + grad;
+      var -= accum * lr();
+    } else {
+      accum.device(d) = accum * momentum() + grad;
+      var.device(d) -= accum * lr();
+    }
+  }
+};
+
+template <typename T>
+struct ApplyAdam<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+                  typename TTypes<T>::ConstScalar beta1_power,
+                  typename TTypes<T>::ConstScalar beta2_power,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar beta1,
+                  typename TTypes<T>::ConstScalar beta2,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad) {
+    const T alpha = lr() * std::sqrt(1 - beta2_power()) / (1 - beta1_power());
+    if (DoInline(var.size())) {
+      m += (grad - m) * (1 - beta1());
+      v += (grad.square() - v) * (1 - beta2());
+      var -= (m * alpha) / (v.sqrt() + epsilon());
+    } else {
+      m.device(d) += (grad - m) * (1 - beta1());
+      v.device(d) += (grad.square() - v) * (1 - beta2());
+      var.device(d) -= (m * alpha) / (v.sqrt() + epsilon());
+    }
+  }
+};
+
+template <typename T>
+struct ApplyRMSProp<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat ms, typename TTypes<T>::Flat mom,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar rho,
+                  typename TTypes<T>::ConstScalar momentum,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad) {
+    if (DoInline(var.size())) {
+      ms += (grad.square() - ms) * (1 - rho());
+      mom = mom * momentum() + (grad * lr()) / ((ms + epsilon()).sqrt());
+      var -= mom;
+    } else {
+      ms.device(d) += (grad.square() - ms) * (1 - rho());
+      mom.device(d) =
+          mom * momentum() + (grad * lr()) / ((ms + epsilon()).sqrt());
+      var.device(d) -= mom;
+    }
+  }
+};
+
+}  // namespace functor
+
+template <typename Device, typename T>
+class ApplyGradientDescentOp : public OpKernel {
+ public:
+  explicit ApplyGradientDescentOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    if (use_exclusive_lock_) {
+      mutex_lock l(*ctx->input_ref_mutex(0));
+      DoValidate(ctx);
+      if (!ctx->status().ok()) return;
+      DoCompute(ctx);
+    } else {
+      DoValidate(ctx);
+      if (!ctx->status().ok()) return;
+      DoCompute(ctx);
+    }
+    ctx->forward_ref_input_to_ref_output(0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+
+  void DoValidate(OpKernelContext* ctx) {
+    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(0)));
+    const Tensor& alpha = ctx->input(1);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsLegacyScalar(alpha.shape()),
+                errors::InvalidArgument("alpha is not a scalar: ",
+                                        alpha.shape().DebugString()));
+    const Tensor& delta = ctx->input(2);
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(delta.shape()),
+        errors::InvalidArgument("var and delta do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                delta.shape().DebugString()));
+  }
+
+  void DoCompute(OpKernelContext* ctx) {
+    const Device& device = ctx->template eigen_device<Device>();
+    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+    const Tensor& alpha = ctx->input(1);
+    const Tensor& delta = ctx->input(2);
+    functor::ApplyGradientDescent<Device, T>()(
+        device, var.flat<T>(), alpha.scalar<T>(), delta.flat<T>());
+  }
+};
+
+#define REGISTER_KERNELS(D, T)                                                \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("ApplyGradientDescent").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+      ApplyGradientDescentOp<D##Device, T>);
+
+REGISTER_KERNELS(CPU, float);
+REGISTER_KERNELS(CPU, double);
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                             \
+  template <>                                           \
+  void ApplyGradientDescent<GPUDevice, T>::operator()(  \
+      const GPUDevice& d, typename TTypes<T>::Flat var, \
+      typename TTypes<T>::ConstScalar alpha,            \
+      typename TTypes<T>::ConstFlat delta);             \
+  extern template struct ApplyGradientDescent<GPUDevice, T>;
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+REGISTER_KERNELS(GPU, float);
+REGISTER_KERNELS(GPU, double);
+#endif
+#undef REGISTER_KERNELS
+
+template <typename Device, typename T>
+class ApplyAdagradOp : public OpKernel {
+ public:
+  explicit ApplyAdagradOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    if (use_exclusive_lock_) {
+      mutex_lock l1(*ctx->input_ref_mutex(0));
+      // Don't try to acquire a lock on the second ref as they share the same
+      // mutex.
+      //
+      // mutex_lock l2(*ctx->input_ref_mutex(1));
+      DoValidate(ctx);
+      if (!ctx->status().ok()) return;
+      DoCompute(ctx);
+    } else {
+      DoValidate(ctx);
+      if (!ctx->status().ok()) return;
+      DoCompute(ctx);
+    }
+    ctx->forward_ref_input_to_ref_output(0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+
+  void DoValidate(OpKernelContext* ctx) {
+    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+    Tensor accum = ctx->mutable_input(1, use_exclusive_lock_);
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(0)));
+    OP_REQUIRES(
+        ctx, accum.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(1)));
+    const Tensor& lr = ctx->input(2);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsLegacyScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr.shape().DebugString()));
+    const Tensor& grad = ctx->input(3);
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(accum.shape()),
+        errors::InvalidArgument("var and accum do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                accum.shape().DebugString()));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(grad.shape()),
+        errors::InvalidArgument("var and delta do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                grad.shape().DebugString()));
+  }
+
+  void DoCompute(OpKernelContext* ctx) {
+    const Device& device = ctx->template eigen_device<Device>();
+    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+    Tensor accum = ctx->mutable_input(1, use_exclusive_lock_);
+    const Tensor& lr = ctx->input(2);
+    const Tensor& grad = ctx->input(3);
+    functor::ApplyAdagrad<Device, T>()(device, var.flat<T>(), accum.flat<T>(),
+                                       lr.scalar<T>(), grad.flat<T>());
+  }
+};
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+#define REGISTER_KERNELS(D, T)                                        \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("ApplyAdagrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+      ApplyAdagradOp<D##Device, T>);
+
+REGISTER_KERNELS(CPU, float);
+REGISTER_KERNELS(CPU, double);
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                               \
+  template <>                                                             \
+  void ApplyAdagrad<GPUDevice, T>::operator()(                            \
+      const GPUDevice& d, typename TTypes<T>::Flat var,                   \
+      typename TTypes<T>::Flat accum, typename TTypes<T>::ConstScalar lr, \
+      typename TTypes<T>::ConstFlat grad);                                \
+  extern template struct ApplyAdagrad<GPUDevice, T>;
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+REGISTER_KERNELS(GPU, float);
+REGISTER_KERNELS(GPU, double);
+#endif
+#undef REGISTER_KERNELS
+
+// Note, this op works on cpu only.
+template <typename T, typename Tindex>
+class SparseApplyAdagradOp : public OpKernel {
+ public:
+  explicit SparseApplyAdagradOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
+    mutex* mu_var = ctx->input_ref_mutex(0);
+    // mu_accum is actually the same mutex as mu_var since currently we use a
+    // global mutex.
+    //
+    // mutex* mu_accum = ctx->input_ref_mutex(1);
+    if (use_exclusive_lock_) {
+      mu_var->lock();
+    }
+    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+    Tensor accum = ctx->mutable_input(1, use_exclusive_lock_);
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(0)));
+    OP_REQUIRES(
+        ctx, accum.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(1)));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(accum.shape()),
+        errors::InvalidArgument("var and accum do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                accum.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(var.shape()),
+                errors::InvalidArgument("var must be at least 1 dimensional"));
+
+    const Tensor& lr = ctx->input(2);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsLegacyScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr.shape().DebugString()));
+    const Tensor& grad = ctx->input(3);
+    const Tensor& indices = ctx->input(4);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()),
+                errors::InvalidArgument("indices must be one-dimensional"));
+
+    for (int d = 1; d < var.dims(); d++) {
+      OP_REQUIRES(ctx, var.dim_size(d) == grad.dim_size(d),
+                  errors::InvalidArgument(strings::StrCat(
+                      "var and grad must match in dimension ", d)));
+    }
+    const Tindex N = indices.dim_size(0);
+    OP_REQUIRES(
+        ctx, grad.dim_size(0) == N,
+        errors::InvalidArgument(
+            "grad must be the same size as indices in the first dimension."));
+
+    if (N > 0) {
+      const Tindex first_dim_size = var.dim_size(0);
+      // Validate all the indices are in range
+      auto indices_vec = indices.vec<Tindex>();
+      for (Tindex i = 0; i < N; i++) {
+        const Tindex index = indices_vec(i);
+        OP_REQUIRES(ctx, index >= 0 && index < first_dim_size,
+                    errors::InvalidArgument(
+                        strings::StrCat("Index ", index, " at offset ", i,
+                                        " in indices is out of range")));
+      }
+
+      auto var_flat = var.flat_outer_dims<T>();
+      auto accum_flat = accum.flat_outer_dims<T>();
+      auto grad_flat = grad.flat_outer_dims<T>();
+      T lr_scalar = lr.scalar<T>()();
+
+      // Note(yonghui): It might be worth multi-threading square() and rsqrt().
+      for (Tindex i = 0; i < N; i++) {
+        const Tindex index = indices_vec(i);
+        auto a = accum_flat.template chip<0>(index);
+        auto g = grad_flat.template chip<0>(i);
+        auto v = var_flat.template chip<0>(index);
+        a += g.square();
+        v -= g.constant(lr_scalar) * g * a.rsqrt();
+      }
+    }
+    if (use_exclusive_lock_) {
+      mu_var->unlock();
+    }
+
+    ctx->forward_ref_input_to_ref_output(0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+};
+
+#define REGISTER_KERNELS(T, Tindices)                                \
+  REGISTER_KERNEL_BUILDER(Name("SparseApplyAdagrad")                 \
+                              .Device(DEVICE_CPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .TypeConstraint<Tindices>("Tindices"), \
+                          SparseApplyAdagradOp<T, Tindices>);
+
+REGISTER_KERNELS(float, int32);
+REGISTER_KERNELS(float, int64);
+REGISTER_KERNELS(double, int32);
+REGISTER_KERNELS(double, int64);
+#undef REGISTER_KERNELS
+
+template <typename Device, typename T>
+class ApplyMomentumOp : public OpKernel {
+ public:
+  explicit ApplyMomentumOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    if (use_exclusive_lock_) {
+      mutex_lock l1(*ctx->input_ref_mutex(0));
+      // Don't try to acquire a lock on the second ref as they share the same
+      // mutex.
+      //
+      // mutex_lock l2(*ctx->input_ref_mutex(1));
+      DoValidate(ctx);
+      if (!ctx->status().ok()) return;
+      DoCompute(ctx);
+    } else {
+      DoValidate(ctx);
+      if (!ctx->status().ok()) return;
+      DoCompute(ctx);
+    }
+    ctx->forward_ref_input_to_ref_output(0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+
+  void DoValidate(OpKernelContext* ctx) {
+    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+    Tensor accum = ctx->mutable_input(1, use_exclusive_lock_);
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(0)));
+    OP_REQUIRES(
+        ctx, accum.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(1)));
+    const Tensor& lr = ctx->input(2);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr.shape().DebugString()));
+    const Tensor& grad = ctx->input(3);
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(accum.shape()),
+        errors::InvalidArgument("var and accum do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                accum.shape().DebugString()));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(grad.shape()),
+        errors::InvalidArgument("var and delta do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                grad.shape().DebugString()));
+
+    const Tensor& momentum = ctx->input(4);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(momentum.shape()),
+                errors::InvalidArgument("momentum is not a scalar: ",
+                                        momentum.shape().DebugString()));
+  }
+
+  void DoCompute(OpKernelContext* ctx) {
+    const Device& device = ctx->template eigen_device<Device>();
+    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+    Tensor accum = ctx->mutable_input(1, use_exclusive_lock_);
+    const Tensor& lr = ctx->input(2);
+    const Tensor& grad = ctx->input(3);
+    const Tensor& momentum = ctx->input(4);
+    functor::ApplyMomentum<Device, T>()(device, var.flat<T>(), accum.flat<T>(),
+                                        lr.scalar<T>(), grad.flat<T>(),
+                                        momentum.scalar<T>());
+  }
+};
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+#define REGISTER_KERNELS(D, T)                                         \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("ApplyMomentum").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+      ApplyMomentumOp<D##Device, T>);
+
+REGISTER_KERNELS(CPU, float);
+REGISTER_KERNELS(CPU, double);
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                               \
+  template <>                                                             \
+  void ApplyMomentum<GPUDevice, T>::operator()(                           \
+      const GPUDevice& d, typename TTypes<T>::Flat var,                   \
+      typename TTypes<T>::Flat accum, typename TTypes<T>::ConstScalar lr, \
+      typename TTypes<T>::ConstFlat grad,                                 \
+      typename TTypes<T>::ConstScalar momentum);                          \
+  extern template struct ApplyMomentum<GPUDevice, T>;
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+REGISTER_KERNELS(GPU, float);
+REGISTER_KERNELS(GPU, double);
+#endif
+#undef REGISTER_KERNELS
+
+// Note, this op works on cpu only.
+template <typename T, typename Tindex>
+class SparseApplyMomentumOp : public OpKernel {
+ public:
+  explicit SparseApplyMomentumOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
+    mutex* mu_var = ctx->input_ref_mutex(0);
+    // mu_accum is actually the same mutex as mu_var since currently we use a
+    // global mutex.
+    //
+    // mutex* mu_accum = ctx->input_ref_mutex(1);
+    if (use_exclusive_lock_) {
+      mu_var->lock();
+    }
+    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+    Tensor accum = ctx->mutable_input(1, use_exclusive_lock_);
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(0)));
+    OP_REQUIRES(
+        ctx, accum.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(1)));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(accum.shape()),
+        errors::InvalidArgument("var and accum do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                accum.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(var.shape()),
+                errors::InvalidArgument("var must be at least 1 dimensional"));
+
+    const Tensor& lr = ctx->input(2);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr.shape().DebugString()));
+    const Tensor& grad = ctx->input(3);
+    const Tensor& indices = ctx->input(4);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()),
+                errors::InvalidArgument("indices must be one-dimensional"));
+
+    for (int d = 1; d < var.dims(); d++) {
+      OP_REQUIRES(ctx, var.dim_size(d) == grad.dim_size(d),
+                  errors::InvalidArgument(strings::StrCat(
+                      "var and grad must match in dimension ", d)));
+    }
+    const Tindex N = indices.dim_size(0);
+    OP_REQUIRES(
+        ctx, grad.dim_size(0) == N,
+        errors::InvalidArgument(
+            "grad must be the same size as indices in the first dimension."));
+
+    const Tensor& momentum = ctx->input(5);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(momentum.shape()),
+                errors::InvalidArgument("momentum is not a scalar: ",
+                                        momentum.shape().DebugString()));
+
+    if (N > 0) {
+      const Tindex first_dim_size = var.dim_size(0);
+      // Validate all the indices are in range
+      auto indices_vec = indices.vec<Tindex>();
+      for (Tindex i = 0; i < N; i++) {
+        const Tindex index = indices_vec(i);
+        OP_REQUIRES(ctx, index >= 0 && index < first_dim_size,
+                    errors::InvalidArgument(
+                        strings::StrCat("Index ", index, " at offset ", i,
+                                        " in indices is out of range")));
+      }
+
+      auto var_flat = var.flat_outer_dims<T>();
+      auto accum_flat = accum.flat_outer_dims<T>();
+      auto grad_flat = grad.flat_outer_dims<T>();
+      T lr_scalar = lr.scalar<T>()();
+      T momentum_scalar = momentum.scalar<T>()();
+
+      for (Tindex i = 0; i < N; i++) {
+        const Tindex index = indices_vec(i);
+        auto a = accum_flat.template chip<0>(index);
+        auto g = grad_flat.template chip<0>(i);
+        auto v = var_flat.template chip<0>(index);
+        a = a * a.constant(momentum_scalar) + g;
+        v -= a.constant(lr_scalar) * a;
+      }
+    }
+    if (use_exclusive_lock_) {
+      mu_var->unlock();
+    }
+
+    ctx->forward_ref_input_to_ref_output(0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+};
+
+#define REGISTER_KERNELS(T, Tindices)                                \
+  REGISTER_KERNEL_BUILDER(Name("SparseApplyMomentum")                \
+                              .Device(DEVICE_CPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .TypeConstraint<Tindices>("Tindices"), \
+                          SparseApplyMomentumOp<T, Tindices>);
+
+REGISTER_KERNELS(float, int32);
+REGISTER_KERNELS(float, int64);
+REGISTER_KERNELS(double, int32);
+REGISTER_KERNELS(double, int64);
+#undef REGISTER_KERNELS
+
+template <typename Device, typename T>
+class ApplyAdamOp : public OpKernel {
+ public:
+  explicit ApplyAdamOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    if (use_exclusive_lock_) {
+      // all input refs share the same mutex
+      mutex_lock l1(*ctx->input_ref_mutex(0));
+      DoValidate(ctx);
+      if (!ctx->status().ok()) return;
+      DoCompute(ctx);
+    } else {
+      DoValidate(ctx);
+      if (!ctx->status().ok()) return;
+      DoCompute(ctx);
+    }
+    ctx->forward_ref_input_to_ref_output(0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+
+  void DoValidate(OpKernelContext* ctx) {
+    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+    Tensor m = ctx->mutable_input(1, use_exclusive_lock_);
+    Tensor v = ctx->mutable_input(2, use_exclusive_lock_);
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(0)));
+    OP_REQUIRES(
+        ctx, m.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(1)));
+    OP_REQUIRES(
+        ctx, v.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(2)));
+
+    const Tensor& beta1_power = ctx->input(3);
+    const Tensor& beta2_power = ctx->input(4);
+    const Tensor& lr = ctx->input(5);
+    const Tensor& beta1 = ctx->input(6);
+    const Tensor& beta2 = ctx->input(7);
+    const Tensor& epsilon = ctx->input(8);
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_power.shape()),
+                errors::InvalidArgument("beta1_power is not a scalar: ",
+                                        beta1_power.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2_power.shape()),
+                errors::InvalidArgument("beta2_power is not a scalar: ",
+                                        beta2_power.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1.shape()),
+                errors::InvalidArgument("beta1 is not a scalar: ",
+                                        beta1.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2.shape()),
+                errors::InvalidArgument("beta2 is not a scalar: ",
+                                        beta2.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon.shape()),
+                errors::InvalidArgument("epsilon is not a scalar: ",
+                                        epsilon.shape().DebugString()));
+
+    const Tensor& grad = ctx->input(9);
+    OP_REQUIRES(ctx, var.shape().IsSameSize(m.shape()),
+                errors::InvalidArgument("var and m do not have the same shape",
+                                        var.shape().DebugString(), " ",
+                                        m.shape().DebugString()));
+    OP_REQUIRES(ctx, var.shape().IsSameSize(v.shape()),
+                errors::InvalidArgument("var and v do not have the same shape",
+                                        var.shape().DebugString(), " ",
+                                        v.shape().DebugString()));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(grad.shape()),
+        errors::InvalidArgument("var and grad do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                grad.shape().DebugString()));
+  }
+
+  void DoCompute(OpKernelContext* ctx) {
+    const Device& device = ctx->template eigen_device<Device>();
+    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+    Tensor m = ctx->mutable_input(1, use_exclusive_lock_);
+    Tensor v = ctx->mutable_input(2, use_exclusive_lock_);
+    const Tensor& beta1_power = ctx->input(3);
+    const Tensor& beta2_power = ctx->input(4);
+    const Tensor& lr = ctx->input(5);
+    const Tensor& beta1 = ctx->input(6);
+    const Tensor& beta2 = ctx->input(7);
+    const Tensor& epsilon = ctx->input(8);
+    const Tensor& grad = ctx->input(9);
+
+    functor::ApplyAdam<Device, T>()(device, var.flat<T>(), m.flat<T>(),
+                                    v.flat<T>(), beta1_power.scalar<T>(),
+                                    beta2_power.scalar<T>(), lr.scalar<T>(),
+                                    beta1.scalar<T>(), beta2.scalar<T>(),
+                                    epsilon.scalar<T>(), grad.flat<T>());
+  }
+};
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+#define REGISTER_KERNELS(D, T)                                     \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("ApplyAdam").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+      ApplyAdamOp<D##Device, T>);
+
+REGISTER_KERNELS(CPU, float);
+REGISTER_KERNELS(CPU, double);
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                   \
+  template <>                                                 \
+  void ApplyAdam<GPUDevice, T>::operator()(                   \
+      const GPUDevice& d, typename TTypes<T>::Flat var,       \
+      typename TTypes<T>::Flat m, typename TTypes<T>::Flat v, \
+      typename TTypes<T>::ConstScalar beta1_power,            \
+      typename TTypes<T>::ConstScalar beta2_power,            \
+      typename TTypes<T>::ConstScalar lr,                     \
+      typename TTypes<T>::ConstScalar beta1,                  \
+      typename TTypes<T>::ConstScalar beta2,                  \
+      typename TTypes<T>::ConstScalar epsilon,                \
+      typename TTypes<T>::ConstFlat grad);                    \
+  extern template struct ApplyAdam<GPUDevice, T>;
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+REGISTER_KERNELS(GPU, float);
+REGISTER_KERNELS(GPU, double);
+#endif
+#undef REGISTER_KERNELS
+
+template <typename Device, typename T>
+class ApplyRMSPropOp : public OpKernel {
+ public:
+  explicit ApplyRMSPropOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    if (use_exclusive_lock_) {
+      // all input refs share the same mutex
+      mutex_lock l1(*ctx->input_ref_mutex(0));
+      DoValidate(ctx);
+      if (!ctx->status().ok()) return;
+      DoCompute(ctx);
+    } else {
+      DoValidate(ctx);
+      if (!ctx->status().ok()) return;
+      DoCompute(ctx);
+    }
+    ctx->forward_ref_input_to_ref_output(0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+
+  void DoValidate(OpKernelContext* ctx) {
+    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+    Tensor ms = ctx->mutable_input(1, use_exclusive_lock_);
+    Tensor mom = ctx->mutable_input(2, use_exclusive_lock_);
+
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(0)));
+    OP_REQUIRES(
+        ctx, ms.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(1)));
+    OP_REQUIRES(
+        ctx, mom.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(2)));
+
+    const Tensor& lr = ctx->input(3);
+    const Tensor& rho = ctx->input(4);
+    const Tensor& momentum = ctx->input(5);
+    const Tensor& epsilon = ctx->input(6);
+    const Tensor& grad = ctx->input(7);
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(rho.shape()),
+                errors::InvalidArgument("rho is not a scalar: ",
+                                        rho.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(momentum.shape()),
+                errors::InvalidArgument("momentum is not a scalar: ",
+                                        momentum.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon.shape()),
+                errors::InvalidArgument("epsilon is not a scalar: ",
+                                        epsilon.shape().DebugString()));
+
+    OP_REQUIRES(ctx, var.shape().IsSameSize(ms.shape()),
+                errors::InvalidArgument("var and ms do not have the same shape",
+                                        var.shape().DebugString(), " ",
+                                        ms.shape().DebugString()));
+
+    OP_REQUIRES(ctx, var.shape().IsSameSize(mom.shape()),
+                errors::InvalidArgument(
+                    "var and mom do not have the same shape",
+                    var.shape().DebugString(), " ", mom.shape().DebugString()));
+
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(grad.shape()),
+        errors::InvalidArgument("var and grad do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                grad.shape().DebugString()));
+  }
+
+  void DoCompute(OpKernelContext* ctx) {
+    const Device& device = ctx->template eigen_device<Device>();
+    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+    Tensor ms = ctx->mutable_input(1, use_exclusive_lock_);
+    Tensor mom = ctx->mutable_input(2, use_exclusive_lock_);
+    const Tensor& lr = ctx->input(3);
+    const Tensor& rho = ctx->input(4);
+    const Tensor& momentum = ctx->input(5);
+    const Tensor& epsilon = ctx->input(6);
+    const Tensor& grad = ctx->input(7);
+
+    functor::ApplyRMSProp<Device, T>()(device, var.flat<T>(), ms.flat<T>(),
+                                       mom.flat<T>(), lr.scalar<T>(),
+                                       rho.scalar<T>(), momentum.scalar<T>(),
+                                       epsilon.scalar<T>(), grad.flat<T>());
+  }
+};
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+#define REGISTER_KERNELS(D, T)                                        \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("ApplyRMSProp").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+      ApplyRMSPropOp<D##Device, T>);
+
+REGISTER_KERNELS(CPU, float);
+REGISTER_KERNELS(CPU, double);
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                                    \
+  template <>                                                                  \
+  void ApplyRMSProp<GPUDevice, T>::operator()(                                 \
+      const GPUDevice& d, typename TTypes<T>::Flat var,                        \
+      typename TTypes<T>::Flat ms, typename TTypes<T>::Flat mom,               \
+      typename TTypes<T>::ConstScalar lr, typename TTypes<T>::ConstScalar rho, \
+      typename TTypes<T>::ConstScalar momentum,                                \
+      typename TTypes<T>::ConstScalar epsilon,                                 \
+      typename TTypes<T>::ConstFlat grad);                                     \
+  extern template struct ApplyRMSProp<GPUDevice, T>;
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+REGISTER_KERNELS(GPU, float);
+REGISTER_KERNELS(GPU, double);
+#endif
+#undef REGISTER_KERNELS
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/training_ops.h b/tensorflow/core/kernels/training_ops.h
new file mode 100644
index 0000000000..71f6d0253d
--- /dev/null
+++ b/tensorflow/core/kernels/training_ops.h
@@ -0,0 +1,65 @@
+#ifndef TENSORFLOW_KERNELS_TRAINING_OPS_H_
+#define TENSORFLOW_KERNELS_TRAINING_OPS_H_
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+// Each training algorithm has a ApplyXYZ functor struct declared in
+// this header file. They are specialized for different devices
+// (CPUDevice in training_ops.cc or GPUDevice in training_ops_gpu.cc).
+
+template <typename Device, typename T>
+struct ApplyGradientDescent {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::ConstScalar alpha,
+                  typename TTypes<T>::ConstFlat delta);
+};
+
+template <typename Device, typename T>
+struct ApplyAdagrad {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstFlat grad);
+};
+
+template <typename Device, typename T>
+struct ApplyMomentum {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstFlat grad,
+                  typename TTypes<T>::ConstScalar momentum);
+};
+
+template <typename Device, typename T>
+struct ApplyAdam {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+                  typename TTypes<T>::ConstScalar beta1_power,
+                  typename TTypes<T>::ConstScalar beta2_power,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar beta1,
+                  typename TTypes<T>::ConstScalar beta2,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad);
+};
+
+template <typename Device, typename T>
+struct ApplyRMSProp {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat ms, typename TTypes<T>::Flat mom,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar rho,
+                  typename TTypes<T>::ConstScalar momentum,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad);
+};
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_TRAINING_OPS_H_
diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc
new file mode 100644
index 0000000000..3106f29648
--- /dev/null
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@@ -0,0 +1,127 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/training_ops.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+template <typename T>
+struct ApplyGradientDescent<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::ConstScalar alpha,
+                  typename TTypes<T>::ConstFlat delta) {
+    Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
+    bcast[0] = delta.dimension(0);
+    Eigen::Sizes<1> single;
+    var.device(d) -= alpha.reshape(single).broadcast(bcast) * delta;
+  }
+};
+
+template <typename T>
+struct ApplyAdagrad<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstFlat grad) {
+    accum.device(d) += grad.square();
+    Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
+    bcast[0] = grad.dimension(0);
+    Eigen::Sizes<1> single;
+    var.device(d) -= lr.reshape(single).broadcast(bcast) * grad * accum.rsqrt();
+  }
+};
+
+template <typename T>
+struct ApplyMomentum<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstFlat grad,
+                  typename TTypes<T>::ConstScalar momentum) {
+    Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
+    bcast[0] = grad.dimension(0);
+    Eigen::Sizes<1> single;
+    accum.device(d) = accum * momentum.reshape(single).broadcast(bcast) + grad;
+    var.device(d) -= lr.reshape(single).broadcast(bcast) * accum;
+  }
+};
+
+template <typename T>
+struct ApplyAdam<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+                  typename TTypes<T>::ConstScalar beta1_power,
+                  typename TTypes<T>::ConstScalar beta2_power,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar beta1,
+                  typename TTypes<T>::ConstScalar beta2,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad) {
+    Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
+    bcast[0] = grad.dimension(0);
+    Eigen::Sizes<1> single;
+    const auto one = static_cast<T>(1.0);
+    m.device(d) =
+        m +
+        (beta1.constant(one) - beta1).reshape(single).broadcast(bcast) *
+            (grad - m);
+    v.device(d) =
+        v +
+        (beta2.constant(one) - beta2).reshape(single).broadcast(bcast) *
+            (grad.square() - v);
+    var.device(d) -= (lr * (beta2_power.constant(one) - beta2_power).sqrt() /
+                      (beta1_power.constant(one) - beta1_power))
+                         .reshape(single)
+                         .broadcast(bcast) *
+                     m / (epsilon.reshape(single).broadcast(bcast) + v.sqrt());
+  }
+};
+
+template <typename T>
+struct ApplyRMSProp<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat ms, typename TTypes<T>::Flat mom,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar rho,
+                  typename TTypes<T>::ConstScalar momentum,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad) {
+    Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
+    bcast[0] = grad.dimension(0);
+    Eigen::Sizes<1> single;
+    const auto one = static_cast<T>(1.0);
+    ms.device(d) = ms +
+                   (rho.constant(one) - rho).reshape(single).broadcast(bcast) *
+                       (grad.square() - ms);
+    mom.device(d) =
+        mom * momentum.reshape(single).broadcast(bcast) +
+        lr.reshape(single).broadcast(bcast) * grad /
+            ((epsilon.reshape(single).broadcast(bcast) + ms).sqrt());
+    var.device(d) -= mom;
+  }
+};
+
+}  // namespace functor
+
+template struct functor::ApplyGradientDescent<GPUDevice, float>;
+template struct functor::ApplyGradientDescent<GPUDevice, double>;
+
+template struct functor::ApplyAdagrad<GPUDevice, float>;
+template struct functor::ApplyAdagrad<GPUDevice, double>;
+
+template struct functor::ApplyMomentum<GPUDevice, float>;
+template struct functor::ApplyMomentum<GPUDevice, double>;
+
+template struct functor::ApplyAdam<GPUDevice, float>;
+template struct functor::ApplyAdam<GPUDevice, double>;
+
+template struct functor::ApplyRMSProp<GPUDevice, float>;
+template struct functor::ApplyRMSProp<GPUDevice, double>;
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/training_ops_test.cc b/tensorflow/core/kernels/training_ops_test.cc
new file mode 100644
index 0000000000..3c629badb6
--- /dev/null
+++ b/tensorflow/core/kernels/training_ops_test.cc
@@ -0,0 +1,226 @@
+#include <gtest/gtest.h>
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+
+// We focus on the single thread performance of training ops.
+static SessionOptions InitSingleThreadedOptions() {
+  SessionOptions opts;
+  opts.config.set_intra_op_parallelism_threads(1);
+  opts.config.set_inter_op_parallelism_threads(1);
+  return opts;
+}
+
+static SessionOptions* GetOptions() {
+  static SessionOptions opts = InitSingleThreadedOptions();
+  return &opts;
+}
+
+static Node* Var(Graph* g, int n) {
+  return test::graph::Var(g, DT_FLOAT, TensorShape({n}));
+}
+
+static Node* Zeros(Graph* g, int n) {
+  Tensor data(DT_FLOAT, TensorShape({n}));
+  data.flat<float>().setZero();
+  return test::graph::Constant(g, data);
+}
+
+static Node* Random(Graph* g, int n) {
+  Tensor data(DT_FLOAT, TensorShape({n}));
+  data.flat<float>().setRandom();
+  return test::graph::Constant(g, data);
+}
+
+static Node* Scalar(Graph* g, float val) {
+  Tensor data(DT_FLOAT, TensorShape({}));
+  data.flat<float>()(0) = val;
+  return test::graph::Constant(g, data);
+}
+
+static void SGD(int32 n, Graph** init_g, Graph** train_g) {
+  RequireDefaultOps();
+  {
+    Graph* g = new Graph(OpRegistry::Global());
+    auto var = Var(g, n);
+    test::graph::Assign(g, var, Zeros(g, n));
+    *init_g = g;
+  }
+  {
+    Graph* g = new Graph(OpRegistry::Global());
+    auto var = Var(g, n);
+    auto lr = Scalar(g, 0.01);
+    auto grad = Random(g, n);
+    test::graph::Multi(g, "ApplyGradientDescent", {var, lr, grad});
+    *train_g = g;
+  }
+}
+
+static void BM_SGD(int iters, int params) {
+  const int64 tot = static_cast<int64>(iters) * params;
+  testing::ItemsProcessed(tot);
+  testing::BytesProcessed(tot * sizeof(float));
+  Graph* init;
+  Graph* train;
+  SGD(params, &init, &train);
+  test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
+}
+BENCHMARK(BM_SGD)->Arg(128 << 10)->Arg(256 << 10);
+
+static void Adagrad(int32 n, Graph** init_g, Graph** train_g) {
+  RequireDefaultOps();
+  {
+    Graph* g = new Graph(OpRegistry::Global());
+    auto var = Var(g, n);
+    auto accum = Var(g, n);
+    auto zero = Zeros(g, n);
+    test::graph::Assign(g, var, zero);
+    test::graph::Assign(g, accum, zero);
+    *init_g = g;
+  }
+  {
+    Graph* g = new Graph(OpRegistry::Global());
+    auto var = Var(g, n);
+    auto accum = Var(g, n);
+    auto lr = Scalar(g, 0.01);
+    auto grad = Random(g, n);
+    test::graph::Multi(g, "ApplyAdagrad", {var, accum, lr, grad});
+    *train_g = g;
+  }
+}
+
+static void BM_Adagrad(int iters, int params) {
+  const int64 tot = static_cast<int64>(iters) * params;
+  testing::ItemsProcessed(tot);
+  testing::BytesProcessed(tot * sizeof(float));
+  Graph* init;
+  Graph* train;
+  Adagrad(params, &init, &train);
+  test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
+}
+BENCHMARK(BM_Adagrad)->Arg(128 << 10)->Arg(256 << 10);
+
+static void Momentum(int32 n, Graph** init_g, Graph** train_g) {
+  RequireDefaultOps();
+  TensorShape shape({n});
+  {
+    Graph* g = new Graph(OpRegistry::Global());
+    auto var = Var(g, n);
+    auto accum = Var(g, n);
+    auto zero = Zeros(g, n);
+    test::graph::Assign(g, var, zero);
+    test::graph::Assign(g, accum, zero);
+    *init_g = g;
+  }
+  {
+    Graph* g = new Graph(OpRegistry::Global());
+    auto var = Var(g, n);
+    auto accum = Var(g, n);
+    auto lr = Scalar(g, 0.01);
+    auto grad = Random(g, n);
+    auto mom = Scalar(g, 0.01);
+    test::graph::Multi(g, "ApplyMomentum", {var, accum, lr, grad, mom});
+    *train_g = g;
+  }
+}
+
+static void BM_Momentum(int iters, int params) {
+  const int64 tot = static_cast<int64>(iters) * params;
+  testing::ItemsProcessed(tot);
+  testing::BytesProcessed(tot * sizeof(float));
+  Graph* init;
+  Graph* train;
+  Momentum(params, &init, &train);
+  test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
+}
+BENCHMARK(BM_Momentum)->Arg(128 << 10)->Arg(256 << 10);
+
+static void Adam(int32 n, Graph** init_g, Graph** train_g) {
+  RequireDefaultOps();
+  TensorShape shape({n});
+  {
+    Graph* g = new Graph(OpRegistry::Global());
+    auto var = Var(g, n);
+    auto m = Var(g, n);
+    auto v = Var(g, n);
+    auto zero = Zeros(g, n);
+    test::graph::Assign(g, var, zero);
+    test::graph::Assign(g, m, zero);
+    test::graph::Assign(g, v, zero);
+    *init_g = g;
+  }
+  {
+    Graph* g = new Graph(OpRegistry::Global());
+    auto var = Var(g, n);
+    auto m = Var(g, n);
+    auto v = Var(g, n);
+    auto beta1_power = Scalar(g, 0.9);
+    auto beta2_power = Scalar(g, 0.99);
+    auto lr = Scalar(g, 0.01);
+    auto beta1 = Scalar(g, 0.9);
+    auto beta2 = Scalar(g, 0.99);
+    auto epsilon = Scalar(g, 1e-8);
+    auto grad = Random(g, n);
+    test::graph::Multi(g, "ApplyAdam", {var, m, v, beta1_power, beta2_power, lr,
+                                        beta1, beta2, epsilon, grad});
+    *train_g = g;
+  }
+}
+
+static void BM_Adam(int iters, int params) {
+  const int64 tot = static_cast<int64>(iters) * params;
+  testing::ItemsProcessed(tot);
+  testing::BytesProcessed(tot * sizeof(float));
+  Graph* init;
+  Graph* train;
+  Adam(params, &init, &train);
+  test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
+}
+BENCHMARK(BM_Adam)->Arg(128 << 10)->Arg(256 << 10);
+
+static void RMSProp(int32 n, Graph** init_g, Graph** train_g) {
+  RequireDefaultOps();
+  TensorShape shape({n});
+  {
+    Graph* g = new Graph(OpRegistry::Global());
+    auto var = Var(g, n);
+    auto ms = Var(g, n);
+    auto mom = Var(g, n);
+    auto zero = Zeros(g, n);
+    test::graph::Assign(g, var, zero);
+    test::graph::Assign(g, ms, zero);
+    test::graph::Assign(g, mom, zero);
+    *init_g = g;
+  }
+  {
+    Graph* g = new Graph(OpRegistry::Global());
+    auto var = Var(g, n);
+    auto ms = Var(g, n);
+    auto mom = Var(g, n);
+    auto lr = Scalar(g, 0.01);
+    auto rho = Scalar(g, 0.9);
+    auto momentum = Scalar(g, 0.9);
+    auto epsilon = Scalar(g, 1e-8);
+    auto grad = Random(g, n);
+    test::graph::Multi(g, "ApplyRMSProp",
+                       {var, ms, mom, lr, rho, momentum, epsilon, grad});
+    *train_g = g;
+  }
+}
+
+static void BM_RMSProp(int iters, int params) {
+  const int64 tot = static_cast<int64>(iters) * params;
+  testing::ItemsProcessed(tot);
+  testing::BytesProcessed(tot * sizeof(float));
+  Graph* init;
+  Graph* train;
+  RMSProp(params, &init, &train);
+  test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
+}
+BENCHMARK(BM_RMSProp)->Arg(128 << 10)->Arg(256 << 10);
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc
new file mode 100644
index 0000000000..4f11a881f8
--- /dev/null
+++ b/tensorflow/core/kernels/transpose_op.cc
@@ -0,0 +1,190 @@
+// See docs in ../ops/array_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/transpose_op.h"
+#include "tensorflow/core/kernels/transpose_op_functor.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+// inv = InvertPermutationOp(T<int32> p) takes a permutation of
+// integers 0, 1, ..., n - 1 and returns the inverted
+// permutation of p. I.e., inv[p[i]] == i, for i in [0 .. n).
+//
+// REQUIRES: input is a vector of int32.
+// REQUIRES: input is a permutation of 0, 1, ..., n-1.
+
+class InvertPermutationOp : public OpKernel {
+ public:
+  explicit InvertPermutationOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsVector(input.shape()),
+        errors::InvalidArgument("invert_permutation expects a 1D vector."));
+    auto Tin = input.vec<int32>();
+    const int N = Tin.size();
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input.shape(), &output));
+    auto Tout = output->vec<int32>();
+    std::fill_n(Tout.data(), N, -1);
+    for (int i = 0; i < N; ++i) {
+      const int32 d = Tin(i);
+      OP_REQUIRES(context, 0 <= d && d < N,
+                  errors::InvalidArgument(d, " is not between 0 and ", N));
+      OP_REQUIRES(context, Tout(d) == -1,
+                  errors::InvalidArgument(d, " is duplicated in the input."));
+      Tout(d) = i;
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("InvertPermutation").Device(DEVICE_CPU),
+                        InvertPermutationOp);
+
+// output = TransposeOp(T<any> input, T<int32> perm) takes a tensor
+// of type T and rank N, and a permutation of 0, 1, ..., N-1. It
+// shuffles the dimensions of the input tensor according to permutation.
+//
+// Specifically, the returned tensor output meets the following condition:
+// 1) output.dims() == input.dims();
+// 2) output.dim_size(i) == input.dim_size(perm[i]);
+// 3) output.tensor<T, N>(i_0, i_1, ..., i_N-1) ==
+//      input.tensor<T, N>(j_0, j_1, ..., j_N-1),
+//    where i_s == j_{perm[s]}
+//
+// REQUIRES: perm is a vector of int32.
+// REQUIRES: input.dims() == perm.size().
+// REQUIRES: perm is a permutation.
+
+template <typename Device, typename T>
+TransposeOp<Device, T>::TransposeOp(OpKernelConstruction* context)
+    : OpKernel(context) {}
+
+template <typename Device, typename T>
+void TransposeOp<Device, T>::Compute(OpKernelContext* context) {
+  const Tensor& input = context->input(0);
+  const Tensor& perm = context->input(1);
+  // Preliminary validation of sizes.
+  OP_REQUIRES(context, TensorShapeUtils::IsVector(perm.shape()),
+              errors::InvalidArgument("perm must be a vector, not ",
+                                      perm.shape().DebugString()));
+  auto Vperm = perm.vec<int32>();
+  const int dims = input.dims();
+  static const int kMinDims = 1;
+  static const int kMaxDims = 8;
+  OP_REQUIRES(context, kMinDims <= dims && dims <= kMaxDims,
+              errors::Unimplemented("Transposing a tensor of rank ", dims,
+                                    " is not implemented."));
+  OP_REQUIRES(context, dims == Vperm.size(),
+              errors::InvalidArgument(
+                  "transpose expects a vector of size ", input.dims(),
+                  ". But input(1) is a vector of size ", Vperm.size()));
+  gtl::ArraySlice<int32> permutation(
+      reinterpret_cast<const int32*>(Vperm.data()), dims);
+  TensorShape shape;
+
+  // Check whether permutation is a permutation of integers of [0 .. dims).
+  gtl::InlinedVector<bool, 8> bits(dims);
+  for (const int32 d : permutation) {
+    OP_REQUIRES(
+        context, 0 <= d && d < dims,
+        errors::InvalidArgument(d, " is out of range [0 .. ", dims, ")"));
+    bits[d] = true;
+    shape.AddDim(input.dim_size(d));
+  }
+  for (int i = 0; i < dims; ++i) {
+    OP_REQUIRES(context, bits[i], errors::InvalidArgument(
+                                      i, " is missing from {",
+                                      str_util::Join(permutation, ","), "}."));
+  }
+
+  Tensor* output = nullptr;
+  OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output));
+  switch (dims) {
+#define EXPAND_DIM(N)                                             \
+  case N: {                                                       \
+    functor::TransposeFunctor<Device, T, N> func;                 \
+    func(context->eigen_device<Device>(), output->tensor<T, N>(), \
+         input.tensor<T, N>(), permutation.data());               \
+    break;                                                        \
+  }
+    EXPAND_DIM(1);
+    EXPAND_DIM(2);
+    EXPAND_DIM(3);
+    EXPAND_DIM(4);
+    EXPAND_DIM(5);
+    EXPAND_DIM(6);
+    EXPAND_DIM(7);
+    EXPAND_DIM(8);
+    default:
+      LOG(FATAL) << "Unexpected dims: " << dims;
+  }
+#undef EXPAND_CASE
+}
+
+namespace functor {
+
+template <typename Device, typename T, int NDIMS>
+void TransposeMaybeInline(const Device& d,
+                          typename TTypes<T, NDIMS>::Tensor out,
+                          typename TTypes<T, NDIMS>::ConstTensor in,
+                          const int* perm) {
+  // perm[] is a permutation of 0, 1, ..., NDIMS-1. perm[] is on CPU.
+  Eigen::array<int, NDIMS> p;
+  for (int i = 0; i < NDIMS; ++i) p[i] = perm[i];
+  if (out.size() * sizeof(T) < 131072) {  // Small transpose on a CPU: do inline
+    out = in.shuffle(p);
+  } else {
+    out.device(d) = in.shuffle(p);
+  }
+}
+
+template <typename T, int NDIMS>
+struct TransposeFunctor<CPUDevice, T, NDIMS> {
+  void operator()(const CPUDevice& d, typename TTypes<T, NDIMS>::Tensor out,
+                  typename TTypes<T, NDIMS>::ConstTensor in, const int* perm) {
+    TransposeMaybeInline<CPUDevice, T, NDIMS>(d, out, in, perm);
+  }
+};
+
+}  // namespace functor
+
+#define REGISTER(D, T)                                \
+  template class TransposeOp<D##Device, T>;           \
+  REGISTER_KERNEL_BUILDER(Name("Transpose")           \
+                              .Device(DEVICE_##D)     \
+                              .TypeConstraint<T>("T") \
+                              .HostMemory("perm"),    \
+                          TransposeOp<D##Device, T>)
+REGISTER(CPU, float);
+REGISTER(CPU, double);
+REGISTER(CPU, complex64);
+REGISTER(CPU, uint8);
+REGISTER(CPU, int8);
+REGISTER(CPU, int16);
+REGISTER(CPU, int32);
+REGISTER(CPU, int64);
+REGISTER(CPU, string);
+#if GOOGLE_CUDA
+REGISTER(GPU, uint8);
+REGISTER(GPU, int8);
+REGISTER(GPU, int16);
+REGISTER(GPU, int32);
+REGISTER(GPU, int64);
+REGISTER(GPU, float);
+REGISTER(GPU, double);
+#endif
+#undef REGISTER
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/transpose_op.h b/tensorflow/core/kernels/transpose_op.h
new file mode 100644
index 0000000000..f7a5be5c2b
--- /dev/null
+++ b/tensorflow/core/kernels/transpose_op.h
@@ -0,0 +1,19 @@
+#ifndef TENSORFLOW_KERNELS_TRANSPOSE_OP_H_
+#define TENSORFLOW_KERNELS_TRANSPOSE_OP_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+
+template <typename Device, typename T>
+class TransposeOp : public OpKernel {
+ public:
+  explicit TransposeOp(OpKernelConstruction* context);
+  void Compute(OpKernelContext* context) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_TRANSPOSE_OP_H_
diff --git a/tensorflow/core/kernels/transpose_op_functor.h b/tensorflow/core/kernels/transpose_op_functor.h
new file mode 100644
index 0000000000..8cbd1cbb29
--- /dev/null
+++ b/tensorflow/core/kernels/transpose_op_functor.h
@@ -0,0 +1,28 @@
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_TRANSPOSE_OP_FUNCTOR_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_TRANSPOSE_OP_FUNCTOR_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T, int NDIMS>
+void Transpose(const Device& d, typename TTypes<T, NDIMS>::Tensor out,
+               typename TTypes<T, NDIMS>::ConstTensor in, const int* perm) {
+  // perm[] is a permutation of 0, 1, ..., NDIMS-1. perm[] is on CPU.
+  Eigen::array<int, NDIMS> p;
+  for (int i = 0; i < NDIMS; ++i) p[i] = perm[i];
+  out.device(d) = in.shuffle(p);
+}
+
+template <typename Device, typename T, int NDIMS>
+struct TransposeFunctor {
+  void operator()(const Device& d, typename TTypes<T, NDIMS>::Tensor out,
+                  typename TTypes<T, NDIMS>::ConstTensor in, const int* perm);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_TRANSPOSE_OP_FUNCTOR_H_
diff --git a/tensorflow/core/kernels/transpose_op_gpu.cu.cc b/tensorflow/core/kernels/transpose_op_gpu.cu.cc
new file mode 100644
index 0000000000..8c04a6544e
--- /dev/null
+++ b/tensorflow/core/kernels/transpose_op_gpu.cu.cc
@@ -0,0 +1,43 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/kernels/transpose_op_functor.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename T, int NDIMS>
+struct TransposeFunctor<Eigen::GpuDevice, T, NDIMS> {
+  void operator()(const Eigen::GpuDevice& d,
+                  typename TTypes<T, NDIMS>::Tensor out,
+                  typename TTypes<T, NDIMS>::ConstTensor in, const int* perm) {
+    Transpose<Eigen::GpuDevice, T, NDIMS>(d, out, in, perm);
+  }
+};
+
+#define DEFINE(T, N) template struct TransposeFunctor<Eigen::GpuDevice, T, N>;
+#define DEFINE_DIM(T) \
+  DEFINE(T, 1);       \
+  DEFINE(T, 2);       \
+  DEFINE(T, 3);       \
+  DEFINE(T, 4);       \
+  DEFINE(T, 5);       \
+  DEFINE(T, 6);       \
+  DEFINE(T, 7);       \
+  DEFINE(T, 8);
+DEFINE_DIM(uint8);
+DEFINE_DIM(int8);
+DEFINE_DIM(int16);
+DEFINE_DIM(int32);
+DEFINE_DIM(int64);
+DEFINE_DIM(float);
+DEFINE_DIM(double);
+#undef DEFINE_DIM
+#undef DEFINE
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/unique_op.cc b/tensorflow/core/kernels/unique_op.cc
new file mode 100644
index 0000000000..61f4a54583
--- /dev/null
+++ b/tensorflow/core/kernels/unique_op.cc
@@ -0,0 +1,61 @@
+#include <unordered_map>
+#include <utility>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/status.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename T>
+class UniqueOp : public OpKernel {
+ public:
+  explicit UniqueOp(OpKernelConstruction* context) : OpKernel(context) {
+    const DataType dt = DataTypeToEnum<T>::v();
+    OP_REQUIRES_OK(context, context->MatchSignature({dt}, {dt, DT_INT32}));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(input.shape()),
+                errors::InvalidArgument("unique expects a 1D vector."));
+    auto Tin = input.vec<T>();
+    const int N = Tin.size();
+
+    Tensor* idx = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(1, input.shape(), &idx));
+    auto idx_vec = idx->template vec<int32>();
+
+    std::unordered_map<T, int32> uniq;
+    uniq.reserve(2 * N);
+    for (int i = 0, j = 0; i < N; ++i) {
+      auto it = uniq.insert(std::make_pair(Tin(i), j));
+      idx_vec(i) = it.first->second;
+      if (it.second) {
+        ++j;
+      }
+    }
+    int32 uniq_size = uniq.size();
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                0, TensorShape({uniq_size}), &output));
+    auto output_vec = output->template vec<T>();
+
+    for (auto it : uniq) {
+      output_vec(it.second) = it.first;
+    }
+  }
+};
+
+#define REGISTER_UNIQUE(type)                                      \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("Unique").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      UniqueOp<type>)
+
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_UNIQUE);
+#undef REGISTER_UNIQUE
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/unique_op_test.cc b/tensorflow/core/kernels/unique_op_test.cc
new file mode 100644
index 0000000000..658f2282cf
--- /dev/null
+++ b/tensorflow/core/kernels/unique_op_test.cc
@@ -0,0 +1,51 @@
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/tensor.h"
+#include <gtest/gtest.h>
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+
+namespace {
+
+static void BM_Unique(int iters, int dim) {
+  testing::StopTiming();
+  RequireDefaultOps();
+  Graph* g = new Graph(OpRegistry::Global());
+
+  Tensor input(DT_INT32, TensorShape({dim}));
+  input.flat<int32>().setRandom();
+
+  Node* node;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Unique")
+                  .Input(test::graph::Constant(g, input))
+                  .Attr("T", DT_INT32)
+                  .Finalize(g, &node));
+
+  testing::BytesProcessed(static_cast<int64>(iters) * dim * sizeof(int32));
+  testing::UseRealTime();
+  testing::StartTiming();
+  test::Benchmark("cpu", g).Run(iters);
+}
+
+BENCHMARK(BM_Unique)
+    ->Arg(32)
+    ->Arg(256)
+    ->Arg(1024)
+    ->Arg(4 * 1024)
+    ->Arg(16 * 1024)
+    ->Arg(64 * 1024)
+    ->Arg(256 * 1024);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/unpack_op.cc b/tensorflow/core/kernels/unpack_op.cc
new file mode 100644
index 0000000000..36cfb2c8e5
--- /dev/null
+++ b/tensorflow/core/kernels/unpack_op.cc
@@ -0,0 +1,96 @@
+// See docs in ../ops/array_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/split_op.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class UnpackOp : public OpKernel {
+ public:
+  explicit UnpackOp(OpKernelConstruction* c) : OpKernel(c) {}
+
+  void Compute(OpKernelContext* context) override {
+    const int32 num = num_outputs();
+    const Tensor& input = context->input(0);
+    const TensorShape& input_shape = input.shape();
+
+    OP_REQUIRES(
+        context, input_shape.dims() > 0 && input_shape.dim_size(0) == num,
+        errors::InvalidArgument("Input shape must start with ", num, ", got ",
+                                input_shape.ShortDebugString()));
+
+    auto output_shape = input_shape;
+    output_shape.RemoveDim(0);
+    const int32 output_size = output_shape.num_elements();
+
+    // Special case: Aligned, so we can share the underlying buffer.
+    //
+    // Apply this optimization conservatively: if input is aligned,
+    // the resulting tensors must be aligned. It's conservative
+    // because if the immediate consumer of the resulting tensors are
+    // not using eigen for computation, its perfectly fine to avoid
+    // the copying.
+    if (output_size == 0 || IsInnerDimsSizeAligned<T>(input_shape)) {
+      for (int i = 0; i < num; ++i) {
+        Tensor output;
+        CHECK(output.CopyFrom(input.Slice(i, i + 1), output_shape));
+        context->set_output(i, output);
+      }
+      return;
+    }
+
+    // Except for shape, unpack is a special case of split, so we reuse the
+    // same computational kernels.
+    auto input_reshaped = input.shaped<T, 3>({1, num, output_size});
+
+    for (int i = 0; i < num; ++i) {
+      Tensor* output;
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(i, output_shape, &output));
+      auto output_shaped = output->shaped<T, 3>({1, 1, output_size});
+
+      Eigen::DSizes<ptrdiff_t, 3> indices{0, i, 0};
+      Eigen::DSizes<ptrdiff_t, 3> sizes{1, 1, output_size};
+      functor::Split<Device, T>()(context->eigen_device<Device>(),
+                                  output_shaped, input_reshaped, indices,
+                                  sizes);
+    }
+  }
+};
+
+#define REGISTER_UNPACK(type)                                      \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("Unpack").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      UnpackOp<CPUDevice, type>)
+
+TF_CALL_ALL_TYPES(REGISTER_UNPACK);
+
+#undef REGISTER_UNPACK
+
+#if GOOGLE_CUDA
+
+#define REGISTER_GPU(type)                                         \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("Unpack").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+      UnpackOp<GPUDevice, type>)
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+#undef REGISTER_GPU
+
+#endif  // GOOGLE_CUDA
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/variable_ops.cc b/tensorflow/core/kernels/variable_ops.cc
new file mode 100644
index 0000000000..2f1dbc68c0
--- /dev/null
+++ b/tensorflow/core/kernels/variable_ops.cc
@@ -0,0 +1,37 @@
+#define EIGEN_USE_THREADS
+#include "tensorflow/core/kernels/variable_ops.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/port.h"
+
+namespace tensorflow {
+
+REGISTER_KERNEL_BUILDER(Name("Variable").Device(DEVICE_CPU), VariableOp);
+REGISTER_KERNEL_BUILDER(Name("TemporaryVariable").Device(DEVICE_CPU),
+                        TemporaryVariableOp);
+REGISTER_KERNEL_BUILDER(Name("DestroyTemporaryVariable").Device(DEVICE_CPU),
+                        DestroyTemporaryVariableOp);
+
+#if GOOGLE_CUDA
+// Only register 'Variable' on GPU for the subset of types also supported by
+// 'Assign' (see dense_update_ops.cc.)
+#define REGISTER_GPU_KERNELS(type)                                       \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("Variable").Device(DEVICE_GPU).TypeConstraint<type>("dtype"), \
+      VariableOp);                                                       \
+  REGISTER_KERNEL_BUILDER(Name("TemporaryVariable")                      \
+                              .Device(DEVICE_GPU)                        \
+                              .TypeConstraint<type>("dtype"),            \
+                          TemporaryVariableOp);                          \
+  REGISTER_KERNEL_BUILDER(Name("DestroyTemporaryVariable")               \
+                              .Device(DEVICE_GPU)                        \
+                              .TypeConstraint<type>("T"),                \
+                          DestroyTemporaryVariableOp);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+#undef REGISTER_GPU_KERNELS
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/variable_ops.h b/tensorflow/core/kernels/variable_ops.h
new file mode 100644
index 0000000000..77d2da0ad4
--- /dev/null
+++ b/tensorflow/core/kernels/variable_ops.h
@@ -0,0 +1,146 @@
+#ifndef TENSORFLOW_KERNELS_VARIABLE_OPS_H_
+#define TENSORFLOW_KERNELS_VARIABLE_OPS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/port.h"
+
+namespace tensorflow {
+
+class VariableOp : public OpKernel {
+ public:
+  explicit VariableOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("shape", &shape_));
+    dtype_ = RemoveRefType(context->output_type(0));
+  }
+
+  ~VariableOp() override {
+    if (var_) var_->Unref();
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    mutex_lock l(init_mu_);
+    if (var_ == nullptr) {
+      OP_REQUIRES_OK(ctx, cinfo_.Init(ctx->resource_manager(), def(),
+                                      true /* use name() */));
+      auto creator = [this](Var** var) {
+        *var = new Var(dtype_);
+        (*var)->tensor()->set_shape(shape_);
+        return Status::OK();
+      };
+      OP_REQUIRES_OK(ctx,
+                     cinfo_.resource_manager()->LookupOrCreate<Var>(
+                         cinfo_.container(), cinfo_.name(), &var_, creator));
+    }
+    // Output a reference to our tensor, so it may be updated.
+    //
+    // As long as *this is alive, the ref we return here is valid
+    // because *this owns a ref on var_.
+    ctx->set_output_ref(0, var_->mu(), var_->tensor());
+  }
+
+ private:
+  class Var : public ResourceBase {
+   public:
+    explicit Var(DataType dtype) : tensor_(dtype) {}
+    mutex* mu() { return &mu_; }
+    Tensor* tensor() { return &tensor_; }
+
+    string DebugString() override {
+      return strings::StrCat(DataTypeString(tensor_.dtype()), "/",
+                             tensor_.shape().ShortDebugString());
+    }
+
+   private:
+    mutex mu_;
+    Tensor tensor_;
+
+    ~Var() override {}
+    TF_DISALLOW_COPY_AND_ASSIGN(Var);
+  };
+
+  DataType dtype_;
+  TensorShape shape_;
+
+  mutex init_mu_;
+  ContainerInfo cinfo_ GUARDED_BY(init_mu_);
+  Var* var_ GUARDED_BY(init_mu_) = nullptr;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(VariableOp);
+};
+
+class TemporaryVariableOp : public OpKernel {
+ public:
+  explicit TemporaryVariableOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("shape", &shape_));
+    OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
+    OP_REQUIRES_OK(context, context->GetAttr("var_name", &var_name_));
+    // Variable name defaults to op name if not specified explicitly.
+    if (var_name_ == "") var_name_ = name();
+  }
+
+  void Compute(OpKernelContext* context) override {
+    Status s;
+    ResourceMgr* rm = context->step_resource_manager();
+    OP_REQUIRES(context, rm, errors::Internal("No per-step resource manager."));
+    auto* tmp_var = new TmpVar;
+    OP_REQUIRES(context, tmp_var,
+                errors::ResourceExhausted("Could not allocate TmpVar."));
+    tmp_var->name = var_name_;
+    s = context->allocate_temp(dtype_, shape_, &tmp_var->val);
+    if (!s.ok()) tmp_var->Unref();
+    OP_REQUIRES_OK(context, s);
+    OP_REQUIRES_OK(context, rm->Create("tmp_var", var_name_, tmp_var));
+    context->set_output_ref(0, &tmp_var->mu, &tmp_var->val);
+  }
+
+ private:
+  // Refcounted temporary variable resource.
+  friend class DestroyTemporaryVariableOp;
+  struct TmpVar : public ResourceBase {
+    mutex mu;
+    Tensor val;
+    string name;
+    string DebugString() override { return name; }
+    ~TmpVar() override { VLOG(3) << "TmpVar " << name << " deleted"; }
+  };
+
+  TensorShape shape_;
+  DataType dtype_;
+  string var_name_;
+};
+
+class DestroyTemporaryVariableOp : public OpKernel {
+ public:
+  explicit DestroyTemporaryVariableOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES(context, IsRefType(context->input_type(0)),
+                errors::InvalidArgument("lhs input needs to be a ref type"))
+    OP_REQUIRES_OK(context, context->GetAttr("var_name", &var_name_));
+    OP_REQUIRES(context, var_name_ != "",
+                errors::InvalidArgument("Missing var_name attribute"));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // NOTE(pbar): All other mutators of the Tensor Ref *must* have completed
+    // their execution before this DestroyTemporaryVariable op executes.
+    // This is typically achieved using control dependencies.
+    CHECK(IsRefType(context->input_dtype(0)));
+    Tensor tmpvar = context->mutable_input(0, false);
+    context->set_output(0, tmpvar);
+    ResourceMgr* rm = context->step_resource_manager();
+    OP_REQUIRES(context, rm, errors::Internal("No per-step resource manager."));
+    OP_REQUIRES_OK(
+        context, rm->Delete<TemporaryVariableOp::TmpVar>("tmp_var", var_name_));
+  }
+
+ private:
+  string var_name_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_VARIABLE_OPS_H_
diff --git a/tensorflow/core/kernels/where_op.cc b/tensorflow/core/kernels/where_op.cc
new file mode 100644
index 0000000000..9db0943ea7
--- /dev/null
+++ b/tensorflow/core/kernels/where_op.cc
@@ -0,0 +1,74 @@
+// See docs in ../ops/array_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/where_op.h"
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device>
+class WhereOp : public OpKernel {
+ public:
+  explicit WhereOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+
+    const int input_dims = input.dims();
+    Tensor num_true;
+    OP_REQUIRES_OK(
+        context, context->allocate_temp(DT_INT64, TensorShape({}), &num_true));
+    auto num_true_t = num_true.scalar<int64>();
+
+    functor::NumTrue<Device>::Compute(context->eigen_device<Device>(),
+                                      input.flat<bool>(), num_true_t);
+    TensorShape output_shape({num_true_t(), input_dims});
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+
+#define HANDLE_DIM(NDIM)                                                   \
+  case NDIM:                                                               \
+    functor::Where<Device, NDIM>::Compute(context->eigen_device<Device>(), \
+                                          input.tensor<bool, NDIM>(),      \
+                                          output->matrix<int64>());        \
+    break;
+
+    switch (input_dims) {
+      HANDLE_DIM(1);
+      HANDLE_DIM(2);
+      HANDLE_DIM(3);
+      HANDLE_DIM(4);
+      HANDLE_DIM(5);
+
+      default:
+        OP_REQUIRES(context, false,
+                    errors::InvalidArgument(
+                        "WhereOp : Unhandled input dimensions: ", input_dims));
+    }
+#undef HANDLE_DIM
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(WhereOp);
+};
+
+#define REGISTER_WHERE() \
+  REGISTER_KERNEL_BUILDER(Name("Where").Device(DEVICE_CPU), WhereOp<CPUDevice>);
+
+REGISTER_WHERE();
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/where_op.h b/tensorflow/core/kernels/where_op.h
new file mode 100644
index 0000000000..c7b835d02f
--- /dev/null
+++ b/tensorflow/core/kernels/where_op.h
@@ -0,0 +1,65 @@
+#ifndef TENSORFLOW_KERNELS_WHERE_OP_H_
+#define TENSORFLOW_KERNELS_WHERE_OP_H_
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device>
+struct NumTrue {
+  EIGEN_ALWAYS_INLINE static void Compute(
+      const Device& d, typename TTypes<bool>::ConstFlat input,
+      TTypes<int64>::Scalar num_true) {
+    num_true.device(d) = input.template cast<int64>().sum();
+  }
+};
+
+template <typename Device, int NDIM>
+struct Where {
+  EIGEN_ALWAYS_INLINE static void Compute(
+      const Device& d, typename TTypes<bool, NDIM>::ConstTensor input,
+      typename TTypes<int64>::Matrix output) {
+    Eigen::DenseIndex true_n = 0;
+    Eigen::DSizes<Eigen::DenseIndex, NDIM> dims = input.dimensions();
+    Eigen::DSizes<Eigen::DenseIndex, NDIM> strides;
+
+    // Calculate strides for RowMajor order.
+    EIGEN_STATIC_ASSERT((static_cast<int>(decltype(input)::Layout) ==
+                         static_cast<int>(Eigen::RowMajor)),
+                        INTERNAL_ERROR_INPUT_SHOULD_BE_ROWMAJOR);
+
+    strides[NDIM - 1] = 1;
+    for (int i = NDIM - 2; i >= 0; --i) {
+      strides[i] = strides[i + 1] * dims[i + 1];
+    }
+
+    // Note, no bounds checking is done on true_n.  It is assumed that
+    // the output was correctly sized via output of NumTrue::Compute.
+    for (Eigen::DenseIndex n = 0; n < input.size(); ++n) {
+      if (input.data()[n]) {
+        WriteIndexRowMajor(output, strides, true_n, n);
+        ++true_n;
+      }
+    }
+  }
+
+  EIGEN_ALWAYS_INLINE static void WriteIndexRowMajor(
+      typename TTypes<int64>::Matrix output,
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& strides,
+      Eigen::DenseIndex true_n, Eigen::DenseIndex index) {
+    for (int i = 0; i < NDIM; ++i) {
+      output(true_n, i) = index / strides[i];
+      index %= strides[i];
+    }
+  }
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_WHERE_OP_H_
diff --git a/tensorflow/core/kernels/whole_file_read_ops.cc b/tensorflow/core/kernels/whole_file_read_ops.cc
new file mode 100644
index 0000000000..b940163ec9
--- /dev/null
+++ b/tensorflow/core/kernels/whole_file_read_ops.cc
@@ -0,0 +1,108 @@
+// See docs in ../ops/io_ops.cc.
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/reader_op_kernel.h"
+#include "tensorflow/core/kernels/reader_base.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/public/env.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+static Status ReadEntireFile(Env* env, const string& filename,
+                             string* contents) {
+  uint64 file_size = 0;
+  TF_RETURN_IF_ERROR(env->GetFileSize(filename, &file_size));
+  contents->resize(file_size);
+  RandomAccessFile* file;
+  TF_RETURN_IF_ERROR(env->NewRandomAccessFile(filename, &file));
+  std::unique_ptr<RandomAccessFile> make_sure_file_gets_deleted(file);
+  StringPiece data;
+  TF_RETURN_IF_ERROR(file->Read(0, file_size, &data, &(*contents)[0]));
+  if (data.size() != file_size) {
+    return errors::DataLoss("Truncated read of '", filename, "' expected ",
+                            file_size, " got ", data.size());
+  }
+  if (data.data() != &(*contents)[0]) {
+    memmove(&(*contents)[0], data.data(), data.size());
+  }
+  return Status::OK();
+}
+
+class WholeFileReader : public ReaderBase {
+ public:
+  WholeFileReader(Env* env, const string& node_name)
+      : ReaderBase(strings::StrCat("WholeFileReader '", node_name, "'")),
+        env_(env) {}
+
+  Status ReadLocked(string* key, string* value, bool* produced,
+                    bool* at_end) override {
+    *key = current_work();
+    TF_RETURN_IF_ERROR(ReadEntireFile(env_, *key, value));
+    *produced = true;
+    *at_end = true;
+    return Status::OK();
+  }
+
+  // Stores state in a ReaderBaseState proto, since WholeFileReader has
+  // no additional state beyond ReaderBase.
+  Status SerializeStateLocked(string* state) override {
+    ReaderBaseState base_state;
+    SaveBaseState(&base_state);
+    base_state.SerializeToString(state);
+    return Status::OK();
+  }
+
+  Status RestoreStateLocked(const string& state) override {
+    ReaderBaseState base_state;
+    if (!ParseProtoUnlimited(&base_state, state)) {
+      return errors::InvalidArgument("Could not parse state for ", name(), ": ",
+                                     str_util::CEscape(state));
+    }
+    TF_RETURN_IF_ERROR(RestoreBaseState(base_state));
+    return Status::OK();
+  }
+
+ private:
+  Env* env_;
+};
+
+class WholeFileReaderOp : public ReaderOpKernel {
+ public:
+  explicit WholeFileReaderOp(OpKernelConstruction* context)
+      : ReaderOpKernel(context) {
+    Env* env = context->env();
+    SetReaderFactory(
+        [this, env]() { return new WholeFileReader(env, name()); });
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("WholeFileReader").Device(DEVICE_CPU),
+                        WholeFileReaderOp);
+
+class ReadFileOp : public OpKernel {
+ public:
+  using OpKernel::OpKernel;
+  void Compute(OpKernelContext* context) override {
+    const Tensor* input;
+    OP_REQUIRES_OK(context, context->input("filename", &input));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(input->shape()),
+                errors::InvalidArgument(
+                    "Input filename tensor must be scalar, but had shape: ",
+                    input->shape().DebugString()));
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output("contents",
+                                                     TensorShape({}), &output));
+    OP_REQUIRES_OK(context,
+                   ReadEntireFile(context->env(), input->scalar<string>()(),
+                                  &output->scalar<string>()()));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("ReadFile").Device(DEVICE_CPU), ReadFileOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/xent_op.cc b/tensorflow/core/kernels/xent_op.cc
new file mode 100644
index 0000000000..ff54d157af
--- /dev/null
+++ b/tensorflow/core/kernels/xent_op.cc
@@ -0,0 +1,90 @@
+// See docs in ../ops/nn_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/kernels/xent_op.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class SoftmaxXentWithLogitsOp : public OpKernel {
+ public:
+  explicit SoftmaxXentWithLogitsOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& logits_in = context->input(0);
+    const Tensor& labels_in = context->input(1);
+    OP_REQUIRES(context, logits_in.IsSameSize(labels_in),
+                errors::InvalidArgument(
+                    "logits and labels must be same size: logits_size=",
+                    logits_in.shape().DebugString(), " labels_size=",
+                    labels_in.shape().DebugString()));
+    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(logits_in.shape()),
+                errors::InvalidArgument("logits must be 2-dimensional"));
+    // As we already tested that both inputs have the same shape no need to
+    // check that "labels" is a matrix too.
+
+    // loss is 1-D (one per example), and size is batch_size.
+
+    Tensor scratch;
+    OP_REQUIRES_OK(
+        context, context->allocate_temp(DataTypeToEnum<T>::value,
+                                        TensorShape({logits_in.dim_size(0), 1}),
+                                        &scratch));
+
+    Tensor* loss_out = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(
+                       0, TensorShape({logits_in.dim_size(0)}), &loss_out));
+    Tensor* back_out = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(1, logits_in.shape(), &back_out));
+
+    functor::XentFunctor<Device, T> functor;
+    functor(context->eigen_device<Device>(), logits_in.matrix<T>(),
+            labels_in.matrix<T>(), scratch.matrix<T>(), loss_out->vec<T>(),
+            back_out->matrix<T>());
+  }
+};
+
+// Partial specialization for a CPUDevice, that uses the Eigen implementation
+// from XentEigenImpl.
+namespace functor {
+template <typename T>
+struct XentFunctor<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::ConstMatrix logits,
+                  typename TTypes<T>::ConstMatrix labels,
+                  typename TTypes<T>::Matrix scratch,
+                  typename TTypes<T>::Vec loss,
+                  typename TTypes<T>::Matrix backprop) {
+    XentEigenImpl<CPUDevice, T>::Compute(d, logits, labels, scratch, loss,
+                                         backprop);
+  }
+};
+}  // namespace functor
+
+REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T"),
+                        SoftmaxXentWithLogitsOp<CPUDevice, float>);
+REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<double>("T"),
+                        SoftmaxXentWithLogitsOp<CPUDevice, double>);
+
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<float>("T"),
+                        SoftmaxXentWithLogitsOp<GPUDevice, float>);
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/xent_op.h b/tensorflow/core/kernels/xent_op.h
new file mode 100644
index 0000000000..edb7d817c8
--- /dev/null
+++ b/tensorflow/core/kernels/xent_op.h
@@ -0,0 +1,102 @@
+#ifndef TENSORFLOW_KERNELS_XENT_OP_H_
+#define TENSORFLOW_KERNELS_XENT_OP_H_
+// Functor definition for XentOp, must be compilable by nvcc.
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by XentOp to do the computations.
+template <typename Device, typename T>
+struct XentFunctor {
+  // Computes Cross Entropy loss and backprop.
+  //
+  // logits: batch_size, num_classes.
+  // labels: batch_size, num_classes.
+  // scratch: temporary tensor, dims: batch_size, 1
+  // loss: output tensor for the loss, dims: batch_size.
+  // backprop: output tensor for the backprop, dims: batch_size, num_classes.
+  void operator()(const Device& d, typename TTypes<T>::ConstMatrix logits,
+                  typename TTypes<T>::ConstMatrix labels,
+                  typename TTypes<T>::Matrix scratch,
+                  typename TTypes<T>::Vec loss,
+                  typename TTypes<T>::Matrix backprop);
+};
+
+// Eigen code implementing XentFunctor::operator().
+// This code works for both CPU and GPU and is used by the functor
+// specializations for both device types.
+template <typename Device, typename T>
+struct XentEigenImpl {
+  static void Compute(const Device& d, typename TTypes<T>::ConstMatrix logits,
+                      typename TTypes<T>::ConstMatrix labels,
+                      typename TTypes<T>::Matrix scratch,
+                      typename TTypes<T>::Vec loss,
+                      typename TTypes<T>::Matrix backprop) {
+    // NOTE(mdevin): This duplicates some of the computations in softmax_op
+    // because we need the intermediate (logits -max(logits)) values to
+    // avoid a log(exp()) in the computation of the loss.
+
+    const int kBatchDim = 0;
+    const int kClassDim = 1;
+
+    const int batch_size = logits.dimension(kBatchDim);
+    const int num_classes = logits.dimension(kClassDim);
+
+// These arrays are used to reduce along the class dimension, and broadcast
+// the resulting value to all classes.
+#if !defined(EIGEN_HAS_INDEX_LIST)
+    Eigen::array<int, 1> along_class;
+    along_class[0] = kClassDim;
+    Eigen::array<int, 1> batch_only;
+    batch_only[0] = batch_size;
+    Eigen::array<int, 2> batch_by_one;
+    batch_by_one[0] = batch_size;
+    batch_by_one[1] = 1;
+    Eigen::array<int, 2> one_by_class;
+    one_by_class[0] = 1;
+    one_by_class[1] = num_classes;
+#else
+    Eigen::IndexList<Eigen::type2index<kClassDim> > along_class;
+    Eigen::IndexList<int, Eigen::type2index<1> > batch_by_one;
+    batch_by_one.set(0, batch_size);
+    Eigen::IndexList<int> batch_only;
+    batch_only.set(0, batch_size);
+    Eigen::IndexList<Eigen::type2index<1>, int> one_by_class;
+    one_by_class.set(1, num_classes);
+#endif
+
+    // max_logits along classes.
+    scratch.reshape(batch_only).device(d) = logits.maximum(along_class);
+
+    // logits - max_logits.
+    backprop.device(d) = logits - scratch.broadcast(one_by_class);
+
+    // sum(exp(logits - max_logits)) along classes.
+    scratch.reshape(batch_only).device(d) = backprop.exp().sum(along_class);
+
+    // NOTE(keveman): Eigen on GPU dispatches to an optimized implementaion
+    // for an expression of the form lhs = rhs.sum().
+    // lhs = -rhs.sum() doesn't match the above pattern, so folding in the
+    // negation before calling sum().
+    //  sum(-labels *
+    //     ((logits - max_logits) - log(sum(exp(logits - max_logits)))))
+    //  along classes
+    loss.device(d) =
+        (labels * (scratch.log().eval().broadcast(one_by_class) - backprop))
+            .eval()
+            .sum(along_class);
+
+    // backprop: prob - labels, where
+    //   prob = exp(logits - max_logits) / sum(exp(logits - max_logits))
+    backprop.device(d) =
+        (backprop.exp() / scratch.broadcast(one_by_class)) - labels;
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_XENT_OP_H_
diff --git a/tensorflow/core/kernels/xent_op_gpu.cu.cc b/tensorflow/core/kernels/xent_op_gpu.cu.cc
new file mode 100644
index 0000000000..eec6a84281
--- /dev/null
+++ b/tensorflow/core/kernels/xent_op_gpu.cu.cc
@@ -0,0 +1,35 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/xent_op.h"
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Partial specialization for a GPUDevice, that uses the Eigen implementation
+// from XentEigenImpl.
+namespace functor {
+template <typename T>
+struct XentFunctor<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::ConstMatrix logits,
+                  typename TTypes<T>::ConstMatrix labels,
+                  typename TTypes<T>::Matrix scratch,
+                  typename TTypes<T>::Vec loss,
+                  typename TTypes<T>::Matrix backprop) {
+    XentEigenImpl<GPUDevice, T>::Compute(d, logits, labels, scratch, loss,
+                                         backprop);
+  }
+};
+}  // end namespace functor
+
+// Instantiate the GPU implementation for float.
+template struct functor::XentFunctor<GPUDevice, float>;
+
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/xent_op_test.cc b/tensorflow/core/kernels/xent_op_test.cc
new file mode 100644
index 0000000000..9aab1b09bf
--- /dev/null
+++ b/tensorflow/core/kernels/xent_op_test.cc
@@ -0,0 +1,46 @@
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include <gtest/gtest.h>
+#include "tensorflow/core/kernels/xent_op.h"
+
+namespace tensorflow {
+
+static Graph* Xent(int batch_size, int num_classes) {
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor logits(DT_FLOAT, TensorShape({batch_size, num_classes}));
+  logits.flat<float>().setRandom();
+  Tensor labels(DT_FLOAT, TensorShape({batch_size, num_classes}));
+  labels.flat<float>().setRandom();
+  test::graph::Binary(g, "SoftmaxCrossEntropyWithLogits",
+                      test::graph::Constant(g, logits),
+                      test::graph::Constant(g, labels));
+  return g;
+}
+
+#define BM_XentDev(BATCH, CLASS, DEVICE)                                \
+  static void BM_Xent##_##BATCH##_##CLASS##_##DEVICE(int iters) {       \
+    testing::ItemsProcessed(static_cast<int64>(iters) * BATCH * CLASS); \
+    test::Benchmark(#DEVICE, Xent(BATCH, CLASS)).Run(iters);            \
+  }                                                                     \
+  BENCHMARK(BM_Xent##_##BATCH##_##CLASS##_##DEVICE);
+
+/// The representative tests for ptb_word on GPU
+BM_XentDev(16, 10000, gpu);
+BM_XentDev(16, 30000, gpu);
+BM_XentDev(16, 100000, gpu);
+
+BM_XentDev(32, 10000, gpu);
+BM_XentDev(32, 30000, gpu);
+BM_XentDev(32, 100000, gpu);
+
+BM_XentDev(64, 10000, gpu);
+BM_XentDev(64, 30000, gpu);
+BM_XentDev(64, 100000, gpu);
+
+/// Only the smaller tests for CPU. Otherwise, it's too slow
+BM_XentDev(16, 10000, cpu);
+BM_XentDev(32, 10000, cpu);
+BM_XentDev(64, 10000, cpu);
+
+}  // end namespace tensorflow