27 files changed, 715 insertions, 273 deletions
diff --git a/WORKSPACE b/WORKSPACE
index 5789bf20ce..38993d5816 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -32,7 +32,7 @@ bind(
 git_repository(
   name = "re2",
   remote = "https://github.com/google/re2.git",
-  tag = "2015-07-01",
+  commit = "791beff",
 )
 
 new_http_archive(
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 0e51277284..72bd33dab5 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -101,7 +101,10 @@ tf_cuda_library(
             "**/*main.cc",
         ],
     ),
-    hdrs = glob(["public/**/*.h"]),
+    hdrs = glob([
+        "public/**/*.h",
+        "util/device_name_utils.h",
+    ]),
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
@@ -345,6 +348,12 @@ cc_library(
     alwayslink = 1,
 )
 
+# This is to workaround strict header checks
+cc_library(
+    name = "strict_headers",
+    hdrs = glob(["**/*.h"]),
+)
+
 # Low level library tests
 tf_cc_tests(
     tests = glob(
@@ -356,6 +365,7 @@ tf_cc_tests(
     ),
     deps = [
         ":lib",
+        ":strict_headers",
         ":test_main",
     ],
 )
@@ -404,6 +414,7 @@ tf_cc_tests(
         ":direct_session",
         ":kernels",
         ":lib",
+        ":strict_headers",
         ":test_main",
         ":testlib",
         "//tensorflow/cc:cc_ops",
@@ -424,6 +435,7 @@ tf_cc_tests(
     deps = [
         ":direct_session",
         ":kernels",
+        ":strict_headers",
         ":test_main",
         ":testlib",
         "//tensorflow/cc:cc_ops",
diff --git a/tensorflow/core/kernels/random_op.cc b/tensorflow/core/kernels/random_op.cc
index 9890997bfb..c0f8f77d1d 100644
--- a/tensorflow/core/kernels/random_op.cc
+++ b/tensorflow/core/kernels/random_op.cc
@@ -46,7 +46,7 @@ template <typename Device, class Distribution>
 struct FillPhiloxRandom {
   typedef typename Distribution::ResultElementType T;
   void operator()(OpKernelContext*, const Device&, random::PhiloxRandom gen,
-                  T* data, int64 size) {
+                  T* data, int64 size, Distribution dist) {
     LOG(FATAL) << "Default FillPhiloxRandom should not be executed.";
   }
 };
@@ -57,7 +57,8 @@ template <class Distribution>
 struct FillPhiloxRandom<GPUDevice, Distribution> {
   typedef typename Distribution::ResultElementType T;
   void operator()(OpKernelContext* ctx, const GPUDevice&,
-                  random::PhiloxRandom gen, T* data, int64 size);
+                  random::PhiloxRandom gen, T* data, int64 size,
+                  Distribution dist);
 };
 
 #endif
@@ -72,8 +73,7 @@ template <class Distribution>
 struct FillPhiloxRandomTask<Distribution, false> {
   typedef typename Distribution::ResultElementType T;
   static void Run(random::PhiloxRandom gen, T* data, int64 size,
-                  int64 start_group, int64 limit_group) {
-    Distribution dist;
+                  int64 start_group, int64 limit_group, Distribution dist) {
     const int kGroupSize = Distribution::kResultElementCount;
 
     gen.Skip(start_group);
@@ -96,7 +96,7 @@ struct FillPhiloxRandomTask<Distribution, false> {
   }
 };
 
-// Specialization for distribution that takes a varaiable number of samples for
+// Specialization for distribution that takes a variable number of samples for
 // each output. This will be slower due to the generality.
 template <class Distribution>
 struct FillPhiloxRandomTask<Distribution, true> {
@@ -104,11 +104,10 @@ struct FillPhiloxRandomTask<Distribution, true> {
   static const int64 kReservedSamplesPerOutput = 256;
 
   static void Run(random::PhiloxRandom base_gen, T* data, int64 size,
-                  int64 start_group, int64 limit_group) {
+                  int64 start_group, int64 limit_group, Distribution dist) {
     using random::PhiloxRandom;
     using random::SingleSampleAdapter;
 
-    Distribution dist;
     const int kGroupSize = Distribution::kResultElementCount;
 
     static const int kGeneratorSkipPerOutputGroup =
@@ -153,7 +152,8 @@ template <class Distribution>
 struct FillPhiloxRandom<CPUDevice, Distribution> {
   typedef typename Distribution::ResultElementType T;
   void operator()(OpKernelContext* context, const CPUDevice&,
-                  random::PhiloxRandom gen, T* data, int64 size) {
+                  random::PhiloxRandom gen, T* data, int64 size,
+                  Distribution dist) {
     const int kGroupSize = Distribution::kResultElementCount;
 
     auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
@@ -164,17 +164,49 @@ struct FillPhiloxRandom<CPUDevice, Distribution> {
     // sub-linear. Too many threads causes a much worse overall performance.
     int num_workers = 6;
     Shard(num_workers, worker_threads.workers, total_group_count, kGroupSize,
-          [&gen, data, size](int64 start_group, int64 limit_group) {
+          [&gen, data, size, dist](int64 start_group, int64 limit_group) {
             FillPhiloxRandomTask<
                 Distribution,
                 Distribution::kVariableSamplesPerOutput>::Run(gen, data, size,
                                                               start_group,
-                                                              limit_group);
+                                                              limit_group,
+                                                              dist);
           });
   }
 };
 }  // namespace functor
 
+namespace {
+
+static Status AllocateOutputWithShape(OpKernelContext* ctx, const Tensor& shape,
+                                      int index, Tensor** output) {
+  if (!TensorShapeUtils::IsLegacyVector(shape.shape())) {
+    return errors::InvalidArgument(
+        "shape must be a vector of {int32,int64}, got shape ",
+        shape.shape().ShortDebugString());
+  }
+  if (shape.dtype() == DataType::DT_INT32) {
+    auto vec = shape.flat<int32>();
+    TF_RETURN_IF_ERROR(ctx->allocate_output(
+        index, TensorShapeUtils::MakeShape(vec.data(), vec.size()), output));
+  } else if (shape.dtype() == DataType::DT_INT64) {
+    auto vec = shape.flat<int64>();
+    TF_RETURN_IF_ERROR(ctx->allocate_output(
+        index, TensorShapeUtils::MakeShape(vec.data(), vec.size()), output));
+  } else {
+    return errors::InvalidArgument("shape must be a vector of {int32,int64}.");
+  }
+  return Status::OK();
+}
+
+// Reserve enough random samples in the generator for the given output count.
+// Note that the 256 multiplier is repeated above; do not change it just here.
+static random::PhiloxRandom ReserveRandomOutputs(GuardedPhiloxRandom& generator,
+                                                 int64 output_count) {
+  int64 conservative_sample_count = output_count << 8;
+  return generator.ReserveSamples128(conservative_sample_count);
+}
+
 // For now, use the same interface as RandomOp, so we can choose either one
 // at the run-time.
 template <typename Device, class Distribution>
@@ -186,41 +218,65 @@ class PhiloxRandomOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    const Tensor& input = ctx->input(0);
-    OP_REQUIRES(
-        ctx, TensorShapeUtils::IsLegacyVector(input.shape()),
-        errors::InvalidArgument("shape must be a vector of {int32,int64}."));
-    Tensor* output = nullptr;
-    if (input.dtype() == DataType::DT_INT32) {
-      auto vec = input.flat<int32>();
-      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShapeUtils::MakeShape(
-                                                      vec.data(), vec.size()),
-                                               &output));
-    } else if (input.dtype() == DataType::DT_INT64) {
-      auto vec = input.flat<int64>();
-      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShapeUtils::MakeShape(
-                                                      vec.data(), vec.size()),
-                                               &output));
-    } else {
-      OP_REQUIRES(ctx, false, errors::InvalidArgument(
-                                  "shape must be a vector of {int32,int64}."));
-    }
+    const Tensor& shape = ctx->input(0);
+    Tensor* output;
+    OP_REQUIRES_OK(ctx, AllocateOutputWithShape(ctx, shape, 0, &output));
+    auto output_flat = output->flat<T>();
     functor::FillPhiloxRandom<Device, Distribution>()(
         ctx, ctx->eigen_device<Device>(),
-        ReserveRandomOutputs(output->flat<T>().size()),
-        output->flat<T>().data(), output->flat<T>().size());
+        ReserveRandomOutputs(generator_, output_flat.size()),
+        output_flat.data(), output_flat.size(), Distribution());
   }
 
  private:
   GuardedPhiloxRandom generator_;
+};
+
+template <typename Device, class IntType>
+class RandomUniformIntOp : public OpKernel {
+ public:
+  explicit RandomUniformIntOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, generator_.Init(ctx));
+  }
 
-  // Reserve enough random samples in the generator for the given output count.
-  random::PhiloxRandom ReserveRandomOutputs(int64 output_count) {
-    int64 conservative_sample_count = output_count << 8;
-    return generator_.ReserveSamples128(conservative_sample_count);
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& shape = ctx->input(0);
+    const Tensor& minval = ctx->input(1);
+    const Tensor& maxval = ctx->input(2);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(minval.shape()),
+                errors::InvalidArgument("minval must be 0-D, got shape ",
+                                        minval.shape().ShortDebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(maxval.shape()),
+                errors::InvalidArgument("maxval must be 0-D, got shape ",
+                                        maxval.shape().ShortDebugString()));
+
+    // Verify that minval < maxval
+    IntType lo = minval.scalar<IntType>()();
+    IntType hi = maxval.scalar<IntType>()();
+    OP_REQUIRES(
+        ctx, lo < hi,
+        errors::InvalidArgument("Need minval < maxval, got ", lo, " >= ", hi));
+
+    // Build distribution
+    typedef random::UniformDistribution<random::PhiloxRandom, IntType>
+        Distribution;
+    Distribution dist(lo, hi);
+
+    Tensor* output;
+    OP_REQUIRES_OK(ctx, AllocateOutputWithShape(ctx, shape, 0, &output));
+    auto output_flat = output->flat<IntType>();
+    functor::FillPhiloxRandom<Device, Distribution>()(
+        ctx, ctx->eigen_device<Device>(),
+        ReserveRandomOutputs(generator_, output_flat.size()),
+        output_flat.data(), output_flat.size(), dist);
   }
+
+ private:
+  GuardedPhiloxRandom generator_;
 };
 
+}  // namespace
+
 #define REGISTER(TYPE)                                              \
   REGISTER_KERNEL_BUILDER(                                          \
       Name("RandomUniform")                                         \
@@ -246,10 +302,22 @@ class PhiloxRandomOp : public OpKernel {
           random::TruncatedNormalDistribution<                      \
               random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >)
 
+#define REGISTER_INT(IntType)                                   \
+  REGISTER_KERNEL_BUILDER(Name("RandomUniformInt")              \
+                              .Device(DEVICE_CPU)               \
+                              .HostMemory("shape")              \
+                              .HostMemory("minval")             \
+                              .HostMemory("maxval")             \
+                              .TypeConstraint<IntType>("Tout"), \
+                          RandomUniformIntOp<CPUDevice, IntType>);
+
 REGISTER(float);
 REGISTER(double);
+REGISTER_INT(int32);
+REGISTER_INT(int64);
 
 #undef REGISTER
+#undef REGISTER_INT
 
 #if GOOGLE_CUDA
 
@@ -281,10 +349,23 @@ REGISTER(double);
           random::TruncatedNormalDistribution<                      \
               random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >)
 
+#define REGISTER_INT(IntType)                                   \
+  REGISTER_KERNEL_BUILDER(Name("RandomUniformInt")              \
+                              .Device(DEVICE_GPU)               \
+                              .HostMemory("shape")              \
+                              .HostMemory("minval")             \
+                              .HostMemory("maxval")             \
+                              .TypeConstraint<int32>("T")       \
+                              .TypeConstraint<IntType>("Tout"), \
+                          RandomUniformIntOp<GPUDevice, IntType>);
+
 REGISTER(float);
 REGISTER(double);
+REGISTER_INT(int32);
+REGISTER_INT(int64);
 
 #undef REGISTER
+#undef REGISTER_INT
 
 #endif  // GOOGLE_CUDA
 
diff --git a/tensorflow/core/kernels/random_op_gpu.cu.cc b/tensorflow/core/kernels/random_op_gpu.cu.cc
index 432ae79545..9c51708c4c 100644
--- a/tensorflow/core/kernels/random_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/random_op_gpu.cu.cc
@@ -42,8 +42,8 @@ struct FillPhiloxRandomKernel;
 template <class Distribution>
 struct FillPhiloxRandomKernel<Distribution, false> {
   typedef typename Distribution::ResultElementType T;
-  PHILOX_DEVICE_FUNC void Run(random::PhiloxRandom gen, T* data, int64 size) {
-    Distribution dist;
+  PHILOX_DEVICE_FUNC void Run(random::PhiloxRandom gen, T* data, int64 size,
+                              Distribution dist) {
     const int kGroupSize = Distribution::kResultElementCount;
 
     const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x;
@@ -74,7 +74,7 @@ template <class Distribution>
 struct FillPhiloxRandomKernel<Distribution, true> {
   typedef typename Distribution::ResultElementType T;
   PHILOX_DEVICE_FUNC void Run(const random::PhiloxRandom& base_gen, T* data,
-                              int64 size) {
+                              int64 size, Distribution dist) {
     using random::PhiloxRandom;
     using random::SingleSampleAdapter;
 
@@ -88,7 +88,6 @@ struct FillPhiloxRandomKernel<Distribution, true> {
     const int32 total_thread_count = gridDim.x * blockDim.x;
     int64 group_index = thread_id;
     int64 offset = group_index * kGroupSize;
-    Distribution dist;
 
     while (offset < size) {
       // Since each output takes a variable number of samples, we need to
@@ -118,10 +117,10 @@ template <class Distribution>
 __global__ void __launch_bounds__(1024)
     FillPhiloxRandomKernelLaunch(random::PhiloxRandom base_gen,
                                  typename Distribution::ResultElementType* data,
-                                 int64 size) {
+                                 int64 size, Distribution dist) {
   FillPhiloxRandomKernel<Distribution,
                          Distribution::kVariableSamplesPerOutput>()
-      .Run(base_gen, data, size);
+      .Run(base_gen, data, size, dist);
 }
 
 // Partial specialization for GPU
@@ -130,7 +129,7 @@ struct FillPhiloxRandom<GPUDevice, Distribution> {
   typedef typename Distribution::ResultElementType T;
   typedef GPUDevice Device;
   void operator()(OpKernelContext*, const Device& d, random::PhiloxRandom gen,
-                  T* data, int64 size) {
+                  T* data, int64 size, Distribution dist) {
     const int32 block_size = d.maxCudaThreadsPerBlock();
     const int32 num_blocks =
         (d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor()) /
@@ -138,7 +137,7 @@ struct FillPhiloxRandom<GPUDevice, Distribution> {
 
     FillPhiloxRandomKernelLaunch<
         Distribution><<<num_blocks, block_size, 0, d.stream()>>>(gen, data,
-                                                                 size);
+                                                                 size, dist);
   }
 };
 
@@ -150,6 +149,10 @@ template struct FillPhiloxRandom<
 template struct FillPhiloxRandom<
     GPUDevice, random::UniformDistribution<random::PhiloxRandom, double> >;
 template struct FillPhiloxRandom<
+    GPUDevice, random::UniformDistribution<random::PhiloxRandom, int32> >;
+template struct FillPhiloxRandom<
+    GPUDevice, random::UniformDistribution<random::PhiloxRandom, int64> >;
+template struct FillPhiloxRandom<
     GPUDevice, random::NormalDistribution<random::PhiloxRandom, float> >;
 template struct FillPhiloxRandom<
     GPUDevice, random::NormalDistribution<random::PhiloxRandom, double> >;
diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc
index a11ea47f7e..6968169248 100644
--- a/tensorflow/core/kernels/tile_ops.cc
+++ b/tensorflow/core/kernels/tile_ops.cc
@@ -52,11 +52,16 @@ class TileOp : public OpKernel {
                 errors::InvalidArgument(
                     "Expected multiples argument to be a vector of length ",
                     input.dims(), " but got length ", multiples.dim_size(0)));
-
     const int input_dims = input.dims();
+
+    // Eigen doesn't support scalars on the GPU, so handle 0-D specially
+    if (input_dims == 0) {
+      context->set_output(0, input);
+      return;
+    }
+
     const gtl::ArraySlice<int32> multiples_array(multiples.flat<int32>().data(),
                                                  input_dims);
-
     TensorShape output_shape;
     for (int i = 0; i < input_dims; ++i) {
       OP_REQUIRES(
@@ -75,7 +80,6 @@ class TileOp : public OpKernel {
   }
 
 #define HANDLE_TYPE(T) \
-  HANDLE_DIM(T, 0)     \
   HANDLE_DIM(T, 1)     \
   HANDLE_DIM(T, 2)     \
   HANDLE_DIM(T, 3)     \
@@ -142,16 +146,13 @@ inline void TileOp<Device>::HandleCase(
     HandleCaseImpl<dtype, ndim>(context, multiples_array, result);     \
   }
 
-#define HANDLE_CASE_DIM_POSITIVE(device, dtype) \
-  HANDLE_CASE(device, dtype, 1);                \
-  HANDLE_CASE(device, dtype, 2);                \
-  HANDLE_CASE(device, dtype, 3);                \
-  HANDLE_CASE(device, dtype, 4);                \
-  HANDLE_CASE(device, dtype, 5);
-
+// 0-D handled above
 #define HANDLE_CASE_DIM(device, dtype) \
-  HANDLE_CASE(device, dtype, 0);       \
-  HANDLE_CASE_DIM_POSITIVE(device, dtype);
+  HANDLE_CASE(device, dtype, 1);       \
+  HANDLE_CASE(device, dtype, 2);       \
+  HANDLE_CASE(device, dtype, 3);       \
+  HANDLE_CASE(device, dtype, 4);       \
+  HANDLE_CASE(device, dtype, 5);
 
 HANDLE_CASE_DIM(CPUDevice, DT_BOOL);
 HANDLE_CASE_DIM(CPUDevice, DT_FLOAT);
@@ -163,15 +164,13 @@ HANDLE_CASE_DIM(CPUDevice, DT_INT64);
 HANDLE_CASE_DIM(CPUDevice, DT_STRING);
 
 #if GOOGLE_CUDA
-// Eigen on GPU does not handle 0-dimension data types yet.
-HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_FLOAT);
-HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_DOUBLE);
-HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_INT16);
-HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_INT32);
-HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_INT64);
+HANDLE_CASE_DIM(GPUDevice, DT_FLOAT);
+HANDLE_CASE_DIM(GPUDevice, DT_DOUBLE);
+HANDLE_CASE_DIM(GPUDevice, DT_INT16);
+HANDLE_CASE_DIM(GPUDevice, DT_INT32);
+HANDLE_CASE_DIM(GPUDevice, DT_INT64);
 #endif  // GOOGLE_CUDA
 
-#undef HANDLE_CASE_DIM_POSITIVE
 #undef HANDLE_CASE_DIM
 #undef HANDLE_CASE
 
@@ -194,9 +193,15 @@ class TileGradientOp : public OpKernel {
                     input.dims(), " but got length ", multiples.dim_size(0)));
 
     const int input_dims = input.dims();
+
+    // Eigen doesn't support scalars on the GPU, so handle 0-D specially
+    if (input_dims == 0) {
+      context->set_output(0, input);
+      return;
+    }
+
     const gtl::ArraySlice<int32> multiples_array(multiples.flat<int32>().data(),
                                                  input_dims);
-
     TensorShape output_shape;
     std::vector<int32> input_dim_size_vec;
     for (int i = 0; i < input_dims; ++i) {
@@ -223,7 +228,6 @@ class TileGradientOp : public OpKernel {
   }
 
 #define HANDLE_TYPE(T) \
-  HANDLE_DIM(T, 0)     \
   HANDLE_DIM(T, 1)     \
   HANDLE_DIM(T, 2)     \
   HANDLE_DIM(T, 3)     \
@@ -282,7 +286,7 @@ class TileGradientOp : public OpKernel {
       // NOTE(keveman): Handling the most common case here.
       // Adding more cases here would require more templating and code
       // explosion. For instance, HANDLE_DIM(2) wouldn't make sense for NDIM=1.
-      HANDLE_DIM(NDIM > 0 ? 1 : 0);
+      HANDLE_DIM(1);
 
 // Fall through to the unoptimized version.
 #undef HANDLE_DIM
@@ -362,16 +366,13 @@ inline void TileGradientOp<Device>::HandleCase(
     HandleCaseImpl<dtype, ndim>(context, input_dims, multiples_array, result); \
   }
 
-#define HANDLE_CASE_DIM_POSITIVE(device, dtype) \
-  HANDLE_CASE(device, dtype, 1);                \
-  HANDLE_CASE(device, dtype, 2);                \
-  HANDLE_CASE(device, dtype, 3);                \
-  HANDLE_CASE(device, dtype, 4);                \
-  HANDLE_CASE(device, dtype, 5);
-
+// 0-D handled specially above
 #define HANDLE_CASE_DIM(device, dtype) \
-  HANDLE_CASE(device, dtype, 0);       \
-  HANDLE_CASE_DIM_POSITIVE(device, dtype);
+  HANDLE_CASE(device, dtype, 1);       \
+  HANDLE_CASE(device, dtype, 2);       \
+  HANDLE_CASE(device, dtype, 3);       \
+  HANDLE_CASE(device, dtype, 4);       \
+  HANDLE_CASE(device, dtype, 5);
 
 HANDLE_CASE_DIM(CPUDevice, DT_FLOAT);
 HANDLE_CASE_DIM(CPUDevice, DT_DOUBLE);
@@ -380,15 +381,13 @@ HANDLE_CASE_DIM(CPUDevice, DT_INT32);
 HANDLE_CASE_DIM(CPUDevice, DT_INT64);
 
 #if GOOGLE_CUDA
-// Eigen on GPU does not handle 0-dimension data types yet.
-HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_FLOAT);
-HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_DOUBLE);
-HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_INT16);
-HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_INT32);
-HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_INT64);
+HANDLE_CASE_DIM(GPUDevice, DT_FLOAT);
+HANDLE_CASE_DIM(GPUDevice, DT_DOUBLE);
+HANDLE_CASE_DIM(GPUDevice, DT_INT16);
+HANDLE_CASE_DIM(GPUDevice, DT_INT32);
+HANDLE_CASE_DIM(GPUDevice, DT_INT64);
 #endif  // GOOGLE_CUDA
 
-#undef HANDLE_CASE_DIM_POSITIVE
 #undef HANDLE_CASE_DIM
 #undef HANDLE_CASE
 
diff --git a/tensorflow/core/lib/random/random_distributions.h b/tensorflow/core/lib/random/random_distributions.h
index ba3f6f95e4..745b1cf9f4 100644
--- a/tensorflow/core/lib/random/random_distributions.h
+++ b/tensorflow/core/lib/random/random_distributions.h
@@ -88,6 +88,71 @@ class UniformDistribution<Generator, double> {
   }
 };
 
+template <class Generator>
+class UniformDistribution<Generator, int32> {
+ public:
+  // The number of elements that will be returned.
+  static const int kResultElementCount = Generator::kResultElementCount;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static const bool kVariableSamplesPerOutput = false;
+  typedef Array<int32, kResultElementCount> ResultType;
+  typedef int32 ResultElementType;
+
+  // Must have lo < hi
+  UniformDistribution(int32 lo, int32 hi) : lo_(lo), range_(hi - lo) {}
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator* gen) {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; ++i) {
+      result[i] = lo_ + static_cast<int32>(sample[i] % range_);
+    }
+    return result;
+  }
+
+ private:
+  // Note that lo_ is intentionally signed while range_ is intentionally
+  // unsigned.  This is because hi - lo can overflow signed integers if
+  // lo < 0 < hi, but always fits in unsigned.
+  int32 lo_;
+  uint32 range_;
+};
+
+template <class Generator>
+class UniformDistribution<Generator, int64> {
+ public:
+  // The number of elements that will be returned.
+  static const int kResultElementCount = Generator::kResultElementCount / 2;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static const bool kVariableSamplesPerOutput = false;
+  typedef Array<int64, kResultElementCount> ResultType;
+  typedef int64 ResultElementType;
+
+  // Must have lo < hi
+  UniformDistribution(int64 lo, int64 hi) : lo_(lo), range_(hi - lo) {}
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator* gen) {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; ++i) {
+      auto bits = sample[2 * i] | static_cast<uint64>(sample[2 * i + 1]) << 32;
+      result[i] = lo_ + static_cast<int64>(bits % range_);
+    }
+    return result;
+  }
+
+ private:
+  // Note that lo_ is intentionally signed while range_ is intentionally
+  // unsigned.  This is because hi - lo can overflow signed integers if
+  // lo < 0 < hi, but always fits in unsigned.
+  int64 lo_;
+  uint64 range_;
+};
+
 // A class that adapts the underlying native multiple samples to return a single
 // sample at a time.
 template <class Generator>
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index e57860bd57..446be5e136 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -31,7 +31,7 @@ Input images can be of different types but output images are always float.
 images: 4-D with shape `[batch, height, width, channels]`.
 size:= A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
   new size for the images.
-resized_images:  4-D with shape
+resized_images: 4-D with shape
   `[batch, new_height, new_width, channels]`.
 )doc");
 
@@ -49,7 +49,7 @@ Input images can be of different types but output images are always float.
 images: 4-D with shape `[batch, height, width, channels]`.
 size:= A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
   new size for the images.
-resized_images:  4-D with shape
+resized_images: 4-D with shape
   `[batch, new_height, new_width, channels]`.
 )doc");
 
@@ -67,7 +67,7 @@ Input images can be of different types but output images are always float.
 images: 4-D with shape `[batch, height, width, channels]`.
 size:= A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
   new size for the images.
-resized_images:  4-D with shape
+resized_images: 4-D with shape
   `[batch, new_height, new_width, channels]`.
 )doc");
 
@@ -85,7 +85,7 @@ Input images can be of different types but output images are always float.
 images: 4-D with shape `[batch, height, width, channels]`.
 size:= A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
   new size for the images.
-resized_images:  4-D with shape
+resized_images: 4-D with shape
   `[batch, new_height, new_width, channels]`.
 )doc");
 
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 9f48da94e1..bcff941d82 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -5108,6 +5108,68 @@ op {
   is_stateful: true
 }
 op {
+  name: "RandomUniformInt"
+  input_arg {
+    name: "shape"
+    description: "The shape of the output tensor."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "minval"
+    description: "0-D.  Inclusive lower bound on the generated integers."
+    type_attr: "Tout"
+  }
+  input_arg {
+    name: "maxval"
+    description: "0-D.  Exclusive upper bound on the generated integers."
+    type_attr: "Tout"
+  }
+  output_arg {
+    name: "output"
+    description: "A tensor of the specified shape filled with uniform random integers."
+    type_attr: "Tout"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    description: "If either `seed` or `seed2` are set to be non-zero, the random number\ngenerator is seeded by the given seed.  Otherwise, it is seeded by a\nrandom seed."
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    description: "A second seed to avoid seed collision."
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  summary: "Outputs random integers from a uniform distribution."
+  description: "The generated values are uniform integers in the range `[minval, maxval)`.\nThe lower bound `minval` is included in the range, while the upper bound\n`maxval` is excluded.\n\nThe random integers are slightly biased unless `maxval - minval` is an exact\npower of two.  The bias is small for values of `maxval - minval` significantly\nsmaller than the range of the output (either `2^32` or `2^64`)."
+  is_stateful: true
+}
+op {
   name: "Range"
   input_arg {
     name: "start"
@@ -5831,11 +5893,19 @@ op {
     description: "The dimension which is partially reversed."
   }
   attr {
+    name: "batch_dim"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    description: "The dimension along which reversal is performed."
+  }
+  attr {
     name: "T"
     type: "type"
   }
-  summary: "Reverses variable length slices in dimension `seq_dim`."
-  description: "This op first slices `input` along the first dimension, and for each slice `i`,\nreverses the first `seq_lengths[i]` elements along the dimension `seq_dim`.\n\nThe elements of `seq_lengths` must obey `seq_lengths[i] < input.dims[seq_dim]`,\nand `seq_lengths` must be a vector of length `input.dims(0)`.\n\nThe output slice `i` along dimension 0 is then given by input slice `i`, with\nthe first `seq_lengths[i]` slices along dimension `seq_dim` reversed.\n\nFor example:\n\n```prettyprint\n# Given this:\nseq_dim = 1\ninput.dims = (4, ...)\nseq_lengths = [7, 2, 3, 5]\n\n# then slices of input are reversed on seq_dim, but only up to seq_lengths:\noutput[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]\noutput[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]\noutput[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]\noutput[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]\n\n# while entries past seq_lens are copied through:\noutput[0, 7:, :, ...] = input[0, 7:, :, ...]\noutput[1, 2:, :, ...] = input[1, 2:, :, ...]\noutput[2, 3:, :, ...] = input[2, 3:, :, ...]\noutput[3, 2:, :, ...] = input[3, 2:, :, ...]\n```"
+  summary: "Reverses variable length slices."
+  description: "This op first slices `input` along the dimension `batch_dim`, and for each\nslice `i`, reverses the first `seq_lengths[i]` elements along\nthe dimension `seq_dim`.\n\nThe elements of `seq_lengths` must obey `seq_lengths[i] < input.dims[seq_dim]`,\nand `seq_lengths` must be a vector of length `input.dims[batch_dim]`.\n\nThe output slice `i` along dimension `batch_dim` is then given by input\nslice `i`, with the first `seq_lengths[i]` slices along dimension\n`seq_dim` reversed.\n\nFor example:\n\n```prettyprint\n# Given this:\nbatch_dim = 0\nseq_dim = 1\ninput.dims = (4, 8, ...)\nseq_lengths = [7, 2, 3, 5]\n\n# then slices of input are reversed on seq_dim, but only up to seq_lengths:\noutput[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]\noutput[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]\noutput[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]\noutput[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]\n\n# while entries past seq_lens are copied through:\noutput[0, 7:, :, ...] = input[0, 7:, :, ...]\noutput[1, 2:, :, ...] = input[1, 2:, :, ...]\noutput[2, 3:, :, ...] = input[2, 3:, :, ...]\noutput[3, 2:, :, ...] = input[3, 2:, :, ...]\n```\n\nIn contrast, if:\n```prettyprint\n# Given this:\nbatch_dim = 2\nseq_dim = 0\ninput.dims = (8, ?, 4, ...)\nseq_lengths = [7, 2, 3, 5]\n\n# then slices of input are reversed on seq_dim, but only up to seq_lengths:\noutput[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]\noutput[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]\noutput[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]\noutput[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]\n\n# while entries past seq_lens are copied through:\noutput[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]\noutput[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]\noutput[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]\noutput[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]\n```"
 }
 op {
   name: "Rsqrt"
diff --git a/tensorflow/core/ops/random_ops.cc b/tensorflow/core/ops/random_ops.cc
index 982cf01d29..56148aaafb 100644
--- a/tensorflow/core/ops/random_ops.cc
+++ b/tensorflow/core/ops/random_ops.cc
@@ -41,6 +41,38 @@ seed2: A second seed to avoid seed collision.
 output: A tensor of the specified shape filled with uniform random values.
 )doc");
 
+REGISTER_OP("RandomUniformInt")
+    .Input("shape: T")
+    .Input("minval: Tout")
+    .Input("maxval: Tout")
+    .SetIsStateful()
+    .Output("output: Tout")
+    .Attr("seed: int = 0")
+    .Attr("seed2: int = 0")
+    .Attr("Tout: {int32, int64}")
+    .Attr("T: {int32, int64}")
+    .Doc(R"doc(
+Outputs random integers from a uniform distribution.
+
+The generated values are uniform integers in the range `[minval, maxval)`.
+The lower bound `minval` is included in the range, while the upper bound
+`maxval` is excluded.
+
+The random integers are slightly biased unless `maxval - minval` is an exact
+power of two.  The bias is small for values of `maxval - minval` significantly
+smaller than the range of the output (either `2^32` or `2^64`).
+
+shape: The shape of the output tensor.
+minval: 0-D.  Inclusive lower bound on the generated integers.
+maxval: 0-D.  Exclusive upper bound on the generated integers.
+seed: If either `seed` or `seed2` are set to be non-zero, the random number
+  generator is seeded by the given seed.  Otherwise, it is seeded by a
+  random seed.
+seed2: A second seed to avoid seed collision.
+
+output: A tensor of the specified shape filled with uniform random integers.
+)doc");
+
 REGISTER_OP("RandomStandardNormal")
     .Input("shape: T")
     .SetIsStateful()
diff --git a/tensorflow/g3doc/api_docs/python/array_ops.md b/tensorflow/g3doc/api_docs/python/array_ops.md
index 79abef1717..09485d671e 100644
--- a/tensorflow/g3doc/api_docs/python/array_ops.md
+++ b/tensorflow/g3doc/api_docs/python/array_ops.md
@@ -690,25 +690,28 @@ This is the opposite of pack.  The numpy equivalent is
 
 - - -
 
-### `tf.reverse_sequence(input, seq_lengths, seq_dim, name=None)` {#reverse_sequence}
+### `tf.reverse_sequence(input, seq_lengths, seq_dim, batch_dim=None, name=None)` {#reverse_sequence}
 
-Reverses variable length slices in dimension `seq_dim`.
+Reverses variable length slices.
 
-This op first slices `input` along the first dimension, and for each slice `i`,
-reverses the first `seq_lengths[i]` elements along the dimension `seq_dim`.
+This op first slices `input` along the dimension `batch_dim`, and for each
+slice `i`, reverses the first `seq_lengths[i]` elements along
+the dimension `seq_dim`.
 
 The elements of `seq_lengths` must obey `seq_lengths[i] < input.dims[seq_dim]`,
-and `seq_lengths` must be a vector of length `input.dims(0)`.
+and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
 
-The output slice `i` along dimension 0 is then given by input slice `i`, with
-the first `seq_lengths[i]` slices along dimension `seq_dim` reversed.
+The output slice `i` along dimension `batch_dim` is then given by input
+slice `i`, with the first `seq_lengths[i]` slices along dimension
+`seq_dim` reversed.
 
 For example:
 
 ```prettyprint
 # Given this:
+batch_dim = 0
 seq_dim = 1
-input.dims = (4, ...)
+input.dims = (4, 8, ...)
 seq_lengths = [7, 2, 3, 5]
 
 # then slices of input are reversed on seq_dim, but only up to seq_lengths:
@@ -724,6 +727,27 @@ output[2, 3:, :, ...] = input[2, 3:, :, ...]
 output[3, 2:, :, ...] = input[3, 2:, :, ...]
 ```
 
+In contrast, if:
+```prettyprint
+# Given this:
+batch_dim = 2
+seq_dim = 0
+input.dims = (8, ?, 4, ...)
+seq_lengths = [7, 2, 3, 5]
+
+# then slices of input are reversed on seq_dim, but only up to seq_lengths:
+output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
+output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
+output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
+output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
+
+# while entries past seq_lens are copied through:
+output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
+output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
+output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
+output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
+```
+
 ##### Args:
 
 
@@ -732,6 +756,8 @@ output[3, 2:, :, ...] = input[3, 2:, :, ...]
     1-D with length `input.dims(0)` and
     `max(seq_lengths) < input.dims(seq_dim)`
 *  <b>`seq_dim`</b>: An `int`. The dimension which is partially reversed.
+*  <b>`batch_dim`</b>: An optional `int`. Defaults to `0`.
+    The dimension along which reversal is performed.
 *  <b>`name`</b>: A name for the operation (optional).
 
 ##### Returns:
diff --git a/tensorflow/g3doc/api_docs/python/sparse_ops.md b/tensorflow/g3doc/api_docs/python/sparse_ops.md
index 99a075f14d..f9c39bd677 100644
--- a/tensorflow/g3doc/api_docs/python/sparse_ops.md
+++ b/tensorflow/g3doc/api_docs/python/sparse_ops.md
@@ -157,13 +157,13 @@ Alias for field number 1
 
 - - -
 
-### `tf.sparse_to_dense(sparse_indices, output_shape, sparse_values, default_value, name=None)` {#sparse_to_dense}
+### `tf.sparse_to_dense(sparse_indices, output_shape, sparse_values, default_value=0, name=None)` {#sparse_to_dense}
 
 Converts a sparse representation into a dense tensor.
 
 Builds an array `dense` with shape `output_shape` such that
 
-```prettyprint
+```python
 # If sparse_indices is scalar
 dense[i] = (i == sparse_indices ? sparse_values : default_value)
 
@@ -174,34 +174,32 @@ dense[sparse_indices[i]] = sparse_values[i]
 dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
 ```
 
-All other values in `dense` are set to `default_value`.  If `sparse_values` is a
-scalar, all sparse indices are set to this single value.
+All other values in `dense` are set to `default_value`.  If `sparse_values`
+is a scalar, all sparse indices are set to this single value.
 
 ##### Args:
 
 
-*  <b>`sparse_indices`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
-    index where `sparse_values[i]` will be placed.
-*  <b>`output_shape`</b>: A `Tensor`. Must have the same type as `sparse_indices`.
-    1-D.  Shape of the dense output tensor.
-*  <b>`sparse_values`</b>: A `Tensor`.
-    1-D.  Values corresponding to each row of `sparse_indices`,
-    or a scalar value to be used for all sparse indices.
-*  <b>`default_value`</b>: A `Tensor`. Must have the same type as `sparse_values`.
-    Scalar value to set for indices not specified in
-    `sparse_indices`.
+*  <b>`sparse_indices`</b>: A 0-D, 1-D, or 2-D `Tensor` of type `int32` or `int64`.
+    `sparse_indices[i]` contains the complete index where `sparse_values[i]`
+    will be placed.
+*  <b>`output_shape`</b>: A 1-D `Tensor` of the same type as `sparse_indices`.  Shape
+    of the dense output tensor.
+*  <b>`sparse_values`</b>: A 0-D or 1-D `Tensor`.  Values corresponding to each row of
+    `sparse_indices`, or a scalar value to be used for all sparse indices.
+*  <b>`default_value`</b>: A 0-D `Tensor` of the same type as `sparse_values`.  Value
+    to set for indices not specified in `sparse_indices`.  Defaults to zero.
 *  <b>`name`</b>: A name for the operation (optional).
 
 ##### Returns:
 
-  A `Tensor`. Has the same type as `sparse_values`.
-  Dense output tensor of shape `output_shape`.
+  Dense `Tensor` of shape `output_shape`.  Has the same type as
+  `sparse_values`.
 
 
 - - -
 
-### `tf.sparse_tensor_to_dense(sp_input, default_value, name=None)` {#sparse_tensor_to_dense}
+### `tf.sparse_tensor_to_dense(sp_input, default_value=0, name=None)` {#sparse_tensor_to_dense}
 
 Converts a `SparseTensor` into a dense tensor.
 
@@ -225,7 +223,7 @@ string tensor with values:
 
 *  <b>`sp_input`</b>: The input `SparseTensor`.
 *  <b>`default_value`</b>: Scalar value to set for indices not specified in
-    `sp_input`.
+    `sp_input`.  Defaults to zero.
 *  <b>`name`</b>: A name prefix for the returned tensors (optional).
 
 ##### Returns:
diff --git a/tensorflow/g3doc/get_started/os_setup.md b/tensorflow/g3doc/get_started/os_setup.md
index fde583e47f..6f79d22d3f 100644
--- a/tensorflow/g3doc/get_started/os_setup.md
+++ b/tensorflow/g3doc/get_started/os_setup.md
@@ -548,7 +548,7 @@ $ sudo easy_install -U six
 [MacPorts](https://www.macports.org/) and re-install TensorFlow in that
 copy of Python.
 
-# Mac OS X: TypeError: `__init__()` got an unexpected keyword argument 'syntax'
+### Mac OS X: TypeError: `__init__()` got an unexpected keyword argument 'syntax'
 
 On Mac OS X, you may encounter the following when importing tensorflow.
 
diff --git a/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md b/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md
index f1b7bb8205..23ca971224 100644
--- a/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md
+++ b/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md
@@ -69,7 +69,7 @@ The code example below is a modification of the [simple MNIST tutorial]
 added some summary ops, and run them every ten steps. If you run this and then
 launch `tensorboard --logdir=/tmp/mnist_data`, you'll be able to visualize
 statistics, such as how the weights or accuracy varied during training.
-The code below is an exerpt; full source is [here](mnist_with_summaries.py).
+The code below is an exerpt; full source is [here](../../tutorials/mnist/mnist_with_summaries.py).
 
 ```python
 # Create the model
diff --git a/tensorflow/g3doc/how_tos/summaries_and_tensorboard/mnist_with_summaries.py b/tensorflow/g3doc/how_tos/summaries_and_tensorboard/mnist_with_summaries.py
deleted file mode 100644
index cea82b137e..0000000000
--- a/tensorflow/g3doc/how_tos/summaries_and_tensorboard/mnist_with_summaries.py
+++ /dev/null
@@ -1,69 +0,0 @@
-"""A very simple MNIST classifer, modified to display data in TensorBoard
-
-See extensive documentation for the original model at
-http://tensorflow.org/tutorials/mnist/beginners/index.md
-
-See documentaion on the TensorBoard specific pieces at
-http://tensorflow.org/how_tos/summaries_and_tensorboard/index.md
-
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# Import data
-import input_data
-mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
-
-import tensorflow as tf
-sess = tf.InteractiveSession()
-
-# Create the model
-x = tf.placeholder("float", [None, 784], name="x-input")
-W = tf.Variable(tf.zeros([784,10]), name="weights")
-b = tf.Variable(tf.zeros([10], name="bias"))
-
-# use a name scope to organize nodes in the graph visualizer
-with tf.name_scope("Wx_b") as scope:
-  y = tf.nn.softmax(tf.matmul(x,W) + b)
-
-# Add summary ops to collect data
-w_hist = tf.histogram_summary("weights", W)
-b_hist = tf.histogram_summary("biases", b)
-y_hist = tf.histogram_summary("y", y)
-
-# Define loss and optimizer
-y_ = tf.placeholder("float", [None,10], name="y-input")
-# More name scopes will clean up the graph representation
-with tf.name_scope("xent") as scope:
-  cross_entropy = -tf.reduce_sum(y_*tf.log(y))
-  ce_summ = tf.scalar_summary("cross entropy", cross_entropy)
-with tf.name_scope("train") as scope:
-  train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)
-
-with tf.name_scope("test") as scope:
-  correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
-  accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
-  accuracy_summary = tf.scalar_summary("accuracy", accuracy)
-
-# Merge all the summaries and write them out to /tmp/mnist_logs
-merged = tf.merge_all_summaries()
-writer = tf.train.SummaryWriter("/tmp/mnist_logs", sess.graph_def)
-tf.initialize_all_variables().run()
-
-# Train the model, and feed in test data and record summaries every 10 steps
-
-for i in range(1000):
-  if i % 10 == 0:  # Record summary data, and the accuracy
-    feed = {x: mnist.test.images, y_: mnist.test.labels}
-    result = sess.run([merged, accuracy], feed_dict=feed)
-    summary_str = result[0]
-    acc = result[1]
-    writer.add_summary(summary_str, i)
-    print("Accuracy at step %s: %s" % (i, acc))
-  else:
-    batch_xs, batch_ys = mnist.train.next_batch(100)
-    feed = {x: batch_xs, y_: batch_ys}
-    sess.run(train_step, feed_dict=feed)
-
-print(accuracy.eval({x: mnist.test.images, y_: mnist.test.labels}))
diff --git a/tensorflow/g3doc/tutorials/mnist/beginners/index.md b/tensorflow/g3doc/tutorials/mnist/beginners/index.md
index 44efd43235..569cbef9e9 100644
--- a/tensorflow/g3doc/tutorials/mnist/beginners/index.md
+++ b/tensorflow/g3doc/tutorials/mnist/beginners/index.md
@@ -76,10 +76,11 @@ Isn't that bad? Well, the best computer vision methods do exploit this
 structure, and we will in later tutorials. But the simple method we will be
 using here, a softmax regression, won't.
 
-The result is that `mnist.train.images` is a tensor (an n-dimensional array) with a
-shape of `[60000, 784]`. The first dimension indexes the images and the second
-dimension indexes the pixels in each image. Each entry in the tensor is the
-pixel intensity between 0 and 1, for a particular pixel in a particular image.
+The result is that `mnist.train.images` is a tensor (an n-dimensional array)
+with a shape of `[55000, 784]`. The first dimension indexes the images and the
+second dimension indexes the pixels in each image. Each entry in the tensor is
+the pixel intensity between 0 and 1, for a particular pixel in a particular
+image.
 
 <div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="img/mnist-train-xs.png">
@@ -89,11 +90,11 @@ The corresponding labels in MNIST are numbers between 0 and 9, describing
 which digit a given image is of.
 For the purposes of this tutorial, we're going to want our labels
 as "one-hot vectors". A one-hot vector is a vector which is 0 in most
-dimensions, and 1 in a single dimension. In this case, the \\(n\\)th digit will be
-represented as a vector which is 1 in the \\(n\\)th dimensions. For example, 3
-would be \\([0,0,0,1,0,0,0,0,0,0]\\).
+dimensions, and 1 in a single dimension. In this case, the \\(n\\)th digit will
+be represented as a vector which is 1 in the \\(n\\)th dimensions. For example,
+3 would be \\([0,0,0,1,0,0,0,0,0,0]\\).
 Consequently, `mnist.train.labels` is a
-`[60000, 10]` array of floats.
+`[55000, 10]` array of floats.
 
 <div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="img/mnist-train-ys.png">
diff --git a/tensorflow/g3doc/tutorials/mnist/input_data.py b/tensorflow/g3doc/tutorials/mnist/input_data.py
index 26bef9eda6..3078137f2e 100644
--- a/tensorflow/g3doc/tutorials/mnist/input_data.py
+++ b/tensorflow/g3doc/tutorials/mnist/input_data.py
@@ -91,12 +91,15 @@ def extract_labels(filename, one_hot=False):
 
 class DataSet(object):
 
-  def __init__(self, images, labels, fake_data=False):
+  def __init__(self, images, labels, fake_data=False, one_hot=False):
+    """Construct a DataSet. one_hot arg is used only if fake_data is true."""
+
     if fake_data:
       self._num_examples = 10000
+      self.one_hot = one_hot
     else:
       assert images.shape[0] == labels.shape[0], (
-          "images.shape: %s labels.shape: %s" % (images.shape,
+          'images.shape: %s labels.shape: %s' % (images.shape,
                                                  labels.shape))
       self._num_examples = images.shape[0]
 
@@ -132,8 +135,11 @@ class DataSet(object):
   def next_batch(self, batch_size, fake_data=False):
     """Return the next `batch_size` examples from this data set."""
     if fake_data:
-      fake_image = [1.0 for _ in xrange(784)]
-      fake_label = 0
+      fake_image = [1] * 784
+      if self.one_hot:
+        fake_label = [1] + [0] * 9
+      else:
+        fake_label = 0
       return [fake_image for _ in xrange(batch_size)], [
           fake_label for _ in xrange(batch_size)]
     start = self._index_in_epoch
@@ -160,9 +166,9 @@ def read_data_sets(train_dir, fake_data=False, one_hot=False):
   data_sets = DataSets()
 
   if fake_data:
-    data_sets.train = DataSet([], [], fake_data=True)
-    data_sets.validation = DataSet([], [], fake_data=True)
-    data_sets.test = DataSet([], [], fake_data=True)
+    data_sets.train = DataSet([], [], fake_data=True, one_hot=one_hot)
+    data_sets.validation = DataSet([], [], fake_data=True, one_hot=one_hot)
+    data_sets.test = DataSet([], [], fake_data=True, one_hot=one_hot)
     return data_sets
 
   TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
diff --git a/tensorflow/g3doc/tutorials/mnist/mnist_with_summaries.py b/tensorflow/g3doc/tutorials/mnist/mnist_with_summaries.py
new file mode 100644
index 0000000000..95373404d6
--- /dev/null
+++ b/tensorflow/g3doc/tutorials/mnist/mnist_with_summaries.py
@@ -0,0 +1,107 @@
+# Copyright 2015 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""A very simple MNIST classifer, modified to display data in TensorBoard.
+
+See extensive documentation for the original model at
+http://tensorflow.org/tutorials/mnist/beginners/index.md
+
+See documentaion on the TensorBoard specific pieces at
+http://tensorflow.org/how_tos/summaries_and_tensorboard/index.md
+
+If you modify this file, please update the exerpt in
+how_tos/summaries_and_tensorboard/index.md.
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.python.platform
+from tensorflow.g3doc.tutorials.mnist import input_data
+import tensorflow as tf
+
+flags = tf.app.flags
+FLAGS = flags.FLAGS
+flags.DEFINE_boolean('fake_data', False, 'If true, uses fake data '
+                     'for unit testing.')
+flags.DEFINE_integer('max_steps', 1000, 'Number of steps to run trainer.')
+flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')
+
+
+def main(_):
+  # Import data
+  mnist = input_data.read_data_sets('/tmp/data/', one_hot=True,
+                                    fake_data=FLAGS.fake_data)
+
+  sess = tf.InteractiveSession()
+
+  # Create the model
+  x = tf.placeholder('float', [None, 784], name='x-input')
+  W = tf.Variable(tf.zeros([784, 10]), name='weights')
+  b = tf.Variable(tf.zeros([10], name='bias'))
+
+  # use a name scope to organize nodes in the graph visualizer
+  with tf.name_scope('Wx_b') as scope:
+    y = tf.nn.softmax(tf.matmul(x, W) + b)
+
+  # Add summary ops to collect data
+  w_hist = tf.histogram_summary('weights', W)
+  b_hist = tf.histogram_summary('biases', b)
+  y_hist = tf.histogram_summary('y', y)
+
+  # Define loss and optimizer
+  y_ = tf.placeholder('float', [None, 10], name='y-input')
+  # More name scopes will clean up the graph representation
+  with tf.name_scope('xent') as scope:
+    cross_entropy = -tf.reduce_sum(y_ * tf.log(y))
+    ce_summ = tf.scalar_summary('cross entropy', cross_entropy)
+  with tf.name_scope('train') as scope:
+    train_step = tf.train.GradientDescentOptimizer(
+        FLAGS.learning_rate).minimize(cross_entropy)
+
+  with tf.name_scope('test') as scope:
+    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
+    accuracy = tf.reduce_mean(tf.cast(correct_prediction, 'float'))
+    accuracy_summary = tf.scalar_summary('accuracy', accuracy)
+
+  # Merge all the summaries and write them out to /tmp/mnist_logs
+  merged = tf.merge_all_summaries()
+  writer = tf.train.SummaryWriter('/tmp/mnist_logs', sess.graph_def)
+  tf.initialize_all_variables().run()
+
+  # Train the model, and feed in test data and record summaries every 10 steps
+
+  for i in range(FLAGS.max_steps):
+    if i % 10 == 0:  # Record summary data, and the accuracy
+      if FLAGS.fake_data:
+        batch_xs, batch_ys = mnist.train.next_batch(
+            100, fake_data=FLAGS.fake_data)
+        feed = {x: batch_xs, y_: batch_ys}
+      else:
+        feed = {x: mnist.test.images, y_: mnist.test.labels}
+      result = sess.run([merged, accuracy], feed_dict=feed)
+      summary_str = result[0]
+      acc = result[1]
+      writer.add_summary(summary_str, i)
+      print('Accuracy at step %s: %s' % (i, acc))
+    else:
+      batch_xs, batch_ys = mnist.train.next_batch(
+          100, fake_data=FLAGS.fake_data)
+      feed = {x: batch_xs, y_: batch_ys}
+      sess.run(train_step, feed_dict=feed)
+
+if __name__ == '__main__':
+  tf.app.run()
diff --git a/tensorflow/g3doc/tutorials/word2vec/index.md b/tensorflow/g3doc/tutorials/word2vec/index.md
index c4425bdbef..3dc632c1bc 100644
--- a/tensorflow/g3doc/tutorials/word2vec/index.md
+++ b/tensorflow/g3doc/tutorials/word2vec/index.md
@@ -147,7 +147,7 @@ $$J_\text{NEG} = \log Q_\theta(D=1 |w_t, h) +
 where \\(Q_\theta(D=1 | w, h)\\) is the binary logistic regression probability
 under the model of seeing the word \\(w\\) in the context \\(h\\) in the dataset
 \\(D\\), calculated in terms of the learned embedding vectors \\(\theta\\). In
-practice we approximate the expectation by drawing \\(k\\) constrastive words
+practice we approximate the expectation by drawing \\(k\\) contrastive words
 from the noise distribution (i.e. we compute a
 [Monte Carlo average](https://en.wikipedia.org/wiki/Monte_Carlo_integration)).
 
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 49f42dd6f3..618d0e2ad3 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -487,6 +487,7 @@ tf_gen_op_wrapper_py(
     name = "random_ops",
     hidden = [
         "RandomUniform",
+        "RandomUniformInt",
         "RandomShuffle",
         "RandomStandardNormal",
         "TruncatedNormal",
@@ -510,6 +511,7 @@ tf_gen_op_wrapper_py(
         "SparseConcat",
         "SparseSelectLastK",
         "SparseReorder",
+        "SparseToDense",
     ],
     require_shape_functions = True,
 )
diff --git a/tensorflow/python/kernel_tests/random_ops_test.py b/tensorflow/python/kernel_tests/random_ops_test.py
index c714c7e3cb..620d066934 100644
--- a/tensorflow/python/kernel_tests/random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random_ops_test.py
@@ -168,20 +168,23 @@ class RandomUniformTest(tf.test.TestCase):
     return func
 
   def testRange(self):
-    for use_gpu in [False, True]:
-      for dt in tf.float32, tf.float64:
-        sampler = self._Sampler(1000, -2., 8., dt, use_gpu=use_gpu)
+    for use_gpu in False, True:
+      for dt in tf.float32, tf.float64, tf.int32, tf.int64:
+        sampler = self._Sampler(1000, minv=-2, maxv=8, dtype=dt,
+                                use_gpu=use_gpu)
         x = sampler()
         self.assertTrue(-2 <= np.min(x))
-        self.assertTrue(np.max(x) <= 8)
+        self.assertTrue(np.max(x) < 8)
 
   # Asserts that different trials (1000 samples per trial) is unlikely
   # to see the same sequence of values. Will catch buggy
   # implementations which uses the same random number seed.
   def testDistinct(self):
-    for use_gpu in [False, True]:
-      for dt in tf.float32, tf.float64:
-        sampler = self._Sampler(1000, 0.0, 1.0, dt, use_gpu=use_gpu)
+    for use_gpu in False, True:
+      for dt in tf.float32, tf.float64, tf.int32, tf.int64:
+        maxv = 1.0 if dt.is_floating else 1 << 30
+        sampler = self._Sampler(1000, minv=0, maxv=maxv, dtype=dt,
+                                use_gpu=use_gpu)
         x = sampler()
         y = sampler()
         count = (x == y).sum()
@@ -191,33 +194,57 @@ class RandomUniformTest(tf.test.TestCase):
           print("count = ", count)
         self.assertTrue(count < 10)
 
+  # Check that uniform ints actually follow a uniform distribution.
+  def testUniformInts(self):
+    minv = -2
+    maxv = 15
+    n = 100000
+    p = 1 / (maxv - minv)
+    # The counts should follow an (n, p) binomial distribution.
+    mean = p * n
+    std = np.sqrt(n * p * (1 - p))
+    for use_gpu in False, True:
+      for dt in tf.int32, tf.int64:
+        # Use a fixed seed here to make the test deterministic.
+        # Without the fixed seed, the 5 * std bound will (very rarely) fail.
+        sampler = self._Sampler(n // 10, minv=minv, maxv=maxv, dtype=dt,
+                                use_gpu=use_gpu, seed=17)
+        x = sampler().ravel()
+        self.assertEqual(x.shape, (n,))
+        counts, _ = np.histogram(x, bins=maxv - minv)
+        self.assertEqual(counts.shape, (maxv - minv,))
+        self.assertEqual(counts.sum(), n)
+        error = np.abs(counts - mean)
+        self.assertLess(error.max(), 5 * std)
+
   # Checks that the CPU and GPU implementation returns the same results,
   # given the same random seed
   def testCPUGPUMatch(self):
-    for dt in tf.float32, tf.float64:
+    for dt in tf.float32, tf.float64, tf.int32, tf.int64:
+      maxv = 1.0 if dt.is_floating else 17
       results = {}
-      for use_gpu in [False, True]:
-        sampler = self._Sampler(1000, 0.0, 1.0, dt, use_gpu=use_gpu, seed=12345)
+      for use_gpu in False, True:
+        sampler = self._Sampler(1000, minv=0, maxv=maxv, dtype=dt,
+                                use_gpu=use_gpu, seed=12345)
         results[use_gpu] = sampler()
-      self.assertAllClose(results[False], results[True], rtol=1e-6, atol=1e-6)
+      self.assertAllEqual(results[False], results[True])
 
   def testSeed(self):
-    for use_gpu in [False, True]:
-      for dt in tf.float32, tf.float64:
-        sx = self._Sampler(1000, 0.0, 1.0, dt, use_gpu=use_gpu, seed=345)
-        sy = self._Sampler(1000, 0.0, 1.0, dt, use_gpu=use_gpu, seed=345)
+    for use_gpu in False, True:
+      for dt in tf.float32, tf.float64, tf.int32, tf.int64:
+        sx = self._Sampler(1000, 0, 17, dtype=dt, use_gpu=use_gpu, seed=345)
+        sy = self._Sampler(1000, 0, 17, dtype=dt, use_gpu=use_gpu, seed=345)
         self.assertAllEqual(sx(), sy())
 
   def testNoCSE(self):
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        shape = [2, 3, 4]
-        rnd1 = tf.random_uniform(shape, 0.0, 1.0,
-                                         dtype=tf.float32)
-        rnd2 = tf.random_uniform(shape, 0.0, 1.0,
-                                         dtype=tf.float32)
-        diff = (rnd2 - rnd1).eval()
-        self.assertTrue(np.linalg.norm(diff) > 0.1)
+    shape = [2, 3, 4]
+    for use_gpu in False, True:
+      for dtype in tf.float32, tf.int32:
+        with self.test_session(use_gpu=use_gpu):
+          rnd1 = tf.random_uniform(shape, 0, 17, dtype=dtype)
+          rnd2 = tf.random_uniform(shape, 0, 17, dtype=dtype)
+          diff = (rnd2 - rnd1).eval()
+          self.assertTrue(np.linalg.norm(diff) > 0.1)
 
 
 class RandomShapeTest(tf.test.TestCase):
diff --git a/tensorflow/python/kernel_tests/shape_ops_test.py b/tensorflow/python/kernel_tests/shape_ops_test.py
index 2621ad9dec..81be48990b 100644
--- a/tensorflow/python/kernel_tests/shape_ops_test.py
+++ b/tensorflow/python/kernel_tests/shape_ops_test.py
@@ -216,13 +216,14 @@ class ShapeOpsTest(tf.test.TestCase):
 class TileTest(tf.test.TestCase):
 
   def testScalar(self):
-    with self.test_session():
-      a = tf.constant(7, shape=[], dtype=tf.float32)
-      tiled = tf.tile(a, [])
-      result = tiled.eval()
-    self.assertEqual(result.shape, ())
-    self.assertEqual([], tiled.get_shape())
-    self.assertEqual(7, result)
+    for use_gpu in False, True:
+      with self.test_session(use_gpu=use_gpu):
+        a = tf.constant(7, shape=[], dtype=tf.float32)
+        tiled = tf.tile(a, [])
+        result = tiled.eval()
+      self.assertEqual(result.shape, ())
+      self.assertEqual([], tiled.get_shape())
+      self.assertEqual(7, result)
 
   def testSimple(self):
     with self.test_session():
@@ -357,20 +358,23 @@ class TileTest(tf.test.TestCase):
     self.assertAllClose(expected, result, 1e-3)
 
   def _RunAndVerifyGradientResult(self, input_shape, multiples):
-    with self.test_session():
-      # Random values
-      inp = np.random.rand(*input_shape)
-      a = tf.constant([float(x) for x in inp.flatten()],
-                   shape=input_shape, dtype=tf.float64)
-      tiled = tf.tile(a, multiples)
-      grad_shape = list(np.array(multiples) * np.array(inp.shape))
-      err = tf.test.compute_gradient_error(a,
-                                           list(input_shape),
-                                           tiled,
-                                           grad_shape,
-                                           x_init_value=inp)
-    print("tile(float) error = ", err)
-    self.assertLess(err, 1e-3)
+    for use_gpu in False, True:
+      with self.test_session(use_gpu=use_gpu):
+        # Random values
+        inp = np.asarray(np.random.rand(*input_shape))
+        a = tf.constant(inp, dtype=tf.float64)
+        tiled = tf.tile(a, multiples)
+        grad_shape = list(np.array(multiples) * np.array(inp.shape))
+        err = tf.test.compute_gradient_error(a,
+                                             list(input_shape),
+                                             tiled,
+                                             grad_shape,
+                                             x_init_value=inp)
+      print("tile(float) error = ", err)
+      self.assertLess(err, 1e-3)
+
+  def testGradientRandomScalar(self):
+    self._RunAndVerifyGradientResult([], [])
 
   def testGradientRandom(self):
     self._RunAndVerifyGradientResult([2, 2, 1, 1, 3], [1, 2, 1, 3, 1])
diff --git a/tensorflow/python/kernel_tests/sparse_to_dense_op_py_test.py b/tensorflow/python/kernel_tests/sparse_to_dense_op_py_test.py
index 264ef03c88..ee9a697a0b 100644
--- a/tensorflow/python/kernel_tests/sparse_to_dense_op_py_test.py
+++ b/tensorflow/python/kernel_tests/sparse_to_dense_op_py_test.py
@@ -71,6 +71,11 @@ class SparseToDenseTest(tf.test.TestCase):
                        [ 1, -1, -1, -1]]).astype(np.int32)
     self.assertAllClose(np_ans, tf_ans)
 
+  def testZeroDefault(self):
+    with self.test_session():
+      x = tf.sparse_to_dense(2, [4], 7).eval()
+      self.assertAllEqual(x, [0, 0, 7, 0])
+
   def test3d(self):
     with self.test_session(use_gpu=False):
       tf_ans = _SparseToDense([[1, 3, 0], [2, 0, 1]], [3, 4, 2], 1, -1).eval()
diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py
index cefeec54bb..428d591acc 100644
--- a/tensorflow/python/ops/random_ops.py
+++ b/tensorflow/python/ops/random_ops.py
@@ -122,7 +122,7 @@ def truncated_normal(shape, mean=0.0, stddev=1.0, dtype=dtypes.float32,
 ops.NoGradient("TruncatedNormal")
 
 
-def random_uniform(shape, minval=0.0, maxval=1.0,
+def random_uniform(shape, minval=0, maxval=None,
                    dtype=dtypes.float32, seed=None,
                    name=None):
   """Outputs random values from a uniform distribution.
@@ -131,13 +131,22 @@ def random_uniform(shape, minval=0.0, maxval=1.0,
   `[minval, maxval)`. The lower bound `minval` is included in the range, while
   the upper bound `maxval` is excluded.
 
+  For floats, the default range is `[0, 1)`.  For ints, at least `maxval` must
+  be specified explicitly.
+
+  In the integer case, the random integers are slightly biased unless
+  `maxval - minval` is an exact power of two.  The bias is small for values of
+  `maxval - minval` significantly smaller than the range of the output (either
+  `2**32` or `2**64`).
+
   Args:
     shape: A 1-D integer Tensor or Python array. The shape of the output tensor.
     minval: A 0-D Tensor or Python value of type `dtype`. The lower bound on the
-      range of random values to generate.
+      range of random values to generate.  Defaults to 0.
     maxval: A 0-D Tensor or Python value of type `dtype`. The upper bound on
-      the range of random values to generate.
-    dtype: The type of the output.
+      the range of random values to generate.  Defaults to 1 if `dtype` is
+      floating point.
+    dtype: The type of the output: `float32`, `float64`, `int32`, or `int64`.
     seed: A Python integer. Used to create a random seed for the distribution.
       See
       [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
@@ -146,19 +155,28 @@ def random_uniform(shape, minval=0.0, maxval=1.0,
 
   Returns:
     A tensor of the specified shape filled with random uniform values.
+
+  Raises:
+    ValueError: If `dtype` is integral and `maxval` is not specified.
   """
+  dtype = dtypes.as_dtype(dtype)
+  if maxval is None:
+    if dtype.is_integer:
+      raise ValueError("Must specify maxval for integer dtype %r" % dtype)
+    maxval = 1
   with ops.op_scope([shape, minval, maxval], name, "random_uniform") as name:
-    shape_tensor = _ShapeTensor(shape)
-    min_tensor = ops.convert_to_tensor(minval, dtype=dtype, name="min")
-    range_tensor = ops.convert_to_tensor(
-        maxval - minval, dtype=dtype, name="range")
+    shape = _ShapeTensor(shape)
+    minval = ops.convert_to_tensor(minval, dtype=dtype, name="min")
+    maxval = ops.convert_to_tensor(maxval, dtype=dtype, name="max")
     seed1, seed2 = random_seed.get_seed(seed)
-    rnd = gen_random_ops._random_uniform(shape_tensor, dtype,
-                                         seed=seed1,
-                                         seed2=seed2)
-    mul = rnd * range_tensor
-    value = math_ops.add(mul, min_tensor, name=name)
-    return value
+    if dtype.is_integer:
+      return gen_random_ops._random_uniform_int(shape, minval, maxval,
+                                                seed=seed1, seed2=seed2,
+                                                name=name)
+    else:
+      rnd = gen_random_ops._random_uniform(shape, dtype, seed=seed1,
+                                           seed2=seed2)
+      return math_ops.add(rnd * (maxval - minval), minval, name=name)
 
 
 def random_shuffle(value, seed=None, name=None):
@@ -197,6 +215,7 @@ ops.NoGradient("RandomUniform")
 @ops.RegisterShape("TruncatedNormal")
 @ops.RegisterShape("RandomStandardNormal")
 @ops.RegisterShape("RandomUniform")
+@ops.RegisterShape("RandomUniformInt")
 def _RandomShape(op):
   shape_val = tensor_util.ConstantValue(op.inputs[0])
   if shape_val is not None:
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 1507371b40..2f82f3c6ad 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -240,7 +240,48 @@ def _SparseToDenseShape(op):
     return [tensor_shape.unknown_shape(ndims=input_shape_shape.num_elements())]
 
 
-def sparse_tensor_to_dense(sp_input, default_value, name=None):
+def sparse_to_dense(sparse_indices, output_shape, sparse_values,
+                    default_value=0, name=None):
+  """Converts a sparse representation into a dense tensor.
+
+  Builds an array `dense` with shape `output_shape` such that
+
+  ```python
+  # If sparse_indices is scalar
+  dense[i] = (i == sparse_indices ? sparse_values : default_value)
+
+  # If sparse_indices is a vector, then for each i
+  dense[sparse_indices[i]] = sparse_values[i]
+
+  # If sparse_indices is an n by d matrix, then for each i in [0, n)
+  dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
+  ```
+
+  All other values in `dense` are set to `default_value`.  If `sparse_values`
+  is a scalar, all sparse indices are set to this single value.
+
+  Args:
+    sparse_indices: A 0-D, 1-D, or 2-D `Tensor` of type `int32` or `int64`.
+      `sparse_indices[i]` contains the complete index where `sparse_values[i]`
+      will be placed.
+    output_shape: A 1-D `Tensor` of the same type as `sparse_indices`.  Shape
+      of the dense output tensor.
+    sparse_values: A 0-D or 1-D `Tensor`.  Values corresponding to each row of
+      `sparse_indices`, or a scalar value to be used for all sparse indices.
+    default_value: A 0-D `Tensor` of the same type as `sparse_values`.  Value
+      to set for indices not specified in `sparse_indices`.  Defaults to zero.
+    name: A name for the operation (optional).
+
+  Returns:
+    Dense `Tensor` of shape `output_shape`.  Has the same type as
+    `sparse_values`.
+  """
+  return gen_sparse_ops._sparse_to_dense(sparse_indices, output_shape,
+                                         sparse_values, default_value,
+                                         name=name)
+
+
+def sparse_tensor_to_dense(sp_input, default_value=0, name=None):
   """Converts a `SparseTensor` into a dense tensor.
 
   This op is a convenience wrapper around `sparse_to_dense` for `SparseTensor`s.
@@ -261,7 +302,7 @@ def sparse_tensor_to_dense(sp_input, default_value, name=None):
   Args:
     sp_input: The input `SparseTensor`.
     default_value: Scalar value to set for indices not specified in
-      `sp_input`.
+      `sp_input`.  Defaults to zero.
     name: A name prefix for the returned tensors (optional).
 
   Returns:
@@ -275,12 +316,8 @@ def sparse_tensor_to_dense(sp_input, default_value, name=None):
   if not isinstance(sp_input, ops.SparseTensor):
     raise TypeError("Input must be a SparseTensor")
 
-  return gen_sparse_ops.sparse_to_dense(
-      sp_input.indices,
-      sp_input.shape,
-      sp_input.values,
-      default_value,
-      name=name)
+  return sparse_to_dense(sp_input.indices, sp_input.shape, sp_input.values,
+                         default_value, name=name)
 
 
 def sparse_to_indicator(sp_input, vocab_size, name=None):
@@ -455,7 +492,7 @@ def sparse_fill_empty_rows(sp_input, default_value, name=None):
     all_row_indices = math_ops.cast(math_ops.range(num_rows), dtypes.int64)
     empty_row_indices, _ = array_ops.list_diff(
         all_row_indices, sp_input.indices[:, 0])
-    empty_row_indicator = gen_sparse_ops.sparse_to_dense(
+    empty_row_indicator = sparse_to_dense(
         empty_row_indices, array_ops.expand_dims(sp_input.shape[0], -1), True,
         False)
 
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index c5b490ce4c..2146405d3e 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -269,7 +269,7 @@ def variable_scope(name_or_scope, reuse=None, initializer=None):
   Sharing a variable by capturing a scope and setting reuse:
 
   ```python
-  with tf.variable_scope("foo") as scope.
+  with tf.variable_scope("foo") as scope:
       v = tf.get_variable("v", [1])
       scope.reuse_variables()
       v1 = tf.get_variable("v", [1])
@@ -280,7 +280,7 @@ def variable_scope(name_or_scope, reuse=None, initializer=None):
   getting an existing variable in a non-reusing scope.
 
   ```python
-  with tf.variable_scope("foo") as scope.
+  with tf.variable_scope("foo"):
       v = tf.get_variable("v", [1])
       v1 = tf.get_variable("v", [1])
       #  Raises ValueError("... v already exists ...").
diff --git a/tensorflow/tensorboard/scripts/tfserve.js b/tensorflow/tensorboard/scripts/tfserve.js
index 1d70fade3f..d4d1e32b27 100644
--- a/tensorflow/tensorboard/scripts/tfserve.js
+++ b/tensorflow/tensorboard/scripts/tfserve.js
@@ -1,3 +1,18 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
 /**
  * Simple server for running TensorBoard during development.
  */
diff --git a/third_party/eigen3/BUILD b/third_party/eigen3/BUILD
index ac7eede6d9..04841fac71 100644
--- a/third_party/eigen3/BUILD
+++ b/third_party/eigen3/BUILD
@@ -4,6 +4,8 @@ cc_library(
     name = "eigen3",
     hdrs = glob([
         "**/*.h",
+        "unsupported/Eigen/CXX11/*",
+        "Eigen/*",
     ]),
     includes = [ "." ],
     visibility = ["//visibility:public"],