1 files changed, 276 insertions, 0 deletions
diff --git a/tensorflow/core/kernels/random_op.cc b/tensorflow/core/kernels/random_op.cc
new file mode 100644
index 0000000000..09b66d30e6
--- /dev/null
+++ b/tensorflow/core/kernels/random_op.cc
@@ -0,0 +1,276 @@
+// See docs in ../ops/random_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/random_op.h"
+
+#include <algorithm>
+#include <memory>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/lib/hash/crc32c.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/util/guarded_philox_random.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+// The default implementation of the functor, which should never be invoked
+// But we still need to provide implementation for now for the linker to work,
+// since we do not support all the distributions yet.
+template <typename Device, class Distribution>
+struct FillPhiloxRandom {
+  typedef typename Distribution::ResultElementType T;
+  void operator()(OpKernelContext*, const Device&, random::PhiloxRandom gen,
+                  T* data, int64 size) {
+    LOG(FATAL) << "Default FillPhiloxRandom should not be executed.";
+  }
+};
+
+#if GOOGLE_CUDA
+// Declaration for the partial specialization with GPU
+template <class Distribution>
+struct FillPhiloxRandom<GPUDevice, Distribution> {
+  typedef typename Distribution::ResultElementType T;
+  void operator()(OpKernelContext* ctx, const GPUDevice&,
+                  random::PhiloxRandom gen, T* data, int64 size);
+};
+
+#endif
+
+// A class to fill a specified range of random groups
+template <class Distribution, bool VariableSamplesPerOutput>
+struct FillPhiloxRandomTask;
+
+// Specialization for distribution that takes a fixed number of samples for
+// each output.
+template <class Distribution>
+struct FillPhiloxRandomTask<Distribution, false> {
+  typedef typename Distribution::ResultElementType T;
+  static void Run(random::PhiloxRandom gen, T* data, int64 size,
+                  int64 start_group, int64 limit_group) {
+    Distribution dist;
+    const int kGroupSize = Distribution::kResultElementCount;
+
+    gen.Skip(start_group);
+    int64 offset = start_group * kGroupSize;
+
+    // First fill all the full-size groups
+    int64 limit_group_full = std::min(limit_group, size / kGroupSize);
+    for (int64 index = start_group; index < limit_group_full; ++index) {
+      auto samples = dist(&gen);
+      std::copy(&samples[0], &samples[0] + kGroupSize, data + offset);
+      offset += kGroupSize;
+    }
+
+    // If there are any remaining elements that need to be filled, process them
+    if (limit_group_full < limit_group) {
+      int remaining_size = size - limit_group_full * kGroupSize;
+      auto samples = dist(&gen);
+      std::copy(&samples[0], &samples[0] + remaining_size, data + offset);
+    }
+  }
+};
+
+// Specialization for distribution that takes a varaiable number of samples for
+// each output. This will be slower due to the generality.
+template <class Distribution>
+struct FillPhiloxRandomTask<Distribution, true> {
+  typedef typename Distribution::ResultElementType T;
+  static const int64 kReservedSamplesPerOutput = 256;
+
+  static void Run(random::PhiloxRandom base_gen, T* data, int64 size,
+                  int64 start_group, int64 limit_group) {
+    using random::PhiloxRandom;
+    using random::SingleSampleAdapter;
+
+    Distribution dist;
+    const int kGroupSize = Distribution::kResultElementCount;
+
+    static const int kGeneratorSkipPerOutputGroup =
+        kGroupSize * kReservedSamplesPerOutput /
+        PhiloxRandom::kResultElementCount;
+
+    int64 offset = start_group * kGroupSize;
+
+    // First fill all the full-size groups
+    int64 limit_group_full = std::min(limit_group, size / kGroupSize);
+    int64 group_index;
+    for (group_index = start_group; group_index < limit_group_full;
+         ++group_index) {
+      // Reset the generator to the beginning of the output group region
+      // This is necessary if we want the results to be independent of order
+      // of work
+      PhiloxRandom gen = base_gen;
+      gen.Skip(group_index * kGeneratorSkipPerOutputGroup);
+      SingleSampleAdapter<PhiloxRandom> single_samples(&gen);
+
+      auto samples = dist(&single_samples);
+      std::copy(&samples[0], &samples[0] + kGroupSize, data + offset);
+      offset += kGroupSize;
+    }
+
+    // If there are any remaining elements that need to be filled, process them
+    if (limit_group_full < limit_group) {
+      PhiloxRandom gen = base_gen;
+      gen.Skip(group_index * kGeneratorSkipPerOutputGroup);
+      SingleSampleAdapter<PhiloxRandom> single_samples(&gen);
+
+      int remaining_size = size - limit_group_full * kGroupSize;
+      auto samples = dist(&single_samples);
+      std::copy(&samples[0], &samples[0] + remaining_size, data + offset);
+    }
+  }
+};
+
+// Partial specialization for CPU to fill the entire region with randoms
+// It splits the work into several tasks and run them in parallel
+template <class Distribution>
+struct FillPhiloxRandom<CPUDevice, Distribution> {
+  typedef typename Distribution::ResultElementType T;
+  void operator()(OpKernelContext* context, const CPUDevice&,
+                  random::PhiloxRandom gen, T* data, int64 size) {
+    const int kGroupSize = Distribution::kResultElementCount;
+
+    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+
+    int64 total_group_count = (size + kGroupSize - 1) / kGroupSize;
+
+    // Limit to maximum six threads for now. The performance scaling is very
+    // sub-linear. Too many threads causes a much worse overall performance.
+    int num_workers = 6;
+    Shard(num_workers, worker_threads.workers, total_group_count, kGroupSize,
+          [&gen, data, size](int64 start_group, int64 limit_group) {
+            FillPhiloxRandomTask<
+                Distribution,
+                Distribution::kVariableSamplesPerOutput>::Run(gen, data, size,
+                                                              start_group,
+                                                              limit_group);
+          });
+  }
+};
+}  // namespace functor
+
+// For now, use the same interface as RandomOp, so we can choose either one
+// at the run-time.
+template <typename Device, class Distribution>
+class PhiloxRandomOp : public OpKernel {
+ public:
+  typedef typename Distribution::ResultElementType T;
+  explicit PhiloxRandomOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, generator_.Init(ctx));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& input = ctx->input(0);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsLegacyVector(input.shape()),
+        errors::InvalidArgument("shape must be a vector of {int32,int64}."));
+    Tensor* output = nullptr;
+    if (input.dtype() == DataType::DT_INT32) {
+      auto vec = input.flat<int32>();
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShapeUtils::MakeShape(
+                                                      vec.data(), vec.size()),
+                                               &output));
+    } else if (input.dtype() == DataType::DT_INT64) {
+      auto vec = input.flat<int64>();
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShapeUtils::MakeShape(
+                                                      vec.data(), vec.size()),
+                                               &output));
+    } else {
+      OP_REQUIRES(ctx, false, errors::InvalidArgument(
+                                  "shape must be a vector of {int32,int64}."));
+    }
+    functor::FillPhiloxRandom<Device, Distribution>()(
+        ctx, ctx->eigen_device<Device>(),
+        ReserveRandomOutputs(output->flat<T>().size()),
+        output->flat<T>().data(), output->flat<T>().size());
+  }
+
+ private:
+  GuardedPhiloxRandom generator_;
+
+  // Reserve enough random samples in the generator for the given output count.
+  random::PhiloxRandom ReserveRandomOutputs(int64 output_count) {
+    int64 conservative_sample_count = output_count << 8;
+    return generator_.ReserveSamples128(conservative_sample_count);
+  }
+};
+
+#define REGISTER(TYPE)                                              \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("RandomUniform")                                         \
+          .Device(DEVICE_CPU)                                       \
+          .HostMemory("shape")                                      \
+          .TypeConstraint<TYPE>("dtype"),                           \
+      PhiloxRandomOp<CPUDevice, random::UniformDistribution<        \
+                                    random::PhiloxRandom, TYPE> >); \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("RandomStandardNormal")                                  \
+          .Device(DEVICE_CPU)                                       \
+          .HostMemory("shape")                                      \
+          .TypeConstraint<TYPE>("dtype"),                           \
+      PhiloxRandomOp<CPUDevice, random::NormalDistribution<         \
+                                    random::PhiloxRandom, TYPE> >); \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("TruncatedNormal")                                       \
+          .Device(DEVICE_CPU)                                       \
+          .HostMemory("shape")                                      \
+          .TypeConstraint<TYPE>("dtype"),                           \
+      PhiloxRandomOp<                                               \
+          CPUDevice,                                                \
+          random::TruncatedNormalDistribution<                      \
+              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >)
+
+REGISTER(float);
+REGISTER(double);
+
+#undef REGISTER
+
+#if GOOGLE_CUDA
+
+#define REGISTER(TYPE)                                              \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("RandomUniform")                                         \
+          .Device(DEVICE_GPU)                                       \
+          .HostMemory("shape")                                      \
+          .TypeConstraint<int32>("T")                               \
+          .TypeConstraint<TYPE>("dtype"),                           \
+      PhiloxRandomOp<GPUDevice, random::UniformDistribution<        \
+                                    random::PhiloxRandom, TYPE> >); \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("RandomStandardNormal")                                  \
+          .Device(DEVICE_GPU)                                       \
+          .HostMemory("shape")                                      \
+          .TypeConstraint<int32>("T")                               \
+          .TypeConstraint<TYPE>("dtype"),                           \
+      PhiloxRandomOp<GPUDevice, random::NormalDistribution<         \
+                                    random::PhiloxRandom, TYPE> >); \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("TruncatedNormal")                                       \
+          .Device(DEVICE_GPU)                                       \
+          .HostMemory("shape")                                      \
+          .TypeConstraint<int32>("T")                               \
+          .TypeConstraint<TYPE>("dtype"),                           \
+      PhiloxRandomOp<                                               \
+          GPUDevice,                                                \
+          random::TruncatedNormalDistribution<                      \
+              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >)
+
+REGISTER(float);
+REGISTER(double);
+
+#undef REGISTER
+
+#endif  // GOOGLE_CUDA
+
+}  // end namespace tensorflow