#if GOOGLE_CUDA #define EIGEN_USE_GPU #include "tensorflow/core/kernels/random_op.h" #include #include #include "tensorflow/core/lib/random/philox_random.h" #include "tensorflow/core/lib/random/random_distributions.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" namespace tensorflow { class OpKernelContext; namespace functor { typedef Eigen::GpuDevice GPUDevice; template struct FillPhiloxRandomKernel; // A cuda kernel to fill the data with random numbers from the specified // distribution. Each output takes a fixed number of samples. template struct FillPhiloxRandomKernel { typedef typename Distribution::ResultElementType T; PHILOX_DEVICE_FUNC void Run(random::PhiloxRandom gen, T* data, int64 size) { Distribution dist; const int kGroupSize = Distribution::kResultElementCount; const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x; const int32 total_thread_count = gridDim.x * blockDim.x; int32 offset = thread_id * kGroupSize; gen.Skip(thread_id); while (offset < size) { typename Distribution::ResultType samples = dist(&gen); for (int i = 0; i < kGroupSize; ++i) { if (offset >= size) { return; } data[offset] = samples[i]; ++offset; } offset += (total_thread_count - 1) * kGroupSize; gen.Skip(total_thread_count - 1); } } }; // A cuda kernel to fill the data with random numbers from the specified // distribution. Each output takes a variable number of samples. template struct FillPhiloxRandomKernel { typedef typename Distribution::ResultElementType T; PHILOX_DEVICE_FUNC void Run(const random::PhiloxRandom& base_gen, T* data, int64 size) { using random::PhiloxRandom; using random::SingleSampleAdapter; const int kReservedSamplesPerOutput = 256; const int kGroupSize = Distribution::kResultElementCount; const int kGeneratorSkipPerOutputGroup = kGroupSize * kReservedSamplesPerOutput / PhiloxRandom::kResultElementCount; const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x; const int32 total_thread_count = gridDim.x * blockDim.x; int64 group_index = thread_id; int64 offset = group_index * kGroupSize; Distribution dist; while (offset < size) { // Since each output takes a variable number of samples, we need to // realign the generator to the beginning for the current output group PhiloxRandom gen = base_gen; gen.Skip(group_index * kGeneratorSkipPerOutputGroup); SingleSampleAdapter single_samples(&gen); typename Distribution::ResultType samples = dist(&single_samples); for (int i = 0; i < kGroupSize; ++i) { if (offset >= size) { return; } data[offset] = samples[i]; ++offset; } offset += (total_thread_count - 1) * kGroupSize; group_index += total_thread_count; } } }; // A simple launch pad to call the correct function templates to fill the data template __global__ void __launch_bounds__(1024) FillPhiloxRandomKernelLaunch(random::PhiloxRandom base_gen, typename Distribution::ResultElementType* data, int64 size) { FillPhiloxRandomKernel() .Run(base_gen, data, size); } // Partial specialization for GPU template struct FillPhiloxRandom { typedef typename Distribution::ResultElementType T; typedef GPUDevice Device; void operator()(OpKernelContext*, const Device& d, random::PhiloxRandom gen, T* data, int64 size) { const int32 block_size = d.maxCudaThreadsPerBlock(); const int32 num_blocks = (d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor()) / block_size; FillPhiloxRandomKernelLaunch< Distribution><<>>(gen, data, size); } }; // Explicit instantiation of the GPU distributions functors // clang-format off // NVCC cannot handle ">>" properly template struct FillPhiloxRandom< GPUDevice, random::UniformDistribution >; template struct FillPhiloxRandom< GPUDevice, random::UniformDistribution >; template struct FillPhiloxRandom< GPUDevice, random::NormalDistribution >; template struct FillPhiloxRandom< GPUDevice, random::NormalDistribution >; template struct FillPhiloxRandom< GPUDevice, random::TruncatedNormalDistribution< random::SingleSampleAdapter, float> >; template struct FillPhiloxRandom< GPUDevice, random::TruncatedNormalDistribution< random::SingleSampleAdapter, double> >; // clang-format on } // namespace functor } // namespace tensorflow #endif // GOOGLE_CUDA