diff options
Diffstat (limited to 'tensorflow/core/lib/random/philox_random.h')
-rw-r--r-- | tensorflow/core/lib/random/philox_random.h | 232 |
1 files changed, 232 insertions, 0 deletions
diff --git a/tensorflow/core/lib/random/philox_random.h b/tensorflow/core/lib/random/philox_random.h new file mode 100644 index 0000000000..2c3cd0c4b9 --- /dev/null +++ b/tensorflow/core/lib/random/philox_random.h @@ -0,0 +1,232 @@ +// Implement the Philox algorithm to generate random numbers in parallel. +// Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3. +// http://www.thesalmons.org/john/random123/papers/random123sc11.pdf + +#ifndef TENSORFLOW_LIB_RANDOM_PHILOX_RANDOM_H_ +#define TENSORFLOW_LIB_RANDOM_PHILOX_RANDOM_H_ + +#include <stdlib.h> + +#include "tensorflow/core/platform/port.h" + +// Function qualifiers that need to work on both CPU and GPU. +#ifdef __CUDA_ARCH__ +// For nvcc. +#define PHILOX_DEVICE_FUNC __host__ __device__ +#define PHILOX_INLINE __inline__ +#else +// For non-nvcc. +#define PHILOX_DEVICE_FUNC +#define PHILOX_INLINE inline +#endif +#define PHILOX_DEVICE_INLINE PHILOX_DEVICE_FUNC PHILOX_INLINE + +#include <math.h> + +namespace tensorflow { +namespace random { + +// A class that represents an inline array. It can be used on both CPU and GPU, +// and also trivially copyable between CPU and GPU. +// Arguments: +// T: the array element type; +// ElementCount: the fixed size of the array; +template <typename T, int ElementCount> +class Array { + public: + PHILOX_DEVICE_INLINE Array() { + for (int i = 0; i < ElementCount; ++i) { + data_[i] = T(); + } + } + + PHILOX_DEVICE_INLINE const T& operator[](int index) const { + return data_[index]; + } + + PHILOX_DEVICE_INLINE T& operator[](int index) { return data_[index]; } + + size_t size() const { return ElementCount; } + + private: + T data_[ElementCount]; +}; + +// A class that encapsulates all the states for a random number generator using +// the philox_4x32_10 algorithm. Each invocation returns a 128-bit random bits +// in the form of four uint32. +// There are multiple variants of this algorithm, we picked the 4x32_10 version +// that is most suited for our applications. +// Since this class is meant to be copied between CPU to GPU, it maintains a +// value semantics. +// +// For example: To use this class and populate an array of 1024 randoms on CPU +// with two threads, +// +// void Fill(PhiloxRandom rnd, uint32* output, int start, int limit) { +// assert(start % 4 == 0); +// assert(limit % 4 == 0); +// rnd.Skip(start / 4); +// for (int i = start; i < limit; i += 4) { +// auto sample = rnd(); +// ... copy sample[0..3] to output[i..i+3] +// } +// } +// +// PhiloxRandom rng(seed); +// PhiloxRandom rng_copy = rng; +// rng.Skip(1000/4); +// +// ... schedule Fill(rng_copy, output, 0, 512) in thread 1; +// ... schedule Fill(rng_copy, output, 512, 1024) in thread 2; +// ... wait for thread 1 & 2 to finish executing Fill(). +// +// NOTE: +// 1. PhiloxRandom is trivially copyable. +// 2. PhiloxRandom is compilable by gcc and nvcc. +class PhiloxRandom { + public: + typedef Array<uint32, 4> ResultType; + typedef uint32 ResultElementType; + // The number of elements that will be returned. + static const int kResultElementCount = 4; + + PHILOX_DEVICE_INLINE + PhiloxRandom() {} + + PHILOX_DEVICE_INLINE + explicit PhiloxRandom(uint64 seed) { + key_[0] = static_cast<uint32>(seed); + key_[1] = static_cast<uint32>(seed >> 32); + } + + PHILOX_DEVICE_INLINE + explicit PhiloxRandom(uint64 seed_lo, uint64 seed_hi) { + key_[0] = static_cast<uint32>(seed_lo); + key_[1] = static_cast<uint32>(seed_lo >> 32); + counter_[2] = static_cast<uint32>(seed_hi); + counter_[3] = static_cast<uint32>(seed_hi >> 32); + } + + // Skip the specified number of samples of 128-bits in the current stream. + PHILOX_DEVICE_INLINE + void Skip(uint64 count) { + const uint32 count_lo = static_cast<uint32>(count); + uint32 count_hi = static_cast<uint32>(count >> 32); + + counter_[0] += count_lo; + if (counter_[0] < count_lo) { + ++count_hi; + } + + counter_[1] += count_hi; + if (counter_[1] < count_hi) { + if (++counter_[2] == 0) { + ++counter_[3]; + } + } + } + + // Returns a group of four random numbers using the underlying Philox + // algorithm. + PHILOX_DEVICE_INLINE ResultType operator()() { + ResultType counter = counter_; + Key key = key_; + + // Run the single rounds for ten times. Manually unrolling the loop + // for better performance. + counter = ComputeSingleRound(counter, key); + RaiseKey(&key); + counter = ComputeSingleRound(counter, key); + RaiseKey(&key); + counter = ComputeSingleRound(counter, key); + RaiseKey(&key); + counter = ComputeSingleRound(counter, key); + RaiseKey(&key); + counter = ComputeSingleRound(counter, key); + RaiseKey(&key); + counter = ComputeSingleRound(counter, key); + RaiseKey(&key); + counter = ComputeSingleRound(counter, key); + RaiseKey(&key); + counter = ComputeSingleRound(counter, key); + RaiseKey(&key); + counter = ComputeSingleRound(counter, key); + RaiseKey(&key); + counter = ComputeSingleRound(counter, key); + + SkipOne(); + + return counter; + } + + private: + // The type for the 64-bit key stored in the form of two 32-bit uint + // that are used in the diffusion process. + typedef Array<uint32, 2> Key; + + // We use the same constants as recommended by the original paper. + static const uint32 kPhiloxW32A = 0x9E3779B9; + static const uint32 kPhiloxW32B = 0xBB67AE85; + static const uint32 kPhiloxM4x32A = 0xD2511F53; + static const uint32 kPhiloxM4x32B = 0xCD9E8D57; + + // Helper function to skip the next sample of 128-bits in the current stream. + PHILOX_DEVICE_INLINE void SkipOne() { + if (++counter_[0] == 0) { + if (++counter_[1] == 0) { + if (++counter_[2] == 0) { + ++counter_[3]; + } + } + } + } + + // Helper function to return the lower and higher 32-bits from two 32-bit + // integer multiplications. + PHILOX_DEVICE_INLINE + static void MultiplyHighLow(uint32 a, uint32 b, uint32* result_low, + uint32* result_high) { +#ifndef __GCUDACC__ + const uint64 product = static_cast<uint64>(a) * b; + *result_low = static_cast<uint32>(product); + *result_high = static_cast<uint32>(product >> 32); +#else + *result_low = a * b; + *result_high = __umulhi(a, b); +#endif + } + + // Helper function for a single round of the underlying Philox algorithm. + PHILOX_DEVICE_INLINE static ResultType ComputeSingleRound( + const ResultType& counter, const Key& key) { + uint32 lo0; + uint32 hi0; + MultiplyHighLow(kPhiloxM4x32A, counter[0], &lo0, &hi0); + + uint32 lo1; + uint32 hi1; + MultiplyHighLow(kPhiloxM4x32B, counter[2], &lo1, &hi1); + + ResultType result; + result[0] = hi1 ^ counter[1] ^ key[0]; + result[1] = lo1; + result[2] = hi0 ^ counter[3] ^ key[1]; + result[3] = lo0; + return result; + } + + PHILOX_DEVICE_INLINE void RaiseKey(Key* key) { + (*key)[0] += kPhiloxW32A; + (*key)[1] += kPhiloxW32B; + } + + private: + ResultType counter_; + Key key_; +}; + +} // namespace random +} // namespace tensorflow + +#endif // TENSORFLOW_LIB_RANDOM_PHILOX_RANDOM_H_ |