aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11
diff options
context:
space:
mode:
authorGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2016-10-05 14:54:36 -0700
committerGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2016-10-05 14:54:36 -0700
commitae1385c7e46fd35f4e1a89fd0fda5ec828a85c41 (patch)
tree484427e28e9f8a58f1fa408bf6472af5543d8db5 /unsupported/Eigen/CXX11
parent73b00129451f53a3a701397617c765ec2eb87851 (diff)
parentceee1c008b6d618a48846283e1f18ba1b4cc171a (diff)
Pull the latest updates from trunk
Diffstat (limited to 'unsupported/Eigen/CXX11')
-rw-r--r--unsupported/Eigen/CXX11/Tensor6
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h3
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h121
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h7
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h458
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h276
6 files changed, 338 insertions, 533 deletions
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index da6a3f301..6743179d3 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -61,8 +61,9 @@ typedef unsigned __int64 uint64_t;
#ifdef EIGEN_USE_GPU
#include <iostream>
#include <cuda_runtime.h>
-#if defined(__CUDACC__)
-#include <curand_kernel.h>
+#if __cplusplus >= 201103L
+#include <atomic>
+#include <unistd.h>
#endif
#endif
@@ -81,6 +82,7 @@ typedef unsigned __int64 uint64_t;
#include "src/Tensor/TensorDimensions.h"
#include "src/Tensor/TensorInitializer.h"
#include "src/Tensor/TensorTraits.h"
+#include "src/Tensor/TensorRandom.h"
#include "src/Tensor/TensorUInt128.h"
#include "src/Tensor/TensorIntDiv.h"
#include "src/Tensor/TensorGlobalFunctions.h"
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
index d66e45d50..83c449cf1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
@@ -51,12 +51,15 @@ class TensorOpCost {
internal::scalar_cast_op<SrcType, TargetType> >::Cost;
}
+ EIGEN_DEVICE_FUNC
TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
+ EIGEN_DEVICE_FUNC
TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
: bytes_loaded_(bytes_loaded),
bytes_stored_(bytes_stored),
compute_cycles_(compute_cycles) {}
+ EIGEN_DEVICE_FUNC
TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles,
bool vectorized, double packet_size)
: bytes_loaded_(bytes_loaded),
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index 1468caa23..4f5767bc7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -42,7 +42,21 @@ static bool m_devicePropInitialized = false;
static void initializeDeviceProp() {
if (!m_devicePropInitialized) {
- if (!m_devicePropInitialized) {
+ // Attempts to ensure proper behavior in the case of multiple threads
+ // calling this function simultaneously. This would be trivial to
+ // implement if we could use std::mutex, but unfortunately mutex don't
+ // compile with nvcc, so we resort to atomics and thread fences instead.
+ // Note that if the caller uses a compiler that doesn't support c++11 we
+ // can't ensure that the initialization is thread safe.
+#if __cplusplus >= 201103L
+ static std::atomic<bool> first(true);
+ if (first.exchange(false)) {
+#else
+ static bool first = true;
+ if (first) {
+ first = false;
+#endif
+ // We're the first thread to reach this point.
int num_devices;
cudaError_t status = cudaGetDeviceCount(&num_devices);
if (status != cudaSuccess) {
@@ -63,7 +77,19 @@ static void initializeDeviceProp() {
assert(status == cudaSuccess);
}
}
+
+#if __cplusplus >= 201103L
+ std::atomic_thread_fence(std::memory_order_release);
+#endif
m_devicePropInitialized = true;
+ } else {
+ // Wait for the other thread to inititialize the properties.
+ while (!m_devicePropInitialized) {
+#if __cplusplus >= 201103L
+ std::atomic_thread_fence(std::memory_order_acquire);
+#endif
+ sleep(1);
+ }
}
}
}
@@ -168,39 +194,20 @@ struct GpuDevice {
return stream_->stream();
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
-#ifndef __CUDA_ARCH__
+ EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
return stream_->allocate(num_bytes);
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
- return NULL;
-#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
-#ifndef __CUDA_ARCH__
+ EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
stream_->deallocate(buffer);
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* scratchpad() const {
-#ifndef __CUDA_ARCH__
+ EIGEN_STRONG_INLINE void* scratchpad() const {
return stream_->scratchpad();
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
- return NULL;
-#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned int* semaphore() const {
-#ifndef __CUDA_ARCH__
+ EIGEN_STRONG_INLINE unsigned int* semaphore() const {
return stream_->semaphore();
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
- return NULL;
-#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
@@ -210,30 +217,22 @@ struct GpuDevice {
EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess);
#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
-#ifndef __CUDA_ARCH__
+ EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
cudaError_t err =
cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream());
EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess);
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
-#ifndef __CUDA_ARCH__
+ EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
cudaError_t err =
cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream());
EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess);
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
@@ -242,21 +241,21 @@ struct GpuDevice {
EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess);
#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
+ EIGEN_STRONG_INLINE size_t numThreads() const {
// FIXME
return 32;
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+ EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
// FIXME
return 48*1024;
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+ EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
// We won't try to take advantage of the l2 cache for the time being, and
// there is no l3 cache on cuda devices.
return firstLevelCacheSize();
@@ -276,56 +275,26 @@ struct GpuDevice {
#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
-#ifndef __CUDA_ARCH__
+ EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
return stream_->deviceProperties().multiProcessorCount;
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
- return 0;
-#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
-#ifndef __CUDA_ARCH__
+ EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
return stream_->deviceProperties().maxThreadsPerBlock;
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
- return 0;
-#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
-#ifndef __CUDA_ARCH__
+ EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
return stream_->deviceProperties().maxThreadsPerMultiProcessor;
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
- return 0;
-#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
-#ifndef __CUDA_ARCH__
+ EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
return stream_->deviceProperties().sharedMemPerBlock;
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
- return 0;
-#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
-#ifndef __CUDA_ARCH__
+ EIGEN_STRONG_INLINE int majorDeviceVersion() const {
return stream_->deviceProperties().major;
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
- return 0;
-#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int minorDeviceVersion() const {
-#ifndef __CUDA_ARCH__
+ EIGEN_STRONG_INLINE int minorDeviceVersion() const {
return stream_->deviceProperties().minor;
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
- return 0;
-#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxBlocks() const {
+ EIGEN_STRONG_INLINE int maxBlocks() const {
return max_blocks_;
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 9b99af641..f01d77c0a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -234,16 +234,11 @@ struct EigenMetaKernelEval<Evaluator, Index, true> {
template <typename Evaluator, typename Index>
__global__ void
__launch_bounds__(1024)
-EigenMetaKernel(Evaluator memcopied_eval, Index size) {
+EigenMetaKernel(Evaluator eval, Index size) {
const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
const Index step_size = blockDim.x * gridDim.x;
- // Cuda memcopies the kernel arguments. That's fine for POD, but for more
- // complex types such as evaluators we should really conform to the C++
- // standard and call a proper copy constructor.
- Evaluator eval(memcopied_eval);
-
const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size);
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index fc75dbb5c..7164e8d60 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -99,7 +99,8 @@ template <typename T> struct SumReducer
static const bool IsStateful = false;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
- (*accum) += t;
+ internal::scalar_sum_op<T> sum_op;
+ *accum = sum_op(*accum, t);
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
@@ -145,7 +146,8 @@ template <typename T> struct MeanReducer
MeanReducer() : scalarCount_(0), packetCount_(0) { }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) {
- (*accum) += t;
+ internal::scalar_sum_op<T> sum_op;
+ *accum = sum_op(*accum, t);
scalarCount_++;
}
template <typename Packet>
@@ -190,25 +192,25 @@ struct reducer_traits<MeanReducer<T>, Device> {
template <typename T, bool IsMax = true, bool IsInteger = true>
struct MinMaxBottomValue {
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
return Eigen::NumTraits<T>::lowest();
}
};
template <typename T>
struct MinMaxBottomValue<T, true, false> {
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
return -Eigen::NumTraits<T>::infinity();
}
};
template <typename T>
struct MinMaxBottomValue<T, false, true> {
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
return Eigen::NumTraits<T>::highest();
}
};
template <typename T>
struct MinMaxBottomValue<T, false, false> {
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
return Eigen::NumTraits<T>::infinity();
}
};
@@ -439,448 +441,6 @@ struct reducer_traits<ArgMinTupleReducer<T>, Device> {
};
-// Random number generation
-namespace {
-#ifdef __CUDA_ARCH__
-__device__ int get_random_seed() {
- return clock();
-}
-#else
-static inline int get_random_seed() {
-#ifdef _WIN32
- SYSTEMTIME st;
- GetSystemTime(&st);
- return st.wSecond + 1000 * st.wMilliseconds;
-#elif defined __APPLE__
- return static_cast<int>(mach_absolute_time());
-#else
- timespec ts;
- clock_gettime(CLOCK_REALTIME, &ts);
- return static_cast<int>(ts.tv_nsec);
-#endif
-}
-#endif
-}
-
-#if !defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)
-// We're not compiling a cuda kernel
-template <typename T> class UniformRandomGenerator {
-
- public:
- static const bool PacketAccess = true;
-
- UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
- if (!deterministic) {
- srand(get_random_seed());
- }
- }
- UniformRandomGenerator(const UniformRandomGenerator& other) {
- m_deterministic = other.m_deterministic;
- }
-
- T operator()() const {
- return random<T>();
- }
- template<typename PacketType>
- PacketType packetOp() const {
- const int packetSize = internal::unpacket_traits<PacketType>::size;
- EIGEN_ALIGN_MAX T values[packetSize];
- for (int i = 0; i < packetSize; ++i) {
- values[i] = random<T>();
- }
- return internal::pload<PacketType>(values);
- }
-
- private:
- bool m_deterministic;
-};
-
-#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900
-template <> class UniformRandomGenerator<float> {
- public:
- static const bool PacketAccess = true;
-
- UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_generator(new std::mt19937()) {
- if (!deterministic) {
- m_generator->seed(get_random_seed());
- }
- }
- UniformRandomGenerator(const UniformRandomGenerator<float>& other) {
- m_generator = new std::mt19937();
- m_generator->seed(other() * UINT_MAX);
- m_deterministic = other.m_deterministic;
- }
- ~UniformRandomGenerator() {
- delete m_generator;
- }
-
- float operator()() const {
- return m_distribution(*m_generator);
- }
- template<typename PacketType>
- PacketType packetOp() const {
- const int packetSize = internal::unpacket_traits<PacketType>::size;
- EIGEN_ALIGN_MAX float values[packetSize];
- for (int k = 0; k < packetSize; ++k) {
- values[k] = this->operator()();
- }
- return internal::pload<PacketType>(values);
- }
-
- private:
- UniformRandomGenerator& operator = (const UniformRandomGenerator&);
- // Make sure m_deterministic comes first to match the layout of the cpu
- // version of the code.
- bool m_deterministic;
- std::mt19937* m_generator;
- mutable std::uniform_real_distribution<float> m_distribution;
-};
-
-template <> class UniformRandomGenerator<double> {
- public:
- static const bool PacketAccess = true;
-
- UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_generator(new std::mt19937()) {
- if (!deterministic) {
- m_generator->seed(get_random_seed());
- }
- }
- UniformRandomGenerator(const UniformRandomGenerator<double>& other) {
- m_generator = new std::mt19937();
- m_generator->seed(other() * UINT_MAX);
- m_deterministic = other.m_deterministic;
- }
- ~UniformRandomGenerator() {
- delete m_generator;
- }
-
- double operator()() const {
- return m_distribution(*m_generator);
- }
- template<typename PacketType>
- PacketType packetOp() const {
- const int packetSize = internal::unpacket_traits<PacketType>::size;
- EIGEN_ALIGN_MAX double values[packetSize];
- for (int k = 0; k < packetSize; ++k) {
- values[k] = this->operator()();
- }
- return internal::pload<PacketType>(values);
- }
-
- private:
- UniformRandomGenerator& operator = (const UniformRandomGenerator&);
- // Make sure m_deterministic comes first to match the layout of the cpu
- // version of the code.
- bool m_deterministic;
- std::mt19937* m_generator;
- mutable std::uniform_real_distribution<double> m_distribution;
-};
-#endif
-
-#else
-
-// We're compiling a cuda kernel
-template <typename T> class UniformRandomGenerator;
-
-template <> class UniformRandomGenerator<float> {
- public:
- static const bool PacketAccess = true;
-
- __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
-
- __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
- m_deterministic = other.m_deterministic;
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = m_deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
-
- __device__ float operator()() const {
- return curand_uniform(&m_state);
- }
- template<typename PacketType>
- __device__ float4 packetOp() const {
- EIGEN_STATIC_ASSERT((is_same<PacketType, float4>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
- return curand_uniform4(&m_state);
- }
-
- private:
- bool m_deterministic;
- mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class UniformRandomGenerator<double> {
- public:
- static const bool PacketAccess = true;
-
- __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
- m_deterministic = other.m_deterministic;
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = m_deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- __device__ double operator()() const {
- return curand_uniform_double(&m_state);
- }
- template<typename PacketType>
- __device__ double2 packetOp() const {
- EIGEN_STATIC_ASSERT((is_same<PacketType, double2>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
- return curand_uniform2_double(&m_state);
- }
-
- private:
- bool m_deterministic;
- mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class UniformRandomGenerator<std::complex<float> > {
- public:
- static const bool PacketAccess = false;
-
- __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
- m_deterministic = other.m_deterministic;
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = m_deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- __device__ std::complex<float> operator()() const {
- float4 vals = curand_uniform4(&m_state);
- return std::complex<float>(vals.x, vals.y);
- }
-
- private:
- bool m_deterministic;
- mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class UniformRandomGenerator<std::complex<double> > {
- public:
- static const bool PacketAccess = false;
-
- __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
- m_deterministic = other.m_deterministic;
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = m_deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- __device__ std::complex<double> operator()() const {
- double2 vals = curand_uniform2_double(&m_state);
- return std::complex<double>(vals.x, vals.y);
- }
-
- private:
- bool m_deterministic;
- mutable curandStatePhilox4_32_10_t m_state;
-};
-
-#endif
-
-template <typename Scalar>
-struct functor_traits<UniformRandomGenerator<Scalar> > {
- enum {
- // Rough estimate.
- Cost = 100 * NumTraits<Scalar>::MulCost,
- PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
- };
-};
-
-
-
-#if (!defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)) && (__cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900)
-// We're not compiling a cuda kernel
-template <typename T> class NormalRandomGenerator {
- public:
- static const bool PacketAccess = true;
-
- NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_distribution(0, 1), m_generator(new std::mt19937()) {
- if (!deterministic) {
- m_generator->seed(get_random_seed());
- }
- }
- NormalRandomGenerator(const NormalRandomGenerator& other)
- : m_deterministic(other.m_deterministic), m_distribution(other.m_distribution), m_generator(new std::mt19937()) {
- m_generator->seed(other() * UINT_MAX);
- }
- ~NormalRandomGenerator() {
- delete m_generator;
- }
- T operator()() const {
- return m_distribution(*m_generator);
- }
- template<typename PacketType>
- PacketType packetOp() const {
- const int packetSize = internal::unpacket_traits<PacketType>::size;
- EIGEN_ALIGN_MAX T values[packetSize];
- for (int i = 0; i < packetSize; ++i) {
- values[i] = m_distribution(*m_generator);
- }
- return internal::pload<PacketType>(values);
- }
-
- private:
- // No assignment
- NormalRandomGenerator& operator = (const NormalRandomGenerator&);
-
- bool m_deterministic;
- mutable std::normal_distribution<T> m_distribution;
- std::mt19937* m_generator;
-};
-
-#elif defined (EIGEN_USE_GPU) && defined(__CUDACC__) && defined(__CUDA_ARCH__)
-
-// We're compiling a cuda kernel
-template <typename T> class NormalRandomGenerator;
-
-template <> class NormalRandomGenerator<float> {
- public:
- static const bool PacketAccess = true;
-
- __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- __device__ NormalRandomGenerator(const NormalRandomGenerator<float>& other) {
- m_deterministic = other.m_deterministic;
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = m_deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- __device__ float operator()() const {
- return curand_normal(&m_state);
- }
- template<typename PacketType>
- __device__ float4 packetOp() const {
- EIGEN_STATIC_ASSERT((is_same<PacketType, float4>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
- return curand_normal4(&m_state);
- }
-
- private:
- bool m_deterministic;
- mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class NormalRandomGenerator<double> {
- public:
- static const bool PacketAccess = true;
-
- __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- __device__ NormalRandomGenerator(const NormalRandomGenerator<double>& other) {
- m_deterministic = other.m_deterministic;
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = m_deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- __device__ double operator()() const {
- return curand_normal_double(&m_state);
- }
- template<typename PacketType>
- __device__ double2 packetOp() const {
- EIGEN_STATIC_ASSERT((is_same<PacketType, double2>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
- return curand_normal2_double(&m_state);
- }
-
- private:
- bool m_deterministic;
- mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class NormalRandomGenerator<std::complex<float> > {
- public:
- static const bool PacketAccess = false;
-
- __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- __device__ NormalRandomGenerator(const NormalRandomGenerator& other) {
- m_deterministic = other.m_deterministic;
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = m_deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- __device__ std::complex<float> operator()() const {
- float4 vals = curand_normal4(&m_state);
- return std::complex<float>(vals.x, vals.y);
- }
-
- private:
- bool m_deterministic;
- mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class NormalRandomGenerator<std::complex<double> > {
- public:
- static const bool PacketAccess = false;
-
- __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- __device__ NormalRandomGenerator(const NormalRandomGenerator& other) {
- m_deterministic = other.m_deterministic;
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = m_deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- __device__ std::complex<double> operator()() const {
- double2 vals = curand_normal2_double(&m_state);
- return std::complex<double>(vals.x, vals.y);
- }
-
- private:
- bool m_deterministic;
- mutable curandStatePhilox4_32_10_t m_state;
-};
-
-#else
-
-template <typename T> class NormalRandomGenerator {
- public:
- static const bool PacketAccess = false;
- NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {}
-
- private:
- bool m_deterministic;
-};
-
-#endif
-
-template <typename Scalar>
-struct functor_traits<NormalRandomGenerator<Scalar> > {
- enum {
- // Rough estimate.
- Cost = 100 * NumTraits<Scalar>::MulCost,
- PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
- };
-};
-
-
template <typename T, typename Index, size_t NumDims>
class GaussianGenerator {
public:
@@ -895,7 +455,7 @@ class GaussianGenerator {
}
}
- T operator()(const array<Index, NumDims>& coordinates) const {
+ EIGEN_DEVICE_FUNC T operator()(const array<Index, NumDims>& coordinates) const {
T tmp = T(0);
for (size_t i = 0; i < NumDims; ++i) {
T offset = coordinates[i] - m_means[i];
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
new file mode 100644
index 000000000..dd369fb35
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
@@ -0,0 +1,276 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
+#define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
+
+namespace Eigen {
+namespace internal {
+
+namespace {
+
+EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
+#ifdef __CUDA_ARCH__
+ // We don't support 3d kernels since we currently only use 1 and
+ // 2d kernels.
+ assert(threadIdx.z == 0);
+ return clock64() +
+ blockIdx.x * blockDim.x + threadIdx.x +
+ gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
+
+#elif defined _WIN32
+ // Use the current time as a baseline.
+ GetSystemTime(&st);
+ int time = st.wSecond + 1000 * st.wMilliseconds;
+ // Mix in a random number to make sure that we get different seeds if
+ // we try to generate seeds faster than the clock resolution.
+ // We need 2 random values since the generator only generate 16 bits at
+ // a time (https://msdn.microsoft.com/en-us/library/398ax69y.aspx)
+ SYSTEMTIME st;
+ uint rnd1 = ::rand();
+ uint rnd2 = ::rand();
+ uint64_t rnd = (rnd1 | rnd2 << 16) ^ time;
+ return rnd;
+
+#elif defined __APPLE__
+ // Same approach as for win32, except that the random number generator
+ // is better (// https://developer.apple.com/legacy/library/documentation/Darwin/Reference/ManPages/man3/random.3.html#//apple_ref/doc/man/3/random).
+ uint64_t rnd = ::random() ^ mach_absolute_time();
+ return rnd;
+
+#else
+ // Augment the current time with pseudo random number generation
+ // to ensure that we get different seeds if we try to generate seeds
+ // faster than the clock resolution.
+ timespec ts;
+ clock_gettime(CLOCK_REALTIME, &ts);
+ uint64_t rnd = ::random() ^ ts.tv_nsec;
+ return rnd;
+#endif
+}
+
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state) {
+ // TODO: Unify with the implementation in the non blocking thread pool.
+ uint64_t current = *state;
+ // Update the internal state
+ *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+ // Generate the random output (using the PCG-XSH-RS scheme)
+ return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
+}
+
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) {
+ seed = seed ? seed : get_random_seed();
+ return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+}
+
+} // namespace
+
+
+template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+T RandomToTypeUniform(uint64_t* state) {
+ unsigned rnd = PCG_XSH_RS_generator(state);
+ return static_cast<T>(rnd);
+}
+
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state) {
+ Eigen::half result;
+ // Generate 10 random bits for the mantissa
+ unsigned rnd = PCG_XSH_RS_generator(state);
+ result.x = static_cast<uint16_t>(rnd & 0x3ffu);
+ // Set the exponent
+ result.x |= (static_cast<uint16_t>(15) << 10);
+ // Return the final result
+ return result - Eigen::half(1.0f);
+}
+
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float RandomToTypeUniform<float>(uint64_t* state) {
+ typedef union {
+ uint32_t raw;
+ float fp;
+ } internal;
+ internal result;
+ // Generate 23 random bits for the mantissa mantissa
+ const unsigned rnd = PCG_XSH_RS_generator(state);
+ result.raw = rnd & 0x7fffffu;
+ // Set the exponent
+ result.raw |= (static_cast<uint32_t>(127) << 23);
+ // Return the final result
+ return result.fp - 1.0f;
+}
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double RandomToTypeUniform<double>(uint64_t* state) {
+ typedef union {
+ uint64_t raw;
+ double dp;
+ } internal;
+ internal result;
+ result.raw = 0;
+ // Generate 52 random bits for the mantissa
+ // First generate the upper 20 bits
+ unsigned rnd1 = PCG_XSH_RS_generator(state) & 0xfffffu;
+ // The generate the lower 32 bits
+ unsigned rnd2 = PCG_XSH_RS_generator(state);
+ result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
+ // Set the exponent
+ result.raw |= (static_cast<uint64_t>(1023) << 52);
+ // Return the final result
+ return result.dp - 1.0;
+}
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state) {
+ return std::complex<float>(RandomToTypeUniform<float>(state),
+ RandomToTypeUniform<float>(state));
+}
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state) {
+ return std::complex<double>(RandomToTypeUniform<double>(state),
+ RandomToTypeUniform<double>(state));
+}
+
+template <typename T> class UniformRandomGenerator {
+ public:
+ static const bool PacketAccess = true;
+
+ // Uses the given "seed" if non-zero, otherwise uses a random seed.
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
+ uint64_t seed = 0) {
+ m_state = PCG_XSH_RS_state(seed);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
+ const UniformRandomGenerator& other) {
+ m_state = other.m_state;
+ }
+
+ template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ T operator()(Index i) const {
+ uint64_t local_state = m_state + i;
+ T result = RandomToTypeUniform<T>(&local_state);
+ m_state = local_state;
+ return result;
+ }
+
+ template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Packet packetOp(Index i) const {
+ const int packetSize = internal::unpacket_traits<Packet>::size;
+ EIGEN_ALIGN_MAX T values[packetSize];
+ uint64_t local_state = m_state + i;
+ for (int j = 0; j < packetSize; ++j) {
+ values[j] = RandomToTypeUniform<T>(&local_state);
+ }
+ m_state = local_state;
+ return internal::pload<Packet>(values);
+ }
+
+ private:
+ mutable uint64_t m_state;
+};
+
+template <typename Scalar>
+struct functor_traits<UniformRandomGenerator<Scalar> > {
+ enum {
+ // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)).
+ Cost = 12 * NumTraits<Scalar>::AddCost *
+ ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)),
+ PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
+ };
+};
+
+
+
+template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+T RandomToTypeNormal(uint64_t* state) {
+ // Use the ratio of uniform method to generate numbers following a normal
+ // distribution. See for example Numerical Recipes chapter 7.3.9 for the
+ // details.
+ T u, v, q;
+ do {
+ u = RandomToTypeUniform<T>(state);
+ v = T(1.7156) * (RandomToTypeUniform<T>(state) - T(0.5));
+ const T x = u - T(0.449871);
+ const T y = numext::abs(v) + T(0.386595);
+ q = x*x + y * (T(0.196)*y - T(0.25472)*x);
+ } while (q > T(0.27597) &&
+ (q > T(0.27846) || v*v > T(-4) * numext::log(u) * u*u));
+
+ return v/u;
+}
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state) {
+ return std::complex<float>(RandomToTypeNormal<float>(state),
+ RandomToTypeNormal<float>(state));
+}
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state) {
+ return std::complex<double>(RandomToTypeNormal<double>(state),
+ RandomToTypeNormal<double>(state));
+}
+
+
+template <typename T> class NormalRandomGenerator {
+ public:
+ static const bool PacketAccess = true;
+
+ // Uses the given "seed" if non-zero, otherwise uses a random seed.
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
+ m_state = PCG_XSH_RS_state(seed);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(
+ const NormalRandomGenerator& other) {
+ m_state = other.m_state;
+ }
+
+ template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ T operator()(Index i) const {
+ uint64_t local_state = m_state + i;
+ T result = RandomToTypeNormal<T>(&local_state);
+ m_state = local_state;
+ return result;
+ }
+
+ template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Packet packetOp(Index i) const {
+ const int packetSize = internal::unpacket_traits<Packet>::size;
+ EIGEN_ALIGN_MAX T values[packetSize];
+ uint64_t local_state = m_state + i;
+ for (int j = 0; j < packetSize; ++j) {
+ values[j] = RandomToTypeNormal<T>(&local_state);
+ }
+ m_state = local_state;
+ return internal::pload<Packet>(values);
+ }
+
+ private:
+ mutable uint64_t m_state;
+};
+
+
+template <typename Scalar>
+struct functor_traits<NormalRandomGenerator<Scalar> > {
+ enum {
+ // On average, we need to generate about 3 random numbers
+ // 15 mul, 8 add, 1.5 logs
+ Cost = 3 * functor_traits<UniformRandomGenerator<Scalar> >::Cost +
+ 15 * NumTraits<Scalar>::AddCost + 8 * NumTraits<Scalar>::AddCost +
+ 3 * functor_traits<scalar_log_op<Scalar> >::Cost / 2,
+ PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
+ };
+};
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H