aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11
diff options
context:
space:
mode:
authorGravatar Luke Iwanski <luke@codeplay.com>2016-09-19 14:03:54 +0100
committerGravatar Luke Iwanski <luke@codeplay.com>2016-09-19 14:03:54 +0100
commitb91e0211727b9ea5d7c30908ed86afc4e50d4c6c (patch)
tree4dd857d1aabdd4065e8b7dec1369a4bd06bac5e7 /unsupported/Eigen/CXX11
parentcb81975714a96ecb2faf33ca242feeee3543b1db (diff)
parentff47717f25aeede4878f65b214cdce264b8314e8 (diff)
Merged with default.
Diffstat (limited to 'unsupported/Eigen/CXX11')
-rw-r--r--unsupported/Eigen/CXX11/CMakeLists.txt2
-rw-r--r--unsupported/Eigen/CXX11/src/CMakeLists.txt4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorBase.h6
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h9
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h24
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h9
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h114
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h23
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h79
-rw-r--r--unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt8
-rw-r--r--unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h4
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h4
-rw-r--r--unsupported/Eigen/CXX11/src/util/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/CXX11/src/util/MaxSizeVector.h11
17 files changed, 176 insertions, 145 deletions
diff --git a/unsupported/Eigen/CXX11/CMakeLists.txt b/unsupported/Eigen/CXX11/CMakeLists.txt
index a40bc4715..385ed240c 100644
--- a/unsupported/Eigen/CXX11/CMakeLists.txt
+++ b/unsupported/Eigen/CXX11/CMakeLists.txt
@@ -5,4 +5,4 @@ install(FILES
DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel
)
-add_subdirectory(src)
+install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel FILES_MATCHING PATTERN "*.h")
diff --git a/unsupported/Eigen/CXX11/src/CMakeLists.txt b/unsupported/Eigen/CXX11/src/CMakeLists.txt
deleted file mode 100644
index 1734262bb..000000000
--- a/unsupported/Eigen/CXX11/src/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-add_subdirectory(util)
-add_subdirectory(ThreadPool)
-add_subdirectory(Tensor)
-add_subdirectory(TensorSymmetry)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt b/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt
deleted file mode 100644
index 6d4b3ea0d..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_Tensor_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_CXX11_Tensor_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/Tensor COMPONENT Devel
- )
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index e3880d2e0..3c8710255 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -192,6 +192,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
}
EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log1p_op<Scalar>, const Derived>
+ log1p() const {
+ return unaryExpr(internal::scalar_log1p_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived>
abs() const {
return unaryExpr(internal::scalar_abs_op<Scalar>());
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 56d9c2025..20b29e5fd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -25,8 +25,9 @@ template<typename Dimensions, typename LhsXprType, typename RhsXprType>
struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
{
// Type promotion to handle the case where the types of the lhs and the rhs are different.
- typedef typename internal::promote_storage_type<typename LhsXprType::Scalar,
- typename RhsXprType::Scalar>::ret Scalar;
+ typedef typename gebp_traits<typename remove_const<typename LhsXprType::Scalar>::type,
+ typename remove_const<typename RhsXprType::Scalar>::type>::ResScalar Scalar;
+
typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
typename traits<RhsXprType>::StorageKind>::ret StorageKind;
typedef typename promote_index_type<typename traits<LhsXprType>::Index,
@@ -75,8 +76,8 @@ class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXp
{
public:
typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar;
- typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
- typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
+ typedef typename internal::gebp_traits<typename LhsXprType::CoeffReturnType,
+ typename RhsXprType::CoeffReturnType>::ResScalar CoeffReturnType;
typedef typename Eigen::internal::nested<TensorContractionOp>::type Nested;
typedef typename Eigen::internal::traits<TensorContractionOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorContractionOp>::Index Index;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
index a76c8ca35..d66e45d50 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
@@ -91,21 +91,21 @@ class TensorOpCost {
}
// TODO(rmlarsen): Define min in terms of total cost, not elementwise.
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMin(
- const TensorOpCost& rhs) {
- bytes_loaded_ = numext::mini(bytes_loaded_, rhs.bytes_loaded());
- bytes_stored_ = numext::mini(bytes_stored_, rhs.bytes_stored());
- compute_cycles_ = numext::mini(compute_cycles_, rhs.compute_cycles());
- return *this;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(
+ const TensorOpCost& rhs) const {
+ double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded());
+ double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored());
+ double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles());
+ return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
}
// TODO(rmlarsen): Define max in terms of total cost, not elementwise.
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMax(
- const TensorOpCost& rhs) {
- bytes_loaded_ = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
- bytes_stored_ = numext::maxi(bytes_stored_, rhs.bytes_stored());
- compute_cycles_ = numext::maxi(compute_cycles_, rhs.compute_cycles());
- return *this;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(
+ const TensorOpCost& rhs) const {
+ double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
+ double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored());
+ double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles());
+ return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index b2b4bcf62..834ce07df 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -239,7 +239,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
EIGEN_DEVICE_FUNC
TensorEvaluator(const XprType& op, const Device& device)
- : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device)
+ : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper()
{ }
typedef typename XprType::Index Index;
@@ -256,13 +256,13 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
{
- return m_functor(index);
+ return m_wrapper(m_functor, index);
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
- return m_functor.template packetOp<Index, PacketReturnType>(index);
+ return m_wrapper.template packetOp<PacketReturnType, Index>(m_functor, index);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
@@ -282,6 +282,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
private:
const NullaryOp m_functor;
TensorEvaluator<ArgType, Device> m_argImpl;
+ const internal::nullary_wrapper<CoeffReturnType,NullaryOp> m_wrapper;
};
@@ -612,7 +613,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
.cwiseMax(m_elseImpl.costPerCoeff(vectorized));
}
- EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return NULL; }
/// required by sycl in order to extract the accessor
const TensorEvaluator<IfArgType, Device> & cond_impl() const { return m_condImpl; }
/// required by sycl in order to extract the accessor
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index a8e48fced..fc75dbb5c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -25,7 +25,7 @@ struct scalar_mod_op {
};
template <typename Scalar>
struct functor_traits<scalar_mod_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; };
+{ enum { Cost = scalar_div_cost<Scalar,false>::value, PacketAccess = false }; };
/** \internal
@@ -38,7 +38,7 @@ struct scalar_mod2_op {
};
template <typename Scalar>
struct functor_traits<scalar_mod2_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; };
+{ enum { Cost = scalar_div_cost<Scalar,false>::value, PacketAccess = false }; };
template <typename Scalar>
struct scalar_fmod_op {
@@ -188,6 +188,32 @@ struct reducer_traits<MeanReducer<T>, Device> {
};
+template <typename T, bool IsMax = true, bool IsInteger = true>
+struct MinMaxBottomValue {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
+ return Eigen::NumTraits<T>::lowest();
+ }
+};
+template <typename T>
+struct MinMaxBottomValue<T, true, false> {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
+ return -Eigen::NumTraits<T>::infinity();
+ }
+};
+template <typename T>
+struct MinMaxBottomValue<T, false, true> {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
+ return Eigen::NumTraits<T>::highest();
+ }
+};
+template <typename T>
+struct MinMaxBottomValue<T, false, false> {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
+ return Eigen::NumTraits<T>::infinity();
+ }
+};
+
+
template <typename T> struct MaxReducer
{
static const bool PacketAccess = packet_traits<T>::HasMax;
@@ -200,9 +226,8 @@ template <typename T> struct MaxReducer
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
(*accum) = pmax<Packet>(*accum, p);
}
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
- return Eigen::NumTraits<T>::lowest();
+ return MinMaxBottomValue<T, true, Eigen::NumTraits<T>::IsInteger>::bottom_value();
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
@@ -242,9 +267,8 @@ template <typename T> struct MinReducer
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
(*accum) = pmin<Packet>(*accum, p);
}
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
- return Eigen::NumTraits<T>::highest();
+ return MinMaxBottomValue<T, false, Eigen::NumTraits<T>::IsInteger>::bottom_value();
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
@@ -454,12 +478,11 @@ template <typename T> class UniformRandomGenerator {
m_deterministic = other.m_deterministic;
}
- template<typename Index>
- T operator()(Index) const {
+ T operator()() const {
return random<T>();
}
- template<typename Index, typename PacketType>
- PacketType packetOp(Index) const {
+ template<typename PacketType>
+ PacketType packetOp() const {
const int packetSize = internal::unpacket_traits<PacketType>::size;
EIGEN_ALIGN_MAX T values[packetSize];
for (int i = 0; i < packetSize; ++i) {
@@ -484,23 +507,22 @@ template <> class UniformRandomGenerator<float> {
}
UniformRandomGenerator(const UniformRandomGenerator<float>& other) {
m_generator = new std::mt19937();
- m_generator->seed(other(0) * UINT_MAX);
+ m_generator->seed(other() * UINT_MAX);
m_deterministic = other.m_deterministic;
}
~UniformRandomGenerator() {
delete m_generator;
}
- template<typename Index>
- float operator()(Index) const {
+ float operator()() const {
return m_distribution(*m_generator);
}
- template<typename Index, typename PacketType>
- PacketType packetOp(Index i) const {
+ template<typename PacketType>
+ PacketType packetOp() const {
const int packetSize = internal::unpacket_traits<PacketType>::size;
EIGEN_ALIGN_MAX float values[packetSize];
for (int k = 0; k < packetSize; ++k) {
- values[k] = this->operator()(i);
+ values[k] = this->operator()();
}
return internal::pload<PacketType>(values);
}
@@ -525,23 +547,22 @@ template <> class UniformRandomGenerator<double> {
}
UniformRandomGenerator(const UniformRandomGenerator<double>& other) {
m_generator = new std::mt19937();
- m_generator->seed(other(0) * UINT_MAX);
+ m_generator->seed(other() * UINT_MAX);
m_deterministic = other.m_deterministic;
}
~UniformRandomGenerator() {
delete m_generator;
}
- template<typename Index>
- double operator()(Index) const {
+ double operator()() const {
return m_distribution(*m_generator);
}
- template<typename Index, typename PacketType>
- PacketType packetOp(Index i) const {
+ template<typename PacketType>
+ PacketType packetOp() const {
const int packetSize = internal::unpacket_traits<PacketType>::size;
EIGEN_ALIGN_MAX double values[packetSize];
for (int k = 0; k < packetSize; ++k) {
- values[k] = this->operator()(i);
+ values[k] = this->operator()();
}
return internal::pload<PacketType>(values);
}
@@ -578,12 +599,11 @@ template <> class UniformRandomGenerator<float> {
curand_init(seed, tid, 0, &m_state);
}
- template<typename Index>
- __device__ float operator()(Index) const {
+ __device__ float operator()() const {
return curand_uniform(&m_state);
}
- template<typename Index, typename PacketType>
- __device__ float4 packetOp(Index) const {
+ template<typename PacketType>
+ __device__ float4 packetOp() const {
EIGEN_STATIC_ASSERT((is_same<PacketType, float4>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
return curand_uniform4(&m_state);
}
@@ -608,12 +628,11 @@ template <> class UniformRandomGenerator<double> {
const int seed = m_deterministic ? 0 : get_random_seed();
curand_init(seed, tid, 0, &m_state);
}
- template<typename Index>
- __device__ double operator()(Index) const {
+ __device__ double operator()() const {
return curand_uniform_double(&m_state);
}
- template<typename Index, typename PacketType>
- __device__ double2 packetOp(Index) const {
+ template<typename PacketType>
+ __device__ double2 packetOp() const {
EIGEN_STATIC_ASSERT((is_same<PacketType, double2>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
return curand_uniform2_double(&m_state);
}
@@ -638,8 +657,7 @@ template <> class UniformRandomGenerator<std::complex<float> > {
const int seed = m_deterministic ? 0 : get_random_seed();
curand_init(seed, tid, 0, &m_state);
}
- template<typename Index>
- __device__ std::complex<float> operator()(Index) const {
+ __device__ std::complex<float> operator()() const {
float4 vals = curand_uniform4(&m_state);
return std::complex<float>(vals.x, vals.y);
}
@@ -664,8 +682,7 @@ template <> class UniformRandomGenerator<std::complex<double> > {
const int seed = m_deterministic ? 0 : get_random_seed();
curand_init(seed, tid, 0, &m_state);
}
- template<typename Index>
- __device__ std::complex<double> operator()(Index) const {
+ __device__ std::complex<double> operator()() const {
double2 vals = curand_uniform2_double(&m_state);
return std::complex<double>(vals.x, vals.y);
}
@@ -701,17 +718,16 @@ template <typename T> class NormalRandomGenerator {
}
NormalRandomGenerator(const NormalRandomGenerator& other)
: m_deterministic(other.m_deterministic), m_distribution(other.m_distribution), m_generator(new std::mt19937()) {
- m_generator->seed(other(0) * UINT_MAX);
+ m_generator->seed(other() * UINT_MAX);
}
~NormalRandomGenerator() {
delete m_generator;
}
- template<typename Index>
- T operator()(Index) const {
+ T operator()() const {
return m_distribution(*m_generator);
}
- template<typename Index, typename PacketType>
- PacketType packetOp(Index) const {
+ template<typename PacketType>
+ PacketType packetOp() const {
const int packetSize = internal::unpacket_traits<PacketType>::size;
EIGEN_ALIGN_MAX T values[packetSize];
for (int i = 0; i < packetSize; ++i) {
@@ -749,12 +765,11 @@ template <> class NormalRandomGenerator<float> {
const int seed = m_deterministic ? 0 : get_random_seed();
curand_init(seed, tid, 0, &m_state);
}
- template<typename Index>
- __device__ float operator()(Index) const {
+ __device__ float operator()() const {
return curand_normal(&m_state);
}
- template<typename Index, typename PacketType>
- __device__ float4 packetOp(Index) const {
+ template<typename PacketType>
+ __device__ float4 packetOp() const {
EIGEN_STATIC_ASSERT((is_same<PacketType, float4>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
return curand_normal4(&m_state);
}
@@ -779,12 +794,11 @@ template <> class NormalRandomGenerator<double> {
const int seed = m_deterministic ? 0 : get_random_seed();
curand_init(seed, tid, 0, &m_state);
}
- template<typename Index>
- __device__ double operator()(Index) const {
+ __device__ double operator()() const {
return curand_normal_double(&m_state);
}
- template<typename Index, typename PacketType>
- __device__ double2 packetOp(Index) const {
+ template<typename PacketType>
+ __device__ double2 packetOp() const {
EIGEN_STATIC_ASSERT((is_same<PacketType, double2>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
return curand_normal2_double(&m_state);
}
@@ -809,8 +823,7 @@ template <> class NormalRandomGenerator<std::complex<float> > {
const int seed = m_deterministic ? 0 : get_random_seed();
curand_init(seed, tid, 0, &m_state);
}
- template<typename Index>
- __device__ std::complex<float> operator()(Index) const {
+ __device__ std::complex<float> operator()() const {
float4 vals = curand_normal4(&m_state);
return std::complex<float>(vals.x, vals.y);
}
@@ -835,8 +848,7 @@ template <> class NormalRandomGenerator<std::complex<double> > {
const int seed = m_deterministic ? 0 : get_random_seed();
curand_init(seed, tid, 0, &m_state);
}
- template<typename Index>
- __device__ std::complex<double> operator()(Index) const {
+ __device__ std::complex<double> operator()() const {
double2 vals = curand_normal2_double(&m_state);
return std::complex<double>(vals.x, vals.y);
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 9df697e4c..a87777b22 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -505,9 +505,14 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
(reducing_inner_dims || ReducingInnerMostDims)) {
const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
- if (!data && num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) {
- data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
- m_result = data;
+ if (!data) {
+ if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) {
+ data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+ m_result = data;
+ }
+ else {
+ return true;
+ }
}
Op reducer(m_reducer);
if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
@@ -533,9 +538,14 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
preserving_inner_dims) {
const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
- if (!data && num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) {
- data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
- m_result = data;
+ if (!data) {
+ if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) {
+ data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+ m_result = data;
+ }
+ else {
+ return true;
+ }
}
Op reducer(m_reducer);
if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
@@ -556,6 +566,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
m_impl.cleanup();
if (m_result) {
m_device.deallocate(m_result);
+ m_result = NULL;
}
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 5e512490c..65638b6a8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -67,11 +67,21 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer)
#endif
}
+// We extend atomicExch to support extra data types
+template <typename Type>
+__device__ inline Type atomicExchCustom(Type* address, Type val) {
+ return atomicExch(address, val);
+}
+
+template <>
+__device__ inline double atomicExchCustom(double* address, double val) {
+ unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
+ return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
+}
#ifdef EIGEN_HAS_CUDA_FP16
template <template <typename T> class R>
__device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
-#if __CUDA_ARCH__ >= 300
unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
unsigned int newval = oldval;
reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
@@ -87,9 +97,6 @@ __device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer
return;
}
}
-#else
- assert(0 && "Shouldn't be called on unsupported device");
-#endif
}
#endif
@@ -130,7 +137,7 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
unsigned int block = atomicCAS(semaphore, 0u, 1u);
if (block == 0) {
// We're the first block to run, initialize the output value
- atomicExch(output, reducer.initialize());
+ atomicExchCustom(output, reducer.initialize());
__threadfence();
atomicExch(semaphore, 2u);
}
@@ -263,17 +270,22 @@ __global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2
#endif
-
-template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
struct FullReductionLauncher {
static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
- assert(false && "Should only be called on floats and half floats");
+ assert(false && "Should only be called on doubles, floats and half floats");
}
};
-template <typename Self, typename Op, bool PacketAccess>
-struct FullReductionLauncher<Self, Op, float, PacketAccess> {
- static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs) {
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct FullReductionLauncher<
+ Self, Op, OutputType, PacketAccess,
+ typename internal::enable_if<
+ internal::is_same<float, OutputType>::value ||
+ internal::is_same<double, OutputType>::value,
+ void>::type> {
+ static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
typedef typename Self::Index Index;
typedef typename Self::CoeffReturnType Scalar;
const int block_size = 256;
@@ -330,20 +342,22 @@ struct FullReductionLauncher<Self, Op, Eigen::half, true> {
template <typename Self, typename Op, bool Vectorizable>
struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
// Unfortunately nvidia doesn't support well exotic types such as complex,
- // so reduce the scope of the optimized version of the code to the simple case
- // of floats and half floats.
+ // so reduce the scope of the optimized version of the code to the simple cases
+ // of doubles, floats and half floats
#ifdef EIGEN_HAS_CUDA_FP16
static const bool HasOptimizedImplementation = !Op::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
+ internal::is_same<typename Self::CoeffReturnType, double>::value ||
(internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
#else
static const bool HasOptimizedImplementation = !Op::IsStateful &&
- internal::is_same<typename Self::CoeffReturnType, float>::value;
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+ internal::is_same<typename Self::CoeffReturnType, double>::value);
#endif
template <typename OutputType>
static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
- assert(HasOptimizedImplementation && "Should only be called on floats or half floats");
+ assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
const Index num_coeffs = array_prod(self.m_impl.dimensions());
// Don't crash when we're called with an input tensor of size 0.
if (num_coeffs == 0) {
@@ -360,6 +374,7 @@ template <int NumPerThread, typename Self,
__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
typename Self::CoeffReturnType* output) {
#if __CUDA_ARCH__ >= 300
+ typedef typename Self::CoeffReturnType Type;
eigen_assert(blockDim.y == 1);
eigen_assert(blockDim.z == 1);
eigen_assert(gridDim.y == 1);
@@ -389,13 +404,13 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
const Index col_block = i % input_col_blocks;
const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
- float reduced_val = reducer.initialize();
+ Type reduced_val = reducer.initialize();
for (Index j = 0; j < NumPerThread; j += unroll_times) {
const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
if (last_col >= num_coeffs_to_reduce) {
for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
- const float val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+ const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
reducer.reduce(val, &reduced_val);
}
break;
@@ -521,17 +536,23 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
#endif
-template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
struct InnerReductionLauncher {
static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
- assert(false && "Should only be called to reduce floats and half floats on a gpu device");
+ assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
return true;
}
};
-template <typename Self, typename Op, bool PacketAccess>
-struct InnerReductionLauncher<Self, Op, float, PacketAccess> {
- static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct InnerReductionLauncher<
+ Self, Op, OutputType, PacketAccess,
+ typename internal::enable_if<
+ internal::is_same<float, OutputType>::value ||
+ internal::is_same<double, OutputType>::value,
+ void>::type> {
+ static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
typedef typename Self::Index Index;
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
@@ -549,7 +570,7 @@ struct InnerReductionLauncher<Self, Op, float, PacketAccess> {
const int max_blocks = device.getNumCudaMultiProcessors() *
device.maxCudaThreadsPerMultiProcessor() / 1024;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
- LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
+ LAUNCH_CUDA_KERNEL((ReductionInitKernel<OutputType, Index>),
num_blocks, 1024, 0, device, reducer.initialize(),
num_preserved_vals, output);
}
@@ -616,15 +637,17 @@ struct InnerReducer<Self, Op, GpuDevice> {
#ifdef EIGEN_HAS_CUDA_FP16
static const bool HasOptimizedImplementation = !Op::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
+ internal::is_same<typename Self::CoeffReturnType, double>::value ||
(internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
#else
static const bool HasOptimizedImplementation = !Op::IsStateful &&
- internal::is_same<typename Self::CoeffReturnType, float>::value;
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+ internal::is_same<typename Self::CoeffReturnType, double>::value);
#endif
template <typename OutputType>
static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
- assert(HasOptimizedImplementation && "Should only be called on floats or half floats");
+ assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
const Index num_coeffs = array_prod(self.m_impl.dimensions());
// Don't crash when we're called with an input tensor of size 0.
if (num_coeffs == 0) {
@@ -675,11 +698,11 @@ struct OuterReducer<Self, Op, GpuDevice> {
// so reduce the scope of the optimized version of the code to the simple case
// of floats.
static const bool HasOptimizedImplementation = !Op::IsStateful &&
- internal::is_same<typename Self::CoeffReturnType, float>::value;
-
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+ internal::is_same<typename Self::CoeffReturnType, double>::value);
template <typename Device, typename OutputType>
static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
- assert(false && "Should only be called to reduce floats on a gpu device");
+ assert(false && "Should only be called to reduce doubles or floats on a gpu device");
return true;
}
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt b/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt
deleted file mode 100644
index 6e871a8da..000000000
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-FILE(GLOB Eigen_CXX11_TensorSymmetry_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_CXX11_TensorSymmetry_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/TensorSymmetry COMPONENT Devel
- )
-
-add_subdirectory(util)
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt
deleted file mode 100644
index dc9fc78ec..000000000
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_TensorSymmetry_util_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_CXX11_TensorSymmetry_util_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/TensorSymmetry/util COMPONENT Devel
- )
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt b/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt
deleted file mode 100644
index 88fef50c6..000000000
--- a/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_ThreadPool_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_CXX11_ThreadPool_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/ThreadPool COMPONENT Devel
- )
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
index 12b80d6c4..71d55552d 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
@@ -50,7 +50,7 @@ class EventCount {
public:
class Waiter;
- EventCount(std::vector<Waiter>& waiters) : waiters_(waiters) {
+ EventCount(MaxSizeVector<Waiter>& waiters) : waiters_(waiters) {
eigen_assert(waiters.size() < (1 << kWaiterBits) - 1);
// Initialize epoch to something close to overflow to test overflow.
state_ = kStackMask | (kEpochMask - kEpochInc * waiters.size() * 2);
@@ -199,7 +199,7 @@ class EventCount {
static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift;
static const uint64_t kEpochInc = 1ull << kEpochShift;
std::atomic<uint64_t> state_;
- std::vector<Waiter>& waiters_;
+ MaxSizeVector<Waiter>& waiters_;
void Park(Waiter* w) {
std::unique_lock<std::mutex> lock(w->mu);
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
index 33ae45131..354bce52a 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
@@ -29,6 +29,8 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
spinning_(0),
done_(false),
ec_(waiters_) {
+ waiters_.resize(num_threads);
+
// Calculate coprimes of num_threads.
// Coprimes are used for a random walk over all threads in Steal
// and NonEmptyQueueIndex. Iteration is based on the fact that if we take
@@ -123,7 +125,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
MaxSizeVector<Thread*> threads_;
MaxSizeVector<Queue*> queues_;
MaxSizeVector<unsigned> coprimes_;
- std::vector<EventCount::Waiter> waiters_;
+ MaxSizeVector<EventCount::Waiter> waiters_;
std::atomic<unsigned> blocked_;
std::atomic<bool> spinning_;
std::atomic<bool> done_;
diff --git a/unsupported/Eigen/CXX11/src/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/util/CMakeLists.txt
deleted file mode 100644
index 7eab492d6..000000000
--- a/unsupported/Eigen/CXX11/src/util/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_util_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_CXX11_util_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/util COMPONENT Devel
- )
diff --git a/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h b/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
index 961456f10..4bc3dd1ba 100644
--- a/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
+++ b/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
@@ -55,6 +55,17 @@ class MaxSizeVector {
internal::aligned_free(data_);
}
+ void resize(size_t n) {
+ eigen_assert(n <= reserve_);
+ for (size_t i = size_; i < n; ++i) {
+ new (&data_[i]) T;
+ }
+ for (size_t i = n; i < size_; ++i) {
+ data_[i].~T();
+ }
+ size_ = n;
+ }
+
// Append new elements (up to reserved size).
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void push_back(const T& t) {