aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2016-06-14 15:33:47 +0200
committerGravatar Gael Guennebaud <g.gael@free.fr>2016-06-14 15:33:47 +0200
commit76236cdea402e7236249ab19bd5d8d6ceac1346d (patch)
tree3b979d88426a94114e8af03877cc47de09862b59 /unsupported/Eigen
parent1004c4df99a3e4a019f05b83badb06f4e2df5ee6 (diff)
parent4c61f00838202889045ec9e5ad0d60b79f00fec5 (diff)
merge
Diffstat (limited to 'unsupported/Eigen')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h12
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h10
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h14
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h16
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h86
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h37
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h28
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorScan.h12
9 files changed, 160 insertions, 57 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index a60a17049..ee16cde9b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -202,7 +202,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
// across k dimension.
const TensorOpCost cost =
contractionCost(m, n, bm, bn, bk, shard_by_col, false);
- Index num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
+ int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
static_cast<double>(n) * m, cost, this->m_device.numThreads());
// TODO(dvyukov): this is a stop-gap to prevent regressions while the cost
@@ -301,7 +301,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
class Context {
public:
Context(const Device& device, int num_threads, LhsMapper& lhs,
- RhsMapper& rhs, Scalar* buffer, Index m, Index n, Index k, Index bm,
+ RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm,
Index bn, Index bk, Index nm, Index nn, Index nk, Index gm,
Index gn, Index nm0, Index nn0, bool shard_by_col,
bool parallel_pack)
@@ -309,13 +309,13 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
lhs_(lhs),
rhs_(rhs),
buffer_(buffer),
- output_(buffer, m),
+ output_(buffer, tm),
num_threads_(num_threads),
shard_by_col_(shard_by_col),
parallel_pack_(parallel_pack),
- m_(m),
- n_(n),
- k_(k),
+ m_(tm),
+ n_(tn),
+ k_(tk),
bm_(bm),
bn_(bn),
bk_(bk),
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index d31b0ad38..c770d024f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -106,7 +106,7 @@ static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) {
// Build a thread pool device on top the an existing pool of threads.
struct ThreadPoolDevice {
// The ownership of the thread pool remains with the caller.
- ThreadPoolDevice(ThreadPoolInterface* pool, size_t num_cores) : pool_(pool), num_threads_(num_cores) { }
+ ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores) : pool_(pool), num_threads_(num_cores) { }
EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
return internal::aligned_malloc(num_bytes);
@@ -130,7 +130,7 @@ struct ThreadPoolDevice {
::memset(buffer, c, n);
}
- EIGEN_STRONG_INLINE size_t numThreads() const {
+ EIGEN_STRONG_INLINE int numThreads() const {
return num_threads_;
}
@@ -182,7 +182,7 @@ struct ThreadPoolDevice {
std::function<void(Index, Index)> f) const {
typedef TensorCostModel<ThreadPoolDevice> CostModel;
if (n <= 1 || numThreads() == 1 ||
- CostModel::numThreads(n, cost, numThreads()) == 1) {
+ CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
f(0, n);
return;
}
@@ -242,7 +242,7 @@ struct ThreadPoolDevice {
// Recursively divide size into halves until we reach block_size.
// Division code rounds mid to block_size, so we are guaranteed to get
// block_count leaves that do actual computations.
- Barrier barrier(block_count);
+ Barrier barrier(static_cast<unsigned int>(block_count));
std::function<void(Index, Index)> handleRange;
handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) {
if (last - first <= block_size) {
@@ -268,7 +268,7 @@ struct ThreadPoolDevice {
private:
ThreadPoolInterface* pool_;
- size_t num_threads_;
+ int num_threads_;
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index 4e873011e..a48cb1daa 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -426,6 +426,20 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
m_arg3Impl(op.arg3Expression(), device)
{
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<Arg1Type, Device>::Layout) == static_cast<int>(TensorEvaluator<Arg3Type, Device>::Layout) || internal::traits<XprType>::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+ EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
+ typename internal::traits<Arg2Type>::StorageKind>::value),
+ STORAGE_KIND_MUST_MATCH)
+ EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
+ typename internal::traits<Arg3Type>::StorageKind>::value),
+ STORAGE_KIND_MUST_MATCH)
+ EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index,
+ typename internal::traits<Arg2Type>::Index>::value),
+ STORAGE_INDEX_MUST_MATCH)
+ EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index,
+ typename internal::traits<Arg3Type>::Index>::value),
+ STORAGE_INDEX_MUST_MATCH)
+
eigen_assert(dimensions_match(m_arg1Impl.dimensions(), m_arg2Impl.dimensions()) && dimensions_match(m_arg1Impl.dimensions(), m_arg3Impl.dimensions()));
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
index 9509f8002..5f2e329f2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
@@ -227,22 +227,6 @@ struct traits<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprT
TernaryOp(typename Arg1XprType::Scalar,
typename Arg2XprType::Scalar,
typename Arg3XprType::Scalar)>::type Scalar;
- EIGEN_STATIC_ASSERT(
- (internal::is_same<typename traits<Arg1XprType>::StorageKind,
- typename traits<Arg2XprType>::StorageKind>::value),
- STORAGE_KIND_MUST_MATCH)
- EIGEN_STATIC_ASSERT(
- (internal::is_same<typename traits<Arg1XprType>::StorageKind,
- typename traits<Arg3XprType>::StorageKind>::value),
- STORAGE_KIND_MUST_MATCH)
- EIGEN_STATIC_ASSERT(
- (internal::is_same<typename traits<Arg1XprType>::Index,
- typename traits<Arg2XprType>::Index>::value),
- STORAGE_INDEX_MUST_MATCH)
- EIGEN_STATIC_ASSERT(
- (internal::is_same<typename traits<Arg1XprType>::Index,
- typename traits<Arg3XprType>::Index>::value),
- STORAGE_INDEX_MUST_MATCH)
typedef traits<Arg1XprType> XprTraits;
typedef typename traits<Arg1XprType>::StorageKind StorageKind;
typedef typename traits<Arg1XprType>::Index Index;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index 3dd32e9d1..a8e48fced 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -84,6 +84,14 @@ struct functor_traits<scalar_sigmoid_op<T> > {
};
+template<typename Reducer, typename Device>
+struct reducer_traits {
+ enum {
+ Cost = 1,
+ PacketAccess = false
+ };
+};
+
// Standard reduction functors
template <typename T> struct SumReducer
{
@@ -119,6 +127,15 @@ template <typename T> struct SumReducer
}
};
+template <typename T, typename Device>
+struct reducer_traits<SumReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = PacketType<T, Device>::HasAdd
+ };
+};
+
+
template <typename T> struct MeanReducer
{
static const bool PacketAccess = packet_traits<T>::HasAdd && !NumTraits<T>::IsInteger;
@@ -162,6 +179,15 @@ template <typename T> struct MeanReducer
DenseIndex packetCount_;
};
+template <typename T, typename Device>
+struct reducer_traits<MeanReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = PacketType<T, Device>::HasAdd
+ };
+};
+
+
template <typename T> struct MaxReducer
{
static const bool PacketAccess = packet_traits<T>::HasMax;
@@ -195,6 +221,15 @@ template <typename T> struct MaxReducer
}
};
+template <typename T, typename Device>
+struct reducer_traits<MaxReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = PacketType<T, Device>::HasMax
+ };
+};
+
+
template <typename T> struct MinReducer
{
static const bool PacketAccess = packet_traits<T>::HasMin;
@@ -228,6 +263,14 @@ template <typename T> struct MinReducer
}
};
+template <typename T, typename Device>
+struct reducer_traits<MinReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = PacketType<T, Device>::HasMin
+ };
+};
+
template <typename T> struct ProdReducer
{
@@ -263,6 +306,14 @@ template <typename T> struct ProdReducer
}
};
+template <typename T, typename Device>
+struct reducer_traits<ProdReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::MulCost,
+ PacketAccess = PacketType<T, Device>::HasMul
+ };
+};
+
struct AndReducer
{
@@ -280,6 +331,15 @@ struct AndReducer
}
};
+template <typename Device>
+struct reducer_traits<AndReducer, Device> {
+ enum {
+ Cost = 1,
+ PacketAccess = false
+ };
+};
+
+
struct OrReducer {
static const bool PacketAccess = false;
static const bool IsStateful = false;
@@ -295,6 +355,15 @@ struct OrReducer {
}
};
+template <typename Device>
+struct reducer_traits<OrReducer, Device> {
+ enum {
+ Cost = 1,
+ PacketAccess = false
+ };
+};
+
+
// Argmin/Argmax reducers
template <typename T> struct ArgMaxTupleReducer
{
@@ -312,6 +381,15 @@ template <typename T> struct ArgMaxTupleReducer
}
};
+template <typename T, typename Device>
+struct reducer_traits<ArgMaxTupleReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = false
+ };
+};
+
+
template <typename T> struct ArgMinTupleReducer
{
static const bool PacketAccess = false;
@@ -328,6 +406,14 @@ template <typename T> struct ArgMinTupleReducer
}
};
+template <typename T, typename Device>
+struct reducer_traits<ArgMinTupleReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = false
+ };
+};
+
// Random number generation
namespace {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index b1645d56f..fdb5ee6b8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -47,22 +47,39 @@ template <> struct max_n_1<0> {
// Default packet types
template <typename Scalar, typename Device>
-struct PacketType {
+struct PacketType : internal::packet_traits<Scalar> {
typedef typename internal::packet_traits<Scalar>::type type;
- enum { size = internal::unpacket_traits<type>::size };
};
// For CUDA packet types when using a GpuDevice
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16)
template <>
-struct PacketType<float, GpuDevice> {
- typedef float4 type;
- static const int size = 4;
-};
-template <>
-struct PacketType<double, GpuDevice> {
- typedef double2 type;
+struct PacketType<half, GpuDevice> {
+ typedef half2 type;
static const int size = 2;
+ enum {
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasNegate = 1,
+ HasAbs = 1,
+ HasArg = 0,
+ HasAbs2 = 0,
+ HasMin = 1,
+ HasMax = 1,
+ HasConj = 0,
+ HasSetLinear = 0,
+ HasBlend = 0,
+
+ HasDiv = 1,
+ HasSqrt = 1,
+ HasRsqrt = 1,
+ HasExp = 1,
+ HasLog = 1,
+ HasLog1p = 0,
+ HasLog10 = 0,
+ HasPow = 1,
+ };
};
#endif
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 52cfc2824..d34f1e328 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -148,7 +148,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast<Scalar*>(m_impl.data()); }
- const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+ EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
protected:
TensorEvaluator<ArgType, Device> m_impl;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 0d1a098b7..d9bbcd858 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -130,15 +130,17 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
if (block == 0) {
// We're the first block to run, initialize the output value
atomicExch(output, reducer.initialize());
- unsigned int old = atomicExch(semaphore, 2u);
- assert(old == 1u);
+ __threadfence();
+ atomicExch(semaphore, 2u);
}
else {
+ // Wait for the first block to initialize the output value.
// Use atomicCAS here to ensure that the reads aren't cached
- unsigned int val = atomicCAS(semaphore, 2u, 2u);
- while (val < 2u) {
+ unsigned int val;
+ do {
val = atomicCAS(semaphore, 2u, 2u);
}
+ while (val < 2u);
}
}
}
@@ -166,12 +168,8 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
}
if (gridDim.x > 1 && threadIdx.x == 0) {
- unsigned int ticket = atomicInc(semaphore, UINT_MAX);
- assert(ticket >= 2u);
- if (ticket == gridDim.x + 1) {
- // We're the last block, reset the semaphore
- *semaphore = 0;
- }
+ // Let the last block reset the semaphore
+ atomicInc(semaphore, gridDim.x + 1);
}
}
@@ -330,10 +328,10 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
// Unfortunately nvidia doesn't support well exotic types such as complex,
// so reduce the scope of the optimized version of the code to the simple case
// of floats and half floats.
- #ifdef EIGEN_HAS_CUDA_FP16
+#ifdef EIGEN_HAS_CUDA_FP16
static const bool HasOptimizedImplementation = !Op::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
- (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && Op::PacketAccess));
+ (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
#else
static const bool HasOptimizedImplementation = !Op::IsStateful &&
internal::is_same<typename Self::CoeffReturnType, float>::value;
@@ -348,7 +346,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
return;
}
- FullReductionLauncher<Self, Op, OutputType, Op::PacketAccess>::run(self, reducer, device, output, num_coeffs);
+ FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
}
};
@@ -610,7 +608,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
#ifdef EIGEN_HAS_CUDA_FP16
static const bool HasOptimizedImplementation = !Op::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
- (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && Op::PacketAccess));
+ (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
#else
static const bool HasOptimizedImplementation = !Op::IsStateful &&
internal::is_same<typename Self::CoeffReturnType, float>::value;
@@ -629,7 +627,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
return true;
}
- return InnerReductionLauncher<Self, Op, OutputType, Op::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
+ return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
}
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
index 031dbf6f2..5207f6a8d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -81,7 +81,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
typedef typename XprType::Index Index;
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
typedef DSizes<Index, NumDims> Dimensions;
- typedef typename XprType::Scalar Scalar;
+ typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
@@ -106,7 +106,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
m_output(NULL) {
// Accumulating a scalar isn't supported.
- EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
eigen_assert(m_axis >= 0 && m_axis < NumDims);
// Compute stride of scan axis
@@ -122,7 +122,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
- return m_dimensions;
+ return m_dimensions;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
@@ -136,7 +136,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
return true;
}
}
-
+
template<int LoadMode>
EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
@@ -152,6 +152,10 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
return m_output[index];
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
+ return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
+ }
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
if (m_output != NULL) {
m_device.deallocate(m_output);