aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h12
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h10
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h86
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h8
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorScan.h2
6 files changed, 107 insertions, 16 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index a60a17049..ee16cde9b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -202,7 +202,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
// across k dimension.
const TensorOpCost cost =
contractionCost(m, n, bm, bn, bk, shard_by_col, false);
- Index num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
+ int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
static_cast<double>(n) * m, cost, this->m_device.numThreads());
// TODO(dvyukov): this is a stop-gap to prevent regressions while the cost
@@ -301,7 +301,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
class Context {
public:
Context(const Device& device, int num_threads, LhsMapper& lhs,
- RhsMapper& rhs, Scalar* buffer, Index m, Index n, Index k, Index bm,
+ RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm,
Index bn, Index bk, Index nm, Index nn, Index nk, Index gm,
Index gn, Index nm0, Index nn0, bool shard_by_col,
bool parallel_pack)
@@ -309,13 +309,13 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
lhs_(lhs),
rhs_(rhs),
buffer_(buffer),
- output_(buffer, m),
+ output_(buffer, tm),
num_threads_(num_threads),
shard_by_col_(shard_by_col),
parallel_pack_(parallel_pack),
- m_(m),
- n_(n),
- k_(k),
+ m_(tm),
+ n_(tn),
+ k_(tk),
bm_(bm),
bn_(bn),
bk_(bk),
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index d31b0ad38..c770d024f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -106,7 +106,7 @@ static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) {
// Build a thread pool device on top the an existing pool of threads.
struct ThreadPoolDevice {
// The ownership of the thread pool remains with the caller.
- ThreadPoolDevice(ThreadPoolInterface* pool, size_t num_cores) : pool_(pool), num_threads_(num_cores) { }
+ ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores) : pool_(pool), num_threads_(num_cores) { }
EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
return internal::aligned_malloc(num_bytes);
@@ -130,7 +130,7 @@ struct ThreadPoolDevice {
::memset(buffer, c, n);
}
- EIGEN_STRONG_INLINE size_t numThreads() const {
+ EIGEN_STRONG_INLINE int numThreads() const {
return num_threads_;
}
@@ -182,7 +182,7 @@ struct ThreadPoolDevice {
std::function<void(Index, Index)> f) const {
typedef TensorCostModel<ThreadPoolDevice> CostModel;
if (n <= 1 || numThreads() == 1 ||
- CostModel::numThreads(n, cost, numThreads()) == 1) {
+ CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
f(0, n);
return;
}
@@ -242,7 +242,7 @@ struct ThreadPoolDevice {
// Recursively divide size into halves until we reach block_size.
// Division code rounds mid to block_size, so we are guaranteed to get
// block_count leaves that do actual computations.
- Barrier barrier(block_count);
+ Barrier barrier(static_cast<unsigned int>(block_count));
std::function<void(Index, Index)> handleRange;
handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) {
if (last - first <= block_size) {
@@ -268,7 +268,7 @@ struct ThreadPoolDevice {
private:
ThreadPoolInterface* pool_;
- size_t num_threads_;
+ int num_threads_;
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index 3dd32e9d1..bf52e490f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -84,6 +84,14 @@ struct functor_traits<scalar_sigmoid_op<T> > {
};
+template<typename Reducer, typename Device>
+struct reducer_traits {
+ enum {
+ Cost = 1,
+ PacketAccess = false
+ };
+};
+
// Standard reduction functors
template <typename T> struct SumReducer
{
@@ -119,6 +127,15 @@ template <typename T> struct SumReducer
}
};
+template <typename T, typename Device>
+struct reducer_traits<SumReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = PacketType<T, Device>::type::HasAdd
+ };
+};
+
+
template <typename T> struct MeanReducer
{
static const bool PacketAccess = packet_traits<T>::HasAdd && !NumTraits<T>::IsInteger;
@@ -162,6 +179,15 @@ template <typename T> struct MeanReducer
DenseIndex packetCount_;
};
+template <typename T, typename Device>
+struct reducer_traits<MeanReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = PacketType<T, Device>::type::HasAdd
+ };
+};
+
+
template <typename T> struct MaxReducer
{
static const bool PacketAccess = packet_traits<T>::HasMax;
@@ -195,6 +221,15 @@ template <typename T> struct MaxReducer
}
};
+template <typename T, typename Device>
+struct reducer_traits<MaxReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = PacketType<T, Device>::type::HasMax
+ };
+};
+
+
template <typename T> struct MinReducer
{
static const bool PacketAccess = packet_traits<T>::HasMin;
@@ -228,6 +263,14 @@ template <typename T> struct MinReducer
}
};
+template <typename T, typename Device>
+struct reducer_traits<MinReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = PacketType<T, Device>::type::HasMin
+ };
+};
+
template <typename T> struct ProdReducer
{
@@ -263,6 +306,14 @@ template <typename T> struct ProdReducer
}
};
+template <typename T, typename Device>
+struct reducer_traits<ProdReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::MulCost,
+ PacketAccess = PacketType<T, Device>::type::HasMul
+ };
+};
+
struct AndReducer
{
@@ -280,6 +331,15 @@ struct AndReducer
}
};
+template <typename Device>
+struct reducer_traits<AndReducer, Device> {
+ enum {
+ Cost = 1,
+ PacketAccess = false
+ };
+};
+
+
struct OrReducer {
static const bool PacketAccess = false;
static const bool IsStateful = false;
@@ -295,6 +355,15 @@ struct OrReducer {
}
};
+template <typename Device>
+struct reducer_traits<OrReducer, Device> {
+ enum {
+ Cost = 1,
+ PacketAccess = false
+ };
+};
+
+
// Argmin/Argmax reducers
template <typename T> struct ArgMaxTupleReducer
{
@@ -312,6 +381,15 @@ template <typename T> struct ArgMaxTupleReducer
}
};
+template <typename T, typename Device>
+struct reducer_traits<ArgMaxTupleReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = false
+ };
+};
+
+
template <typename T> struct ArgMinTupleReducer
{
static const bool PacketAccess = false;
@@ -328,6 +406,14 @@ template <typename T> struct ArgMinTupleReducer
}
};
+template <typename T, typename Device>
+struct reducer_traits<ArgMinTupleReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = false
+ };
+};
+
// Random number generation
namespace {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index b1645d56f..82a905a65 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -55,6 +55,11 @@ struct PacketType {
// For CUDA packet types when using a GpuDevice
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
template <>
+ struct PacketType<half, GpuDevice> {
+ typedef half2 type;
+ static const int size = 2;
+ };
+template <>
struct PacketType<float, GpuDevice> {
typedef float4 type;
static const int size = 4;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index e82530955..1b4fdd03f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -331,7 +331,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
#ifdef EIGEN_HAS_CUDA_FP16
static const bool HasOptimizedImplementation = !Op::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
- (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && Op::PacketAccess));
+ (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
#else
static const bool HasOptimizedImplementation = !Op::IsStateful &&
internal::is_same<typename Self::CoeffReturnType, float>::value;
@@ -346,7 +346,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
return;
}
- FullReductionLauncher<Self, Op, OutputType, Op::PacketAccess>::run(self, reducer, device, output, num_coeffs);
+ FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
}
};
@@ -608,7 +608,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
#ifdef EIGEN_HAS_CUDA_FP16
static const bool HasOptimizedImplementation = !Op::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
- (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && Op::PacketAccess));
+ (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
#else
static const bool HasOptimizedImplementation = !Op::IsStateful &&
internal::is_same<typename Self::CoeffReturnType, float>::value;
@@ -627,7 +627,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
return true;
}
- return InnerReductionLauncher<Self, Op, OutputType, Op::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
+ return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
}
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
index 61df8032d..0d084141d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -122,7 +122,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
- return m_dimensions;
+ return m_dimensions;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {