aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported
diff options
context:
space:
mode:
authorGravatar Rasmus Munk Larsen <rmlarsen@google.com>2016-04-14 18:28:23 -0700
committerGravatar Rasmus Munk Larsen <rmlarsen@google.com>2016-04-14 18:28:23 -0700
commit07ac4f7e027cddd3457a34295420480f7e541ac5 (patch)
tree0cc5a19bba742e755dbd7b796cecbf741347e66a /unsupported
parentaeb5494a0b2edef3be447cec222e2d178e413389 (diff)
Eigen Tensor cost model part 2: Thread scheduling for standard evaluators and reductions. The cost model is turned off by default.
Diffstat (limited to 'unsupported')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h6
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h93
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h104
4 files changed, 111 insertions, 97 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
index 32bc5d0b2..4e8f86674 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
@@ -10,9 +10,9 @@
#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
-#if !defined(EIGEN_USE_GPU)
-#define EIGEN_USE_COST_MODEL
-#endif
+//#if !defined(EIGEN_USE_GPU)
+//#define EIGEN_USE_COST_MODEL
+//#endif
namespace Eigen {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index f1f9a90df..293012646 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -189,6 +189,11 @@ struct TensorEvaluator<const Derived, Device>
return loadConstant(m_data+index);
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
+ internal::unpacket_traits<PacketReturnType>::size);
+ }
+
EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; }
protected:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index eabfd91fe..df9cc0998 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -59,9 +59,16 @@ class TensorExecutor<Expression, DefaultDevice, true>
{
const Index size = array_prod(evaluator.dimensions());
const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
+ // Manually unroll this loop since compilers don't do it.
+ const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
+ for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
+ evaluator.evalPacket(i);
+ evaluator.evalPacket(i+PacketSize);
+ evaluator.evalPacket(i+2*PacketSize);
+ evaluator.evalPacket(i+3*PacketSize);
+ }
const Index VectorizedSize = (size / PacketSize) * PacketSize;
-
- for (Index i = 0; i < VectorizedSize; i += PacketSize) {
+ for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
evaluator.evalPacket(i);
}
for (Index i = VectorizedSize; i < size; ++i) {
@@ -78,8 +85,9 @@ class TensorExecutor<Expression, DefaultDevice, true>
#ifdef EIGEN_USE_THREADS
template <typename Evaluator, typename Index, bool Vectorizable>
struct EvalRange {
- static void run(Evaluator evaluator, const Index first, const Index last) {
- eigen_assert(last > first);
+ static void run(void* evaluator_in, const Index first, const Index last) {
+ Evaluator evaluator(*static_cast<Evaluator*>(evaluator_in));
+ eigen_assert(last >= first);
for (Index i = first; i < last; ++i) {
evaluator.evalScalar(i);
}
@@ -88,28 +96,45 @@ struct EvalRange {
template <typename Evaluator, typename Index>
struct EvalRange<Evaluator, Index, true> {
- static void run(Evaluator evaluator, const Index first, const Index last) {
- eigen_assert(last > first);
+ static void run(void* evaluator_in, const Index first, const Index last) {
+ Evaluator evaluator(*static_cast<Evaluator*>(evaluator_in));
+ eigen_assert(last >= first);
Index i = first;
- static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+ const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
if (last - first >= PacketSize) {
eigen_assert(first % PacketSize == 0);
- Index lastPacket = last - (last % PacketSize);
- for (; i < lastPacket; i += PacketSize) {
+ Index last_chunk_offset = last - 4 * PacketSize;
+ // Manually unroll this loop since compilers don't do it.
+ for (; i <= last_chunk_offset; i += 4*PacketSize) {
+ evaluator.evalPacket(i);
+ evaluator.evalPacket(i+PacketSize);
+ evaluator.evalPacket(i+2*PacketSize);
+ evaluator.evalPacket(i+3*PacketSize);
+ }
+ last_chunk_offset = last - PacketSize;
+ for (; i <= last_chunk_offset; i += PacketSize) {
evaluator.evalPacket(i);
}
}
-
for (; i < last; ++i) {
evaluator.evalScalar(i);
}
}
};
-template<typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
-{
+// Used to make an std::function to add to the ThreadPool with less templating
+// than EvalRange::Run.
+// This requires that this and EvalRange takes a void* to the evaluator that can
+// be downcast to the right type by the EvalRange.
+template <typename Index>
+inline void InvokeEvalRange(void (*run_fn)(void*, const Index, const Index),
+ void* evaluator, const Index first, const Index last) {
+ run_fn(evaluator, first, last);
+}
+
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
public:
typedef typename Expression::Index Index;
static inline void run(const Expression& expr, const ThreadPoolDevice& device)
@@ -119,24 +144,35 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign)
{
+ const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
const Index size = array_prod(evaluator.dimensions());
-
- static const int PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
-
- int blocksz = std::ceil<int>(static_cast<float>(size)/device.numThreads()) + PacketSize - 1;
- const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
- const unsigned int numblocks = static_cast<unsigned int>(size / blocksize);
-
- Barrier barrier(numblocks);
- for (unsigned int i = 0; i < numblocks; ++i) {
- device.enqueue_with_barrier(&barrier, &EvalRange<Evaluator, Index, Vectorizable>::run, evaluator, i*blocksize, (i+1)*blocksize);
+ int num_threads = device.numThreads();
+#ifdef EIGEN_USE_COST_MODEL
+ if (num_threads > 1) {
+ num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
+ size, evaluator.costPerCoeff(Vectorizable), num_threads);
}
-
- if (static_cast<Index>(numblocks) * blocksize < size) {
- EvalRange<Evaluator, Index, Vectorizable>::run(evaluator, numblocks * blocksize, size);
+#endif
+ if (num_threads == 1) {
+ EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
+ } else {
+ Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
+ const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
+ const Index numblocks = size / blocksize;
+
+ Barrier barrier(numblocks);
+ for (int i = 0; i < numblocks; ++i) {
+ device.enqueue_with_barrier(
+ &barrier, &InvokeEvalRange<Index>,
+ &EvalRange<Evaluator, Index, Vectorizable>::run,
+ static_cast<void*>(&evaluator), i * blocksize,
+ (i + 1) * blocksize);
+ }
+ if (numblocks * blocksize < size) {
+ EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, numblocks * blocksize, size);
+ }
+ barrier.Wait();
}
-
- barrier.Wait();
}
evaluator.cleanup();
}
@@ -226,7 +262,6 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
#endif // __CUDACC__
#endif // EIGEN_USE_GPU
-
} // end namespace internal
} // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 1c9e7ab66..885295f0a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -214,7 +214,7 @@ struct FullReducer {
static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) {
const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
- *output = InnerMostDimReducer<Self, Op>::reduce(self, 0, num_coeffs, reducer);
+ *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
}
};
@@ -222,18 +222,19 @@ struct FullReducer {
#ifdef EIGEN_USE_THREADS
// Multithreaded full reducers
template <typename Self, typename Op,
- bool vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
+ bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
struct FullReducerShard {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex,
typename Self::Index numValuesToReduce, Op& reducer,
typename Self::CoeffReturnType* output) {
- *output = InnerMostDimReducer<Self, Op, vectorizable>::reduce(
+ *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
self, firstIndex, numValuesToReduce, reducer);
}
};
-template <typename Self, typename Op>
-struct FullReducer<Self, Op, ThreadPoolDevice, false> {
+// Multithreaded full reducer
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
static const bool HasOptimizedImplementation = !Op::IsStateful;
static const int PacketSize =
unpacket_traits<typename Self::PacketReturnType>::size;
@@ -247,79 +248,44 @@ struct FullReducer<Self, Op, ThreadPoolDevice, false> {
*output = reducer.finalize(reducer.initialize());
return;
}
- const std::size_t num_threads = device.numThreads();
- if (num_threads == 1) {
- *output = InnerMostDimReducer<Self, Op, false>::reduce(self, 0, num_coeffs, reducer);
- return;
- } else {
- const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
- const unsigned int numblocks = blocksize > 0 ? static_cast<unsigned int>(num_coeffs / blocksize) : 0;
- eigen_assert(num_coeffs >= static_cast<Index>(numblocks) * blocksize);
-
- Barrier barrier(numblocks);
- MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
- for (unsigned int i = 0; i < numblocks; ++i) {
- device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, false>::run, self,
- i * blocksize, blocksize, reducer, &shards[i]);
- }
-
- typename Self::CoeffReturnType finalShard;
- if (static_cast<Index>(numblocks) * blocksize < num_coeffs) {
- finalShard = InnerMostDimReducer<Self, Op, false>::reduce(
- self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer);
- } else {
- finalShard = reducer.initialize();
- }
- barrier.Wait();
- for (unsigned int i = 0; i < numblocks; ++i) {
- reducer.reduce(shards[i], &finalShard);
- }
- *output = reducer.finalize(finalShard);
- }
- }
-};
-
-template <typename Self, typename Op>
-struct FullReducer<Self, Op, ThreadPoolDevice, true> {
- static const bool HasOptimizedImplementation = !Op::IsStateful;
- static const int PacketSize =
- unpacket_traits<typename Self::PacketReturnType>::size;
-
- // launch one reducer per thread and accumulate the result.
- static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device,
- typename Self::CoeffReturnType* output) {
- typedef typename Self::Index Index;
- const Index num_coeffs = array_prod(self.m_impl.dimensions());
- if (num_coeffs == 0) {
- *output = reducer.finalize(reducer.initialize());
- return;
- }
- const std::size_t num_threads = device.numThreads();
+#ifdef EIGEN_USE_COST_MODEL
+ const TensorOpCost cost =
+ self.m_impl.costPerCoeff(Vectorizable) +
+ TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable,
+ PacketSize);
+ const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
+ num_coeffs, cost, device.numThreads());
+#else
+ const int num_threads = device.numThreads();
+#endif
if (num_threads == 1) {
- *output = InnerMostDimReducer<Self, Op, true>::reduce(self, 0, num_coeffs, reducer);
+ *output =
+ InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
return;
}
- const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
- const unsigned int numblocks = blocksize > 0 ? static_cast<unsigned int>(num_coeffs / blocksize) : 0;
- eigen_assert(num_coeffs >= static_cast<Index>(numblocks) * blocksize);
+ const Index blocksize =
+ std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
+ const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
+ eigen_assert(num_coeffs >= numblocks * blocksize);
Barrier barrier(numblocks);
MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
- for (unsigned int i = 0; i < numblocks; ++i) {
- device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, true>::run,
+ for (Index i = 0; i < numblocks; ++i) {
+ device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run,
self, i * blocksize, blocksize, reducer,
&shards[i]);
}
typename Self::CoeffReturnType finalShard;
- if (static_cast<Index>(numblocks) * blocksize < num_coeffs) {
- finalShard = InnerMostDimReducer<Self, Op, true>::reduce(
- self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer);
+ if (numblocks * blocksize < num_coeffs) {
+ finalShard = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
+ self, numblocks * blocksize, num_coeffs - numblocks * blocksize,
+ reducer);
} else {
finalShard = reducer.initialize();
}
-
barrier.Wait();
- for (unsigned int i = 0; i < numblocks; ++i) {
+
+ for (Index i = 0; i < numblocks; ++i) {
reducer.reduce(shards[i], &finalShard);
}
*output = reducer.finalize(finalShard);
@@ -498,13 +464,21 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+ static bool size_large_enough(Index total_size) {
+#ifndef EIGEN_USE_COST_MODEL
+ return total_size > 1024 * 1024;
+#else
+ return true || total_size;
+#endif
+ }
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) {
m_impl.evalSubExprsIfNeeded(NULL);
// Use the FullReducer if possible.
if (RunningFullReduction && internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
- (!RunningOnGPU && (internal::array_prod(m_impl.dimensions()) > 1024 * 1024)))) {
+ (!RunningOnGPU && size_large_enough(internal::array_prod(m_impl.dimensions()))))) {
bool need_assign = false;
if (!data) {