aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
diff options
context:
space:
mode:
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h59
1 files changed, 38 insertions, 21 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 5c3d4d630..0cac7b179 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -59,13 +59,14 @@ class TensorExecutor<Expression, DefaultDevice, true>
{
const Index size = array_prod(evaluator.dimensions());
const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
- // Manually unroll this loop since compilers don't do it.
+ // Give the compiler a strong hint to unroll the loop. But don't insist
+ // on unrolling, because if the function is expensive the compiler should not
+ // unroll the loop at the expense of inlining.
const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
- evaluator.evalPacket(i);
- evaluator.evalPacket(i+PacketSize);
- evaluator.evalPacket(i+2*PacketSize);
- evaluator.evalPacket(i+3*PacketSize);
+ for (Index j = 0; j < 4; j++) {
+ evaluator.evalPacket(i + j * PacketSize);
+ }
}
const Index VectorizedSize = (size / PacketSize) * PacketSize;
for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
@@ -92,24 +93,30 @@ struct EvalRange {
evaluator.evalScalar(i);
}
}
+
+ static Index alignBlockSize(Index size) {
+ return size;
+ }
};
template <typename Evaluator, typename Index>
struct EvalRange<Evaluator, Index, true> {
+ static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+
static void run(Evaluator* evaluator_in, const Index first, const Index last) {
Evaluator evaluator = *evaluator_in;
eigen_assert(last >= first);
Index i = first;
- const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
if (last - first >= PacketSize) {
eigen_assert(first % PacketSize == 0);
Index last_chunk_offset = last - 4 * PacketSize;
- // Manually unroll this loop since compilers don't do it.
+ // Give the compiler a strong hint to unroll the loop. But don't insist
+ // on unrolling, because if the function is expensive the compiler should not
+ // unroll the loop at the expense of inlining.
for (; i <= last_chunk_offset; i += 4*PacketSize) {
- evaluator.evalPacket(i);
- evaluator.evalPacket(i+PacketSize);
- evaluator.evalPacket(i+2*PacketSize);
- evaluator.evalPacket(i+3*PacketSize);
+ for (Index j = 0; j < 4; j++) {
+ evaluator.evalPacket(i + j * PacketSize);
+ }
}
last_chunk_offset = last - PacketSize;
for (; i <= last_chunk_offset; i += PacketSize) {
@@ -120,6 +127,15 @@ struct EvalRange<Evaluator, Index, true> {
evaluator.evalScalar(i);
}
}
+
+ static Index alignBlockSize(Index size) {
+ // Align block size to packet size and account for unrolling in run above.
+ if (size >= 16 * PacketSize) {
+ return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
+ }
+ // Aligning to 4 * PacketSize would increase block size by more than 25%.
+ return (size + PacketSize - 1) & ~(PacketSize - 1);
+ }
};
template <typename Expression, bool Vectorizable>
@@ -133,18 +149,23 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign)
{
- const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
const Index size = array_prod(evaluator.dimensions());
+#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL)
+ device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
+ EvalRange<Evaluator, Index, Vectorizable>::alignBlockSize,
+ [&evaluator](Index first, Index last) {
+ EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, first, last);
+ });
+#else
size_t num_threads = device.numThreads();
-#ifdef EIGEN_USE_COST_MODEL
if (num_threads > 1) {
num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
size, evaluator.costPerCoeff(Vectorizable), num_threads);
}
-#endif
if (num_threads == 1) {
EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
} else {
+ const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
const Index numblocks = size / blocksize;
@@ -161,11 +182,12 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
}
barrier.Wait();
}
+#endif // defined(!EIGEN_USE_SIMPLE_THREAD_POOL)
}
evaluator.cleanup();
}
};
-#endif
+#endif // EIGEN_USE_THREADS
// GPU: the evaluation of the expression is offloaded to a GPU.
@@ -212,16 +234,11 @@ struct EigenMetaKernelEval<Evaluator, Index, true> {
template <typename Evaluator, typename Index>
__global__ void
__launch_bounds__(1024)
-EigenMetaKernel(Evaluator memcopied_eval, Index size) {
+EigenMetaKernel(Evaluator eval, Index size) {
const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
const Index step_size = blockDim.x * gridDim.x;
- // Cuda memcopies the kernel arguments. That's fine for POD, but for more
- // complex types such as evaluators we should really conform to the C++
- // standard and call a proper copy constructor.
- Evaluator eval(memcopied_eval);
-
const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size);
}