diff options
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 59 |
1 files changed, 38 insertions, 21 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 5c3d4d630..0cac7b179 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -59,13 +59,14 @@ class TensorExecutor<Expression, DefaultDevice, true> { const Index size = array_prod(evaluator.dimensions()); const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size; - // Manually unroll this loop since compilers don't do it. + // Give the compiler a strong hint to unroll the loop. But don't insist + // on unrolling, because if the function is expensive the compiler should not + // unroll the loop at the expense of inlining. const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize; for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) { - evaluator.evalPacket(i); - evaluator.evalPacket(i+PacketSize); - evaluator.evalPacket(i+2*PacketSize); - evaluator.evalPacket(i+3*PacketSize); + for (Index j = 0; j < 4; j++) { + evaluator.evalPacket(i + j * PacketSize); + } } const Index VectorizedSize = (size / PacketSize) * PacketSize; for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) { @@ -92,24 +93,30 @@ struct EvalRange { evaluator.evalScalar(i); } } + + static Index alignBlockSize(Index size) { + return size; + } }; template <typename Evaluator, typename Index> struct EvalRange<Evaluator, Index, true> { + static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; + static void run(Evaluator* evaluator_in, const Index first, const Index last) { Evaluator evaluator = *evaluator_in; eigen_assert(last >= first); Index i = first; - const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; if (last - first >= PacketSize) { eigen_assert(first % PacketSize == 0); Index last_chunk_offset = last - 4 * PacketSize; - // Manually unroll this loop since compilers don't do it. + // Give the compiler a strong hint to unroll the loop. But don't insist + // on unrolling, because if the function is expensive the compiler should not + // unroll the loop at the expense of inlining. for (; i <= last_chunk_offset; i += 4*PacketSize) { - evaluator.evalPacket(i); - evaluator.evalPacket(i+PacketSize); - evaluator.evalPacket(i+2*PacketSize); - evaluator.evalPacket(i+3*PacketSize); + for (Index j = 0; j < 4; j++) { + evaluator.evalPacket(i + j * PacketSize); + } } last_chunk_offset = last - PacketSize; for (; i <= last_chunk_offset; i += PacketSize) { @@ -120,6 +127,15 @@ struct EvalRange<Evaluator, Index, true> { evaluator.evalScalar(i); } } + + static Index alignBlockSize(Index size) { + // Align block size to packet size and account for unrolling in run above. + if (size >= 16 * PacketSize) { + return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1); + } + // Aligning to 4 * PacketSize would increase block size by more than 25%. + return (size + PacketSize - 1) & ~(PacketSize - 1); + } }; template <typename Expression, bool Vectorizable> @@ -133,18 +149,23 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> { const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { - const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1; const Index size = array_prod(evaluator.dimensions()); +#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL) + device.parallelFor(size, evaluator.costPerCoeff(Vectorizable), + EvalRange<Evaluator, Index, Vectorizable>::alignBlockSize, + [&evaluator](Index first, Index last) { + EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, first, last); + }); +#else size_t num_threads = device.numThreads(); -#ifdef EIGEN_USE_COST_MODEL if (num_threads > 1) { num_threads = TensorCostModel<ThreadPoolDevice>::numThreads( size, evaluator.costPerCoeff(Vectorizable), num_threads); } -#endif if (num_threads == 1) { EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size); } else { + const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1; Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1; const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize))); const Index numblocks = size / blocksize; @@ -161,11 +182,12 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> { } barrier.Wait(); } +#endif // defined(!EIGEN_USE_SIMPLE_THREAD_POOL) } evaluator.cleanup(); } }; -#endif +#endif // EIGEN_USE_THREADS // GPU: the evaluation of the expression is offloaded to a GPU. @@ -212,16 +234,11 @@ struct EigenMetaKernelEval<Evaluator, Index, true> { template <typename Evaluator, typename Index> __global__ void __launch_bounds__(1024) -EigenMetaKernel(Evaluator memcopied_eval, Index size) { +EigenMetaKernel(Evaluator eval, Index size) { const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; const Index step_size = blockDim.x * gridDim.x; - // Cuda memcopies the kernel arguments. That's fine for POD, but for more - // complex types such as evaluators we should really conform to the C++ - // standard and call a proper copy constructor. - Evaluator eval(memcopied_eval); - const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned; EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size); } |