diff options
author | Rasmus Munk Larsen <rmlarsen@google.com> | 2016-04-14 18:28:23 -0700 |
---|---|---|
committer | Rasmus Munk Larsen <rmlarsen@google.com> | 2016-04-14 18:28:23 -0700 |
commit | 07ac4f7e027cddd3457a34295420480f7e541ac5 (patch) | |
tree | 0cc5a19bba742e755dbd7b796cecbf741347e66a /unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | |
parent | aeb5494a0b2edef3be447cec222e2d178e413389 (diff) |
Eigen Tensor cost model part 2: Thread scheduling for standard evaluators and reductions. The cost model is turned off by default.
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 93 |
1 files changed, 64 insertions, 29 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index eabfd91fe..df9cc0998 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -59,9 +59,16 @@ class TensorExecutor<Expression, DefaultDevice, true> { const Index size = array_prod(evaluator.dimensions()); const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size; + // Manually unroll this loop since compilers don't do it. + const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize; + for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) { + evaluator.evalPacket(i); + evaluator.evalPacket(i+PacketSize); + evaluator.evalPacket(i+2*PacketSize); + evaluator.evalPacket(i+3*PacketSize); + } const Index VectorizedSize = (size / PacketSize) * PacketSize; - - for (Index i = 0; i < VectorizedSize; i += PacketSize) { + for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) { evaluator.evalPacket(i); } for (Index i = VectorizedSize; i < size; ++i) { @@ -78,8 +85,9 @@ class TensorExecutor<Expression, DefaultDevice, true> #ifdef EIGEN_USE_THREADS template <typename Evaluator, typename Index, bool Vectorizable> struct EvalRange { - static void run(Evaluator evaluator, const Index first, const Index last) { - eigen_assert(last > first); + static void run(void* evaluator_in, const Index first, const Index last) { + Evaluator evaluator(*static_cast<Evaluator*>(evaluator_in)); + eigen_assert(last >= first); for (Index i = first; i < last; ++i) { evaluator.evalScalar(i); } @@ -88,28 +96,45 @@ struct EvalRange { template <typename Evaluator, typename Index> struct EvalRange<Evaluator, Index, true> { - static void run(Evaluator evaluator, const Index first, const Index last) { - eigen_assert(last > first); + static void run(void* evaluator_in, const Index first, const Index last) { + Evaluator evaluator(*static_cast<Evaluator*>(evaluator_in)); + eigen_assert(last >= first); Index i = first; - static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; + const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; if (last - first >= PacketSize) { eigen_assert(first % PacketSize == 0); - Index lastPacket = last - (last % PacketSize); - for (; i < lastPacket; i += PacketSize) { + Index last_chunk_offset = last - 4 * PacketSize; + // Manually unroll this loop since compilers don't do it. + for (; i <= last_chunk_offset; i += 4*PacketSize) { + evaluator.evalPacket(i); + evaluator.evalPacket(i+PacketSize); + evaluator.evalPacket(i+2*PacketSize); + evaluator.evalPacket(i+3*PacketSize); + } + last_chunk_offset = last - PacketSize; + for (; i <= last_chunk_offset; i += PacketSize) { evaluator.evalPacket(i); } } - for (; i < last; ++i) { evaluator.evalScalar(i); } } }; -template<typename Expression, bool Vectorizable> -class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> -{ +// Used to make an std::function to add to the ThreadPool with less templating +// than EvalRange::Run. +// This requires that this and EvalRange takes a void* to the evaluator that can +// be downcast to the right type by the EvalRange. +template <typename Index> +inline void InvokeEvalRange(void (*run_fn)(void*, const Index, const Index), + void* evaluator, const Index first, const Index last) { + run_fn(evaluator, first, last); +} + +template <typename Expression, bool Vectorizable> +class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> { public: typedef typename Expression::Index Index; static inline void run(const Expression& expr, const ThreadPoolDevice& device) @@ -119,24 +144,35 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { + const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1; const Index size = array_prod(evaluator.dimensions()); - - static const int PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1; - - int blocksz = std::ceil<int>(static_cast<float>(size)/device.numThreads()) + PacketSize - 1; - const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize))); - const unsigned int numblocks = static_cast<unsigned int>(size / blocksize); - - Barrier barrier(numblocks); - for (unsigned int i = 0; i < numblocks; ++i) { - device.enqueue_with_barrier(&barrier, &EvalRange<Evaluator, Index, Vectorizable>::run, evaluator, i*blocksize, (i+1)*blocksize); + int num_threads = device.numThreads(); +#ifdef EIGEN_USE_COST_MODEL + if (num_threads > 1) { + num_threads = TensorCostModel<ThreadPoolDevice>::numThreads( + size, evaluator.costPerCoeff(Vectorizable), num_threads); } - - if (static_cast<Index>(numblocks) * blocksize < size) { - EvalRange<Evaluator, Index, Vectorizable>::run(evaluator, numblocks * blocksize, size); +#endif + if (num_threads == 1) { + EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size); + } else { + Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1; + const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize))); + const Index numblocks = size / blocksize; + + Barrier barrier(numblocks); + for (int i = 0; i < numblocks; ++i) { + device.enqueue_with_barrier( + &barrier, &InvokeEvalRange<Index>, + &EvalRange<Evaluator, Index, Vectorizable>::run, + static_cast<void*>(&evaluator), i * blocksize, + (i + 1) * blocksize); + } + if (numblocks * blocksize < size) { + EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, numblocks * blocksize, size); + } + barrier.Wait(); } - - barrier.Wait(); } evaluator.cleanup(); } @@ -226,7 +262,6 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run( #endif // __CUDACC__ #endif // EIGEN_USE_GPU - } // end namespace internal } // end namespace Eigen |