From 6913221c43c6ad41b1fbfc0d263d2764abd11ad2 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Wed, 25 Jul 2018 13:51:10 -0700 Subject: Add tiled evaluation support to TensorExecutor --- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 262 ++++++++++++++++----- 1 file changed, 199 insertions(+), 63 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 53640c6aa..024de3696 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -12,29 +12,37 @@ namespace Eigen { -/** \class TensorExecutor - * \ingroup CXX11_Tensor_Module - * - * \brief The tensor executor class. - * - * This class is responsible for launch the evaluation of the expression on - * the specified computing device. - */ +/** + * \class TensorExecutor + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor executor class. + * + * This class is responsible for launch the evaluation of the expression on + * the specified computing device. + * + * @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and + * instructions) + * @tparam Tileable can use block based tensor evaluation + * (see TensorBlock.h) + */ namespace internal { -// Default strategy: the expression is evaluated with a single cpu thread. -template -class TensorExecutor -{ +/** + * Default strategy: the expression is evaluated sequentially with a single cpu + * thread, without vectorization and block evaluation. + */ +template +class TensorExecutor { public: typedef typename Expression::Index Index; EIGEN_DEVICE_FUNC - static inline void run(const Expression& expr, const Device& device = Device()) - { + static inline void run(const Expression& expr, + const Device& device = Device()) { TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { + if (needs_assign) { const Index size = array_prod(evaluator.dimensions()); for (Index i = 0; i < size; ++i) { evaluator.evalScalar(i); @@ -44,12 +52,14 @@ class TensorExecutor } }; - -template -class TensorExecutor -{ +/** + * Process all the data with a single cpu thread, using vectorized instructions. + */ +template +class TensorExecutor { public: typedef typename Expression::Index Index; + EIGEN_DEVICE_FUNC static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) { @@ -58,9 +68,11 @@ class TensorExecutor if (needs_assign) { const Index size = array_prod(evaluator.dimensions()); - const int PacketSize = unpacket_traits::PacketReturnType>::size; - // Give the compiler a strong hint to unroll the loop. But don't insist - // on unrolling, because if the function is expensive the compiler should not + const int PacketSize = unpacket_traits::PacketReturnType>::size; + + // Give compiler a strong possibility to unroll the loop. But don't insist + // on unrolling, because if the function is expensive compiler should not // unroll the loop at the expense of inlining. const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize; for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) { @@ -80,9 +92,75 @@ class TensorExecutor } }; +/** + * Process all the data with a single cpu thread, using blocks of data. By + * sizing a block to fit L1 cache we get better cache performance. + */ +template +class TensorExecutor { + public: + typedef typename Expression::Index Index; + + EIGEN_DEVICE_FUNC + static inline void run(const Expression& expr, + const DefaultDevice& device = DefaultDevice()) { + using Evaluator = TensorEvaluator; + using Index = typename traits::Index; + const int NumDims = traits::NumDimensions; -// Multicore strategy: the index space is partitioned and each partition is executed on a single core + using Scalar = typename traits::Scalar; + using ScalarNoConst = typename remove_const::type; + + using TensorBlock = + TensorBlock; + using TensorBlockMapper = + TensorBlockMapper; + + Evaluator evaluator(expr, device); + std::size_t total_size = array_prod(evaluator.dimensions()); + std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar); + + if (total_size < cache_size) { + // TODO(andydavis) Reduce block management overhead for small tensors. + // TODO(wuke) Do not do this when evaluating TensorBroadcastingOp. + internal::TensorExecutor::run(expr, device); + return; + } + + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) { + // Size tensor blocks to fit in cache (or requested target block size). + size_t block_total_size = numext::mini(cache_size, total_size); + TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims; + // Query expression tree for desired block size/shape. + std::vector resources; + evaluator.getResourceRequirements(&resources); + MergeResourceRequirements(resources, &block_shape, &block_total_size); + + TensorBlockMapper block_mapper(evaluator.dimensions(), block_shape, + block_total_size); + block_total_size = block_mapper.block_dims_total_size(); + + Scalar* data = static_cast( + device.allocate(block_total_size * sizeof(Scalar))); + + const Index total_block_count = block_mapper.total_block_count(); + for (Index i = 0; i < total_block_count; ++i) { + TensorBlock block = block_mapper.GetBlockForIndex(i, data); + evaluator.evalBlock(&block); + } + device.deallocate(data); + } + evaluator.cleanup(); + } +}; + +/** + * Multicore strategy: the index space is partitioned and each partition is + * executed on a single core. + */ #ifdef EIGEN_USE_THREADS template struct EvalRange { @@ -100,7 +178,7 @@ struct EvalRange { }; template -struct EvalRange { +struct EvalRange { static const int PacketSize = unpacket_traits::size; static void run(Evaluator* evaluator_in, const Index first, const Index last) { @@ -110,8 +188,8 @@ struct EvalRange { if (last - first >= PacketSize) { eigen_assert(first % PacketSize == 0); Index last_chunk_offset = last - 4 * PacketSize; - // Give the compiler a strong hint to unroll the loop. But don't insist - // on unrolling, because if the function is expensive the compiler should not + // Give compiler a strong possibility to unroll the loop. But don't insist + // on unrolling, because if the function is expensive compiler should not // unroll the loop at the expense of inlining. for (; i <= last_chunk_offset; i += 4*PacketSize) { for (Index j = 0; j < 4; j++) { @@ -138,55 +216,113 @@ struct EvalRange { } }; -template -class TensorExecutor { +template +class TensorExecutor { public: typedef typename Expression::Index Index; - static inline void run(const Expression& expr, const ThreadPoolDevice& device) - { + + static inline void run(const Expression& expr, + const ThreadPoolDevice& device) { typedef TensorEvaluator Evaluator; + typedef EvalRange EvalRange; + Evaluator evaluator(expr, device); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { + const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); + if (needs_assign) { + const Index PacketSize = + Vectorizable + ? unpacket_traits::size + : 1; const Index size = array_prod(evaluator.dimensions()); - size_t num_threads = device.numThreads(); - if (num_threads > 1) { - num_threads = TensorCostModel::numThreads( - size, evaluator.costPerCoeff(Vectorizable), num_threads); - } - if (num_threads == 1) { - EvalRange::run(&evaluator, 0, size); - } else { - const Index PacketSize = Vectorizable ? unpacket_traits::size : 1; - Index blocksz = std::ceil(static_cast(size)/num_threads) + PacketSize - 1; - const Index blocksize = numext::maxi(PacketSize, (blocksz - (blocksz % PacketSize))); - const Index numblocks = size / blocksize; - - Barrier barrier(numblocks); - for (int i = 0; i < numblocks; ++i) { - device.enqueue_with_barrier( - &barrier, &EvalRange::run, - &evaluator, i * blocksize, (i + 1) * blocksize); - } - if (numblocks * blocksize < size) { - EvalRange::run( - &evaluator, numblocks * blocksize, size); - } - barrier.Wait(); - } + device.parallelFor(size, evaluator.costPerCoeff(Vectorizable), + EvalRange::alignBlockSize, + [&evaluator](Index first, Index last) { + EvalRange::run(&evaluator, first, last); + }); + } + evaluator.cleanup(); + } +}; + +template +class TensorExecutor { + public: + typedef typename Expression::Index Index; + + static inline void run(const Expression& expr, + const ThreadPoolDevice& device) { + typedef TensorEvaluator Evaluator; + typedef typename internal::remove_const< + typename traits::Scalar>::type Scalar; + typedef typename traits::Index Index; + + static const int NumDims = traits::NumDimensions; + + typedef TensorBlock TensorBlock; + typedef TensorBlockMapper + TensorBlockMapper; + + Evaluator evaluator(expr, device); + std::size_t total_size = array_prod(evaluator.dimensions()); + std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar); + if (total_size < cache_size) { + // TODO(andydavis) Reduce block management overhead for small tensors. + internal::TensorExecutor::run(expr, device); + evaluator.cleanup(); + return; + } + + const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); + if (needs_assign) { + TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims; + size_t block_total_size = 0; + // Query expression tree for desired block size/shape. + std::vector resources; + evaluator.getResourceRequirements(&resources); + MergeResourceRequirements(resources, &block_shape, &block_total_size); + int num_threads = device.numThreads(); + + // Estimate minimum block size based on cost. + TensorOpCost cost = evaluator.costPerCoeff(Vectorizable); + double taskSize = TensorCostModel::taskSize(1, cost); + size_t block_size = static_cast(1.0 / taskSize); + TensorBlockMapper block_mapper(evaluator.dimensions(), block_shape, + block_size); + block_size = block_mapper.block_dims_total_size(); + const size_t aligned_blocksize = + EIGEN_MAX_ALIGN_BYTES * + divup(block_size * sizeof(Scalar), EIGEN_MAX_ALIGN_BYTES); + void* buf = device.allocate((num_threads + 1) * aligned_blocksize); + device.parallelFor( + block_mapper.total_block_count(), cost * block_size, + [=, &device, &evaluator, &block_mapper](Index first, Index last) { + // currentThreadId() returns -1 if called from a thread not in the + // threadpool, such as the main thread dispatching Eigen + // expressions. + const int thread_idx = device.currentThreadId(); + eigen_assert(thread_idx >= -1 && thread_idx < num_threads); + Scalar* thread_buf = reinterpret_cast( + static_cast(buf) + aligned_blocksize * (thread_idx + 1)); + for (Index i = first; i < last; ++i) { + auto block = block_mapper.GetBlockForIndex(i, thread_buf); + evaluator.evalBlock(&block); + } + }); + device.deallocate(buf); } evaluator.cleanup(); } }; + #endif // EIGEN_USE_THREADS // GPU: the evaluation of the expression is offloaded to a GPU. #if defined(EIGEN_USE_GPU) -template -class TensorExecutor { +template +class TensorExecutor { public: typedef typename Expression::Index Index; static void run(const Expression& expr, const GpuDevice& device); @@ -236,8 +372,8 @@ EigenMetaKernel(Evaluator eval, Index size) { } /*static*/ -template -inline void TensorExecutor::run( +template +inline void TensorExecutor::run( const Expression& expr, const GpuDevice& device) { TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); -- cgit v1.2.3