diff options
author | Eugene Zhulenev <ezhulenev@google.com> | 2018-07-25 13:51:10 -0700 |
---|---|---|
committer | Eugene Zhulenev <ezhulenev@google.com> | 2018-07-25 13:51:10 -0700 |
commit | 6913221c43c6ad41b1fbfc0d263d2764abd11ad2 (patch) | |
tree | e5dbd8f9c73087d37b1e812bc679d1dec2d3bfcd /unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | |
parent | d55efa6f0f9ab9ec758c6b40204be476c01b7528 (diff) |
Add tiled evaluation support to TensorExecutor
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 262 |
1 files changed, 199 insertions, 63 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 53640c6aa..024de3696 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -12,29 +12,37 @@ namespace Eigen { -/** \class TensorExecutor - * \ingroup CXX11_Tensor_Module - * - * \brief The tensor executor class. - * - * This class is responsible for launch the evaluation of the expression on - * the specified computing device. - */ +/** + * \class TensorExecutor + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor executor class. + * + * This class is responsible for launch the evaluation of the expression on + * the specified computing device. + * + * @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and + * instructions) + * @tparam Tileable can use block based tensor evaluation + * (see TensorBlock.h) + */ namespace internal { -// Default strategy: the expression is evaluated with a single cpu thread. -template<typename Expression, typename Device, bool Vectorizable> -class TensorExecutor -{ +/** + * Default strategy: the expression is evaluated sequentially with a single cpu + * thread, without vectorization and block evaluation. + */ +template <typename Expression, typename Device, bool Vectorizable, + bool Tileable> +class TensorExecutor { public: typedef typename Expression::Index Index; EIGEN_DEVICE_FUNC - static inline void run(const Expression& expr, const Device& device = Device()) - { + static inline void run(const Expression& expr, + const Device& device = Device()) { TensorEvaluator<Expression, Device> evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { + if (needs_assign) { const Index size = array_prod(evaluator.dimensions()); for (Index i = 0; i < size; ++i) { evaluator.evalScalar(i); @@ -44,12 +52,14 @@ class TensorExecutor } }; - -template<typename Expression> -class TensorExecutor<Expression, DefaultDevice, true> -{ +/** + * Process all the data with a single cpu thread, using vectorized instructions. + */ +template <typename Expression> +class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true, /*Tilable*/ false> { public: typedef typename Expression::Index Index; + EIGEN_DEVICE_FUNC static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) { @@ -58,9 +68,11 @@ class TensorExecutor<Expression, DefaultDevice, true> if (needs_assign) { const Index size = array_prod(evaluator.dimensions()); - const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size; - // Give the compiler a strong hint to unroll the loop. But don't insist - // on unrolling, because if the function is expensive the compiler should not + const int PacketSize = unpacket_traits<typename TensorEvaluator< + Expression, DefaultDevice>::PacketReturnType>::size; + + // Give compiler a strong possibility to unroll the loop. But don't insist + // on unrolling, because if the function is expensive compiler should not // unroll the loop at the expense of inlining. const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize; for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) { @@ -80,9 +92,75 @@ class TensorExecutor<Expression, DefaultDevice, true> } }; +/** + * Process all the data with a single cpu thread, using blocks of data. By + * sizing a block to fit L1 cache we get better cache performance. + */ +template <typename Expression, bool Vectorizable> +class TensorExecutor<Expression, DefaultDevice, Vectorizable, /*Tilable*/ true> { + public: + typedef typename Expression::Index Index; + + EIGEN_DEVICE_FUNC + static inline void run(const Expression& expr, + const DefaultDevice& device = DefaultDevice()) { + using Evaluator = TensorEvaluator<Expression, DefaultDevice>; + using Index = typename traits<Expression>::Index; + const int NumDims = traits<Expression>::NumDimensions; -// Multicore strategy: the index space is partitioned and each partition is executed on a single core + using Scalar = typename traits<Expression>::Scalar; + using ScalarNoConst = typename remove_const<Scalar>::type; + + using TensorBlock = + TensorBlock<ScalarNoConst, Index, NumDims, Evaluator::Layout>; + using TensorBlockMapper = + TensorBlockMapper<ScalarNoConst, Index, NumDims, Evaluator::Layout>; + + Evaluator evaluator(expr, device); + std::size_t total_size = array_prod(evaluator.dimensions()); + std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar); + + if (total_size < cache_size) { + // TODO(andydavis) Reduce block management overhead for small tensors. + // TODO(wuke) Do not do this when evaluating TensorBroadcastingOp. + internal::TensorExecutor<Expression, DefaultDevice, Vectorizable, + false>::run(expr, device); + return; + } + + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) { + // Size tensor blocks to fit in cache (or requested target block size). + size_t block_total_size = numext::mini(cache_size, total_size); + TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims; + // Query expression tree for desired block size/shape. + std::vector<TensorOpResourceRequirements> resources; + evaluator.getResourceRequirements(&resources); + MergeResourceRequirements(resources, &block_shape, &block_total_size); + + TensorBlockMapper block_mapper(evaluator.dimensions(), block_shape, + block_total_size); + block_total_size = block_mapper.block_dims_total_size(); + + Scalar* data = static_cast<Scalar*>( + device.allocate(block_total_size * sizeof(Scalar))); + + const Index total_block_count = block_mapper.total_block_count(); + for (Index i = 0; i < total_block_count; ++i) { + TensorBlock block = block_mapper.GetBlockForIndex(i, data); + evaluator.evalBlock(&block); + } + device.deallocate(data); + } + evaluator.cleanup(); + } +}; + +/** + * Multicore strategy: the index space is partitioned and each partition is + * executed on a single core. + */ #ifdef EIGEN_USE_THREADS template <typename Evaluator, typename Index, bool Vectorizable> struct EvalRange { @@ -100,7 +178,7 @@ struct EvalRange { }; template <typename Evaluator, typename Index> -struct EvalRange<Evaluator, Index, true> { +struct EvalRange<Evaluator, Index, /*Vectorizable*/ true> { static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; static void run(Evaluator* evaluator_in, const Index first, const Index last) { @@ -110,8 +188,8 @@ struct EvalRange<Evaluator, Index, true> { if (last - first >= PacketSize) { eigen_assert(first % PacketSize == 0); Index last_chunk_offset = last - 4 * PacketSize; - // Give the compiler a strong hint to unroll the loop. But don't insist - // on unrolling, because if the function is expensive the compiler should not + // Give compiler a strong possibility to unroll the loop. But don't insist + // on unrolling, because if the function is expensive compiler should not // unroll the loop at the expense of inlining. for (; i <= last_chunk_offset; i += 4*PacketSize) { for (Index j = 0; j < 4; j++) { @@ -138,55 +216,113 @@ struct EvalRange<Evaluator, Index, true> { } }; -template <typename Expression, bool Vectorizable> -class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> { +template <typename Expression, bool Vectorizable, bool Tileable> +class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> { public: typedef typename Expression::Index Index; - static inline void run(const Expression& expr, const ThreadPoolDevice& device) - { + + static inline void run(const Expression& expr, + const ThreadPoolDevice& device) { typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator; + typedef EvalRange<Evaluator, Index, Vectorizable> EvalRange; + Evaluator evaluator(expr, device); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { + const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); + if (needs_assign) { + const Index PacketSize = + Vectorizable + ? unpacket_traits<typename Evaluator::PacketReturnType>::size + : 1; const Index size = array_prod(evaluator.dimensions()); - size_t num_threads = device.numThreads(); - if (num_threads > 1) { - num_threads = TensorCostModel<ThreadPoolDevice>::numThreads( - size, evaluator.costPerCoeff(Vectorizable), num_threads); - } - if (num_threads == 1) { - EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size); - } else { - const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1; - Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1; - const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize))); - const Index numblocks = size / blocksize; - - Barrier barrier(numblocks); - for (int i = 0; i < numblocks; ++i) { - device.enqueue_with_barrier( - &barrier, &EvalRange<Evaluator, Index, Vectorizable>::run, - &evaluator, i * blocksize, (i + 1) * blocksize); - } - if (numblocks * blocksize < size) { - EvalRange<Evaluator, Index, Vectorizable>::run( - &evaluator, numblocks * blocksize, size); - } - barrier.Wait(); - } + device.parallelFor(size, evaluator.costPerCoeff(Vectorizable), + EvalRange::alignBlockSize, + [&evaluator](Index first, Index last) { + EvalRange::run(&evaluator, first, last); + }); + } + evaluator.cleanup(); + } +}; + +template <typename Expression, bool Vectorizable> +class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ true> { + public: + typedef typename Expression::Index Index; + + static inline void run(const Expression& expr, + const ThreadPoolDevice& device) { + typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator; + typedef typename internal::remove_const< + typename traits<Expression>::Scalar>::type Scalar; + typedef typename traits<Expression>::Index Index; + + static const int NumDims = traits<Expression>::NumDimensions; + + typedef TensorBlock<Scalar, Index, NumDims, Evaluator::Layout> TensorBlock; + typedef TensorBlockMapper<Scalar, Index, NumDims, Evaluator::Layout> + TensorBlockMapper; + + Evaluator evaluator(expr, device); + std::size_t total_size = array_prod(evaluator.dimensions()); + std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar); + if (total_size < cache_size) { + // TODO(andydavis) Reduce block management overhead for small tensors. + internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, + false>::run(expr, device); + evaluator.cleanup(); + return; + } + + const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); + if (needs_assign) { + TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims; + size_t block_total_size = 0; + // Query expression tree for desired block size/shape. + std::vector<internal::TensorOpResourceRequirements> resources; + evaluator.getResourceRequirements(&resources); + MergeResourceRequirements(resources, &block_shape, &block_total_size); + int num_threads = device.numThreads(); + + // Estimate minimum block size based on cost. + TensorOpCost cost = evaluator.costPerCoeff(Vectorizable); + double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(1, cost); + size_t block_size = static_cast<size_t>(1.0 / taskSize); + TensorBlockMapper block_mapper(evaluator.dimensions(), block_shape, + block_size); + block_size = block_mapper.block_dims_total_size(); + const size_t aligned_blocksize = + EIGEN_MAX_ALIGN_BYTES * + divup<size_t>(block_size * sizeof(Scalar), EIGEN_MAX_ALIGN_BYTES); + void* buf = device.allocate((num_threads + 1) * aligned_blocksize); + device.parallelFor( + block_mapper.total_block_count(), cost * block_size, + [=, &device, &evaluator, &block_mapper](Index first, Index last) { + // currentThreadId() returns -1 if called from a thread not in the + // threadpool, such as the main thread dispatching Eigen + // expressions. + const int thread_idx = device.currentThreadId(); + eigen_assert(thread_idx >= -1 && thread_idx < num_threads); + Scalar* thread_buf = reinterpret_cast<Scalar*>( + static_cast<char*>(buf) + aligned_blocksize * (thread_idx + 1)); + for (Index i = first; i < last; ++i) { + auto block = block_mapper.GetBlockForIndex(i, thread_buf); + evaluator.evalBlock(&block); + } + }); + device.deallocate(buf); } evaluator.cleanup(); } }; + #endif // EIGEN_USE_THREADS // GPU: the evaluation of the expression is offloaded to a GPU. #if defined(EIGEN_USE_GPU) -template <typename Expression, bool Vectorizable> -class TensorExecutor<Expression, GpuDevice, Vectorizable> { +template <typename Expression, bool Vectorizable, bool Tileable> +class TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable> { public: typedef typename Expression::Index Index; static void run(const Expression& expr, const GpuDevice& device); @@ -236,8 +372,8 @@ EigenMetaKernel(Evaluator eval, Index size) { } /*static*/ -template <typename Expression, bool Vectorizable> -inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run( +template <typename Expression, bool Vectorizable, bool Tileable> +inline void TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable>::run( const Expression& expr, const GpuDevice& device) { TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); |