From ef9dfee7bdc8e0d82c9b7ddf9414ef99d866d7ba Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 24 Sep 2019 12:52:45 -0700 Subject: Tensor block evaluation V2 support for unary/binary/broadcsting --- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 172 ++++++++++++++++++--- 1 file changed, 150 insertions(+), 22 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index cf07656b3..a7cb8dc97 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -23,7 +23,7 @@ namespace Eigen { * * @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and * instructions) - * @tparam Tileable can use block based tensor evaluation + * @tparam Tiling can use block based tensor evaluation * (see TensorBlock.h) */ namespace internal { @@ -76,8 +76,13 @@ struct ExpressionHasTensorBroadcastingOp< * Default strategy: the expression is evaluated sequentially with a single cpu * thread, without vectorization and block evaluation. */ +#if EIGEN_HAS_CXX11 template + TiledEvaluation Tiling> +#else + template +#endif class TensorExecutor { public: typedef typename Expression::Index StorageIndex; @@ -109,8 +114,8 @@ class TensorAsyncExecutor {}; * Process all the data with a single cpu thread, using vectorized instructions. */ template -class TensorExecutor { +class TensorExecutor { public: typedef typename Expression::Index StorageIndex; @@ -152,7 +157,7 @@ class TensorExecutor class TensorExecutor { + /*Tiling=*/TiledEvaluation::Legacy> { public: typedef typename traits::Scalar Scalar; typedef typename remove_const::type ScalarNoConst; @@ -176,8 +181,7 @@ class TensorExecutor::value) { // TODO(andydavis) Reduce block management overhead for small tensors. - internal::TensorExecutor::run(expr, device); + internal::TensorExecutor::run(expr,device); evaluator.cleanup(); return; } @@ -211,6 +215,70 @@ class TensorExecutor +class TensorExecutor { + public: + typedef typename traits::Scalar Scalar; + typedef typename remove_const::type ScalarNoConst; + + typedef TensorEvaluator Evaluator; + typedef typename traits::Index StorageIndex; + + static const int NumDims = traits::NumDimensions; + + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE void run(const Expression& expr, + const DefaultDevice& device = DefaultDevice()) { + typedef TensorBlock TensorBlock; + typedef TensorBlockMapper TensorBlockMapper; + typedef typename TensorBlock::Dimensions TensorBlockDimensions; + + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator + TensorBlockScratch; + + Evaluator evaluator(expr, device); + Index total_size = array_prod(evaluator.dimensions()); + Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar); + + // TODO(ezhulenev): Do not use tiling for small tensors? + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + + if (needs_assign) { + // Size tensor blocks to fit in cache (or requested target block size). + Index block_total_size = numext::mini(cache_size, total_size); + TensorBlockShapeType block_shape = kSkewedInnerDims; + // Query expression tree for desired block size/shape. + std::vector resources; + evaluator.getResourceRequirements(&resources); + MergeResourceRequirements(resources, &block_shape, &block_total_size); + + TensorBlockMapper block_mapper( + TensorBlockDimensions(evaluator.dimensions()), block_shape, + block_total_size); + block_total_size = block_mapper.block_dims_total_size(); + + // Share scratch memory allocator between all blocks. + TensorBlockScratch scratch(device); + + const StorageIndex total_block_count = block_mapper.total_block_count(); + for (StorageIndex i = 0; i < total_block_count; ++i) { + TensorBlock block = block_mapper.GetBlockForIndex(i, NULL); + + TensorBlockDesc desc(block.first_coeff_index(), block.block_sizes()); + evaluator.evalBlockV2(desc, scratch); + scratch.reset(); + } + } + evaluator.cleanup(); + } +}; + /** * Multicore strategy: the index space is partitioned and each partition is * executed on a single core. @@ -256,10 +324,11 @@ struct TensorExecutorTilingContext { }; // Computes a block evaluation parameters, and allocates temporary memory buffer -// for blocks. See TensorExecutor/TensorAsyncExecutor (Tileable=true) below. +// for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below. template TensorExecutorTilingContext GetTensorExecutorTilingContext( - const ThreadPoolDevice& device, const Evaluator& evaluator) { + const ThreadPoolDevice& device, const Evaluator& evaluator, + bool allocate_buffer = true) { // Prefer blocks skewed toward inner dimension. TensorBlockShapeType block_shape = kSkewedInnerDims; Index block_total_size = 0; @@ -284,7 +353,13 @@ TensorExecutorTilingContext GetTensorExecutorTilingContext( const size_t aligned_blocksize = align * divup(block_size * sizeof(typename Evaluator::Scalar), align); - void* buf = device.allocate((num_threads + 1) * aligned_blocksize); + + // TODO(ezhulenev): In new block evaluation framework there is no need for + // allocating temporary buffers, remove this after migration. + void* buf = NULL; + if (allocate_buffer) { + buf = device.allocate((num_threads + 1) * aligned_blocksize); + } return {block_mapper, cost * block_size, buf, aligned_blocksize}; } @@ -344,8 +419,8 @@ struct EvalRange { } }; -template -class TensorExecutor { +template +class TensorExecutor { public: typedef typename Expression::Index StorageIndex; @@ -369,7 +444,8 @@ class TensorExecutor { }; template -class TensorExecutor { +class TensorExecutor { public: typedef typename traits::Index StorageIndex; typedef typename traits::Scalar Scalar; @@ -387,11 +463,12 @@ class TensorExecutor::value) { + if (total_size < cache_size && + !ExpressionHasTensorBroadcastingOp::value) { // TODO(andydavis) Reduce block management overhead for small tensors. internal::TensorExecutor::run(expr, device); + /*Tiling=*/TiledEvaluation::Off>::run(expr, + device); evaluator.cleanup(); return; } @@ -419,6 +496,57 @@ class TensorExecutor +class TensorExecutor { + public: + typedef typename traits::Index IndexType; + typedef typename traits::Scalar Scalar; + typedef typename remove_const::type ScalarNoConst; + + static const int NumDims = traits::NumDimensions; + + typedef TensorEvaluator Evaluator; + typedef TensorBlockMapper + BlockMapper; + typedef TensorExecutorTilingContext TilingContext; + + typedef internal::TensorBlockDescriptor + TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator + TensorBlockScratch; + + static EIGEN_STRONG_INLINE void run(const Expression& expr, + const ThreadPoolDevice& device) { + Evaluator evaluator(expr, device); + + const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); + if (needs_assign) { + const TilingContext tiling = + internal::GetTensorExecutorTilingContext( + device, evaluator, /*allocate_buffer=*/false); + + auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx, + IndexType lastBlockIdx) { + TensorBlockScratch scratch(device); + + for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; ++block_idx) { + auto block = tiling.block_mapper.GetBlockForIndex(block_idx, nullptr); + TensorBlockDesc desc(block.first_coeff_index(), block.block_sizes()); + evaluator.evalBlockV2(desc, scratch); + scratch.reset(); + } + }; + + device.parallelFor(tiling.block_mapper.total_block_count(), tiling.cost, + eval_block); + } + evaluator.cleanup(); + } +}; + template class TensorAsyncExecutor -class TensorExecutor { +template +class TensorExecutor { public: typedef typename Expression::Index StorageIndex; static void run(const Expression& expr, const GpuDevice& device); @@ -612,8 +740,8 @@ EigenMetaKernel(Evaluator eval, StorageIndex size) { } /*static*/ -template -EIGEN_STRONG_INLINE void TensorExecutor::run( +template +EIGEN_STRONG_INLINE void TensorExecutor::run( const Expression& expr, const GpuDevice& device) { TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); @@ -711,8 +839,8 @@ struct ExecExprFunctorKernel range_, vectorizable_threads_, evaluator) {} }; -template -class TensorExecutor { +template +class TensorExecutor { public: typedef typename Expression::Index Index; static EIGEN_STRONG_INLINE void run(const Expression &expr, const Eigen::SyclDevice &dev) { -- cgit v1.2.3