From 13c3327f5cf829fd9d04a2ab46861e722cd74ca0 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 12 Nov 2019 10:12:28 -0800 Subject: Remove legacy block evaluation support --- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 202 --------------------- 1 file changed, 202 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 0fb0a9227..9926046b9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -153,70 +153,6 @@ class TensorExecutor -class TensorExecutor { - public: - typedef typename traits::Scalar Scalar; - typedef typename remove_const::type ScalarNoConst; - - typedef TensorEvaluator Evaluator; - typedef typename traits::Index StorageIndex; - - static const int NumDims = traits::NumDimensions; - - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE void run(const Expression& expr, - const DefaultDevice& device = DefaultDevice()) { - typedef TensorBlock TensorBlock; - typedef TensorBlockMapper TensorBlockMapper; - typedef typename TensorBlock::Dimensions TensorBlockDimensions; - - Evaluator evaluator(expr, device); - Index total_size = array_prod(evaluator.dimensions()); - Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar); - - if (total_size < cache_size - && !ExpressionHasTensorBroadcastingOp::value) { - // TODO(andydavis) Reduce block management overhead for small tensors. - internal::TensorExecutor::run(expr,device); - evaluator.cleanup(); - return; - } - - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) { - // Size tensor blocks to fit in cache (or requested target block size). - Index block_total_size = numext::mini(cache_size, total_size); - TensorBlockShapeType block_shape = kSkewedInnerDims; - // Query expression tree for desired block size/shape. - std::vector resources; - evaluator.getResourceRequirements(&resources); - MergeResourceRequirements(resources, &block_shape, &block_total_size); - - TensorBlockMapper block_mapper( - TensorBlockDimensions(evaluator.dimensions()), block_shape, - block_total_size); - block_total_size = block_mapper.block_dims_total_size(); - - ScalarNoConst* data = static_cast( - device.allocate(block_total_size * sizeof(Scalar))); - - const StorageIndex total_block_count = block_mapper.total_block_count(); - for (StorageIndex i = 0; i < total_block_count; ++i) { - TensorBlock block = block_mapper.GetBlockForIndex(i, data); - evaluator.evalBlock(&block); - } - device.deallocate(data); - } - evaluator.cleanup(); - } -}; - /** * Process all the data with a single cpu thread, using blocks of data. By * sizing a block to fit L1 cache we get better cache performance. @@ -446,59 +382,6 @@ class TensorExecutor { } }; -template -class TensorExecutor { - public: - typedef typename traits::Index StorageIndex; - typedef typename traits::Scalar Scalar; - typedef typename remove_const::type ScalarNoConst; - - static const int NumDims = traits::NumDimensions; - - typedef TensorEvaluator Evaluator; - typedef TensorBlockMapper BlockMapper; - typedef TensorExecutorTilingContext TilingContext; - - static EIGEN_STRONG_INLINE void run(const Expression& expr, - const ThreadPoolDevice& device) { - Evaluator evaluator(expr, device); - Index total_size = array_prod(evaluator.dimensions()); - Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar); - - if (total_size < cache_size && - !ExpressionHasTensorBroadcastingOp::value) { - // TODO(andydavis) Reduce block management overhead for small tensors. - internal::TensorExecutor::run(expr, - device); - evaluator.cleanup(); - return; - } - - const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); - if (needs_assign) { - const TilingContext tiling = - internal::GetTensorExecutorTilingContext(device, evaluator); - - device.parallelFor( - tiling.block_mapper.total_block_count(), tiling.cost, - [=, &device, &evaluator, &tiling](StorageIndex firstIdx, - StorageIndex lastIdx) { - ScalarNoConst* thread_buf = - tiling.template GetCurrentThreadBuffer(device); - for (StorageIndex i = firstIdx; i < lastIdx; ++i) { - auto block = tiling.block_mapper.GetBlockForIndex(i, thread_buf); - evaluator.evalBlock(&block); - } - }); - device.deallocate(tiling.buffer); - } - evaluator.cleanup(); - } -}; - template class TensorExecutor { @@ -603,91 +486,6 @@ class TensorAsyncExecutor -class TensorAsyncExecutor { - public: - typedef typename traits::Index StorageIndex; - typedef typename traits::Scalar Scalar; - typedef typename remove_const::type ScalarNoConst; - - static const int NumDims = traits::NumDimensions; - - typedef TensorEvaluator Evaluator; - typedef TensorBlockMapper - BlockMapper; - typedef TensorExecutorTilingContext TilingContext; - - static EIGEN_STRONG_INLINE void runAsync(const Expression& expr, - const ThreadPoolDevice& device, - DoneCallback done) { - TensorAsyncExecutorContext* const ctx = - new TensorAsyncExecutorContext(expr, device, std::move(done)); - - Index total_size = array_prod(ctx->evaluator.dimensions()); - Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar); - - if (total_size < cache_size && - !ExpressionHasTensorBroadcastingOp::value) { - auto delete_ctx = [ctx]() { delete ctx; }; - internal::TensorAsyncExecutor< - Expression, ThreadPoolDevice, decltype(delete_ctx), Vectorizable, - /*Tileable*/ TiledEvaluation::Off>::runAsync(expr, device, std::move(delete_ctx)); - return; - } - - const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void { - if (!need_assign) { - delete ctx; - return; - } - - ctx->tiling = - GetTensorExecutorTilingContext( - device, ctx->evaluator); - - auto eval_block = [ctx](StorageIndex firstIdx, StorageIndex lastIdx) { - ScalarNoConst* thread_buf = - ctx->tiling.template GetCurrentThreadBuffer( - ctx->device); - for (StorageIndex i = firstIdx; i < lastIdx; ++i) { - auto block = ctx->tiling.block_mapper.GetBlockForIndex(i, thread_buf); - ctx->evaluator.evalBlock(&block); - } - }; - device.parallelForAsync(ctx->tiling.block_mapper.total_block_count(), - ctx->tiling.cost, eval_block, - [ctx]() { delete ctx; }); - }; - - ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs); - } - - private: - struct TensorAsyncExecutorContext { - TensorAsyncExecutorContext(const Expression& expr, - const ThreadPoolDevice& thread_pool, - DoneCallback done) - : device(thread_pool), - evaluator(expr, thread_pool), - on_done(std::move(done)) {} - - ~TensorAsyncExecutorContext() { - device.deallocate(tiling.buffer); - evaluator.cleanup(); - on_done(); - } - - const ThreadPoolDevice& device; - Evaluator evaluator; - TilingContext tiling; - - private: - DoneCallback on_done; - }; -}; - template class TensorAsyncExecutor { -- cgit v1.2.3