From c9220c035fa1f552c2caba27b22bd26950bdb4ea Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 10 Dec 2019 17:15:55 -0800 Subject: Remove block memory allocation required by removed block evaluation API --- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 50 +++++----------------- 1 file changed, 10 insertions(+), 40 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index b2327da1e..a1e3d175f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -223,30 +223,14 @@ class TensorExecutor struct TensorExecutorTilingContext { - TensorExecutorTilingContext() : buffer(nullptr) {} TensorExecutorTilingContext(const TensorBlockMapper& b_mapper, - const TensorOpCost& b_cost, void* b_buffer, - size_t b_aligned_size) + const TensorOpCost& b_cost, size_t b_aligned_size) : block_mapper(b_mapper), cost(b_cost), - buffer(b_buffer), aligned_blocksize(b_aligned_size) {} - template - Scalar* GetCurrentThreadBuffer(const ThreadPoolDevice& device) const { - // ThreadPoolDevice::currentThreadId() returns -1 if called from a thread - // not in the thread pool, such as the main thread dispatching Eigen - // expressions. - const int thread_idx = device.currentThreadId(); - eigen_assert(thread_idx >= -1 && thread_idx < device.numThreads()); - - const Index offset = aligned_blocksize * (thread_idx + 1); - return reinterpret_cast(static_cast(buffer) + offset); - } - TensorBlockMapper block_mapper; // navigate through blocks TensorOpCost cost; // cost of computing a single block - void* buffer; // temporary buffer for blocks size_t aligned_blocksize; // block size after memory alignment }; @@ -254,37 +238,27 @@ struct TensorExecutorTilingContext { // for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below. template TensorExecutorTilingContext GetTensorExecutorTilingContext( - const ThreadPoolDevice& device, const Evaluator& evaluator, - bool allocate_buffer = true) { + const Evaluator& evaluator) { // Query expression tree for desired block size/shape. - const TensorBlockResourceRequirements requirements = + TensorBlockResourceRequirements requirements = evaluator.getResourceRequirements(); - int num_threads = device.numThreads(); - - // Estimate minimum block size based on cost. + // Update target block size based on cost model. TensorOpCost cost = evaluator.costPerCoeff(Vectorizable); double taskSize = TensorCostModel::taskSize(1, cost); - size_t block_size = static_cast(1.0 / taskSize); + requirements.size = static_cast(1.0 / taskSize); TensorBlockMapper block_mapper( typename TensorBlockMapper::Dimensions(evaluator.dimensions()), requirements); - block_size = block_mapper.blockTotalSize(); + size_t block_size = block_mapper.blockTotalSize(); const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); const size_t aligned_blocksize = align * divup(block_size * sizeof(typename Evaluator::Scalar), align); - // TODO(ezhulenev): In new block evaluation framework there is no need for - // allocating temporary buffers, remove this after migration. - void* buf = NULL; - if (allocate_buffer) { - buf = device.allocate((num_threads + 1) * aligned_blocksize); - } - - return {block_mapper, cost * block_size, buf, aligned_blocksize}; + return {block_mapper, cost * block_size, aligned_blocksize}; } template @@ -393,8 +367,7 @@ class TensorExecutor( - device, evaluator, /*allocate_buffer=*/false); + Vectorizable>(evaluator); auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx, IndexType lastBlockIdx) { @@ -498,10 +471,8 @@ class TensorAsyncExecutortiling = - internal::GetTensorExecutorTilingContext( - ctx->device, ctx->evaluator, /*allocate_buffer=*/false); + ctx->tiling = internal::GetTensorExecutorTilingContext< + Evaluator, BlockMapper, Vectorizable>(ctx->evaluator); auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) { TensorBlockScratch scratch(ctx->device); @@ -531,7 +502,6 @@ class TensorAsyncExecutor