diff options
author | Eugene Zhulenev <ezhulenev@google.com> | 2019-12-10 17:15:55 -0800 |
---|---|---|
committer | Eugene Zhulenev <ezhulenev@google.com> | 2019-12-10 17:15:55 -0800 |
commit | c9220c035fa1f552c2caba27b22bd26950bdb4ea (patch) | |
tree | d30cd6113bfe36869fc676c2e2707c9d2c77d715 /unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | |
parent | 1c879eb010df8e53e5ac016ee5d155db2c721c2b (diff) |
Remove block memory allocation required by removed block evaluation API
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 50 |
1 files changed, 10 insertions, 40 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index b2327da1e..a1e3d175f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -223,30 +223,14 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable, template <typename TensorBlockMapper> struct TensorExecutorTilingContext { - TensorExecutorTilingContext() : buffer(nullptr) {} TensorExecutorTilingContext(const TensorBlockMapper& b_mapper, - const TensorOpCost& b_cost, void* b_buffer, - size_t b_aligned_size) + const TensorOpCost& b_cost, size_t b_aligned_size) : block_mapper(b_mapper), cost(b_cost), - buffer(b_buffer), aligned_blocksize(b_aligned_size) {} - template <typename Scalar> - Scalar* GetCurrentThreadBuffer(const ThreadPoolDevice& device) const { - // ThreadPoolDevice::currentThreadId() returns -1 if called from a thread - // not in the thread pool, such as the main thread dispatching Eigen - // expressions. - const int thread_idx = device.currentThreadId(); - eigen_assert(thread_idx >= -1 && thread_idx < device.numThreads()); - - const Index offset = aligned_blocksize * (thread_idx + 1); - return reinterpret_cast<Scalar*>(static_cast<char*>(buffer) + offset); - } - TensorBlockMapper block_mapper; // navigate through blocks TensorOpCost cost; // cost of computing a single block - void* buffer; // temporary buffer for blocks size_t aligned_blocksize; // block size after memory alignment }; @@ -254,37 +238,27 @@ struct TensorExecutorTilingContext { // for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below. template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable> TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext( - const ThreadPoolDevice& device, const Evaluator& evaluator, - bool allocate_buffer = true) { + const Evaluator& evaluator) { // Query expression tree for desired block size/shape. - const TensorBlockResourceRequirements requirements = + TensorBlockResourceRequirements requirements = evaluator.getResourceRequirements(); - int num_threads = device.numThreads(); - - // Estimate minimum block size based on cost. + // Update target block size based on cost model. TensorOpCost cost = evaluator.costPerCoeff(Vectorizable); double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(1, cost); - size_t block_size = static_cast<size_t>(1.0 / taskSize); + requirements.size = static_cast<size_t>(1.0 / taskSize); TensorBlockMapper block_mapper( typename TensorBlockMapper::Dimensions(evaluator.dimensions()), requirements); - block_size = block_mapper.blockTotalSize(); + size_t block_size = block_mapper.blockTotalSize(); const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); const size_t aligned_blocksize = align * divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align); - // TODO(ezhulenev): In new block evaluation framework there is no need for - // allocating temporary buffers, remove this after migration. - void* buf = NULL; - if (allocate_buffer) { - buf = device.allocate((num_threads + 1) * aligned_blocksize); - } - - return {block_mapper, cost * block_size, buf, aligned_blocksize}; + return {block_mapper, cost * block_size, aligned_blocksize}; } template <typename Evaluator, typename StorageIndex, bool Vectorizable> @@ -393,8 +367,7 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, if (needs_assign) { const TilingContext tiling = internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper, - Vectorizable>( - device, evaluator, /*allocate_buffer=*/false); + Vectorizable>(evaluator); auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx, IndexType lastBlockIdx) { @@ -498,10 +471,8 @@ class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback, return; } - ctx->tiling = - internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper, - Vectorizable>( - ctx->device, ctx->evaluator, /*allocate_buffer=*/false); + ctx->tiling = internal::GetTensorExecutorTilingContext< + Evaluator, BlockMapper, Vectorizable>(ctx->evaluator); auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) { TensorBlockScratch scratch(ctx->device); @@ -531,7 +502,6 @@ class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback, on_done(std::move(done)) {} ~TensorAsyncExecutorContext() { - device.deallocate(tiling.buffer); evaluator.cleanup(); on_done(); } |