From 788bef6ab55bc2897e29be308996b8937da4a38d Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 17 Dec 2019 19:06:14 +0000 Subject: Reduce block evaluation overhead for small tensor expressions --- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 28 ++++++++++++++++++---- 1 file changed, 23 insertions(+), 5 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index e2f1806cb..b90791d8d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -374,15 +374,23 @@ class TensorExecutordevice.parallelForAsync(ctx->tiling.block_mapper.blockCount(), - ctx->tiling.cost, eval_block, [ctx]() { delete ctx; }); + + // Evaluate small expressions directly as a single block. + if (ctx->tiling.block_mapper.blockCount() == 1) { + TensorBlockScratch scratch(ctx->device); + TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions()); + ctx->evaluator.evalBlock(desc, scratch); + delete ctx; + } else { + ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(), + ctx->tiling.cost, eval_block, + [ctx]() { delete ctx; }); + } }; ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs); -- cgit v1.2.3