From 96a276803c68274396af1e3411bc6d3f6921f8c7 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Thu, 16 May 2019 16:15:45 -0700 Subject: Always evaluate Tensor expressions with broadcasting via tiled evaluation code path --- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 57 ++++++++++++++++++++-- 1 file changed, 52 insertions(+), 5 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index e2ff11129..d57203ad9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -28,6 +28,50 @@ namespace Eigen { */ namespace internal { +/** + * Evaluating TensorBroadcastingOp via coefficient of packet path is extremely + * expensive. If expression has at least one broadcast op in it, and it supports + * block based evaluation, we always prefer it, even for the small tensors. For + * all other tileable ops, block evaluation overhead for small tensors (fits + * into L1) is too large, and we fallback on vectorized evaluation. + */ + +// TODO(ezhulenev): Add specializations for all other types of Tensor ops. + +template +struct ExpressionHasTensorBroadcastingOp { + enum { value = false }; +}; + +template +struct ExpressionHasTensorBroadcastingOp< + const TensorAssignOp > { + enum { value = ExpressionHasTensorBroadcastingOp::value }; +}; + +template +struct ExpressionHasTensorBroadcastingOp< + const TensorCwiseUnaryOp > { + enum { value = ExpressionHasTensorBroadcastingOp::value }; +}; + +template +struct ExpressionHasTensorBroadcastingOp< + const TensorCwiseBinaryOp > { + enum { + value = ExpressionHasTensorBroadcastingOp::value || + ExpressionHasTensorBroadcastingOp::value + }; +}; + +template +struct ExpressionHasTensorBroadcastingOp< + const TensorBroadcastingOp > { + enum { value = true }; +}; + +// -------------------------------------------------------------------------- // + /** * Default strategy: the expression is evaluated sequentially with a single cpu * thread, without vectorization and block evaluation. @@ -121,11 +165,12 @@ class TensorExecutor::value) { // TODO(andydavis) Reduce block management overhead for small tensors. - // TODO(wuke) Do not do this when evaluating TensorBroadcastingOp. internal::TensorExecutor::run(expr, device); + /*Tileable*/ false>::run(expr, device); + evaluator.cleanup(); return; } @@ -260,10 +305,12 @@ class TensorExecutor::value) { // TODO(andydavis) Reduce block management overhead for small tensors. internal::TensorExecutor::run(expr, device); + /*Tileable*/ false>::run(expr, device); evaluator.cleanup(); return; } -- cgit v1.2.3