aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2018-07-02 11:41:09 +0200
committerGravatar Gael Guennebaud <g.gael@free.fr>2018-07-02 11:41:09 +0200
commitd428a199ab70bc08db7551457a1e9d8f65d9ebb9 (patch)
treecff1c55b4bede4c05c27957efb0846ff864ab626 /Eigen/src/Core
parenta7b313a16cf5b64981dd953f150327638379e68b (diff)
bug #1562: optimize evaluation of small products of the form s*A*B by rewriting them as: s*(A.lazyProduct(B)) to save a costly temporary. Measured speedup from 2x to 5x...
Diffstat (limited to 'Eigen/src/Core')
-rw-r--r--Eigen/src/Core/ProductEvaluators.h28
-rw-r--r--Eigen/src/Core/products/GeneralMatrixMatrix.h8
2 files changed, 31 insertions, 5 deletions
diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index 60de771b2..8072a1959 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -396,7 +396,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
// but easier on the compiler side
call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op<typename Dst::Scalar,Scalar>());
}
-
+
template<typename Dst>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{
@@ -410,6 +410,32 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
// dst.noalias() -= lhs.lazyProduct(rhs);
call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<typename Dst::Scalar,Scalar>());
}
+
+ // Catch "dst {,+,-}= (s*A)*B" and evaluate it lazily by moving out the scalar factor:
+ // dst {,+,-}= s * (A.lazyProduct(B))
+ // This is a huge benefit for heap-allocated matrix types as it save one costly allocation.
+ // For them, this strategy is also faster than simply by-passing the heap allocation through
+ // stack allocation.
+ // For fixed sizes matrices, this is less obvious, it is sometimes x2 faster, but sometimes x3 slower,
+ // and the behavior depends also a lot on the compiler... so let's be conservative and enable them for dynamic-size only,
+ // that is when coming from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h
+ template<typename Dst, typename Scalar1, typename Scalar2, typename Plain1, typename Xpr2, typename Func>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ void eval_dynamic(Dst& dst, const CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
+ const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>, Xpr2>& lhs, const Rhs& rhs, const Func &func)
+ {
+ call_assignment_no_alias(dst, lhs.lhs().functor().m_other * lhs.rhs().lazyProduct(rhs), func);
+ }
+
+ // Here, we we always have LhsT==Lhs, but we need to make it a template type to make the above
+ // overload more specialized.
+ template<typename Dst, typename LhsT, typename Func>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ void eval_dynamic(Dst& dst, const LhsT& lhs, const Rhs& rhs, const Func &func)
+ {
+ call_assignment_no_alias(dst, lhs.lazyProduct(rhs), func);
+ }
+
// template<typename Dst>
// static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index ed4d3182b..dc47cf7cf 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -431,10 +431,10 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
// to determine the following heuristic.
// EIGEN_GEMM_TO_COEFFBASED_THRESHOLD is typically defined to 20 in GeneralProduct.h,
// unless it has been specialized by the user or for a given architecture.
- // Note that the condition rhs.rows()>0 was required because lazy produc is (was?) not happy with empty inputs.
+ // Note that the condition rhs.rows()>0 was required because lazy product is (was?) not happy with empty inputs.
// I'm not sure it is still required.
if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
- lazyproduct::evalTo(dst, lhs, rhs);
+ lazyproduct::eval_dynamic(dst, lhs, rhs, internal::assign_op<typename Dst::Scalar,Scalar>());
else
{
dst.setZero();
@@ -446,7 +446,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{
if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
- lazyproduct::addTo(dst, lhs, rhs);
+ lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op<typename Dst::Scalar,Scalar>());
else
scaleAndAddTo(dst,lhs, rhs, Scalar(1));
}
@@ -455,7 +455,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{
if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
- lazyproduct::subTo(dst, lhs, rhs);
+ lazyproduct::eval_dynamic(dst, lhs, rhs, internal::sub_assign_op<typename Dst::Scalar,Scalar>());
else
scaleAndAddTo(dst, lhs, rhs, Scalar(-1));
}