GEMM: catch all scalar-multiple variants when falling-back to a coeff-based product.

Before only s*A*B was caught which was both inconsistent with GEMM, sub-optimal, and could even lead to compilation-errors (https://stackoverflow.com/questions/54738495).
author: Gael Guennebaud <g.gael@free.fr> 2019-02-18 11:47:54 +0100
committer: Gael Guennebaud <g.gael@free.fr> 2019-02-18 11:47:54 +0100
commit: 512b74aaa19fa12a05774dd30205d2c97e8bdef9 (patch)
tree: efadb2022fb2291c4b733a7c7f4670dce6b01ba3 /Eigen
parent: ec032ac03b90dc6c58680a4dc858133e9a72fd1f (diff)
2 files changed, 54 insertions, 23 deletions
diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index 27796315d..60b79b855 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -411,35 +411,56 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
     call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<typename Dst::Scalar,Scalar>());
   }
 
-  // Catch "dst {,+,-}= (s*A)*B" and evaluate it lazily by moving out the scalar factor:
-  //    dst {,+,-}= s * (A.lazyProduct(B))
-  // This is a huge benefit for heap-allocated matrix types as it save one costly allocation.
-  // For them, this strategy is also faster than simply by-passing the heap allocation through
-  // stack allocation.
-  // For fixed sizes matrices, this is less obvious, it is sometimes x2 faster, but sometimes x3 slower,
-  // and the behavior depends also a lot on the compiler... so let's be conservative and enable them for dynamic-size only,
-  // that is when coming from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h
-  template<typename Dst, typename Scalar1, typename Scalar2, typename Plain1, typename Xpr2, typename Func>
+  // This is a special evaluation path called from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h
+  // This variant tries to extract scalar multiples from both the LHS and RHS and factor them out. For instance:
+  //   dst {,+,-}= (s1*A)*(B*s2)
+  // will be rewritten as:
+  //   dst {,+,-}= (s1*s2) * (A.lazyProduct(B))
+  // There are at least four benefits of doing so:
+  //  1 - huge performance gain for heap-allocated matrix types as it save costly allocations.
+  //  2 - it is faster than simply by-passing the heap allocation through stack allocation.
+  //  3 - it makes this fallback consistent with the heavy GEMM routine.
+  //  4 - it fully by-passes huge stack allocation attempts when multiplying huge fixed-size matrices.
+  //      (see https://stackoverflow.com/questions/54738495)
+  // For small fixed sizes matrices, howver, the gains are less obvious, it is sometimes x2 faster, but sometimes x3 slower,
+  // and the behavior depends also a lot on the compiler... This is why this re-writting strategy is currently
+  // enabled only when falling back from the main GEMM.
+  template<typename Dst, typename Func>
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void eval_dynamic(Dst& dst, const CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
-                                           const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>, Xpr2>& lhs, const Rhs& rhs, const Func &func)
+  void eval_dynamic(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Func &func)
   {
-    call_restricted_packet_assignment_no_alias(dst, lhs.lhs().functor().m_other * lhs.rhs().lazyProduct(rhs), func);
+    enum {
+      HasScalarFactor = blas_traits<Lhs>::HasScalarFactor || blas_traits<Rhs>::HasScalarFactor
+    };
+    // FIXME: in c++11 this should be auto, and extractScalarFactor should also return auto
+    //        this is important for real*complex_mat
+    Scalar actualAlpha =    blas_traits<Lhs>::extractScalarFactor(lhs)
+                          * blas_traits<Rhs>::extractScalarFactor(rhs);
+    eval_dynamic_impl(dst,
+                      blas_traits<Lhs>::extract(lhs),
+                      blas_traits<Rhs>::extract(rhs),
+                      func,
+                      actualAlpha,
+                      typename conditional<HasScalarFactor,true_type,false_type>::type());
+
+    
   }
 
-  // Here, we we always have LhsT==Lhs, but we need to make it a template type to make the above
-  // overload more specialized.
-  template<typename Dst, typename LhsT, typename Func>
+protected:
+
+  template<typename Dst, typename LhsT, typename RhsT, typename Func, typename Scalar>
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void eval_dynamic(Dst& dst, const LhsT& lhs, const Rhs& rhs, const Func &func)
+  void eval_dynamic_impl(Dst& dst, const LhsT& lhs, const RhsT& rhs, const Func &func, const Scalar& /* s == 1 */, false_type)
   {
     call_restricted_packet_assignment_no_alias(dst, lhs.lazyProduct(rhs), func);
   }
-  
-  
-//   template<typename Dst>
-//   static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
-//   { dst.noalias() += alpha * lhs.lazyProduct(rhs); }
+
+  template<typename Dst, typename LhsT, typename RhsT, typename Func, typename Scalar>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void eval_dynamic_impl(Dst& dst, const LhsT& lhs, const RhsT& rhs, const Func &func, const Scalar& s, true_type)
+  {
+    call_restricted_packet_assignment_no_alias(dst, s * lhs.lazyProduct(rhs), func);
+  }
 };
 
 // This specialization enforces the use of a coefficient-based evaluation strategy
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index a32630ed7..bc0a01540 100755
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -274,7 +274,8 @@ template<typename XprType> struct blas_traits
     HasUsableDirectAccess = (    (int(XprType::Flags)&DirectAccessBit)
                               && (   bool(XprType::IsVectorAtCompileTime)
                                   || int(inner_stride_at_compile_time<XprType>::ret) == 1)
-                             ) ?  1 : 0
+                             ) ?  1 : 0,
+    HasScalarFactor = false
   };
   typedef typename conditional<bool(HasUsableDirectAccess),
     ExtractType,
@@ -306,6 +307,9 @@ template<typename Scalar, typename NestedXpr, typename Plain>
 struct blas_traits<CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain>, NestedXpr> >
  : blas_traits<NestedXpr>
 {
+  enum {
+    HasScalarFactor = true
+  };
   typedef blas_traits<NestedXpr> Base;
   typedef CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain>, NestedXpr> XprType;
   typedef typename Base::ExtractType ExtractType;
@@ -317,6 +321,9 @@ template<typename Scalar, typename NestedXpr, typename Plain>
 struct blas_traits<CwiseBinaryOp<scalar_product_op<Scalar>, NestedXpr, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain> > >
  : blas_traits<NestedXpr>
 {
+  enum {
+    HasScalarFactor = true
+  };
   typedef blas_traits<NestedXpr> Base;
   typedef CwiseBinaryOp<scalar_product_op<Scalar>, NestedXpr, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain> > XprType;
   typedef typename Base::ExtractType ExtractType;
@@ -335,6 +342,9 @@ template<typename Scalar, typename NestedXpr>
 struct blas_traits<CwiseUnaryOp<scalar_opposite_op<Scalar>, NestedXpr> >
  : blas_traits<NestedXpr>
 {
+  enum {
+    HasScalarFactor = true
+  };
   typedef blas_traits<NestedXpr> Base;
   typedef CwiseUnaryOp<scalar_opposite_op<Scalar>, NestedXpr> XprType;
   typedef typename Base::ExtractType ExtractType;
@@ -358,7 +368,7 @@ struct blas_traits<Transpose<NestedXpr> >
     typename ExtractType::PlainObject
     >::type DirectLinearAccessType;
   enum {
-    IsTransposed = Base::IsTransposed ? 0 : 1
+    IsTransposed = Base::IsTransposed ? 0 : 1,
   };
   static inline ExtractType extract(const XprType& x) { return ExtractType(Base::extract(x.nestedExpression())); }
   static inline Scalar extractScalarFactor(const XprType& x) { return Base::extractScalarFactor(x.nestedExpression()); }
author	Gael Guennebaud <g.gael@free.fr>	2019-02-18 11:47:54 +0100
committer	Gael Guennebaud <g.gael@free.fr>	2019-02-18 11:47:54 +0100
commit	512b74aaa19fa12a05774dd30205d2c97e8bdef9 (patch)
tree	efadb2022fb2291c4b733a7c7f4670dce6b01ba3 /Eigen
parent	ec032ac03b90dc6c58680a4dc858133e9a72fd1f (diff)