3 files changed, 400 insertions, 45 deletions
diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index cc8477598..015e7d698 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -446,36 +446,6 @@ protected:
   typename evaluator<ArgType>::type m_argImpl;
 };
 
-// -------------------- Product --------------------
-
-template<typename Lhs, typename Rhs>
-struct evaluator_impl<Product<Lhs,Rhs> > : public evaluator<typename Product<Lhs,Rhs>::PlainObject>::type
-{
-  typedef Product<Lhs,Rhs> XprType;
-  typedef typename XprType::PlainObject PlainObject;
-  typedef typename evaluator<PlainObject>::type evaluator_base;
-  
-//   enum {
-//     EvaluateLhs = ;
-//     EvaluateRhs = ;
-//   };
-  
-  evaluator_impl(const XprType& product) : evaluator_base(m_result)
-  {
-    // here we process the left and right hand sides with a specialized evaluator
-    // perhaps this step should be done by the TreeOptimizer to get a canonical tree and reduce evaluator instanciations
-    // typename product_operand_evaluator<Lhs>::type m_lhsImpl(product.lhs());
-    // typename product_operand_evaluator<Rhs>::type m_rhsImpl(product.rhs());
-  
-    // TODO do not rely on previous product mechanism !!
-    m_result.resize(product.rows(), product.cols());
-    m_result.noalias() = product.lhs() * product.rhs();
-  }
-  
-protected:  
-  PlainObject m_result;
-};
-
 // -------------------- Map --------------------
 
 template<typename Derived, int AccessorsType>
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index f7824aa80..6e66888c7 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -42,25 +42,16 @@ template<typename Lhs, typename Rhs, typename StorageKind> class ProductImpl;
   *
   */
 
+// Use ProductReturnType to get correct traits, in particular vectorization flags
 namespace internal {
 template<typename Lhs, typename Rhs>
 struct traits<Product<Lhs, Rhs> >
-{
-  typedef MatrixXpr XprKind;
-  typedef typename remove_all<Lhs>::type LhsCleaned;
-  typedef typename remove_all<Rhs>::type RhsCleaned;
-  typedef typename scalar_product_traits<typename traits<LhsCleaned>::Scalar, typename traits<RhsCleaned>::Scalar>::ReturnType Scalar;
-  typedef typename promote_storage_type<typename traits<LhsCleaned>::StorageKind,
-                                        typename traits<RhsCleaned>::StorageKind>::ret StorageKind;
-  typedef typename promote_index_type<typename traits<LhsCleaned>::Index,
-                                      typename traits<RhsCleaned>::Index>::type Index;
+  : traits<typename ProductReturnType<Lhs, Rhs>::Type>
+{ 
+  // We want A+B*C to be of type Product<Matrix, Sum> and not Product<Matrix, Matrix>
+  // TODO: This flag should eventually go in a separate evaluator traits class
   enum {
-    RowsAtCompileTime = LhsCleaned::RowsAtCompileTime,
-    ColsAtCompileTime = RhsCleaned::ColsAtCompileTime,
-    MaxRowsAtCompileTime = LhsCleaned::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = RhsCleaned::MaxColsAtCompileTime,
-    Flags = (MaxRowsAtCompileTime==1 ? RowMajorBit : 0), // TODO should be no storage order
-    CoeffReadCost = 0 // TODO CoeffReadCost should not be part of the expression traits
+    Flags = traits<typename ProductReturnType<Lhs, Rhs>::Type>::Flags & ~EvalBeforeNestingBit
   };
 };
 } // end namespace internal
diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
new file mode 100644
index 000000000..aadaa9303
--- /dev/null
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -0,0 +1,394 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+
+#ifndef EIGEN_PRODUCTEVALUATORS_H
+#define EIGEN_PRODUCTEVALUATORS_H
+
+namespace Eigen {
+  
+namespace internal {
+  
+// We can evaluate the product either all at once, like GeneralProduct and its evalTo() function, or
+// traverse the matrix coefficient by coefficient, like CoeffBasedProduct.  Use the existing logic
+// in ProductReturnType to decide.
+
+template<typename XprType, typename ProductType>
+struct product_evaluator_dispatcher;
+
+template<typename Lhs, typename Rhs>
+struct evaluator_impl<Product<Lhs, Rhs> >
+  : product_evaluator_dispatcher<Product<Lhs, Rhs>, typename ProductReturnType<Lhs, Rhs>::Type> 
+{
+  typedef Product<Lhs, Rhs> XprType;
+  typedef product_evaluator_dispatcher<XprType, typename ProductReturnType<Lhs, Rhs>::Type> Base;
+
+  evaluator_impl(const XprType& xpr) : Base(xpr) 
+  { }
+};
+
+// Case 1: Evaluate all at once
+//
+// We can view the GeneralProduct class as a part of the product evaluator. 
+// Four sub-cases: InnerProduct, OuterProduct, GemmProduct and GemvProduct.
+// InnerProduct is special because GeneralProduct does not have an evalTo() method in this case.
+
+template<typename Lhs, typename Rhs>
+struct product_evaluator_dispatcher<Product<Lhs, Rhs>, GeneralProduct<Lhs, Rhs, InnerProduct> > 
+  : public evaluator<typename Product<Lhs, Rhs>::PlainObject>::type
+{
+  typedef Product<Lhs, Rhs> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+  typedef typename evaluator<PlainObject>::type evaluator_base;
+  
+  product_evaluator_dispatcher(const XprType& xpr) : evaluator_base(m_result)
+  {
+    m_result.coeffRef(0,0) = (xpr.lhs().transpose().cwiseProduct(xpr.rhs())).sum();
+  }
+  
+protected:  
+  PlainObject m_result;
+};
+
+// For the other three subcases, simply call the evalTo() method of GeneralProduct
+// TODO: GeneralProduct should take evaluators, not expression objects.
+
+template<typename Lhs, typename Rhs, int ProductType>
+struct product_evaluator_dispatcher<Product<Lhs, Rhs>, GeneralProduct<Lhs, Rhs, ProductType> > 
+  : public evaluator<typename Product<Lhs, Rhs>::PlainObject>::type
+{
+  typedef Product<Lhs, Rhs> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+  typedef typename evaluator<PlainObject>::type evaluator_base;
+  
+  product_evaluator_dispatcher(const XprType& xpr) : evaluator_base(m_result)
+  {
+    m_result.resize(xpr.rows(), xpr.cols());
+    GeneralProduct<Lhs, Rhs, ProductType>(xpr.lhs(), xpr.rhs()).evalTo(m_result);
+  }
+  
+protected:  
+  PlainObject m_result;
+};
+
+// Case 2: Evaluate coeff by coeff
+//
+// This is mostly taken from CoeffBasedProduct.h
+// The main difference is that we add an extra argument to the etor_product_*_impl::run() function
+// for the inner dimension of the product, because evaluator object do not know their size.
+
+template<int Traversal, int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
+struct etor_product_coeff_impl;
+
+template<int StorageOrder, int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl;
+
+template<typename Lhs, typename Rhs, typename LhsNested, typename RhsNested, int Flags>
+struct product_evaluator_dispatcher<Product<Lhs, Rhs>, CoeffBasedProduct<LhsNested, RhsNested, Flags> >
+  : evaluator_impl_base<Product<Lhs, Rhs> >
+{
+  typedef Product<Lhs, Rhs> XprType;
+  typedef CoeffBasedProduct<LhsNested, RhsNested, Flags> CoeffBasedProductType;
+
+  product_evaluator_dispatcher(const XprType& xpr) 
+    : m_lhsImpl(xpr.lhs()), 
+      m_rhsImpl(xpr.rhs()),  
+      m_innerDim(xpr.lhs().cols())
+  { }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketScalar PacketScalar;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  // Everything below here is taken from CoeffBasedProduct.h
+
+  enum {
+    RowsAtCompileTime = traits<CoeffBasedProductType>::RowsAtCompileTime,
+    PacketSize = packet_traits<Scalar>::size,
+    InnerSize  = traits<CoeffBasedProductType>::InnerSize,
+    CoeffReadCost = traits<CoeffBasedProductType>::CoeffReadCost,
+    Unroll = CoeffReadCost != Dynamic && CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
+    CanVectorizeInner = traits<CoeffBasedProductType>::CanVectorizeInner
+  };
+
+  typedef typename evaluator<Lhs>::type LhsEtorType;
+  typedef typename evaluator<Rhs>::type RhsEtorType;
+  typedef etor_product_coeff_impl<CanVectorizeInner ? InnerVectorizedTraversal : DefaultTraversal,
+                                  Unroll ? InnerSize-1 : Dynamic,
+                                  LhsEtorType, RhsEtorType, Scalar> CoeffImpl;
+
+  const CoeffReturnType coeff(Index row, Index col) const
+  {
+    Scalar res;
+    CoeffImpl::run(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res);
+    return res;
+  }
+
+  /* Allow index-based non-packet access. It is impossible though to allow index-based packed access,
+   * which is why we don't set the LinearAccessBit.
+   */
+  const CoeffReturnType coeff(Index index) const
+  {
+    Scalar res;
+    const Index row = RowsAtCompileTime == 1 ? 0 : index;
+    const Index col = RowsAtCompileTime == 1 ? index : 0;
+    CoeffImpl::run(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res);
+    return res;
+  }
+
+  template<int LoadMode>
+  const PacketReturnType packet(Index row, Index col) const
+  {
+    PacketScalar res;
+    typedef etor_product_packet_impl<Flags&RowMajorBit ? RowMajor : ColMajor,
+				     Unroll ? InnerSize-1 : Dynamic,
+				     LhsEtorType, RhsEtorType, PacketScalar, LoadMode> PacketImpl;
+    PacketImpl::run(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res);
+    return res;
+  }
+
+protected:
+  typename evaluator<Lhs>::type m_lhsImpl;
+  typename evaluator<Rhs>::type m_rhsImpl;
+
+  // TODO: Get rid of m_innerDim if known at compile time
+  Index m_innerDim;
+};
+
+/***************************************************************************
+* Normal product .coeff() implementation (with meta-unrolling)
+***************************************************************************/
+
+/**************************************
+*** Scalar path  - no vectorization ***
+**************************************/
+
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
+struct etor_product_coeff_impl<DefaultTraversal, UnrollingIndex, Lhs, Rhs, RetScalar>
+{
+  typedef typename Lhs::Index Index;
+  EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, RetScalar &res)
+  {
+    etor_product_coeff_impl<DefaultTraversal, UnrollingIndex-1, Lhs, Rhs, RetScalar>::run(row, col, lhs, rhs, innerDim, res);
+    res += lhs.coeff(row, UnrollingIndex) * rhs.coeff(UnrollingIndex, col);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename RetScalar>
+struct etor_product_coeff_impl<DefaultTraversal, 0, Lhs, Rhs, RetScalar>
+{
+  typedef typename Lhs::Index Index;
+  EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, RetScalar &res)
+  {
+    res = lhs.coeff(row, 0) * rhs.coeff(0, col);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename RetScalar>
+struct etor_product_coeff_impl<DefaultTraversal, Dynamic, Lhs, Rhs, RetScalar>
+{
+  typedef typename Lhs::Index Index;
+  EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, RetScalar& res)
+  {
+    eigen_assert(innerDim>0 && "you are using a non initialized matrix");
+    res = lhs.coeff(row, 0) * rhs.coeff(0, col);
+    for(Index i = 1; i < innerDim; ++i)
+      res += lhs.coeff(row, i) * rhs.coeff(i, col);
+  }
+};
+
+/*******************************************
+*** Scalar path with inner vectorization ***
+*******************************************/
+
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet>
+struct etor_product_coeff_vectorized_unroller
+{
+  typedef typename Lhs::Index Index;
+  enum { PacketSize = packet_traits<typename Lhs::Scalar>::size };
+  EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, typename Lhs::PacketScalar &pres)
+  {
+    etor_product_coeff_vectorized_unroller<UnrollingIndex-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, innerDim, pres);
+    pres = padd(pres, pmul( lhs.template packet<Aligned>(row, UnrollingIndex) , rhs.template packet<Aligned>(UnrollingIndex, col) ));
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet>
+struct etor_product_coeff_vectorized_unroller<0, Lhs, Rhs, Packet>
+{
+  typedef typename Lhs::Index Index;
+  EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, typename Lhs::PacketScalar &pres)
+  {
+    pres = pmul(lhs.template packet<Aligned>(row, 0) , rhs.template packet<Aligned>(0, col));
+  }
+};
+
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
+struct etor_product_coeff_impl<InnerVectorizedTraversal, UnrollingIndex, Lhs, Rhs, RetScalar>
+{
+  typedef typename Lhs::PacketScalar Packet;
+  typedef typename Lhs::Index Index;
+  enum { PacketSize = packet_traits<typename Lhs::Scalar>::size };
+  EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, RetScalar &res)
+  {
+    Packet pres;
+    etor_product_coeff_vectorized_unroller<UnrollingIndex+1-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, innerDim, pres);
+    etor_product_coeff_impl<DefaultTraversal,UnrollingIndex,Lhs,Rhs,RetScalar>::run(row, col, lhs, rhs, innerDim, res);
+    res = predux(pres);
+  }
+};
+
+template<typename Lhs, typename Rhs, int LhsRows = Lhs::RowsAtCompileTime, int RhsCols = Rhs::ColsAtCompileTime>
+struct etor_product_coeff_vectorized_dyn_selector
+{
+  typedef typename Lhs::Index Index;
+  EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, typename Lhs::Scalar &res)
+  {
+    res = lhs.row(row).transpose().cwiseProduct(rhs.col(col)).sum();
+  }
+};
+
+// NOTE the 3 following specializations are because taking .col(0) on a vector is a bit slower
+// NOTE maybe they are now useless since we have a specialization for Block<Matrix>
+template<typename Lhs, typename Rhs, int RhsCols>
+struct etor_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,RhsCols>
+{
+  typedef typename Lhs::Index Index;
+  EIGEN_STRONG_INLINE static void run(Index /*row*/, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, typename Lhs::Scalar &res)
+  {
+    res = lhs.transpose().cwiseProduct(rhs.col(col)).sum();
+  }
+};
+
+template<typename Lhs, typename Rhs, int LhsRows>
+struct etor_product_coeff_vectorized_dyn_selector<Lhs,Rhs,LhsRows,1>
+{
+  typedef typename Lhs::Index Index;
+  EIGEN_STRONG_INLINE static void run(Index row, Index /*col*/, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, typename Lhs::Scalar &res)
+  {
+    res = lhs.row(row).transpose().cwiseProduct(rhs).sum();
+  }
+};
+
+template<typename Lhs, typename Rhs>
+struct etor_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,1>
+{
+  typedef typename Lhs::Index Index;
+  EIGEN_STRONG_INLINE static void run(Index /*row*/, Index /*col*/, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, typename Lhs::Scalar &res)
+  {
+    res = lhs.transpose().cwiseProduct(rhs).sum();
+  }
+};
+
+template<typename Lhs, typename Rhs, typename RetScalar>
+struct etor_product_coeff_impl<InnerVectorizedTraversal, Dynamic, Lhs, Rhs, RetScalar>
+{
+  typedef typename Lhs::Index Index;
+  EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, typename Lhs::Scalar &res)
+  {
+    etor_product_coeff_vectorized_dyn_selector<Lhs,Rhs>::run(row, col, lhs, rhs, innerDim, res);
+  }
+};
+
+/*******************
+*** Packet path  ***
+*******************/
+
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
+{
+  typedef typename Lhs::Index Index;
+  EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
+  {
+    etor_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
+    res =  pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex)), rhs.template packet<LoadMode>(UnrollingIndex, col), res);
+  }
+};
+
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
+{
+  typedef typename Lhs::Index Index;
+  EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
+  {
+    etor_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
+    res =  pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex), pset1<Packet>(rhs.coeff(UnrollingIndex, col)), res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
+{
+  typedef typename Lhs::Index Index;
+  EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
+  {
+    res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
+{
+  typedef typename Lhs::Index Index;
+  EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
+  {
+    res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
+{
+  typedef typename Lhs::Index Index;
+  EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
+  {
+    eigen_assert(innerDim>0 && "you are using a non initialized matrix");
+    res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
+    for(Index i = 1; i < innerDim; ++i)
+      res =  pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
+{
+  typedef typename Lhs::Index Index;
+  EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
+  {
+    eigen_assert(innerDim>0 && "you are using a non initialized matrix");
+    res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
+    for(Index i = 1; i < innerDim; ++i)
+      res =  pmadd(lhs.template packet<LoadMode>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_PRODUCT_EVALUATORS_H