Move evaluation related flags from traits to evaluator and fix evaluators of MapBase and Replicate

author: Gael Guennebaud <g.gael@free.fr> 2014-03-12 13:34:11 +0100
committer: Gael Guennebaud <g.gael@free.fr> 2014-03-12 13:34:11 +0100
commit: 8dd3b716e39d4b4b472b948de1af20838bf17493 (patch)
tree: 3fa4e90f1a6caf23e5028c8c5025e04ad27a8768 /Eigen/src/Core
parent: 7eefdb948c1ff372f85991ff3f9d998e66a554d9 (diff)
23 files changed, 433 insertions, 125 deletions
diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h
index 2ea1cc126..05816094c 100644
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -28,11 +28,10 @@ template <typename DstEvaluator, typename SrcEvaluator, typename AssignFunc>
 struct copy_using_evaluator_traits
 {
   typedef typename DstEvaluator::XprType Dst;
-  typedef typename SrcEvaluator::XprType Src;
-  // TODO, we should get these flags from the evaluators
+  
   enum {
-    DstFlags = Dst::Flags,
-    SrcFlags = Src::Flags
+    DstFlags = DstEvaluator::Flags,
+    SrcFlags = SrcEvaluator::Flags
   };
   
 public:
@@ -56,7 +55,9 @@ private:
   };
 
   enum {
-    StorageOrdersAgree = (int(Dst::IsRowMajor) == int(Src::IsRowMajor)),
+    DstIsRowMajor = DstEvaluator::Flags&RowMajorBit,
+    SrcIsRowMajor = SrcEvaluator::Flags&RowMajorBit,
+    StorageOrdersAgree = (int(DstIsRowMajor) == int(SrcIsRowMajor)),
     MightVectorize = StorageOrdersAgree
                   && (int(DstFlags) & int(SrcFlags) & ActualPacketAccessBit)
                   && (functor_traits<AssignFunc>::PacketAccess),
@@ -596,7 +597,7 @@ public:
     typedef typename DstEvaluatorType::ExpressionTraits Traits;
     return int(Traits::RowsAtCompileTime) == 1 ? 0
       : int(Traits::ColsAtCompileTime) == 1 ? inner
-      : int(Traits::Flags)&RowMajorBit ? outer
+      : int(DstEvaluatorType::Flags)&RowMajorBit ? outer
       : inner;
   }
 
@@ -605,7 +606,7 @@ public:
     typedef typename DstEvaluatorType::ExpressionTraits Traits;
     return int(Traits::ColsAtCompileTime) == 1 ? 0
       : int(Traits::RowsAtCompileTime) == 1 ? inner
-      : int(Traits::Flags)&RowMajorBit ? inner
+      : int(DstEvaluatorType::Flags)&RowMajorBit ? inner
       : outer;
   }
   
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index 31cd5c72c..d92797a98 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -68,6 +68,7 @@ struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprTyp
     MaxColsAtCompileTime = BlockCols==0 ? 0
                          : ColsAtCompileTime != Dynamic ? int(ColsAtCompileTime)
                          : int(traits<XprType>::MaxColsAtCompileTime),
+
     XprTypeIsRowMajor = (int(traits<XprType>::Flags)&RowMajorBit) != 0,
     IsRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
                : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
@@ -80,6 +81,10 @@ struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprTyp
     OuterStrideAtCompileTime = HasSameStorageOrderAsXprType
                              ? int(outer_stride_at_compile_time<XprType>::ret)
                              : int(inner_stride_at_compile_time<XprType>::ret),
+    // IsAligned is needed by MapBase's assertions
+    // We can sefely set it to false here. Internal alignment errors will be detected by an eigen_internal_assert in the respective evaluator
+    IsAligned = 0,
+#ifndef EIGEN_TEST_EVALUATORS
     MaskPacketAccessBit = (InnerSize == Dynamic || (InnerSize % packet_traits<Scalar>::size) == 0)
                        && (InnerStrideAtCompileTime == 1)
                         ? PacketAccessBit : 0,
@@ -92,6 +97,12 @@ struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprTyp
                                         MaskPacketAccessBit |
                                         MaskAlignedBit),
     Flags = Flags0 | FlagsLinearAccessBit | FlagsLvalueBit | FlagsRowMajorBit
+#else
+    // FIXME, this traits is rather specialized for dense object...
+    FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
+    FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
+    Flags = (traits<XprType>::Flags & DirectAccessBit) | FlagsLvalueBit | FlagsRowMajorBit // FIXME DirectAccessBit should not be handled by expressions
+#endif
   };
 };
 
diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index 33c89c2d4..a5de3593c 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -136,7 +136,9 @@ struct evaluator<PlainObjectBase<Derived> >
     RowsAtCompileTime = PlainObjectType::RowsAtCompileTime,
     ColsAtCompileTime = PlainObjectType::ColsAtCompileTime,
     
-    CoeffReadCost = NumTraits<Scalar>::ReadCost
+    CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    Flags = compute_matrix_evaluator_flags< Scalar,Derived::RowsAtCompileTime,Derived::ColsAtCompileTime,
+                                            Derived::Options,Derived::MaxRowsAtCompileTime,Derived::MaxColsAtCompileTime>::ret
   };
   
   evaluator()
@@ -323,7 +325,8 @@ struct evaluator<Transpose<ArgType> >
   typedef Transpose<ArgType> XprType;
   
   enum {
-    CoeffReadCost = evaluator<ArgType>::CoeffReadCost
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,    
+    Flags = evaluator<ArgType>::Flags ^ RowMajorBit
   };
 
   evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {}
@@ -389,9 +392,16 @@ struct evaluator<CwiseNullaryOp<NullaryOp,PlainObjectType> >
   : evaluator_base<CwiseNullaryOp<NullaryOp,PlainObjectType> >
 {
   typedef CwiseNullaryOp<NullaryOp,PlainObjectType> XprType;
+  typedef typename internal::remove_all<PlainObjectType>::type PlainObjectTypeCleaned;
   
   enum {
-    CoeffReadCost = internal::functor_traits<NullaryOp>::Cost
+    CoeffReadCost = internal::functor_traits<NullaryOp>::Cost,
+    
+    Flags = (evaluator<PlainObjectTypeCleaned>::Flags
+          &  (  HereditaryBits
+              | (functor_has_linear_access<NullaryOp>::ret  ? LinearAccessBit : 0)
+              | (functor_traits<NullaryOp>::PacketAccess    ? PacketAccessBit : 0)))
+          | (functor_traits<NullaryOp>::IsRepeatable ? 0 : EvalBeforeNestingBit) // FIXME EvalBeforeNestingBit should be needed anymore
   };
 
   evaluator(const XprType& n) 
@@ -437,7 +447,11 @@ struct evaluator<CwiseUnaryOp<UnaryOp, ArgType> >
   typedef CwiseUnaryOp<UnaryOp, ArgType> XprType;
   
   enum {
-    CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<UnaryOp>::Cost
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<UnaryOp>::Cost,
+    
+    Flags = evaluator<ArgType>::Flags & (
+              HereditaryBits | LinearAccessBit | AlignedBit
+            | (functor_traits<UnaryOp>::PacketAccess ? PacketAccessBit : 0))
   };
 
   evaluator(const XprType& op) 
@@ -485,7 +499,22 @@ struct evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
   typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
   
   enum {
-    CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost
+    CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
+    
+    LhsFlags = evaluator<Lhs>::Flags,
+    RhsFlags = evaluator<Rhs>::Flags,
+    SameType = is_same<typename Lhs::Scalar,typename Rhs::Scalar>::value,
+    StorageOrdersAgree = (int(LhsFlags)&RowMajorBit)==(int(RhsFlags)&RowMajorBit),
+    Flags0 = (int(LhsFlags) | int(RhsFlags)) & (
+        HereditaryBits
+      | (int(LhsFlags) & int(RhsFlags) &
+           ( AlignedBit
+           | (StorageOrdersAgree ? LinearAccessBit : 0)
+           | (functor_traits<BinaryOp>::PacketAccess && StorageOrdersAgree && SameType ? PacketAccessBit : 0)
+           )
+        )
+     ),
+    Flags = (Flags0 & ~RowMajorBit) | (LhsFlags & RowMajorBit)
   };
 
   evaluator(const XprType& xpr) 
@@ -537,7 +566,9 @@ struct evaluator<CwiseUnaryView<UnaryOp, ArgType> >
   typedef CwiseUnaryView<UnaryOp, ArgType> XprType;
   
   enum {
-    CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<UnaryOp>::Cost
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<UnaryOp>::Cost,
+    
+    Flags = (evaluator<ArgType>::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit))
   };
 
   evaluator(const XprType& op) 
@@ -576,12 +607,15 @@ protected:
 
 // -------------------- Map --------------------
 
-template<typename Derived, int AccessorsType>
-struct evaluator<MapBase<Derived, AccessorsType> >
-  : evaluator_base<Derived>
+// FIXME perhaps the PlainObjectType could be provided by Derived::PlainObject ?
+// but that might complicate template specialization
+template<typename Derived, typename PlainObjectType>
+struct mapbase_evaluator;
+
+template<typename Derived, typename PlainObjectType>
+struct mapbase_evaluator : evaluator_base<Derived>
 {
-  typedef MapBase<Derived, AccessorsType> MapType;
-  typedef Derived XprType;
+  typedef Derived  XprType;
   typedef typename XprType::PointerType PointerType;
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
@@ -590,81 +624,103 @@ struct evaluator<MapBase<Derived, AccessorsType> >
   typedef typename XprType::PacketReturnType PacketReturnType;
   
   enum {
-    RowsAtCompileTime = XprType::RowsAtCompileTime,
+    IsRowMajor = XprType::RowsAtCompileTime,
+    ColsAtCompileTime = XprType::ColsAtCompileTime,
     CoeffReadCost = NumTraits<Scalar>::ReadCost
   };
   
-  evaluator(const XprType& map) 
+  mapbase_evaluator(const XprType& map) 
     : m_data(const_cast<PointerType>(map.data())),  
-      m_rowStride(map.rowStride()),
-      m_colStride(map.colStride())
-  { }
+      m_xpr(map)
+  {
+    EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(evaluator<Derived>::Flags&PacketAccessBit, internal::inner_stride_at_compile_time<Derived>::ret==1),
+                        PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
+  }
  
   CoeffReturnType coeff(Index row, Index col) const 
-  { 
-    return m_data[col * m_colStride + row * m_rowStride];
+  {
+    return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()];
   }
   
   CoeffReturnType coeff(Index index) const 
-  { 
-    return coeff(RowsAtCompileTime == 1 ? 0 : index,
-                 RowsAtCompileTime == 1 ? index : 0);
+  {
+    return m_data[index * m_xpr.innerStride()];
   }
 
   Scalar& coeffRef(Index row, Index col) 
-  { 
-    return m_data[col * m_colStride + row * m_rowStride];
+  {
+    return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()];
   }
   
   Scalar& coeffRef(Index index) 
-  { 
-    return coeffRef(RowsAtCompileTime == 1 ? 0 : index,
-                    RowsAtCompileTime == 1 ? index : 0);
+  {
+    return m_data[index * m_xpr.innerStride()];
   }
  
   template<int LoadMode> 
   PacketReturnType packet(Index row, Index col) const 
-  { 
-    PointerType ptr = m_data + row * m_rowStride + col * m_colStride;
+  {
+    PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride();
     return internal::ploadt<PacketScalar, LoadMode>(ptr);
   }
 
   template<int LoadMode> 
   PacketReturnType packet(Index index) const 
-  { 
-    return packet<LoadMode>(RowsAtCompileTime == 1 ? 0 : index,
-                            RowsAtCompileTime == 1 ? index : 0);
+  {
+    return internal::ploadt<PacketScalar, LoadMode>(m_data + index * m_xpr.innerStride());
   }
   
   template<int StoreMode> 
   void writePacket(Index row, Index col, const PacketScalar& x) 
-  { 
-    PointerType ptr = m_data + row * m_rowStride + col * m_colStride;
+  {
+    PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride();
     return internal::pstoret<Scalar, PacketScalar, StoreMode>(ptr, x);
   }
   
   template<int StoreMode> 
   void writePacket(Index index, const PacketScalar& x) 
-  { 
-    return writePacket<StoreMode>(RowsAtCompileTime == 1 ? 0 : index,
-                                  RowsAtCompileTime == 1 ? index : 0,
-                                  x);
+  {
+    internal::pstoret<Scalar, PacketScalar, StoreMode>(m_data + index * m_xpr.innerStride(), x);
   }
  
 protected:
   PointerType m_data;
-  int m_rowStride;
-  int m_colStride;
+  const XprType& m_xpr;
 };
 
 template<typename PlainObjectType, int MapOptions, typename StrideType> 
 struct evaluator<Map<PlainObjectType, MapOptions, StrideType> >
-  : public evaluator<MapBase<Map<PlainObjectType, MapOptions, StrideType> > >
+  : public mapbase_evaluator<Map<PlainObjectType, MapOptions, StrideType>, PlainObjectType>
 {
   typedef Map<PlainObjectType, MapOptions, StrideType> XprType;
+  typedef typename XprType::Scalar Scalar;
+  
+  enum {
+    InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
+                             ? int(PlainObjectType::InnerStrideAtCompileTime)
+                             : int(StrideType::InnerStrideAtCompileTime),
+    OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
+                             ? int(PlainObjectType::OuterStrideAtCompileTime)
+                             : int(StrideType::OuterStrideAtCompileTime),
+    HasNoInnerStride = InnerStrideAtCompileTime == 1,
+    HasNoOuterStride = StrideType::OuterStrideAtCompileTime == 0,
+    HasNoStride = HasNoInnerStride && HasNoOuterStride,
+    IsAligned = bool(EIGEN_ALIGN) && ((int(MapOptions)&Aligned)==Aligned),
+    IsDynamicSize = PlainObjectType::SizeAtCompileTime==Dynamic,
+    KeepsPacketAccess = bool(HasNoInnerStride)
+                        && ( bool(IsDynamicSize)
+                           || HasNoOuterStride
+                           || ( OuterStrideAtCompileTime!=Dynamic
+                           && ((static_cast<int>(sizeof(Scalar))*OuterStrideAtCompileTime)%16)==0 ) ),
+    Flags0 = evaluator<PlainObjectType>::Flags,
+    Flags1 = IsAligned ? (int(Flags0) | AlignedBit) : (int(Flags0) & ~AlignedBit),
+    Flags2 = (bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime))
+           ? int(Flags1) : int(Flags1 & ~LinearAccessBit),
+    Flags = KeepsPacketAccess ? int(Flags2) : (int(Flags2) & ~PacketAccessBit)
+  };
 
   evaluator(const XprType& map) 
-    : evaluator<MapBase<XprType> >(map) 
+    : mapbase_evaluator<XprType, PlainObjectType>(map) 
   { }
 };
 
@@ -672,12 +728,16 @@ struct evaluator<Map<PlainObjectType, MapOptions, StrideType> >
 
 template<typename PlainObjectType, int RefOptions, typename StrideType> 
 struct evaluator<Ref<PlainObjectType, RefOptions, StrideType> >
-  : public evaluator<MapBase<Ref<PlainObjectType, RefOptions, StrideType> > >
+  : public mapbase_evaluator<Ref<PlainObjectType, RefOptions, StrideType>, PlainObjectType>
 {
   typedef Ref<PlainObjectType, RefOptions, StrideType> XprType;
+  
+  enum {
+    Flags = evaluator<Map<PlainObjectType, RefOptions, StrideType> >::Flags
+  };
 
-  evaluator(const XprType& map) 
-    : evaluator<MapBase<XprType> >(map) 
+  evaluator(const XprType& ref) 
+    : mapbase_evaluator<XprType, PlainObjectType>(ref) 
   { }
 };
 
@@ -691,8 +751,39 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
   : block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel>
 {
   typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
+  typedef typename XprType::Scalar Scalar; 
+  
   enum {
-    CoeffReadCost = evaluator<ArgType>::CoeffReadCost
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+    
+    RowsAtCompileTime = traits<ArgType>::RowsAtCompileTime,
+    ColsAtCompileTime = traits<ArgType>::ColsAtCompileTime,
+    MaxRowsAtCompileTime = traits<ArgType>::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = traits<ArgType>::MaxColsAtCompileTime,
+    
+    XprTypeIsRowMajor = (int(traits<ArgType>::Flags)&RowMajorBit) != 0,
+    IsRowMajor = (MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1) ? 1
+               : (MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1) ? 0
+               : XprTypeIsRowMajor,
+    HasSameStorageOrderAsXprType = (IsRowMajor == XprTypeIsRowMajor),
+    InnerSize = IsRowMajor ? int(ColsAtCompileTime) : int(RowsAtCompileTime),
+    InnerStrideAtCompileTime = HasSameStorageOrderAsXprType
+                             ? int(inner_stride_at_compile_time<XprType>::ret)
+                             : int(outer_stride_at_compile_time<XprType>::ret),
+    OuterStrideAtCompileTime = HasSameStorageOrderAsXprType
+                             ? int(outer_stride_at_compile_time<XprType>::ret)
+                             : int(inner_stride_at_compile_time<XprType>::ret),
+    MaskPacketAccessBit = (InnerSize == Dynamic || (InnerSize % packet_traits<Scalar>::size) == 0)
+                       && (InnerStrideAtCompileTime == 1)
+                        ? PacketAccessBit : 0,
+    MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % 16) == 0)) ? AlignedBit : 0,
+    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0,
+    FlagsRowMajorBit = XprType::Flags&RowMajorBit,
+    Flags0 = traits<XprType>::Flags & ( (HereditaryBits & ~RowMajorBit) |
+                                        DirectAccessBit |
+                                        MaskPacketAccessBit |
+                                        MaskAlignedBit),
+    Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit
   };
   typedef block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel> block_evaluator_type;
   evaluator(const XprType& block) : block_evaluator_type(block) {}
@@ -778,18 +869,23 @@ protected:
 
 template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel> 
 struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /* HasDirectAccess */ true>
-  : evaluator<MapBase<Block<ArgType, BlockRows, BlockCols, InnerPanel> > >
+  : mapbase_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>,
+                      typename Block<ArgType, BlockRows, BlockCols, InnerPanel>::PlainObject>
 {
   typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
 
   block_evaluator(const XprType& block) 
-    : evaluator<MapBase<XprType> >(block) 
-  { }
+    : mapbase_evaluator<XprType, typename XprType::PlainObject>(block) 
+  {
+    // FIXME this should be an internal assertion
+    eigen_assert(EIGEN_IMPLIES(evaluator<XprType>::Flags&AlignedBit, (size_t(block.data()) % 16) == 0) && "data is not aligned");
+  }
 };
 
 
 // -------------------- Select --------------------
 
+// TODO enable vectorization for Select
 template<typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType>
 struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
   : evaluator_base<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
@@ -798,7 +894,9 @@ struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
   enum {
     CoeffReadCost = evaluator<ConditionMatrixType>::CoeffReadCost
                   + EIGEN_SIZE_MAX(evaluator<ThenMatrixType>::CoeffReadCost,
-                                   evaluator<ElseMatrixType>::CoeffReadCost)
+                                   evaluator<ElseMatrixType>::CoeffReadCost),
+
+    Flags = (unsigned int)evaluator<ThenMatrixType>::Flags & evaluator<ElseMatrixType>::Flags & HereditaryBits
   };
 
   evaluator(const XprType& select) 
@@ -850,7 +948,9 @@ struct evaluator<Replicate<ArgType, RowFactor, ColFactor> >
   typedef typename internal::remove_all<ArgTypeNested>::type ArgTypeNestedCleaned;
   
   enum {
-    CoeffReadCost = evaluator<ArgTypeNestedCleaned>::CoeffReadCost
+    CoeffReadCost = evaluator<ArgTypeNestedCleaned>::CoeffReadCost,
+    
+    Flags = (evaluator<ArgTypeNestedCleaned>::Flags & HereditaryBits & ~RowMajorBit) | (traits<XprType>::Flags & RowMajorBit)
   };
 
   evaluator(const XprType& replicate) 
@@ -858,7 +958,7 @@ struct evaluator<Replicate<ArgType, RowFactor, ColFactor> >
       m_argImpl(m_arg),
       m_rows(replicate.nestedExpression().rows()),
       m_cols(replicate.nestedExpression().cols())
-  { }
+  {}
  
   CoeffReturnType coeff(Index row, Index col) const
   {
@@ -907,17 +1007,19 @@ struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
   typedef PartialReduxExpr<ArgType, MemberOp, Direction> XprType;
   typedef typename XprType::Scalar InputScalar;
   enum {
-    TraversalSize = Direction==Vertical ? XprType::RowsAtCompileTime :  XprType::ColsAtCompileTime
+    TraversalSize = Direction==Vertical ? ArgType::RowsAtCompileTime :  XprType::ColsAtCompileTime
   };
   typedef typename MemberOp::template Cost<InputScalar,int(TraversalSize)> CostOpType;
   enum {
     CoeffReadCost = TraversalSize==Dynamic ? Dynamic
-                  : TraversalSize * evaluator<ArgType>::CoeffReadCost + int(CostOpType::value)
+                  : TraversalSize * evaluator<ArgType>::CoeffReadCost + int(CostOpType::value),
+    
+    Flags = (traits<XprType>::Flags&RowMajorBit) | (evaluator<ArgType>::Flags&HereditaryBits)
   };
 
   evaluator(const XprType expr)
     : m_expr(expr)
-  { }
+  {}
 
   typedef typename XprType::Index Index;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
@@ -948,7 +1050,8 @@ struct evaluator_wrapper_base
 {
   typedef typename remove_all<typename XprType::NestedExpressionType>::type ArgType;
   enum {
-    CoeffReadCost = evaluator<ArgType>::CoeffReadCost
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+    Flags = evaluator<ArgType>::Flags
   };
 
   evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {}
@@ -1058,7 +1161,15 @@ struct evaluator<Reverse<ArgType, Direction> >
                     || ((Direction == Vertical)   && IsColMajor)
                     || ((Direction == Horizontal) && IsRowMajor),
                     
-    CoeffReadCost = evaluator<ArgType>::CoeffReadCost
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+    
+    // let's enable LinearAccess only with vectorization because of the product overhead
+    // FIXME enable DirectAccess with negative strides?
+    Flags0 = evaluator<ArgType>::Flags,
+    LinearAccess = ( (Direction==BothDirections) && (int(Flags0)&PacketAccessBit) )
+                 ? LinearAccessBit : 0,
+
+    Flags = int(Flags0) & (HereditaryBits | PacketAccessBit | LinearAccess)
   };
   typedef internal::reverse_packet_cond<PacketScalar,ReversePacket> reverse_packet;
 
@@ -1071,7 +1182,7 @@ struct evaluator<Reverse<ArgType, Direction> >
   CoeffReturnType coeff(Index row, Index col) const
   {
     return m_argImpl.coeff(ReverseRow ? m_rows.value() - row - 1 : row,
-			   ReverseCol ? m_cols.value() - col - 1 : col);
+                           ReverseCol ? m_cols.value() - col - 1 : col);
   }
 
   CoeffReturnType coeff(Index index) const
@@ -1082,7 +1193,7 @@ struct evaluator<Reverse<ArgType, Direction> >
   Scalar& coeffRef(Index row, Index col)
   {
     return m_argImpl.coeffRef(ReverseRow ? m_rows.value() - row - 1 : row,
-			      ReverseCol ? m_cols.value() - col - 1 : col);
+                              ReverseCol ? m_cols.value() - col - 1 : col);
   }
 
   Scalar& coeffRef(Index index)
@@ -1138,7 +1249,9 @@ struct evaluator<Diagonal<ArgType, DiagIndex> >
   typedef Diagonal<ArgType, DiagIndex> XprType;
   
   enum {
-    CoeffReadCost = evaluator<ArgType>::CoeffReadCost
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+    
+    Flags = (unsigned int)evaluator<ArgType>::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit) & ~RowMajorBit
   };
 
   evaluator(const XprType& diagonal) 
diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h
index 105e7fb11..07861dbc9 100644
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@@ -65,6 +65,7 @@ struct traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
   typedef typename remove_reference<LhsNested>::type _LhsNested;
   typedef typename remove_reference<RhsNested>::type _RhsNested;
   enum {
+#ifndef EIGEN_TEST_EVALUATORS
     LhsFlags = _LhsNested::Flags,
     RhsFlags = _RhsNested::Flags,
     SameType = is_same<typename _LhsNested::Scalar,typename _RhsNested::Scalar>::value,
@@ -78,12 +79,13 @@ struct traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
            )
         )
      ),
-    Flags = (Flags0 & ~RowMajorBit) | (LhsFlags & RowMajorBit)
-#ifndef EIGEN_TEST_EVALUATORS
-    ,
+    Flags = (Flags0 & ~RowMajorBit) | (LhsFlags & RowMajorBit),
+    
     LhsCoeffReadCost = _LhsNested::CoeffReadCost,
     RhsCoeffReadCost = _RhsNested::CoeffReadCost,
     CoeffReadCost = LhsCoeffReadCost + RhsCoeffReadCost + functor_traits<BinaryOp>::Cost
+#else
+    Flags = _LhsNested::Flags & RowMajorBit
 #endif
   };
 };
diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h
index 560e03f12..f9f127cc2 100644
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@@ -35,14 +35,15 @@ template<typename NullaryOp, typename PlainObjectType>
 struct traits<CwiseNullaryOp<NullaryOp, PlainObjectType> > : traits<PlainObjectType>
 {
   enum {
+#ifndef EIGEN_TEST_EVALUATORS  
     Flags = (traits<PlainObjectType>::Flags
       & (  HereditaryBits
          | (functor_has_linear_access<NullaryOp>::ret ? LinearAccessBit : 0)
          | (functor_traits<NullaryOp>::PacketAccess ? PacketAccessBit : 0)))
-      | (functor_traits<NullaryOp>::IsRepeatable ? 0 : EvalBeforeNestingBit)
-#ifndef EIGEN_TEST_EVALUATORS  
-    ,
+      | (functor_traits<NullaryOp>::IsRepeatable ? 0 : EvalBeforeNestingBit),
     CoeffReadCost = functor_traits<NullaryOp>::Cost
+#else
+    Flags = traits<PlainObjectType>::Flags & RowMajorBit
 #endif
   };
 };
diff --git a/Eigen/src/Core/CwiseUnaryOp.h b/Eigen/src/Core/CwiseUnaryOp.h
index 25da52ab7..af05a9108 100644
--- a/Eigen/src/Core/CwiseUnaryOp.h
+++ b/Eigen/src/Core/CwiseUnaryOp.h
@@ -44,12 +44,13 @@ struct traits<CwiseUnaryOp<UnaryOp, XprType> >
   typedef typename XprType::Nested XprTypeNested;
   typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
   enum {
+#ifndef EIGEN_TEST_EVALUATORS
     Flags = _XprTypeNested::Flags & (
       HereditaryBits | LinearAccessBit | AlignedBit
-      | (functor_traits<UnaryOp>::PacketAccess ? PacketAccessBit : 0))
-#ifndef EIGEN_TEST_EVALUATORS
-    ,
+      | (functor_traits<UnaryOp>::PacketAccess ? PacketAccessBit : 0)),
     CoeffReadCost = _XprTypeNested::CoeffReadCost + functor_traits<UnaryOp>::Cost
+#else
+    Flags = _XprTypeNested::Flags & RowMajorBit 
 #endif
   };
 };
diff --git a/Eigen/src/Core/CwiseUnaryView.h b/Eigen/src/Core/CwiseUnaryView.h
index a0bd80fb9..9cdebb8e7 100644
--- a/Eigen/src/Core/CwiseUnaryView.h
+++ b/Eigen/src/Core/CwiseUnaryView.h
@@ -37,9 +37,11 @@ struct traits<CwiseUnaryView<ViewOp, MatrixType> >
   typedef typename MatrixType::Nested MatrixTypeNested;
   typedef typename remove_all<MatrixTypeNested>::type _MatrixTypeNested;
   enum {
-    Flags = (traits<_MatrixTypeNested>::Flags & (HereditaryBits | LvalueBit | LinearAccessBit | DirectAccessBit)),
 #ifndef EIGEN_TEST_EVALUATORS
+    Flags = (traits<_MatrixTypeNested>::Flags & (HereditaryBits | LvalueBit | LinearAccessBit | DirectAccessBit)),
     CoeffReadCost = traits<_MatrixTypeNested>::CoeffReadCost + functor_traits<ViewOp>::Cost,
+#else
+    Flags = traits<_MatrixTypeNested>::Flags & (RowMajorBit | LvalueBit | DirectAccessBit), // FIXME DirectAccessBit should not be handled by expressions
 #endif
     MatrixTypeInnerStride =  inner_stride_at_compile_time<MatrixType>::ret,
     // need to cast the sizeof's from size_t to int explicitly, otherwise:
diff --git a/Eigen/src/Core/Diagonal.h b/Eigen/src/Core/Diagonal.h
index 02ab04980..3ff6a3e66 100644
--- a/Eigen/src/Core/Diagonal.h
+++ b/Eigen/src/Core/Diagonal.h
@@ -51,10 +51,13 @@ struct traits<Diagonal<MatrixType,DiagIndex> >
                          : (EIGEN_PLAIN_ENUM_MIN(MatrixType::MaxRowsAtCompileTime - EIGEN_PLAIN_ENUM_MAX(-DiagIndex, 0),
                                                  MatrixType::MaxColsAtCompileTime - EIGEN_PLAIN_ENUM_MAX( DiagIndex, 0))),
     MaxColsAtCompileTime = 1,
+#ifndef EIGEN_TEST_EVALUATORS
     MaskLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
     Flags = (unsigned int)_MatrixTypeNested::Flags & (HereditaryBits | LinearAccessBit | MaskLvalueBit | DirectAccessBit) & ~RowMajorBit,
-#ifndef EIGEN_TEST_EVALUATORS
     CoeffReadCost = _MatrixTypeNested::CoeffReadCost,
+#else
+    MaskLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
+    Flags = (unsigned int)_MatrixTypeNested::Flags & (RowMajorBit | MaskLvalueBit | DirectAccessBit) & ~RowMajorBit, // FIXME DirectAccessBit should not be handled by expressions
 #endif
     MatrixTypeOuterStride = outer_stride_at_compile_time<MatrixType>::ret,
     InnerStrideAtCompileTime = MatrixTypeOuterStride == Dynamic ? Dynamic : MatrixTypeOuterStride+1,
diff --git a/Eigen/src/Core/DiagonalMatrix.h b/Eigen/src/Core/DiagonalMatrix.h
index 784e4b1ce..ba0042ba4 100644
--- a/Eigen/src/Core/DiagonalMatrix.h
+++ b/Eigen/src/Core/DiagonalMatrix.h
@@ -275,6 +275,7 @@ struct traits<DiagonalWrapper<_DiagonalVectorType> >
   typedef typename DiagonalVectorType::Scalar Scalar;
   typedef typename DiagonalVectorType::Index Index;
   typedef typename DiagonalVectorType::StorageKind StorageKind;
+  typedef typename traits<DiagonalVectorType>::XprKind XprKind;
   enum {
     RowsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
     ColsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
diff --git a/Eigen/src/Core/DiagonalProduct.h b/Eigen/src/Core/DiagonalProduct.h
index 840b70dbb..c6dafdddc 100644
--- a/Eigen/src/Core/DiagonalProduct.h
+++ b/Eigen/src/Core/DiagonalProduct.h
@@ -26,6 +26,7 @@ struct traits<DiagonalProduct<MatrixType, DiagonalType, ProductOrder> >
     MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
 
+#ifndef EIGEN_TEST_EVALUATORS
     _StorageOrder = MatrixType::Flags & RowMajorBit ? RowMajor : ColMajor,
     _ScalarAccessOnDiag =  !((int(_StorageOrder) == ColMajor && int(ProductOrder) == OnTheLeft)
                           ||(int(_StorageOrder) == RowMajor && int(ProductOrder) == OnTheRight)),
@@ -34,11 +35,10 @@ struct traits<DiagonalProduct<MatrixType, DiagonalType, ProductOrder> >
     //_Vectorizable = bool(int(MatrixType::Flags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagonalType::DiagonalVectorType::Flags)&PacketAccessBit))),
     _Vectorizable = bool(int(MatrixType::Flags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagonalType::DiagonalVectorType::Flags)&PacketAccessBit))),
     _LinearAccessMask = (RowsAtCompileTime==1 || ColsAtCompileTime==1) ? LinearAccessBit : 0,
-
-    Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixType::Flags)) | (_Vectorizable ? PacketAccessBit : 0) | AlignedBit //(int(MatrixType::Flags)&int(DiagonalType::DiagonalVectorType::Flags)&AlignedBit),
-#ifndef EIGEN_TEST_EVALUATORS
-    ,
+    Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixType::Flags)) | (_Vectorizable ? PacketAccessBit : 0) | AlignedBit, //(int(MatrixType::Flags)&int(DiagonalType::DiagonalVectorType::Flags)&AlignedBit),
     CoeffReadCost = NumTraits<Scalar>::MulCost + MatrixType::CoeffReadCost + DiagonalType::DiagonalVectorType::CoeffReadCost
+#else
+    Flags = RowMajorBit & (unsigned int)(MatrixType::Flags)
 #endif
   };
 };
diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h
index 8ea13cfb7..23bbb46bf 100644
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@@ -79,10 +79,11 @@ struct traits<Map<PlainObjectType, MapOptions, StrideType> >
     OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
                              ? int(PlainObjectType::OuterStrideAtCompileTime)
                              : int(StrideType::OuterStrideAtCompileTime),
+    IsAligned = bool(EIGEN_ALIGN) && ((int(MapOptions)&Aligned)==Aligned),
+#ifndef EIGEN_TEST_EVALUATORS
     HasNoInnerStride = InnerStrideAtCompileTime == 1,
     HasNoOuterStride = StrideType::OuterStrideAtCompileTime == 0,
     HasNoStride = HasNoInnerStride && HasNoOuterStride,
-    IsAligned = bool(EIGEN_ALIGN) && ((int(MapOptions)&Aligned)==Aligned),
     IsDynamicSize = PlainObjectType::SizeAtCompileTime==Dynamic,
     KeepsPacketAccess = bool(HasNoInnerStride)
                         && ( bool(IsDynamicSize)
@@ -95,6 +96,10 @@ struct traits<Map<PlainObjectType, MapOptions, StrideType> >
            ? int(Flags1) : int(Flags1 & ~LinearAccessBit),
     Flags3 = is_lvalue<PlainObjectType>::value ? int(Flags2) : (int(Flags2) & ~LvalueBit),
     Flags = KeepsPacketAccess ? int(Flags3) : (int(Flags3) & ~PacketAccessBit)
+#else
+    Flags0 = TraitsBase::Flags & (~NestByRefBit),
+    Flags = is_lvalue<PlainObjectType>::value ? int(Flags0) : (int(Flags0) & ~LvalueBit)
+#endif
   };
 private:
   enum { Options }; // Expressions don't have Options
diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h
index ffa1371c2..de1424b09 100644
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@@ -161,11 +161,16 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
     EIGEN_DEVICE_FUNC
     void checkSanity() const
     {
+#ifndef EIGEN_TEST_EVALUATORS
+      // moved to evaluator
       EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(internal::traits<Derived>::Flags&PacketAccessBit,
                                         internal::inner_stride_at_compile_time<Derived>::ret==1),
                           PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
-      eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::Flags&AlignedBit, (size_t(m_data) % 16) == 0)
-                   && "data is not aligned");
+      eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::Flags&AlignedBit, (size_t(m_data) % 16) == 0) && "data is not aligned");
+#else
+      eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::IsAligned, (size_t(m_data) % 16) == 0) && "data is not aligned");
+#endif
+      
     }
 
     PointerType m_data;
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index cac90bc1f..453180049 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -33,14 +33,29 @@ template<typename Lhs, typename Rhs, int Option, typename StorageKind> class Pro
 namespace internal {
 template<typename Lhs, typename Rhs, int Option>
 struct traits<Product<Lhs, Rhs, Option> >
-  : traits<CoeffBasedProduct<Lhs, Rhs, NestByRefBit> >
-{ 
-  // We want A+B*C to be of type Product<Matrix, Sum> and not Product<Matrix, Matrix>
-  // TODO: This flag should eventually go in a separate evaluator traits class
+{
+  typedef typename remove_all<Lhs>::type LhsCleaned;
+  typedef typename remove_all<Rhs>::type RhsCleaned;
+  
+  typedef MatrixXpr XprKind;
+  
+  typedef typename scalar_product_traits<typename LhsCleaned::Scalar, typename RhsCleaned::Scalar>::ReturnType Scalar;
+  typedef typename promote_storage_type<typename traits<LhsCleaned>::StorageKind,
+                                           typename traits<RhsCleaned>::StorageKind>::ret StorageKind;
+  typedef typename promote_index_type<typename traits<LhsCleaned>::Index,
+                                         typename traits<RhsCleaned>::Index>::type Index;
+  
   enum {
-    Flags = traits<CoeffBasedProduct<Lhs, Rhs, NestByRefBit> >::Flags & ~(EvalBeforeNestingBit | DirectAccessBit)
+    RowsAtCompileTime    = LhsCleaned::RowsAtCompileTime,
+    ColsAtCompileTime    = RhsCleaned::ColsAtCompileTime,
+    MaxRowsAtCompileTime = LhsCleaned::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = RhsCleaned::MaxColsAtCompileTime,
+    
+    // The storage order is somewhat arbitrary here. The correct one will be determined through the evaluator.
+    Flags = (MaxRowsAtCompileTime==1 ? RowMajorBit : 0)
   };
 };
+
 } // end namespace internal
 
 
@@ -59,8 +74,6 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option,
         typename internal::promote_storage_type<typename Lhs::StorageKind,
                                                 typename Rhs::StorageKind>::ret>::Base Base;
     EIGEN_GENERIC_PUBLIC_INTERFACE(Product)
-    
-    
 
     typedef typename internal::nested<Lhs>::type LhsNested;
     typedef typename internal::nested<Rhs>::type RhsNested;
diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index 7ebf31696..1159c2f44 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -18,19 +18,6 @@ namespace Eigen {
 namespace internal {
 
 /** \internal
-  * \class product_evaluator
-  * Products need their own evaluator with more template arguments allowing for
-  * easier partial template specializations.
-  */
-template< typename T,
-          int ProductTag = internal::product_type<typename T::Lhs,typename T::Rhs>::ret,
-          typename LhsShape = typename evaluator_traits<typename T::Lhs>::Shape,
-          typename RhsShape = typename evaluator_traits<typename T::Rhs>::Shape,
-          typename LhsScalar = typename T::Lhs::Scalar,
-          typename RhsScalar = typename T::Rhs::Scalar
-        > struct product_evaluator;
-
-/** \internal
   * Evaluator of a product expression.
   * Since products require special treatments to handle all possible cases,
   * we simply deffer the evaluation logic to a product_evaluator class
@@ -119,6 +106,18 @@ struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, ProductTag, DenseSha
     : m_result(xpr.rows(), xpr.cols())
   {
     ::new (static_cast<Base*>(this)) Base(m_result);
+    
+// FIXME shall we handle nested_eval here?
+//     typedef typename internal::nested_eval<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
+//     typedef typename internal::nested_eval<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
+//     typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
+//     typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;
+//     
+//     const LhsNested lhs(xpr.lhs());
+//     const RhsNested rhs(xpr.rhs());
+//   
+//     generic_product_impl<LhsNestedCleaned, RhsNestedCleaned>::evalTo(m_result, lhs, rhs);
+
     generic_product_impl<Lhs, Rhs>::evalTo(m_result, xpr.lhs(), xpr.rhs());
   }
   
@@ -133,6 +132,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::assign_
   typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
   static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
   {
+    // FIXME shall we handle nested_eval here?
     generic_product_impl<Lhs, Rhs>::evalTo(dst, src.lhs(), src.rhs());
   }
 };
@@ -144,6 +144,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::add_ass
   typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
   static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar> &)
   {
+    // FIXME shall we handle nested_eval here?
     generic_product_impl<Lhs, Rhs>::addTo(dst, src.lhs(), src.rhs());
   }
 };
@@ -155,6 +156,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::sub_ass
   typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
   static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar> &)
   {
+    // FIXME shall we handle nested_eval here?
     generic_product_impl<Lhs, Rhs>::subTo(dst, src.lhs(), src.rhs());
   }
 };
@@ -368,7 +370,6 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
     : evaluator_base<Product<Lhs, Rhs, LazyProduct> >
 {
   typedef Product<Lhs, Rhs, LazyProduct> XprType;
-  typedef CoeffBasedProduct<Lhs, Rhs, 0> CoeffBasedProductType;
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
@@ -396,9 +397,13 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
   typedef typename evaluator<RhsNestedCleaned>::type RhsEtorType;
   
   enum {
-    RowsAtCompileTime = traits<CoeffBasedProductType>::RowsAtCompileTime,
+    RowsAtCompileTime = LhsNestedCleaned::RowsAtCompileTime,
+    ColsAtCompileTime = RhsNestedCleaned::ColsAtCompileTime,
+    InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(LhsNestedCleaned::ColsAtCompileTime, RhsNestedCleaned::RowsAtCompileTime),
+    MaxRowsAtCompileTime = LhsNestedCleaned::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = RhsNestedCleaned::MaxColsAtCompileTime,
+      
     PacketSize = packet_traits<Scalar>::size,
-    InnerSize  = traits<CoeffBasedProductType>::InnerSize,
     
     LhsCoeffReadCost = LhsEtorType::CoeffReadCost,
     RhsCoeffReadCost = RhsEtorType::CoeffReadCost,
@@ -407,8 +412,51 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
                     + (InnerSize - 1) * NumTraits<Scalar>::AddCost,
 
     Unroll = CoeffReadCost != Dynamic && CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
-    CanVectorizeInner = traits<CoeffBasedProductType>::CanVectorizeInner,
-    Flags = traits<CoeffBasedProductType>::Flags
+    
+    LhsFlags = LhsEtorType::Flags,
+    RhsFlags = RhsEtorType::Flags,
+    
+    LhsRowMajor = LhsFlags & RowMajorBit,
+    RhsRowMajor = RhsFlags & RowMajorBit,
+      
+    SameType = is_same<typename LhsNestedCleaned::Scalar,typename RhsNestedCleaned::Scalar>::value,
+
+    CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit)
+                    && (ColsAtCompileTime == Dynamic
+                        || ( (ColsAtCompileTime % packet_traits<Scalar>::size) == 0
+                            && (RhsFlags&AlignedBit)
+                            )
+                        ),
+
+    CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit)
+                    && (RowsAtCompileTime == Dynamic
+                        || ( (RowsAtCompileTime % packet_traits<Scalar>::size) == 0
+                            && (LhsFlags&AlignedBit)
+                            )
+                        ),
+
+    EvalToRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
+                    : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
+                    : (RhsRowMajor && !CanVectorizeLhs),
+
+    Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & ~RowMajorBit)
+          | (EvalToRowMajor ? RowMajorBit : 0)
+          | (CanVectorizeLhs ? (LhsFlags & AlignedBit) : 0)
+          | (CanVectorizeRhs ? (RhsFlags & AlignedBit) : 0)
+          // TODO enable vectorization for mixed types
+          | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0),
+          
+    /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
+    * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
+    * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
+    * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
+    */
+    CanVectorizeInner =    SameType
+                        && LhsRowMajor
+                        && (!RhsRowMajor)
+                        && (LhsFlags & RhsFlags & ActualPacketAccessBit)
+                        && (LhsFlags & RhsFlags & AlignedBit)
+                        && (InnerSize % packet_traits<Scalar>::size == 0)
   };
   
   const CoeffReturnType coeff(Index row, Index col) const
@@ -689,7 +737,7 @@ protected:
 * Diagonal products
 ***************************************************************************/
   
-template<typename MatrixType, typename DiagonalType, typename Derived>
+template<typename MatrixType, typename DiagonalType, typename Derived, int ProductOrder>
 struct diagonal_product_evaluator_base
   : evaluator_base<Derived>
 {
@@ -698,7 +746,20 @@ struct diagonal_product_evaluator_base
    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
 public:
   enum {
-    CoeffReadCost = NumTraits<Scalar>::MulCost + evaluator<MatrixType>::CoeffReadCost + evaluator<DiagonalType>::CoeffReadCost
+    CoeffReadCost = NumTraits<Scalar>::MulCost + evaluator<MatrixType>::CoeffReadCost + evaluator<DiagonalType>::CoeffReadCost,
+    
+    MatrixFlags = evaluator<MatrixType>::Flags,
+    DiagFlags = evaluator<DiagonalType>::Flags,
+    _StorageOrder = MatrixFlags & RowMajorBit ? RowMajor : ColMajor,
+    _ScalarAccessOnDiag =  !((int(_StorageOrder) == ColMajor && int(ProductOrder) == OnTheLeft)
+                           ||(int(_StorageOrder) == RowMajor && int(ProductOrder) == OnTheRight)),
+    _SameTypes = is_same<typename MatrixType::Scalar, typename DiagonalType::Scalar>::value,
+    // FIXME currently we need same types, but in the future the next rule should be the one
+    //_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagFlags)&PacketAccessBit))),
+    _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
+    _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0,
+    Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0) | AlignedBit
+            //(int(MatrixFlags)&int(DiagFlags)&AlignedBit),
   };
   
   diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
@@ -724,7 +785,7 @@ protected:
   {
     enum {
       InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime,
-      DiagonalPacketLoadMode = (LoadMode == Aligned && (((InnerSize%16) == 0) || (int(DiagonalType::Flags)&AlignedBit)==AlignedBit) ? Aligned : Unaligned)
+      DiagonalPacketLoadMode = (LoadMode == Aligned && (((InnerSize%16) == 0) || (int(DiagFlags)&AlignedBit)==AlignedBit) ? Aligned : Unaligned)
     };
     return internal::pmul(m_matImpl.template packet<LoadMode>(row, col),
                           m_diagImpl.template packet<DiagonalPacketLoadMode>(id));
@@ -737,9 +798,9 @@ protected:
 // diagonal * dense
 template<typename Lhs, typename Rhs, int ProductKind, int ProductTag>
 struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalShape, DenseShape, typename Lhs::Scalar, typename Rhs::Scalar> 
-  : diagonal_product_evaluator_base<Rhs, typename Lhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct> >
+  : diagonal_product_evaluator_base<Rhs, typename Lhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheLeft>
 {
-  typedef diagonal_product_evaluator_base<Rhs, typename Lhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct> > Base;
+  typedef diagonal_product_evaluator_base<Rhs, typename Lhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheLeft> Base;
   using Base::m_diagImpl;
   using Base::m_matImpl;
   using Base::coeff;
@@ -783,9 +844,9 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
 // dense * diagonal
 template<typename Lhs, typename Rhs, int ProductKind, int ProductTag>
 struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape, DiagonalShape, typename Lhs::Scalar, typename Rhs::Scalar> 
-  : diagonal_product_evaluator_base<Lhs, typename Rhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct> >
+  : diagonal_product_evaluator_base<Lhs, typename Rhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheRight>
 {
-  typedef diagonal_product_evaluator_base<Lhs, typename Rhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct> > Base;
+  typedef diagonal_product_evaluator_base<Lhs, typename Rhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheRight> Base;
   using Base::m_diagImpl;
   using Base::m_matImpl;
   using Base::coeff;
diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h
index 41290323f..6c8c58e95 100644
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -389,8 +389,19 @@ DenseBase<Derived>::redux(const Func& func) const
   eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
 #ifdef EIGEN_TEST_EVALUATORS
   
+  // FIXME, eval_nest should be handled by redux_evaluator, however:
+  //  - it is currently difficult to provide the right Flags since they are still handled by the expressions
+  //  - handling it here might reduce the number of template instantiations
+//   typedef typename internal::nested_eval<Derived,1>::type ThisNested;
+//   typedef typename internal::remove_all<ThisNested>::type ThisNestedCleaned;
+//   typedef typename internal::redux_evaluator<ThisNestedCleaned> ThisEvaluator;
+//   
+//   ThisNested thisNested(derived());
+//   ThisEvaluator thisEval(thisNested);
+  
   typedef typename internal::redux_evaluator<Derived> ThisEvaluator;
   ThisEvaluator thisEval(derived());
+  
   return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func);
   
 #else
diff --git a/Eigen/src/Core/Replicate.h b/Eigen/src/Core/Replicate.h
index 1e640d8aa..2dff03ea3 100644
--- a/Eigen/src/Core/Replicate.h
+++ b/Eigen/src/Core/Replicate.h
@@ -53,10 +53,13 @@ struct traits<Replicate<MatrixType,RowFactor,ColFactor> >
     IsRowMajor = MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1 ? 1
                : MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1 ? 0
                : (MatrixType::Flags & RowMajorBit) ? 1 : 0,
-    Flags = (_MatrixTypeNested::Flags & HereditaryBits & ~RowMajorBit) | (IsRowMajor ? RowMajorBit : 0)
+    
 #ifndef EIGEN_TEST_EVALUATORS
-    ,
+    Flags = (_MatrixTypeNested::Flags & HereditaryBits & ~RowMajorBit) | (IsRowMajor ? RowMajorBit : 0),
     CoeffReadCost = _MatrixTypeNested::CoeffReadCost
+#else
+    // FIXME enable DirectAccess with negative strides?
+    Flags = IsRowMajor ? RowMajorBit : 0
 #endif
   };
 };
diff --git a/Eigen/src/Core/Reverse.h b/Eigen/src/Core/Reverse.h
index 495b44cc4..4969bb4fc 100644
--- a/Eigen/src/Core/Reverse.h
+++ b/Eigen/src/Core/Reverse.h
@@ -45,14 +45,15 @@ struct traits<Reverse<MatrixType, Direction> >
     MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
 
+#ifndef EIGEN_TEST_EVALUATORS
     // let's enable LinearAccess only with vectorization because of the product overhead
     LinearAccess = ( (Direction==BothDirections) && (int(_MatrixTypeNested::Flags)&PacketAccessBit) )
                  ? LinearAccessBit : 0,
 
-    Flags = int(_MatrixTypeNested::Flags) & (HereditaryBits | LvalueBit | PacketAccessBit | LinearAccess)
-#ifndef EIGEN_TEST_EVALUATORS
-    ,
+    Flags = int(_MatrixTypeNested::Flags) & (HereditaryBits | LvalueBit | PacketAccessBit | LinearAccess),
     CoeffReadCost = _MatrixTypeNested::CoeffReadCost
+#else
+    Flags = _MatrixTypeNested::Flags & (RowMajorBit | LvalueBit)
 #endif
   };
 };
diff --git a/Eigen/src/Core/Select.h b/Eigen/src/Core/Select.h
index abcba2d15..d4fd88e62 100644
--- a/Eigen/src/Core/Select.h
+++ b/Eigen/src/Core/Select.h
@@ -43,12 +43,13 @@ struct traits<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
     ColsAtCompileTime = ConditionMatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = ConditionMatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = ConditionMatrixType::MaxColsAtCompileTime,
-    Flags = (unsigned int)ThenMatrixType::Flags & ElseMatrixType::Flags & HereditaryBits
 #ifndef EIGEN_TEST_EVALUATORS
-    ,
+    Flags = (unsigned int)ThenMatrixType::Flags & ElseMatrixType::Flags & HereditaryBits,
     CoeffReadCost = traits<typename remove_all<ConditionMatrixNested>::type>::CoeffReadCost
                   + EIGEN_SIZE_MAX(traits<typename remove_all<ThenMatrixNested>::type>::CoeffReadCost,
                                    traits<typename remove_all<ElseMatrixNested>::type>::CoeffReadCost)
+#else
+    Flags = (unsigned int)ThenMatrixType::Flags & ElseMatrixType::Flags & RowMajorBit
 #endif
   };
 };
diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index 98f9e888f..11b0e45a8 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -41,12 +41,17 @@ struct traits<Transpose<MatrixType> > : traits<MatrixType>
     ColsAtCompileTime = MatrixType::RowsAtCompileTime,
     MaxRowsAtCompileTime = MatrixType::MaxColsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+#ifndef EIGEN_TEST_EVALUATORS
     FlagsLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
     Flags0 = MatrixTypeNestedPlain::Flags & ~(LvalueBit | NestByRefBit),
     Flags1 = Flags0 | FlagsLvalueBit,
     Flags = Flags1 ^ RowMajorBit,
-#ifndef EIGEN_TEST_EVALUATORS
     CoeffReadCost = MatrixTypeNestedPlain::CoeffReadCost,
+#else
+    FlagsLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
+    Flags0 = MatrixTypeNestedPlain::Flags & ~(LvalueBit | NestByRefBit),
+    Flags1 = Flags0 | FlagsLvalueBit,
+    Flags = Flags1 ^ RowMajorBit,
 #endif
     InnerStrideAtCompileTime = inner_stride_at_compile_time<MatrixType>::ret,
     OuterStrideAtCompileTime = outer_stride_at_compile_time<MatrixType>::ret
diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h
index 702d0006d..672b9662f 100644
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@@ -48,8 +48,12 @@ struct traits<PartialReduxExpr<MatrixType, MemberOp, Direction> >
     ColsAtCompileTime = Direction==Horizontal ? 1 : MatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = Direction==Vertical   ? 1 : MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = Direction==Horizontal ? 1 : MatrixType::MaxColsAtCompileTime,
+#ifndef EIGEN_TEST_EVALUATORS
     Flags0 = (unsigned int)_MatrixTypeNested::Flags & HereditaryBits,
     Flags = (Flags0 & ~RowMajorBit) | (RowsAtCompileTime == 1 ? RowMajorBit : 0),
+#else
+    Flags = RowsAtCompileTime == 1 ? RowMajorBit : 0,
+#endif
     TraversalSize = Direction==Vertical ? MatrixType::RowsAtCompileTime :  MatrixType::ColsAtCompileTime
   };
 #ifndef EIGEN_TEST_EVALUATORS
diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h
index eed7f4258..771613b11 100644
--- a/Eigen/src/Core/products/TriangularMatrixVector.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector.h
@@ -259,7 +259,7 @@ template<int Mode> struct trmv_selector<Mode,ColMajor>
     typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
     typedef internal::blas_traits<Rhs> RhsBlasTraits;
     typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
-  
+    
     typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
 
     typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index 975fdbf2a..092ba758e 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -157,6 +157,18 @@ template<typename _Scalar, int Rows=Dynamic, int Cols=Dynamic, int Supers=Dynami
 
 namespace internal {
 template<typename Lhs, typename Rhs> struct product_type;
+/** \internal
+  * \class product_evaluator
+  * Products need their own evaluator with more template arguments allowing for
+  * easier partial template specializations.
+  */
+template< typename T,
+          int ProductTag = internal::product_type<typename T::Lhs,typename T::Rhs>::ret,
+          typename LhsShape = typename evaluator_traits<typename T::Lhs>::Shape,
+          typename RhsShape = typename evaluator_traits<typename T::Rhs>::Shape,
+          typename LhsScalar = typename T::Lhs::Scalar,
+          typename RhsScalar = typename T::Rhs::Scalar
+        > struct product_evaluator;
 }
 
 template<typename Lhs, typename Rhs,
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index bcd6183e2..016b37f71 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -124,6 +124,7 @@ template<typename _Scalar, int _Rows, int _Cols,
     typedef Matrix<_Scalar, _Rows, _Cols, Options, _MaxRows, _MaxCols> type;
 };
 
+#ifndef EIGEN_TEST_EVALUATORS
 template<typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
 class compute_matrix_flags
 {
@@ -158,6 +159,57 @@ class compute_matrix_flags
     enum { ret = LinearAccessBit | LvalueBit | DirectAccessBit | NestByRefBit | packet_access_bit | row_major_bit | aligned_bit };
 };
 
+#else // EIGEN_TEST_EVALUATORS
+
+template<typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+class compute_matrix_flags
+{
+    enum { row_major_bit = Options&RowMajor ? RowMajorBit : 0 };
+  public:
+    // FIXME currently we still have to handle DirectAccessBit at the expression level to handle DenseCoeffsBase<>
+    // and then propagate this information to the evaluator's flags.
+    // However, I (Gael) think that DirectAccessBit should only matter at the evaluation stage.
+    enum { ret = DirectAccessBit | LvalueBit | NestByRefBit | row_major_bit };
+};
+#endif
+
+#ifdef EIGEN_ENABLE_EVALUATORS
+template<typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+class compute_matrix_evaluator_flags
+{
+    enum {
+      row_major_bit = Options&RowMajor ? RowMajorBit : 0,
+      is_dynamic_size_storage = MaxRows==Dynamic || MaxCols==Dynamic,
+
+      aligned_bit =
+      (
+            ((Options&DontAlign)==0)
+        && (
+#if EIGEN_ALIGN_STATICALLY
+             ((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % 16) == 0))
+#else
+             0
+#endif
+
+          ||
+
+#if EIGEN_ALIGN
+             is_dynamic_size_storage
+#else
+             0
+#endif
+
+          )
+      ) ? AlignedBit : 0,
+      packet_access_bit = packet_traits<Scalar>::Vectorizable && aligned_bit ? PacketAccessBit : 0
+    };
+
+  public:
+    enum { ret = LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit | aligned_bit };
+};
+
+#endif // EIGEN_ENABLE_EVALUATORS
+
 template<int _Rows, int _Cols> struct size_at_compile_time
 {
   enum { ret = (_Rows==Dynamic || _Cols==Dynamic) ? Dynamic : _Rows * _Cols };
author	Gael Guennebaud <g.gael@free.fr>	2014-03-12 13:34:11 +0100
committer	Gael Guennebaud <g.gael@free.fr>	2014-03-12 13:34:11 +0100
commit	8dd3b716e39d4b4b472b948de1af20838bf17493 (patch)
tree	3fa4e90f1a6caf23e5028c8c5025e04ad27a8768 /Eigen/src/Core
parent	7eefdb948c1ff372f85991ff3f9d998e66a554d9 (diff)