1 files changed, 549 insertions, 329 deletions
diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index 3568cb85f..1c7123b85 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -2,7 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2011-2012 Jitse Niesen <jitse@maths.leeds.ac.uk>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -14,57 +14,85 @@
 #define EIGEN_COREEVALUATORS_H
 
 namespace Eigen {
-
+  
 namespace internal {
 
-// evaluator_traits<T> contains traits for evaluator_impl<T> 
+// This class returns the evaluator kind from the expression storage kind.
+// Default assumes index based accessors
+template<typename StorageKind>
+struct storage_kind_to_evaluator_kind {
+  typedef IndexBased Kind;
+};
 
-template<typename T>
-struct evaluator_traits
-{
-  // 1 if evaluator_impl<T>::evalTo() exists
-  // 0 if evaluator_impl<T> allows coefficient-based access
-  static const int HasEvalTo = 0;
+// This class returns the evaluator shape from the expression storage kind.
+// It can be Dense, Sparse, Triangular, Diagonal, SelfAdjoint, Band, etc.
+template<typename StorageKind> struct storage_kind_to_shape;
 
-  // 1 if assignment A = B assumes aliasing when B is of type T and thus B needs to be evaluated into a
-  // temporary; 0 if not.
-  static const int AssumeAliasing = 0;
-};
 
-// expression class for evaluating nested expression to a temporary
- 
-template<typename ArgType>
-class EvalToTemp;
+template<> struct storage_kind_to_shape<Dense> { typedef DenseShape Shape; };
 
-// evaluator<T>::type is type of evaluator for T
-// evaluator<T>::nestedType is type of evaluator if T is nested inside another evaluator
- 
-template<typename T>
-struct evaluator_impl 
-{ };
- 
-template<typename T, int Nested = evaluator_traits<T>::HasEvalTo>
-struct evaluator_nested_type;
+
+// FIXME Is this necessary? And why was it not before refactoring???
+template<> struct storage_kind_to_shape<PermutationStorage> { typedef PermutationShape Shape; };
+
+
+// Evaluators have to be specialized with respect to various criteria such as:
+//  - storage/structure/shape
+//  - scalar type
+//  - etc.
+// Therefore, we need specialization of evaluator providing additional template arguments for each kind of evaluators.
+// We currently distinguish the following kind of evaluators:
+// - unary_evaluator    for expressions taking only one arguments (CwiseUnaryOp, CwiseUnaryView, Transpose, MatrixWrapper, ArrayWrapper, Reverse, Replicate)
+// - binary_evaluator   for expression taking two arguments (CwiseBinaryOp)
+// - product_evaluator  for linear algebra products (Product); special case of binary_evaluator because it requires additional tags for dispatching.
+// - mapbase_evaluator  for Map, Block, Ref
+// - block_evaluator    for Block (special dispatching to a mapbase_evaluator or unary_evaluator)
+
+template< typename T,
+          typename LhsKind   = typename evaluator_traits<typename T::Lhs>::Kind,
+          typename RhsKind   = typename evaluator_traits<typename T::Rhs>::Kind,
+          typename LhsScalar = typename traits<typename T::Lhs>::Scalar,
+          typename RhsScalar = typename traits<typename T::Rhs>::Scalar> struct binary_evaluator;
+
+template< typename T,
+          typename Kind   = typename evaluator_traits<typename T::NestedExpression>::Kind,
+          typename Scalar = typename T::Scalar> struct unary_evaluator;
+          
+// evaluator_traits<T> contains traits for evaluator<T> 
 
 template<typename T>
-struct evaluator_nested_type<T, 0>
+struct evaluator_traits_base
 {
-  typedef evaluator_impl<T> type;
+  // TODO check whether these two indirections are really needed.
+  // Basically, if nobody overwrite type and nestedType, then, they can be dropped
+//   typedef evaluator<T> type;
+//   typedef evaluator<T> nestedType;
+  
+  // by default, get evaluator kind and shape from storage
+  typedef typename storage_kind_to_evaluator_kind<typename traits<T>::StorageKind>::Kind Kind;
+  typedef typename storage_kind_to_shape<typename traits<T>::StorageKind>::Shape Shape;
+  
+  // 1 if assignment A = B assumes aliasing when B is of type T and thus B needs to be evaluated into a
+  // temporary; 0 if not.
+  static const int AssumeAliasing = 0;
 };
 
+// Default evaluator traits
 template<typename T>
-struct evaluator_nested_type<T, 1>
+struct evaluator_traits : public evaluator_traits_base<T>
 {
-  typedef evaluator_impl<EvalToTemp<T> > type;
 };
 
+
+// By default, we assume a unary expression:
 template<typename T>
-struct evaluator
+struct evaluator : public unary_evaluator<T>
 {
-  typedef evaluator_impl<T> type;
-  typedef typename evaluator_nested_type<T>::type nestedType;
+  typedef unary_evaluator<T> Base;
+  EIGEN_DEVICE_FUNC explicit evaluator(const T& xpr) : Base(xpr) {}
 };
 
+
 // TODO: Think about const-correctness
 
 template<typename T>
@@ -76,47 +104,59 @@ struct evaluator<const T>
 
 // TODO this class does not seem to be necessary anymore
 template<typename ExpressionType>
-struct evaluator_impl_base
+struct evaluator_base
 {
-  typedef typename ExpressionType::Index Index;
+//   typedef typename evaluator_traits<ExpressionType>::type type;
+//   typedef typename evaluator_traits<ExpressionType>::nestedType nestedType;
+  typedef evaluator<ExpressionType> type;
+  typedef evaluator<ExpressionType> nestedType;
+  
+  typedef typename traits<ExpressionType>::Index Index;
   // TODO that's not very nice to have to propagate all these traits. They are currently only needed to handle outer,inner indices.
   typedef traits<ExpressionType> ExpressionTraits;
-
-  evaluator_impl<ExpressionType>& derived() 
-  {
-    return *static_cast<evaluator_impl<ExpressionType>*>(this); 
-  }
 };
 
 // -------------------- Matrix and Array --------------------
 //
-// evaluator_impl<PlainObjectBase> is a common base class for the
+// evaluator<PlainObjectBase> is a common base class for the
 // Matrix and Array evaluators.
+// Here we directly specialize evaluator. This is not really a unary expression, and it is, by definition, dense,
+// so no need for more sophisticated dispatching.
 
 template<typename Derived>
-struct evaluator_impl<PlainObjectBase<Derived> >
-  : evaluator_impl_base<Derived>
+struct evaluator<PlainObjectBase<Derived> >
+  : evaluator_base<Derived>
 {
   typedef PlainObjectBase<Derived> PlainObjectType;
+  typedef typename PlainObjectType::Index Index;
+  typedef typename PlainObjectType::Scalar Scalar;
+  typedef typename PlainObjectType::CoeffReturnType CoeffReturnType;
+  typedef typename PlainObjectType::PacketScalar PacketScalar;
+  typedef typename PlainObjectType::PacketReturnType PacketReturnType;
 
   enum {
     IsRowMajor = PlainObjectType::IsRowMajor,
     IsVectorAtCompileTime = PlainObjectType::IsVectorAtCompileTime,
     RowsAtCompileTime = PlainObjectType::RowsAtCompileTime,
-    ColsAtCompileTime = PlainObjectType::ColsAtCompileTime
+    ColsAtCompileTime = PlainObjectType::ColsAtCompileTime,
+    
+    CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    Flags = compute_matrix_evaluator_flags< Scalar,Derived::RowsAtCompileTime,Derived::ColsAtCompileTime,
+                                            Derived::Options,Derived::MaxRowsAtCompileTime,Derived::MaxColsAtCompileTime>::ret
   };
-
-  evaluator_impl(const PlainObjectType& m) 
+  
+  EIGEN_DEVICE_FUNC evaluator()
+    : m_data(0),
+      m_outerStride(IsVectorAtCompileTime  ? 0 
+                                           : int(IsRowMajor) ? ColsAtCompileTime 
+                                           : RowsAtCompileTime)
+  {}
+  
+  EIGEN_DEVICE_FUNC explicit evaluator(const PlainObjectType& m)
     : m_data(m.data()), m_outerStride(IsVectorAtCompileTime ? 0 : m.outerStride()) 
   { }
 
-  typedef typename PlainObjectType::Index Index;
-  typedef typename PlainObjectType::Scalar Scalar;
-  typedef typename PlainObjectType::CoeffReturnType CoeffReturnType;
-  typedef typename PlainObjectType::PacketScalar PacketScalar;
-  typedef typename PlainObjectType::PacketReturnType PacketReturnType;
-
-  CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
   {
     if (IsRowMajor)
       return m_data[row * m_outerStride.value() + col];
@@ -124,12 +164,12 @@ struct evaluator_impl<PlainObjectBase<Derived> >
       return m_data[row + col * m_outerStride.value()];
   }
 
-  CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
     return m_data[index];
   }
 
-  Scalar& coeffRef(Index row, Index col)
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
   {
     if (IsRowMajor)
       return const_cast<Scalar*>(m_data)[row * m_outerStride.value() + col];
@@ -137,7 +177,7 @@ struct evaluator_impl<PlainObjectBase<Derived> >
       return const_cast<Scalar*>(m_data)[row + col * m_outerStride.value()];
   }
 
-  Scalar& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
   {
     return const_cast<Scalar*>(m_data)[index];
   }
@@ -184,153 +224,45 @@ protected:
 };
 
 template<typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
-struct evaluator_impl<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
-  : evaluator_impl<PlainObjectBase<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> > >
+struct evaluator<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
+  : evaluator<PlainObjectBase<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> > >
 {
   typedef Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType;
+  
+  evaluator() {}
 
-  evaluator_impl(const XprType& m) 
-    : evaluator_impl<PlainObjectBase<XprType> >(m) 
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& m)
+    : evaluator<PlainObjectBase<XprType> >(m) 
   { }
 };
 
 template<typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
-struct evaluator_impl<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
-  : evaluator_impl<PlainObjectBase<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> > >
+struct evaluator<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
+  : evaluator<PlainObjectBase<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> > >
 {
   typedef Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType;
 
-  evaluator_impl(const XprType& m) 
-    : evaluator_impl<PlainObjectBase<XprType> >(m) 
-  { }
-};
-
-// -------------------- EvalToTemp --------------------
-
-template<typename ArgType>
-struct traits<EvalToTemp<ArgType> >
-  : public traits<ArgType>
-{ };
-
-template<typename ArgType>
-class EvalToTemp
-  : public dense_xpr_base<EvalToTemp<ArgType> >::type
-{
- public:
- 
-  typedef typename dense_xpr_base<EvalToTemp>::type Base;
-  EIGEN_GENERIC_PUBLIC_INTERFACE(EvalToTemp)
- 
-  EvalToTemp(const ArgType& arg)
-    : m_arg(arg)
-  { }
- 
-  const ArgType& arg() const
-  {
-    return m_arg;
-  }
-
-  Index rows() const 
-  {
-    return m_arg.rows();
-  }
-
-  Index cols() const 
-  {
-    return m_arg.cols();
-  }
-
- private:
-  const ArgType& m_arg;
-};
- 
-template<typename ArgType>
-struct evaluator_impl<EvalToTemp<ArgType> >
-{
-  typedef EvalToTemp<ArgType> XprType;
-  typedef typename ArgType::PlainObject PlainObject;
-
-  evaluator_impl(const XprType& xpr) 
-    : m_result(xpr.rows(), xpr.cols()), m_resultImpl(m_result)
-  {
-    // TODO we should simply do m_result(xpr.arg());
-    call_dense_assignment_loop(m_result, xpr.arg());
-  }
-
-  // This constructor is used when nesting an EvalTo evaluator in another evaluator
-  evaluator_impl(const ArgType& arg) 
-    : m_result(arg.rows(), arg.cols()), m_resultImpl(m_result)
-  {
-    // TODO we should simply do m_result(xpr.arg());
-    call_dense_assignment_loop(m_result, arg);
-  }
-
-  typedef typename PlainObject::Index Index;
-  typedef typename PlainObject::Scalar Scalar;
-  typedef typename PlainObject::CoeffReturnType CoeffReturnType;
-  typedef typename PlainObject::PacketScalar PacketScalar;
-  typedef typename PlainObject::PacketReturnType PacketReturnType;
-
-  // All other functions are forwarded to m_resultImpl
-
-  CoeffReturnType coeff(Index row, Index col) const 
-  { 
-    return m_resultImpl.coeff(row, col); 
-  }
-  
-  CoeffReturnType coeff(Index index) const 
-  { 
-    return m_resultImpl.coeff(index); 
-  }
+  evaluator() {}
   
-  Scalar& coeffRef(Index row, Index col) 
-  { 
-    return m_resultImpl.coeffRef(row, col); 
-  }
-  
-  Scalar& coeffRef(Index index) 
-  { 
-    return m_resultImpl.coeffRef(index); 
-  }
-
-  template<int LoadMode> 
-  PacketReturnType packet(Index row, Index col) const
-  {
-    return m_resultImpl.template packet<LoadMode>(row, col);
-  }
-
-  template<int LoadMode> 
-  PacketReturnType packet(Index index) const
-  {
-    return m_resultImpl.packet<LoadMode>(index);
-  }
-
-  template<int StoreMode> 
-  void writePacket(Index row, Index col, const PacketScalar& x)
-  {
-    m_resultImpl.template writePacket<StoreMode>(row, col, x);
-  }
-
-  template<int StoreMode> 
-  void writePacket(Index index, const PacketScalar& x)
-  {
-    m_resultImpl.template writePacket<StoreMode>(index, x);
-  }
-
-protected:
-  PlainObject m_result;
-  typename evaluator<PlainObject>::nestedType m_resultImpl;
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& m)
+    : evaluator<PlainObjectBase<XprType> >(m) 
+  { }
 };
 
 // -------------------- Transpose --------------------
 
 template<typename ArgType>
-struct evaluator_impl<Transpose<ArgType> >
-  : evaluator_impl_base<Transpose<ArgType> >
+struct unary_evaluator<Transpose<ArgType>, IndexBased>
+  : evaluator_base<Transpose<ArgType> >
 {
   typedef Transpose<ArgType> XprType;
+  
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,    
+    Flags = evaluator<ArgType>::Flags ^ RowMajorBit
+  };
 
-  evaluator_impl(const XprType& t) : m_argImpl(t.nestedExpression()) {}
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {}
 
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
@@ -338,22 +270,22 @@ struct evaluator_impl<Transpose<ArgType> >
   typedef typename XprType::PacketScalar PacketScalar;
   typedef typename XprType::PacketReturnType PacketReturnType;
 
-  CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
   {
     return m_argImpl.coeff(col, row);
   }
 
-  CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
     return m_argImpl.coeff(index);
   }
 
-  Scalar& coeffRef(Index row, Index col)
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
   {
     return m_argImpl.coeffRef(col, row);
   }
 
-  typename XprType::Scalar& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC typename XprType::Scalar& coeffRef(Index index)
   {
     return m_argImpl.coeffRef(index);
   }
@@ -387,13 +319,27 @@ protected:
 };
 
 // -------------------- CwiseNullaryOp --------------------
+// Like Matrix and Array, this is not really a unary expression, so we directly specialize evaluator.
+// Likewise, there is not need to more sophisticated dispatching here.
 
 template<typename NullaryOp, typename PlainObjectType>
-struct evaluator_impl<CwiseNullaryOp<NullaryOp,PlainObjectType> >
+struct evaluator<CwiseNullaryOp<NullaryOp,PlainObjectType> >
+  : evaluator_base<CwiseNullaryOp<NullaryOp,PlainObjectType> >
 {
   typedef CwiseNullaryOp<NullaryOp,PlainObjectType> XprType;
+  typedef typename internal::remove_all<PlainObjectType>::type PlainObjectTypeCleaned;
+  
+  enum {
+    CoeffReadCost = internal::functor_traits<NullaryOp>::Cost,
+    
+    Flags = (evaluator<PlainObjectTypeCleaned>::Flags
+          &  (  HereditaryBits
+              | (functor_has_linear_access<NullaryOp>::ret  ? LinearAccessBit : 0)
+              | (functor_traits<NullaryOp>::PacketAccess    ? PacketAccessBit : 0)))
+          | (functor_traits<NullaryOp>::IsRepeatable ? 0 : EvalBeforeNestingBit) // FIXME EvalBeforeNestingBit should be needed anymore
+  };
 
-  evaluator_impl(const XprType& n) 
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& n)
     : m_functor(n.functor()) 
   { }
 
@@ -401,12 +347,12 @@ struct evaluator_impl<CwiseNullaryOp<NullaryOp,PlainObjectType> >
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketScalar PacketScalar;
 
-  CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
   {
     return m_functor(row, col);
   }
 
-  CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
     return m_functor(index);
   }
@@ -430,11 +376,20 @@ protected:
 // -------------------- CwiseUnaryOp --------------------
 
 template<typename UnaryOp, typename ArgType>
-struct evaluator_impl<CwiseUnaryOp<UnaryOp, ArgType> >
+struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased >
+  : evaluator_base<CwiseUnaryOp<UnaryOp, ArgType> >
 {
   typedef CwiseUnaryOp<UnaryOp, ArgType> XprType;
+  
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<UnaryOp>::Cost,
+    
+    Flags = evaluator<ArgType>::Flags & (
+              HereditaryBits | LinearAccessBit | AlignedBit
+            | (functor_traits<UnaryOp>::PacketAccess ? PacketAccessBit : 0))
+  };
 
-  evaluator_impl(const XprType& op) 
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op)
     : m_functor(op.functor()), 
       m_argImpl(op.nestedExpression()) 
   { }
@@ -443,12 +398,12 @@ struct evaluator_impl<CwiseUnaryOp<UnaryOp, ArgType> >
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketScalar PacketScalar;
 
-  CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
   {
     return m_functor(m_argImpl.coeff(row, col));
   }
 
-  CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
     return m_functor(m_argImpl.coeff(index));
   }
@@ -472,12 +427,43 @@ protected:
 
 // -------------------- CwiseBinaryOp --------------------
 
+// this is a binary expression
 template<typename BinaryOp, typename Lhs, typename Rhs>
-struct evaluator_impl<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
+struct evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
+  : public binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
 {
   typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
+  typedef binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs> > Base;
+  
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
+};
 
-  evaluator_impl(const XprType& xpr) 
+template<typename BinaryOp, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBased>
+  : evaluator_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
+{
+  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
+  
+  enum {
+    CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
+    
+    LhsFlags = evaluator<Lhs>::Flags,
+    RhsFlags = evaluator<Rhs>::Flags,
+    SameType = is_same<typename Lhs::Scalar,typename Rhs::Scalar>::value,
+    StorageOrdersAgree = (int(LhsFlags)&RowMajorBit)==(int(RhsFlags)&RowMajorBit),
+    Flags0 = (int(LhsFlags) | int(RhsFlags)) & (
+        HereditaryBits
+      | (int(LhsFlags) & int(RhsFlags) &
+           ( AlignedBit
+           | (StorageOrdersAgree ? LinearAccessBit : 0)
+           | (functor_traits<BinaryOp>::PacketAccess && StorageOrdersAgree && SameType ? PacketAccessBit : 0)
+           )
+        )
+     ),
+    Flags = (Flags0 & ~RowMajorBit) | (LhsFlags & RowMajorBit)
+  };
+
+  EIGEN_DEVICE_FUNC explicit binary_evaluator(const XprType& xpr)
     : m_functor(xpr.functor()),
       m_lhsImpl(xpr.lhs()), 
       m_rhsImpl(xpr.rhs())  
@@ -487,12 +473,12 @@ struct evaluator_impl<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketScalar PacketScalar;
 
-  CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
   {
     return m_functor(m_lhsImpl.coeff(row, col), m_rhsImpl.coeff(row, col));
   }
 
-  CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
     return m_functor(m_lhsImpl.coeff(index), m_rhsImpl.coeff(index));
   }
@@ -501,14 +487,14 @@ struct evaluator_impl<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
   PacketScalar packet(Index row, Index col) const
   {
     return m_functor.packetOp(m_lhsImpl.template packet<LoadMode>(row, col),
-			      m_rhsImpl.template packet<LoadMode>(row, col));
+                              m_rhsImpl.template packet<LoadMode>(row, col));
   }
 
   template<int LoadMode>
   PacketScalar packet(Index index) const
   {
     return m_functor.packetOp(m_lhsImpl.template packet<LoadMode>(index),
-			      m_rhsImpl.template packet<LoadMode>(index));
+                              m_rhsImpl.template packet<LoadMode>(index));
   }
 
 protected:
@@ -520,12 +506,18 @@ protected:
 // -------------------- CwiseUnaryView --------------------
 
 template<typename UnaryOp, typename ArgType>
-struct evaluator_impl<CwiseUnaryView<UnaryOp, ArgType> >
-  : evaluator_impl_base<CwiseUnaryView<UnaryOp, ArgType> >
+struct unary_evaluator<CwiseUnaryView<UnaryOp, ArgType>, IndexBased>
+  : evaluator_base<CwiseUnaryView<UnaryOp, ArgType> >
 {
   typedef CwiseUnaryView<UnaryOp, ArgType> XprType;
+  
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<UnaryOp>::Cost,
+    
+    Flags = (evaluator<ArgType>::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit))
+  };
 
-  evaluator_impl(const XprType& op) 
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op)
     : m_unaryOp(op.functor()), 
       m_argImpl(op.nestedExpression()) 
   { }
@@ -534,22 +526,22 @@ struct evaluator_impl<CwiseUnaryView<UnaryOp, ArgType> >
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
-  CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
   {
     return m_unaryOp(m_argImpl.coeff(row, col));
   }
 
-  CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
     return m_unaryOp(m_argImpl.coeff(index));
   }
 
-  Scalar& coeffRef(Index row, Index col)
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
   {
     return m_unaryOp(m_argImpl.coeffRef(row, col));
   }
 
-  Scalar& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
   {
     return m_unaryOp(m_argImpl.coeffRef(index));
   }
@@ -561,13 +553,15 @@ protected:
 
 // -------------------- Map --------------------
 
-template<typename Derived, int AccessorsType>
-struct evaluator_impl<MapBase<Derived, AccessorsType> >
-  : evaluator_impl_base<Derived>
-{
-  typedef MapBase<Derived, AccessorsType> MapType;
-  typedef Derived XprType;
+// FIXME perhaps the PlainObjectType could be provided by Derived::PlainObject ?
+// but that might complicate template specialization
+template<typename Derived, typename PlainObjectType>
+struct mapbase_evaluator;
 
+template<typename Derived, typename PlainObjectType>
+struct mapbase_evaluator : evaluator_base<Derived>
+{
+  typedef Derived  XprType;
   typedef typename XprType::PointerType PointerType;
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
@@ -575,81 +569,121 @@ struct evaluator_impl<MapBase<Derived, AccessorsType> >
   typedef typename XprType::PacketScalar PacketScalar;
   typedef typename XprType::PacketReturnType PacketReturnType;
   
-  evaluator_impl(const XprType& map) 
-    : m_data(const_cast<PointerType>(map.data())),  
-      m_rowStride(map.rowStride()),
-      m_colStride(map.colStride())
-  { }
- 
   enum {
-    RowsAtCompileTime = XprType::RowsAtCompileTime
+    IsRowMajor = XprType::RowsAtCompileTime,
+    ColsAtCompileTime = XprType::ColsAtCompileTime,
+    CoeffReadCost = NumTraits<Scalar>::ReadCost
   };
+  
+  EIGEN_DEVICE_FUNC explicit mapbase_evaluator(const XprType& map)
+    : m_data(const_cast<PointerType>(map.data())),  
+      m_xpr(map)
+  {
+    EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(evaluator<Derived>::Flags&PacketAccessBit, internal::inner_stride_at_compile_time<Derived>::ret==1),
+                        PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
+  }
  
-  CoeffReturnType coeff(Index row, Index col) const 
-  { 
-    return m_data[col * m_colStride + row * m_rowStride];
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()];
   }
   
-  CoeffReturnType coeff(Index index) const 
-  { 
-    return coeff(RowsAtCompileTime == 1 ? 0 : index,
-		 RowsAtCompileTime == 1 ? index : 0);
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    return m_data[index * m_xpr.innerStride()];
   }
 
-  Scalar& coeffRef(Index row, Index col) 
-  { 
-    return m_data[col * m_colStride + row * m_rowStride];
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
+  {
+    return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()];
   }
   
-  Scalar& coeffRef(Index index) 
-  { 
-    return coeffRef(RowsAtCompileTime == 1 ? 0 : index,
-		    RowsAtCompileTime == 1 ? index : 0);
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
+  {
+    return m_data[index * m_xpr.innerStride()];
   }
  
   template<int LoadMode> 
   PacketReturnType packet(Index row, Index col) const 
-  { 
-    PointerType ptr = m_data + row * m_rowStride + col * m_colStride;
+  {
+    PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride();
     return internal::ploadt<PacketScalar, LoadMode>(ptr);
   }
 
   template<int LoadMode> 
   PacketReturnType packet(Index index) const 
-  { 
-    return packet<LoadMode>(RowsAtCompileTime == 1 ? 0 : index,
-			    RowsAtCompileTime == 1 ? index : 0);
+  {
+    return internal::ploadt<PacketScalar, LoadMode>(m_data + index * m_xpr.innerStride());
   }
   
   template<int StoreMode> 
   void writePacket(Index row, Index col, const PacketScalar& x) 
-  { 
-    PointerType ptr = m_data + row * m_rowStride + col * m_colStride;
+  {
+    PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride();
     return internal::pstoret<Scalar, PacketScalar, StoreMode>(ptr, x);
   }
   
   template<int StoreMode> 
   void writePacket(Index index, const PacketScalar& x) 
-  { 
-    return writePacket<StoreMode>(RowsAtCompileTime == 1 ? 0 : index,
-				  RowsAtCompileTime == 1 ? index : 0,
-				  x);
+  {
+    internal::pstoret<Scalar, PacketScalar, StoreMode>(m_data + index * m_xpr.innerStride(), x);
   }
  
 protected:
   PointerType m_data;
-  int m_rowStride;
-  int m_colStride;
+  const XprType& m_xpr;
 };
 
 template<typename PlainObjectType, int MapOptions, typename StrideType> 
-struct evaluator_impl<Map<PlainObjectType, MapOptions, StrideType> >
-  : public evaluator_impl<MapBase<Map<PlainObjectType, MapOptions, StrideType> > >
+struct evaluator<Map<PlainObjectType, MapOptions, StrideType> >
+  : public mapbase_evaluator<Map<PlainObjectType, MapOptions, StrideType>, PlainObjectType>
 {
   typedef Map<PlainObjectType, MapOptions, StrideType> XprType;
+  typedef typename XprType::Scalar Scalar;
+  
+  enum {
+    InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
+                             ? int(PlainObjectType::InnerStrideAtCompileTime)
+                             : int(StrideType::InnerStrideAtCompileTime),
+    OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
+                             ? int(PlainObjectType::OuterStrideAtCompileTime)
+                             : int(StrideType::OuterStrideAtCompileTime),
+    HasNoInnerStride = InnerStrideAtCompileTime == 1,
+    HasNoOuterStride = StrideType::OuterStrideAtCompileTime == 0,
+    HasNoStride = HasNoInnerStride && HasNoOuterStride,
+    IsAligned = bool(EIGEN_ALIGN) && ((int(MapOptions)&Aligned)==Aligned),
+    IsDynamicSize = PlainObjectType::SizeAtCompileTime==Dynamic,
+    KeepsPacketAccess = bool(HasNoInnerStride)
+                        && ( bool(IsDynamicSize)
+                           || HasNoOuterStride
+                           || ( OuterStrideAtCompileTime!=Dynamic
+                           && ((static_cast<int>(sizeof(Scalar))*OuterStrideAtCompileTime)%EIGEN_ALIGN_BYTES)==0 ) ),
+    Flags0 = evaluator<PlainObjectType>::Flags,
+    Flags1 = IsAligned ? (int(Flags0) | AlignedBit) : (int(Flags0) & ~AlignedBit),
+    Flags2 = (bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime))
+           ? int(Flags1) : int(Flags1 & ~LinearAccessBit),
+    Flags = KeepsPacketAccess ? int(Flags2) : (int(Flags2) & ~PacketAccessBit)
+  };
 
-  evaluator_impl(const XprType& map) 
-    : evaluator_impl<MapBase<XprType> >(map) 
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& map)
+    : mapbase_evaluator<XprType, PlainObjectType>(map) 
+  { }
+};
+
+// -------------------- Ref --------------------
+
+template<typename PlainObjectType, int RefOptions, typename StrideType> 
+struct evaluator<Ref<PlainObjectType, RefOptions, StrideType> >
+  : public mapbase_evaluator<Ref<PlainObjectType, RefOptions, StrideType>, PlainObjectType>
+{
+  typedef Ref<PlainObjectType, RefOptions, StrideType> XprType;
+  
+  enum {
+    Flags = evaluator<Map<PlainObjectType, RefOptions, StrideType> >::Flags
+  };
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& ref)
+    : mapbase_evaluator<XprType, PlainObjectType>(ref) 
   { }
 };
 
@@ -659,21 +693,68 @@ template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel,
          bool HasDirectAccess = internal::has_direct_access<ArgType>::ret> struct block_evaluator;
          
 template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel> 
-struct evaluator_impl<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
+struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
   : block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel>
 {
   typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
+  typedef typename XprType::Scalar Scalar; 
+  
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+    
+    RowsAtCompileTime = traits<XprType>::RowsAtCompileTime,
+    ColsAtCompileTime = traits<XprType>::ColsAtCompileTime,
+    MaxRowsAtCompileTime = traits<XprType>::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = traits<XprType>::MaxColsAtCompileTime,
+    
+    ArgTypeIsRowMajor = (int(evaluator<ArgType>::Flags)&RowMajorBit) != 0,
+    IsRowMajor = (MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1) ? 1
+               : (MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1) ? 0
+               : ArgTypeIsRowMajor,
+    HasSameStorageOrderAsArgType = (IsRowMajor == ArgTypeIsRowMajor),
+    InnerSize = IsRowMajor ? int(ColsAtCompileTime) : int(RowsAtCompileTime),
+    InnerStrideAtCompileTime = HasSameStorageOrderAsArgType
+                             ? int(inner_stride_at_compile_time<ArgType>::ret)
+                             : int(outer_stride_at_compile_time<ArgType>::ret),
+    OuterStrideAtCompileTime = HasSameStorageOrderAsArgType
+                             ? int(outer_stride_at_compile_time<ArgType>::ret)
+                             : int(inner_stride_at_compile_time<ArgType>::ret),
+    MaskPacketAccessBit = (InnerSize == Dynamic || (InnerSize % packet_traits<Scalar>::size) == 0)
+                       && (InnerStrideAtCompileTime == 1)
+                        ? PacketAccessBit : 0,
+    
+    MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0)) ? AlignedBit : 0,
+    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator<ArgType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,    
+    FlagsRowMajorBit = XprType::Flags&RowMajorBit,
+    Flags0 = evaluator<ArgType>::Flags & ( (HereditaryBits & ~RowMajorBit) |
+                                           DirectAccessBit |
+                                           MaskPacketAccessBit |
+                                           MaskAlignedBit),
+    Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit
+  };
   typedef block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel> block_evaluator_type;
-  evaluator_impl(const XprType& block) : block_evaluator_type(block) {}
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& block) : block_evaluator_type(block) {}
 };
 
+// no direct-access => dispatch to a unary evaluator
 template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
 struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /*HasDirectAccess*/ false>
-  : evaluator_impl_base<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
+  : unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
+{
+  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
+
+  EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block)
+    : unary_evaluator<XprType>(block) 
+  {}
+};
+
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
+struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBased>
+  : evaluator_base<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
 {
   typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
 
-  block_evaluator(const XprType& block) 
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& block)
     : m_argImpl(block.nestedExpression()), 
       m_startRow(block.startRow()), 
       m_startCol(block.startCol()) 
@@ -689,26 +770,24 @@ struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /*HasDirectAcc
     RowsAtCompileTime = XprType::RowsAtCompileTime
   };
  
-  CoeffReturnType coeff(Index row, Index col) const 
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
   { 
     return m_argImpl.coeff(m_startRow.value() + row, m_startCol.value() + col); 
   }
   
-  CoeffReturnType coeff(Index index) const 
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   { 
-    return coeff(RowsAtCompileTime == 1 ? 0 : index,
-		 RowsAtCompileTime == 1 ? index : 0);
+    return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
   }
 
-  Scalar& coeffRef(Index row, Index col) 
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
   { 
     return m_argImpl.coeffRef(m_startRow.value() + row, m_startCol.value() + col); 
   }
   
-  Scalar& coeffRef(Index index) 
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
   { 
-    return coeffRef(RowsAtCompileTime == 1 ? 0 : index,
-		    RowsAtCompileTime == 1 ? index : 0);
+    return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
   }
  
   template<int LoadMode> 
@@ -721,7 +800,7 @@ struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /*HasDirectAcc
   PacketReturnType packet(Index index) const 
   { 
     return packet<LoadMode>(RowsAtCompileTime == 1 ? 0 : index,
-			    RowsAtCompileTime == 1 ? index : 0);
+                            RowsAtCompileTime == 1 ? index : 0);
   }
   
   template<int StoreMode> 
@@ -734,8 +813,8 @@ struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /*HasDirectAcc
   void writePacket(Index index, const PacketScalar& x) 
   { 
     return writePacket<StoreMode>(RowsAtCompileTime == 1 ? 0 : index,
-				  RowsAtCompileTime == 1 ? index : 0,
-				  x);
+                                  RowsAtCompileTime == 1 ? index : 0,
+                                  x);
   }
  
 protected:
@@ -749,24 +828,38 @@ protected:
 
 template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel> 
 struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /* HasDirectAccess */ true>
-  : evaluator_impl<MapBase<Block<ArgType, BlockRows, BlockCols, InnerPanel> > >
+  : mapbase_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>,
+                      typename Block<ArgType, BlockRows, BlockCols, InnerPanel>::PlainObject>
 {
   typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
 
-  block_evaluator(const XprType& block) 
-    : evaluator_impl<MapBase<XprType> >(block) 
-  { }
+  EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block)
+    : mapbase_evaluator<XprType, typename XprType::PlainObject>(block) 
+  {
+    // FIXME this should be an internal assertion
+    eigen_assert(EIGEN_IMPLIES(evaluator<XprType>::Flags&AlignedBit, (size_t(block.data()) % EIGEN_ALIGN_BYTES) == 0) && "data is not aligned");
+  }
 };
 
 
 // -------------------- Select --------------------
+// TODO shall we introduce a ternary_evaluator?
 
+// TODO enable vectorization for Select
 template<typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType>
-struct evaluator_impl<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
+struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
+  : evaluator_base<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
 {
   typedef Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> XprType;
+  enum {
+    CoeffReadCost = evaluator<ConditionMatrixType>::CoeffReadCost
+                  + EIGEN_SIZE_MAX(evaluator<ThenMatrixType>::CoeffReadCost,
+                                   evaluator<ElseMatrixType>::CoeffReadCost),
+
+    Flags = (unsigned int)evaluator<ThenMatrixType>::Flags & evaluator<ElseMatrixType>::Flags & HereditaryBits
+  };
 
-  evaluator_impl(const XprType& select) 
+  inline EIGEN_DEVICE_FUNC  explicit evaluator(const XprType& select)
     : m_conditionImpl(select.conditionMatrix()),
       m_thenImpl(select.thenMatrix()),
       m_elseImpl(select.elseMatrix())
@@ -775,7 +868,7 @@ struct evaluator_impl<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType
   typedef typename XprType::Index Index;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
-  CoeffReturnType coeff(Index row, Index col) const
+  inline EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
   {
     if (m_conditionImpl.coeff(row, col))
       return m_thenImpl.coeff(row, col);
@@ -783,7 +876,7 @@ struct evaluator_impl<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType
       return m_elseImpl.coeff(row, col);
   }
 
-  CoeffReturnType coeff(Index index) const
+  inline EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
     if (m_conditionImpl.coeff(index))
       return m_thenImpl.coeff(index);
@@ -801,21 +894,33 @@ protected:
 // -------------------- Replicate --------------------
 
 template<typename ArgType, int RowFactor, int ColFactor> 
-struct evaluator_impl<Replicate<ArgType, RowFactor, ColFactor> >
+struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
+  : evaluator_base<Replicate<ArgType, RowFactor, ColFactor> >
 {
   typedef Replicate<ArgType, RowFactor, ColFactor> XprType;
-
-  evaluator_impl(const XprType& replicate) 
-    : m_argImpl(replicate.nestedExpression()),
-      m_rows(replicate.nestedExpression().rows()),
-      m_cols(replicate.nestedExpression().cols())
-  { }
- 
   typedef typename XprType::Index Index;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketReturnType PacketReturnType;
+  enum {
+    Factor = (RowFactor==Dynamic || ColFactor==Dynamic) ? Dynamic : RowFactor*ColFactor
+  };
+  typedef typename internal::nested_eval<ArgType,Factor>::type ArgTypeNested;
+  typedef typename internal::remove_all<ArgTypeNested>::type ArgTypeNestedCleaned;
+  
+  enum {
+    CoeffReadCost = evaluator<ArgTypeNestedCleaned>::CoeffReadCost,
+    
+    Flags = (evaluator<ArgTypeNestedCleaned>::Flags & HereditaryBits & ~RowMajorBit) | (traits<XprType>::Flags & RowMajorBit)
+  };
 
-  CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& replicate)
+    : m_arg(replicate.nestedExpression()),
+      m_argImpl(m_arg),
+      m_rows(replicate.nestedExpression().rows()),
+      m_cols(replicate.nestedExpression().cols())
+  {}
+ 
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
   {
     // try to avoid using modulo; this is a pure optimization strategy
     const Index actual_row = internal::traits<XprType>::RowsAtCompileTime==1 ? 0
@@ -842,9 +947,10 @@ struct evaluator_impl<Replicate<ArgType, RowFactor, ColFactor> >
   }
  
 protected:
-  typename evaluator<ArgType>::nestedType m_argImpl;
-  const variable_if_dynamic<Index, XprType::RowsAtCompileTime> m_rows;
-  const variable_if_dynamic<Index, XprType::ColsAtCompileTime> m_cols;
+  const ArgTypeNested m_arg; // FIXME is it OK to store both the argument and its evaluator?? (we have the same situation in evaluator_product)
+  typename evaluator<ArgTypeNestedCleaned>::nestedType m_argImpl;
+  const variable_if_dynamic<Index, ArgType::RowsAtCompileTime> m_rows;
+  const variable_if_dynamic<Index, ArgType::ColsAtCompileTime> m_cols;
 };
 
 
@@ -855,23 +961,35 @@ protected:
 //       the row() and col() member functions.
 
 template< typename ArgType, typename MemberOp, int Direction>
-struct evaluator_impl<PartialReduxExpr<ArgType, MemberOp, Direction> >
+struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
+  : evaluator_base<PartialReduxExpr<ArgType, MemberOp, Direction> >
 {
   typedef PartialReduxExpr<ArgType, MemberOp, Direction> XprType;
+  typedef typename XprType::Scalar InputScalar;
+  enum {
+    TraversalSize = Direction==int(Vertical) ? int(ArgType::RowsAtCompileTime) :  int(XprType::ColsAtCompileTime)
+  };
+  typedef typename MemberOp::template Cost<InputScalar,int(TraversalSize)> CostOpType;
+  enum {
+    CoeffReadCost = TraversalSize==Dynamic ? Dynamic
+                  : TraversalSize * evaluator<ArgType>::CoeffReadCost + int(CostOpType::value),
+    
+    Flags = (traits<XprType>::Flags&RowMajorBit) | (evaluator<ArgType>::Flags&HereditaryBits)
+  };
 
-  evaluator_impl(const XprType expr)
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType expr)
     : m_expr(expr)
-  { }
+  {}
 
   typedef typename XprType::Index Index;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
  
-  CoeffReturnType coeff(Index row, Index col) const 
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
   { 
     return m_expr.coeff(row, col);
   }
   
-  CoeffReturnType coeff(Index index) const 
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   { 
     return m_expr.coeff(index);
   }
@@ -883,16 +1001,20 @@ protected:
 
 // -------------------- MatrixWrapper and ArrayWrapper --------------------
 //
-// evaluator_impl_wrapper_base<T> is a common base class for the
+// evaluator_wrapper_base<T> is a common base class for the
 // MatrixWrapper and ArrayWrapper evaluators.
 
 template<typename XprType>
-struct evaluator_impl_wrapper_base
-  : evaluator_impl_base<XprType>
+struct evaluator_wrapper_base
+  : evaluator_base<XprType>
 {
   typedef typename remove_all<typename XprType::NestedExpressionType>::type ArgType;
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+    Flags = evaluator<ArgType>::Flags
+  };
 
-  evaluator_impl_wrapper_base(const ArgType& arg) : m_argImpl(arg) {}
+  EIGEN_DEVICE_FUNC explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {}
 
   typedef typename ArgType::Index Index;
   typedef typename ArgType::Scalar Scalar;
@@ -900,22 +1022,22 @@ struct evaluator_impl_wrapper_base
   typedef typename ArgType::PacketScalar PacketScalar;
   typedef typename ArgType::PacketReturnType PacketReturnType;
 
-  CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
   {
     return m_argImpl.coeff(row, col);
   }
 
-  CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
     return m_argImpl.coeff(index);
   }
 
-  Scalar& coeffRef(Index row, Index col)
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
   {
     return m_argImpl.coeffRef(row, col);
   }
 
-  Scalar& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
   {
     return m_argImpl.coeffRef(index);
   }
@@ -949,24 +1071,24 @@ protected:
 };
 
 template<typename TArgType>
-struct evaluator_impl<MatrixWrapper<TArgType> >
-  : evaluator_impl_wrapper_base<MatrixWrapper<TArgType> >
+struct unary_evaluator<MatrixWrapper<TArgType> >
+  : evaluator_wrapper_base<MatrixWrapper<TArgType> >
 {
   typedef MatrixWrapper<TArgType> XprType;
 
-  evaluator_impl(const XprType& wrapper) 
-    : evaluator_impl_wrapper_base<MatrixWrapper<TArgType> >(wrapper.nestedExpression())
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& wrapper)
+    : evaluator_wrapper_base<MatrixWrapper<TArgType> >(wrapper.nestedExpression())
   { }
 };
 
 template<typename TArgType>
-struct evaluator_impl<ArrayWrapper<TArgType> >
-  : evaluator_impl_wrapper_base<ArrayWrapper<TArgType> >
+struct unary_evaluator<ArrayWrapper<TArgType> >
+  : evaluator_wrapper_base<ArrayWrapper<TArgType> >
 {
   typedef ArrayWrapper<TArgType> XprType;
 
-  evaluator_impl(const XprType& wrapper) 
-    : evaluator_impl_wrapper_base<ArrayWrapper<TArgType> >(wrapper.nestedExpression())
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& wrapper)
+    : evaluator_wrapper_base<ArrayWrapper<TArgType> >(wrapper.nestedExpression())
   { }
 };
 
@@ -977,8 +1099,8 @@ struct evaluator_impl<ArrayWrapper<TArgType> >
 template<typename PacketScalar, bool ReversePacket> struct reverse_packet_cond;
 
 template<typename ArgType, int Direction>
-struct evaluator_impl<Reverse<ArgType, Direction> >
-  : evaluator_impl_base<Reverse<ArgType, Direction> >
+struct unary_evaluator<Reverse<ArgType, Direction> >
+  : evaluator_base<Reverse<ArgType, Direction> >
 {
   typedef Reverse<ArgType, Direction> XprType;
   typedef typename XprType::Index Index;
@@ -997,34 +1119,44 @@ struct evaluator_impl<Reverse<ArgType, Direction> >
     OffsetCol  = ReverseCol && IsRowMajor ? PacketSize : 1,
     ReversePacket = (Direction == BothDirections)
                     || ((Direction == Vertical)   && IsColMajor)
-                    || ((Direction == Horizontal) && IsRowMajor)
+                    || ((Direction == Horizontal) && IsRowMajor),
+                    
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+    
+    // let's enable LinearAccess only with vectorization because of the product overhead
+    // FIXME enable DirectAccess with negative strides?
+    Flags0 = evaluator<ArgType>::Flags,
+    LinearAccess = ( (Direction==BothDirections) && (int(Flags0)&PacketAccessBit) )
+                 ? LinearAccessBit : 0,
+
+    Flags = int(Flags0) & (HereditaryBits | PacketAccessBit | LinearAccess)
   };
   typedef internal::reverse_packet_cond<PacketScalar,ReversePacket> reverse_packet;
 
-  evaluator_impl(const XprType& reverse) 
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& reverse)
     : m_argImpl(reverse.nestedExpression()),
       m_rows(ReverseRow ? reverse.nestedExpression().rows() : 0),
       m_cols(ReverseCol ? reverse.nestedExpression().cols() : 0)
   { }
  
-  CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
   {
     return m_argImpl.coeff(ReverseRow ? m_rows.value() - row - 1 : row,
-			   ReverseCol ? m_cols.value() - col - 1 : col);
+                           ReverseCol ? m_cols.value() - col - 1 : col);
   }
 
-  CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
     return m_argImpl.coeff(m_rows.value() * m_cols.value() - index - 1);
   }
 
-  Scalar& coeffRef(Index row, Index col)
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
   {
     return m_argImpl.coeffRef(ReverseRow ? m_rows.value() - row - 1 : row,
-			      ReverseCol ? m_cols.value() - col - 1 : col);
+                              ReverseCol ? m_cols.value() - col - 1 : col);
   }
 
-  Scalar& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
   {
     return m_argImpl.coeffRef(m_rows.value() * m_cols.value() - index - 1);
   }
@@ -1071,36 +1203,44 @@ protected:
 // -------------------- Diagonal --------------------
 
 template<typename ArgType, int DiagIndex>
-struct evaluator_impl<Diagonal<ArgType, DiagIndex> >
-  : evaluator_impl_base<Diagonal<ArgType, DiagIndex> >
+struct evaluator<Diagonal<ArgType, DiagIndex> >
+  : evaluator_base<Diagonal<ArgType, DiagIndex> >
 {
   typedef Diagonal<ArgType, DiagIndex> XprType;
+  
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+    
+    Flags = (unsigned int)evaluator<ArgType>::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit) & ~RowMajorBit
+  };
 
-  evaluator_impl(const XprType& diagonal) 
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& diagonal)
     : m_argImpl(diagonal.nestedExpression()),
       m_index(diagonal.index())
   { }
  
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  // FIXME having to check whether ArgType is sparse here i not very nice.
+  typedef typename internal::conditional<!internal::is_same<typename ArgType::StorageKind,Sparse>::value,
+                                         typename XprType::CoeffReturnType,Scalar>::type CoeffReturnType;
 
-  CoeffReturnType coeff(Index row, Index) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index) const
   {
     return m_argImpl.coeff(row + rowOffset(), row + colOffset());
   }
 
-  CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
     return m_argImpl.coeff(index + rowOffset(), index + colOffset());
   }
 
-  Scalar& coeffRef(Index row, Index)
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index)
   {
     return m_argImpl.coeffRef(row + rowOffset(), row + colOffset());
   }
 
-  Scalar& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
   {
     return m_argImpl.coeffRef(index + rowOffset(), index + colOffset());
   }
@@ -1110,8 +1250,88 @@ protected:
   const internal::variable_if_dynamicindex<Index, XprType::DiagIndex> m_index;
 
 private:
-  EIGEN_STRONG_INLINE Index rowOffset() const { return m_index.value() > 0 ? 0 : -m_index.value(); }
-  EIGEN_STRONG_INLINE Index colOffset() const { return m_index.value() > 0 ? m_index.value() : 0; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowOffset() const { return m_index.value() > 0 ? 0 : -m_index.value(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colOffset() const { return m_index.value() > 0 ? m_index.value() : 0; }
+};
+
+
+//----------------------------------------------------------------------
+// deprecated code
+//----------------------------------------------------------------------
+
+// -------------------- EvalToTemp --------------------
+
+// expression class for evaluating nested expression to a temporary
+
+template<typename ArgType> class EvalToTemp;
+
+template<typename ArgType>
+struct traits<EvalToTemp<ArgType> >
+  : public traits<ArgType>
+{ };
+
+template<typename ArgType>
+class EvalToTemp
+  : public dense_xpr_base<EvalToTemp<ArgType> >::type
+{
+ public:
+ 
+  typedef typename dense_xpr_base<EvalToTemp>::type Base;
+  EIGEN_GENERIC_PUBLIC_INTERFACE(EvalToTemp)
+ 
+  explicit EvalToTemp(const ArgType& arg)
+    : m_arg(arg)
+  { }
+ 
+  const ArgType& arg() const
+  {
+    return m_arg;
+  }
+
+  Index rows() const 
+  {
+    return m_arg.rows();
+  }
+
+  Index cols() const 
+  {
+    return m_arg.cols();
+  }
+
+ private:
+  const ArgType& m_arg;
+};
+ 
+template<typename ArgType>
+struct evaluator<EvalToTemp<ArgType> >
+  : public evaluator<typename ArgType::PlainObject>::type
+{
+  typedef EvalToTemp<ArgType>                   XprType;
+  typedef typename ArgType::PlainObject         PlainObject;
+  typedef typename evaluator<PlainObject>::type Base;
+  
+  typedef evaluator type;
+  typedef evaluator nestedType;
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
+    : m_result(xpr.rows(), xpr.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    // TODO we should simply do m_result(xpr.arg());
+    call_dense_assignment_loop(m_result, xpr.arg());
+  }
+
+  // This constructor is used when nesting an EvalTo evaluator in another evaluator
+  EIGEN_DEVICE_FUNC evaluator(const ArgType& arg)
+    : m_result(arg.rows(), arg.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    // TODO we should simply do m_result(xpr.arg());
+    call_dense_assignment_loop(m_result, arg);
+  }
+
+protected:
+  PlainObject m_result;
 };
 
 } // namespace internal