merging updates from upstream

author: Deven Desai <deven.desai.amd@gmail.com> 2018-07-11 09:17:33 -0400
committer: Deven Desai <deven.desai.amd@gmail.com> 2018-07-11 09:17:33 -0400
commit: 38807a257500cd0746b819c994efab805b8a02e4 (patch)
tree: 0be837e16ad1dc2b09d8f2be2f752f074b169717 /Eigen/src/Core
parent: e2b2c61533cb923ddba41ba7bd64b87f30a25e29 (diff)
parent: f00d08cc0a987fa624209b920608b56638404f13 (diff)
26 files changed, 578 insertions, 247 deletions
diff --git a/Eigen/src/Core/ArrayWrapper.h b/Eigen/src/Core/ArrayWrapper.h
index 688aadd62..757b31825 100644
--- a/Eigen/src/Core/ArrayWrapper.h
+++ b/Eigen/src/Core/ArrayWrapper.h
@@ -90,8 +90,8 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
     EIGEN_DEVICE_FUNC
     inline void evalTo(Dest& dst) const { dst = m_expression; }
 
-    const typename internal::remove_all<NestedExpressionType>::type& 
     EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<NestedExpressionType>::type& 
     nestedExpression() const 
     {
       return m_expression;
diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index 6231f2edd..b65ec4ffb 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -1080,7 +1080,7 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
     : m_argImpl(block.nestedExpression()), 
       m_startRow(block.startRow()), 
       m_startCol(block.startCol()),
-      m_linear_offset(InnerPanel?(XprType::IsRowMajor ? block.startRow()*block.cols() : block.startCol()*block.rows()):0)
+      m_linear_offset((InnerPanel|| XprType::IsVectorAtCompileTime)?(XprType::IsRowMajor ? block.startRow()*block.cols() + block.startCol() : block.startCol()*block.rows() + block.startRow()):0)
   { }
  
   typedef typename XprType::Scalar Scalar;
@@ -1088,7 +1088,7 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
 
   enum {
     RowsAtCompileTime = XprType::RowsAtCompileTime,
-    ForwardLinearAccess = InnerPanel && bool(evaluator<ArgType>::Flags&LinearAccessBit)
+    ForwardLinearAccess = (InnerPanel || XprType::IsVectorAtCompileTime) && bool(evaluator<ArgType>::Flags&LinearAccessBit)
   };
  
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -1162,7 +1162,7 @@ protected:
   evaluator<ArgType> m_argImpl;
   const variable_if_dynamic<Index, (ArgType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow;
   const variable_if_dynamic<Index, (ArgType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol;
-  const variable_if_dynamic<Index, InnerPanel ? Dynamic : 0> m_linear_offset;
+  const variable_if_dynamic<Index, (InnerPanel || XprType::IsVectorAtCompileTime) ? Dynamic : 0> m_linear_offset;
 };
 
 // TODO: This evaluator does not actually use the child evaluator; 
diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h
index 9e58fbf88..3c02a1025 100644
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h
@@ -207,7 +207,9 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
       EIGEN_UNUSED_VARIABLE(rows);
       EIGEN_UNUSED_VARIABLE(cols);
     }
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      numext::swap(m_data, other.m_data);
+    }
     EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
     EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
     EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {}
@@ -267,7 +269,11 @@ template<typename T, int Size, int _Options> class DenseStorage<T, Size, Dynamic
     }
     EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index cols) : m_rows(rows), m_cols(cols) {}
     EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
-    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
+    {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_rows,other.m_rows);
+      numext::swap(m_cols,other.m_cols);
+    }
     EIGEN_DEVICE_FUNC Index rows() const {return m_rows;}
     EIGEN_DEVICE_FUNC Index cols() const {return m_cols;}
     EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index cols) { m_rows = rows; m_cols = cols; }
@@ -296,7 +302,11 @@ template<typename T, int Size, int _Cols, int _Options> class DenseStorage<T, Si
       return *this; 
     }
     EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index) : m_rows(rows) {}
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
+    {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_rows,other.m_rows);
+    }
     EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
     EIGEN_DEVICE_FUNC Index cols(void) const {return _Cols;}
     EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index) { m_rows = rows; }
@@ -325,11 +335,14 @@ template<typename T, int Size, int _Rows, int _Options> class DenseStorage<T, Si
       return *this;
     }
     EIGEN_DEVICE_FUNC DenseStorage(Index, Index, Index cols) : m_cols(cols) {}
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_cols,other.m_cols);
+    }
     EIGEN_DEVICE_FUNC Index rows(void) const {return _Rows;}
     EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
-    void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
-    void resize(Index, Index, Index cols) { m_cols = cols; }
+    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
+    EIGEN_DEVICE_FUNC void resize(Index, Index, Index cols) { m_cols = cols; }
     EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
     EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
 };
@@ -381,16 +394,19 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
     EIGEN_DEVICE_FUNC
     DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
     {
-      using std::swap;
-      swap(m_data, other.m_data);
-      swap(m_rows, other.m_rows);
-      swap(m_cols, other.m_cols);
+      numext::swap(m_data, other.m_data);
+      numext::swap(m_rows, other.m_rows);
+      numext::swap(m_cols, other.m_cols);
       return *this;
     }
 #endif
     EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols); }
     EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
-    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
+    {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_rows,other.m_rows);
+      numext::swap(m_cols,other.m_cols);
+    }
     EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
     EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
     void conservativeResize(Index size, Index rows, Index cols)
@@ -459,14 +475,16 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
     EIGEN_DEVICE_FUNC
     DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
     {
-      using std::swap;
-      swap(m_data, other.m_data);
-      swap(m_cols, other.m_cols);
+      numext::swap(m_data, other.m_data);
+      numext::swap(m_cols, other.m_cols);
       return *this;
     }
 #endif
     EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols); }
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_cols,other.m_cols);
+    }
     EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
     EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
     EIGEN_DEVICE_FUNC void conservativeResize(Index size, Index, Index cols)
@@ -533,14 +551,16 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
     EIGEN_DEVICE_FUNC
     DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
     {
-      using std::swap;
-      swap(m_data, other.m_data);
-      swap(m_rows, other.m_rows);
+      numext::swap(m_data, other.m_data);
+      numext::swap(m_rows, other.m_rows);
       return *this;
     }
 #endif
     EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows); }
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_rows,other.m_rows);
+    }
     EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
     EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
     void conservativeResize(Index size, Index rows, Index)
diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h
index 261f77b99..43f3b84c8 100644
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@@ -163,13 +163,13 @@ template<typename Scalar,int Size,int MaxSize,bool Cond> struct gemv_static_vect
 template<typename Scalar,int Size,int MaxSize>
 struct gemv_static_vector_if<Scalar,Size,MaxSize,false>
 {
-  EIGEN_STRONG_INLINE  Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; }
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; }
 };
 
 template<typename Scalar,int Size>
 struct gemv_static_vector_if<Scalar,Size,Dynamic,true>
 {
-  EIGEN_STRONG_INLINE Scalar* data() { return 0; }
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { return 0; }
 };
 
 template<typename Scalar,int Size,int MaxSize>
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 6415a7696..72aa68d45 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -869,7 +869,7 @@ template<typename T> T generic_fast_tanh_float(const T& a_x);
 
 namespace numext {
 
-#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__)
+#if (!defined(EIGEN_GPUCC) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) && !defined(__SYCL_DEVICE_ONLY__)
 template<typename T>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
@@ -886,19 +886,16 @@ EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
   return max EIGEN_NOT_A_MACRO (x,y);
 }
 
-
 #elif defined(__SYCL_DEVICE_ONLY__)
 template<typename T>
 EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
 {
-
   return y < x ? y : x;
 }
 
 template<typename T>
 EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
 {
-
   return x < y ? y : x;
 }
 
@@ -942,7 +939,6 @@ EIGEN_ALWAYS_INLINE unsigned long maxi(const unsigned long& x, const unsigned lo
   return cl::sycl::max(x,y);
 }
 
-
 EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y)
 {
   return cl::sycl::fmin(x,y);
@@ -976,6 +972,19 @@ EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y)
 {
   return fminf(x, y);
 }
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y)
+{
+  return fmin(x, y);
+}
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y)
+{
+  return fminl(x, y);
+}
+
 template<typename T>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
@@ -988,6 +997,18 @@ EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y)
 {
   return fmaxf(x, y);
 }
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y)
+{
+  return fmax(x, y);
+}
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y)
+{
+  return fmaxl(x, y);
+}
 #endif
 
 
diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h
index 28737c15e..a23e93ccb 100644
--- a/Eigen/src/Core/MathFunctionsImpl.h
+++ b/Eigen/src/Core/MathFunctionsImpl.h
@@ -67,7 +67,7 @@ T generic_fast_tanh_float(const T& a_x)
 }
 
 template<typename RealScalar>
-EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y)
 {
   EIGEN_USING_STD_MATH(sqrt);
@@ -82,7 +82,8 @@ template<typename Scalar>
 struct hypot_impl
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  static inline RealScalar run(const Scalar& x, const Scalar& y)
+  static EIGEN_DEVICE_FUNC
+  inline RealScalar run(const Scalar& x, const Scalar& y)
   {
     EIGEN_USING_STD_MATH(abs);
     return positive_real_hypot<RealScalar>(abs(x), abs(y));
diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index 11435903b..6046c8bae 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -328,6 +328,7 @@ template<typename Derived> class MatrixBase
 
     inline const PartialPivLU<PlainObject> lu() const;
 
+    EIGEN_DEVICE_FUNC
     inline const Inverse<Derived> inverse() const;
 
     template<typename ResultType>
@@ -337,12 +338,15 @@ template<typename Derived> class MatrixBase
       bool& invertible,
       const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
     ) const;
+
     template<typename ResultType>
     inline void computeInverseWithCheck(
       ResultType& inverse,
       bool& invertible,
       const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
     ) const;
+
+    EIGEN_DEVICE_FUNC
     Scalar determinant() const;
 
 /////////// Cholesky module ///////////
@@ -414,15 +418,19 @@ template<typename Derived> class MatrixBase
 
 ////////// Householder module ///////////
 
+    EIGEN_DEVICE_FUNC
     void makeHouseholderInPlace(Scalar& tau, RealScalar& beta);
     template<typename EssentialPart>
+    EIGEN_DEVICE_FUNC
     void makeHouseholder(EssentialPart& essential,
                          Scalar& tau, RealScalar& beta) const;
     template<typename EssentialPart>
+    EIGEN_DEVICE_FUNC
     void applyHouseholderOnTheLeft(const EssentialPart& essential,
                                    const Scalar& tau,
                                    Scalar* workspace);
     template<typename EssentialPart>
+    EIGEN_DEVICE_FUNC
     void applyHouseholderOnTheRight(const EssentialPart& essential,
                                     const Scalar& tau,
                                     Scalar* workspace);
diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h
index 5567d4c90..b053cff07 100644
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@@ -21,12 +21,14 @@ template< typename T,
           bool is_integer = NumTraits<T>::IsInteger>
 struct default_digits10_impl
 {
+  EIGEN_DEVICE_FUNC
   static int run() { return std::numeric_limits<T>::digits10; }
 };
 
 template<typename T>
 struct default_digits10_impl<T,false,false> // Floating point
 {
+  EIGEN_DEVICE_FUNC
   static int run() {
     using std::log10;
     using std::ceil;
@@ -38,6 +40,7 @@ struct default_digits10_impl<T,false,false> // Floating point
 template<typename T>
 struct default_digits10_impl<T,false,true> // Integer
 {
+  EIGEN_DEVICE_FUNC
   static int run() { return 0; }
 };
 
@@ -49,12 +52,14 @@ template< typename T,
           bool is_integer = NumTraits<T>::IsInteger>
 struct default_digits_impl
 {
+  EIGEN_DEVICE_FUNC
   static int run() { return std::numeric_limits<T>::digits; }
 };
 
 template<typename T>
 struct default_digits_impl<T,false,false> // Floating point
 {
+  EIGEN_DEVICE_FUNC
   static int run() {
     using std::log;
     using std::ceil;
@@ -66,6 +71,7 @@ struct default_digits_impl<T,false,false> // Floating point
 template<typename T>
 struct default_digits_impl<T,false,true> // Integer
 {
+  EIGEN_DEVICE_FUNC
   static int run() { return 0; }
 };
 
diff --git a/Eigen/src/Core/PermutationMatrix.h b/Eigen/src/Core/PermutationMatrix.h
index b1fb455b9..acd085301 100644
--- a/Eigen/src/Core/PermutationMatrix.h
+++ b/Eigen/src/Core/PermutationMatrix.h
@@ -99,13 +99,13 @@ class PermutationBase : public EigenBase<Derived>
     #endif
 
     /** \returns the number of rows */
-    inline Index rows() const { return Index(indices().size()); }
+    inline EIGEN_DEVICE_FUNC Index rows() const { return Index(indices().size()); }
 
     /** \returns the number of columns */
-    inline Index cols() const { return Index(indices().size()); }
+    inline EIGEN_DEVICE_FUNC Index cols() const { return Index(indices().size()); }
 
     /** \returns the size of a side of the respective square matrix, i.e., the number of indices */
-    inline Index size() const { return Index(indices().size()); }
+    inline EIGEN_DEVICE_FUNC Index size() const { return Index(indices().size()); }
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename DenseDerived>
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 3d67d9489..70790dbd4 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -127,7 +127,7 @@ public:
   using Base::derived;
   typedef typename Base::Scalar Scalar;
   
-  EIGEN_STRONG_INLINE operator const Scalar() const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator const Scalar() const
   {
     return internal::evaluator<ProductXpr>(derived()).coeff(0,0);
   }
diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index 2bb42f74b..0330b5741 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -272,7 +272,7 @@ template<typename Dst, typename Lhs, typename Rhs, typename Func>
 void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
 {
   evaluator<Rhs> rhsEval(rhs);
-  typename nested_eval<Lhs,Rhs::SizeAtCompileTime>::type actual_lhs(lhs);
+  ei_declare_local_nested_eval(Lhs,lhs,Rhs::SizeAtCompileTime,actual_lhs);
   // FIXME if cols is large enough, then it might be useful to make sure that lhs is sequentially stored
   // FIXME not very good if rhs is real and lhs complex while alpha is real too
   const Index cols = dst.cols();
@@ -285,7 +285,7 @@ template<typename Dst, typename Lhs, typename Rhs, typename Func>
 void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
 {
   evaluator<Lhs> lhsEval(lhs);
-  typename nested_eval<Rhs,Lhs::SizeAtCompileTime>::type actual_rhs(rhs);
+  ei_declare_local_nested_eval(Rhs,rhs,Lhs::SizeAtCompileTime,actual_rhs);
   // FIXME if rows is large enough, then it might be useful to make sure that rhs is sequentially stored
   // FIXME not very good if lhs is real and rhs complex while alpha is real too
   const Index rows = dst.rows();
@@ -396,7 +396,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
     // but easier on the compiler side
     call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op<typename Dst::Scalar,Scalar>());
   }
-  
+
   template<typename Dst>
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
@@ -410,6 +410,32 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
     // dst.noalias() -= lhs.lazyProduct(rhs);
     call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<typename Dst::Scalar,Scalar>());
   }
+
+  // Catch "dst {,+,-}= (s*A)*B" and evaluate it lazily by moving out the scalar factor:
+  //    dst {,+,-}= s * (A.lazyProduct(B))
+  // This is a huge benefit for heap-allocated matrix types as it save one costly allocation.
+  // For them, this strategy is also faster than simply by-passing the heap allocation through
+  // stack allocation.
+  // For fixed sizes matrices, this is less obvious, it is sometimes x2 faster, but sometimes x3 slower,
+  // and the behavior depends also a lot on the compiler... so let's be conservative and enable them for dynamic-size only,
+  // that is when coming from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h
+  template<typename Dst, typename Scalar1, typename Scalar2, typename Plain1, typename Xpr2, typename Func>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void eval_dynamic(Dst& dst, const CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
+                                           const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>, Xpr2>& lhs, const Rhs& rhs, const Func &func)
+  {
+    call_assignment_no_alias(dst, lhs.lhs().functor().m_other * lhs.rhs().lazyProduct(rhs), func);
+  }
+
+  // Here, we we always have LhsT==Lhs, but we need to make it a template type to make the above
+  // overload more specialized.
+  template<typename Dst, typename LhsT, typename Func>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void eval_dynamic(Dst& dst, const LhsT& lhs, const Rhs& rhs, const Func &func)
+  {
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), func);
+  }
+  
   
 //   template<typename Dst>
 //   static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
@@ -741,7 +767,8 @@ struct generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag>
   typedef typename Product<Lhs,Rhs>::Scalar Scalar;
   
   template<typename Dest>
-  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static EIGEN_DEVICE_FUNC
+  void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
   {
     selfadjoint_product_impl<typename Lhs::MatrixType,Lhs::Mode,false,Rhs,0,Rhs::IsVectorAtCompileTime>::run(dst, lhs.nestedExpression(), rhs, alpha);
   }
@@ -785,7 +812,11 @@ public:
     _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
     _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0,
     Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0),
-    Alignment = evaluator<MatrixType>::Alignment
+    Alignment = evaluator<MatrixType>::Alignment,
+
+    AsScalarProduct =     (DiagonalType::SizeAtCompileTime==1)
+                      ||  (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::RowsAtCompileTime==1 && ProductOrder==OnTheLeft)
+                      ||  (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==1 && ProductOrder==OnTheRight)
   };
   
   diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
@@ -797,7 +828,10 @@ public:
   
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
   {
-    return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
+    if(AsScalarProduct)
+      return m_diagImpl.coeff(0) * m_matImpl.coeff(idx);
+    else
+      return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
   }
   
 protected:
diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h
index 0a99acb21..ddce65468 100644
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -23,22 +23,22 @@ namespace internal {
 * Part 1 : the logic deciding a strategy for vectorization and unrolling
 ***************************************************************************/
 
-template<typename Func, typename Derived>
+template<typename Func, typename Evaluator>
 struct redux_traits
 {
 public:
-    typedef typename find_best_packet<typename Derived::Scalar,Derived::SizeAtCompileTime>::type PacketType;
+    typedef typename find_best_packet<typename Evaluator::Scalar,Evaluator::SizeAtCompileTime>::type PacketType;
   enum {
     PacketSize = unpacket_traits<PacketType>::size,
-    InnerMaxSize = int(Derived::IsRowMajor)
-                 ? Derived::MaxColsAtCompileTime
-                 : Derived::MaxRowsAtCompileTime
+    InnerMaxSize = int(Evaluator::IsRowMajor)
+                 ? Evaluator::MaxColsAtCompileTime
+                 : Evaluator::MaxRowsAtCompileTime
   };
 
   enum {
-    MightVectorize = (int(Derived::Flags)&ActualPacketAccessBit)
+    MightVectorize = (int(Evaluator::Flags)&ActualPacketAccessBit)
                   && (functor_traits<Func>::PacketAccess),
-    MayLinearVectorize = bool(MightVectorize) && (int(Derived::Flags)&LinearAccessBit),
+    MayLinearVectorize = bool(MightVectorize) && (int(Evaluator::Flags)&LinearAccessBit),
     MaySliceVectorize  = bool(MightVectorize) && int(InnerMaxSize)>=3*PacketSize
   };
 
@@ -51,8 +51,8 @@ public:
 
 public:
   enum {
-    Cost = Derived::SizeAtCompileTime == Dynamic ? HugeCost
-         : Derived::SizeAtCompileTime * Derived::CoeffReadCost + (Derived::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
+    Cost = Evaluator::SizeAtCompileTime == Dynamic ? HugeCost
+         : Evaluator::SizeAtCompileTime * Evaluator::CoeffReadCost + (Evaluator::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
     UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize))
   };
 
@@ -64,9 +64,9 @@ public:
 #ifdef EIGEN_DEBUG_ASSIGN
   static void debug()
   {
-    std::cerr << "Xpr: " << typeid(typename Derived::XprType).name() << std::endl;
+    std::cerr << "Xpr: " << typeid(typename Evaluator::XprType).name() << std::endl;
     std::cerr.setf(std::ios::hex, std::ios::basefield);
-    EIGEN_DEBUG_VAR(Derived::Flags)
+    EIGEN_DEBUG_VAR(Evaluator::Flags)
     std::cerr.unsetf(std::ios::hex);
     EIGEN_DEBUG_VAR(InnerMaxSize)
     EIGEN_DEBUG_VAR(PacketSize)
@@ -87,88 +87,88 @@ public:
 
 /*** no vectorization ***/
 
-template<typename Func, typename Derived, int Start, int Length>
+template<typename Func, typename Evaluator, int Start, int Length>
 struct redux_novec_unroller
 {
   enum {
     HalfLength = Length/2
   };
 
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
 
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
+  static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func& func)
   {
-    return func(redux_novec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
-                redux_novec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func));
+    return func(redux_novec_unroller<Func, Evaluator, Start, HalfLength>::run(eval,func),
+                redux_novec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::run(eval,func));
   }
 };
 
-template<typename Func, typename Derived, int Start>
-struct redux_novec_unroller<Func, Derived, Start, 1>
+template<typename Func, typename Evaluator, int Start>
+struct redux_novec_unroller<Func, Evaluator, Start, 1>
 {
   enum {
-    outer = Start / Derived::InnerSizeAtCompileTime,
-    inner = Start % Derived::InnerSizeAtCompileTime
+    outer = Start / Evaluator::InnerSizeAtCompileTime,
+    inner = Start % Evaluator::InnerSizeAtCompileTime
   };
 
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
 
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func&)
+  static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func&)
   {
-    return mat.coeffByOuterInner(outer, inner);
+    return eval.coeffByOuterInner(outer, inner);
   }
 };
 
 // This is actually dead code and will never be called. It is required
 // to prevent false warnings regarding failed inlining though
 // for 0 length run() will never be called at all.
-template<typename Func, typename Derived, int Start>
-struct redux_novec_unroller<Func, Derived, Start, 0>
+template<typename Func, typename Evaluator, int Start>
+struct redux_novec_unroller<Func, Evaluator, Start, 0>
 {
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
   EIGEN_DEVICE_FUNC 
-  static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); }
+  static EIGEN_STRONG_INLINE Scalar run(const Evaluator&, const Func&) { return Scalar(); }
 };
 
 /*** vectorization ***/
 
-template<typename Func, typename Derived, int Start, int Length>
+template<typename Func, typename Evaluator, int Start, int Length>
 struct redux_vec_unroller
 {
   enum {
-    PacketSize = redux_traits<Func, Derived>::PacketSize,
+    PacketSize = redux_traits<Func, Evaluator>::PacketSize,
     HalfLength = Length/2
   };
 
-  typedef typename Derived::Scalar Scalar;
-  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
+  typedef typename Evaluator::Scalar Scalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
 
-  static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func& func)
+  static EIGEN_STRONG_INLINE PacketScalar run(const Evaluator &eval, const Func& func)
   {
     return func.packetOp(
-            redux_vec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
-            redux_vec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func) );
+            redux_vec_unroller<Func, Evaluator, Start, HalfLength>::run(eval,func),
+            redux_vec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::run(eval,func) );
   }
 };
 
-template<typename Func, typename Derived, int Start>
-struct redux_vec_unroller<Func, Derived, Start, 1>
+template<typename Func, typename Evaluator, int Start>
+struct redux_vec_unroller<Func, Evaluator, Start, 1>
 {
   enum {
-    index = Start * redux_traits<Func, Derived>::PacketSize,
-    outer = index / int(Derived::InnerSizeAtCompileTime),
-    inner = index % int(Derived::InnerSizeAtCompileTime),
-    alignment = Derived::Alignment
+    index = Start * redux_traits<Func, Evaluator>::PacketSize,
+    outer = index / int(Evaluator::InnerSizeAtCompileTime),
+    inner = index % int(Evaluator::InnerSizeAtCompileTime),
+    alignment = Evaluator::Alignment
   };
 
-  typedef typename Derived::Scalar Scalar;
-  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
+  typedef typename Evaluator::Scalar Scalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
 
-  static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&)
+  static EIGEN_STRONG_INLINE PacketScalar run(const Evaluator &eval, const Func&)
   {
-    return mat.template packetByOuterInner<alignment,PacketScalar>(outer, inner);
+    return eval.template packetByOuterInner<alignment,PacketScalar>(outer, inner);
   }
 };
 
@@ -176,53 +176,65 @@ struct redux_vec_unroller<Func, Derived, Start, 1>
 * Part 3 : implementation of all cases
 ***************************************************************************/
 
-template<typename Func, typename Derived,
-         int Traversal = redux_traits<Func, Derived>::Traversal,
-         int Unrolling = redux_traits<Func, Derived>::Unrolling
+template<typename Func, typename Evaluator,
+         int Traversal = redux_traits<Func, Evaluator>::Traversal,
+         int Unrolling = redux_traits<Func, Evaluator>::Unrolling
 >
 struct redux_impl;
 
-template<typename Func, typename Derived>
-struct redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>
+template<typename Func, typename Evaluator>
+struct redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>
 {
-  typedef typename Derived::Scalar Scalar;
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
+  typedef typename Evaluator::Scalar Scalar;
+
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
+  Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
   {
-    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
+    eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
     Scalar res;
-    res = mat.coeffByOuterInner(0, 0);
-    for(Index i = 1; i < mat.innerSize(); ++i)
-      res = func(res, mat.coeffByOuterInner(0, i));
-    for(Index i = 1; i < mat.outerSize(); ++i)
-      for(Index j = 0; j < mat.innerSize(); ++j)
-        res = func(res, mat.coeffByOuterInner(i, j));
+    res = eval.coeffByOuterInner(0, 0);
+    for(Index i = 1; i < xpr.innerSize(); ++i)
+      res = func(res, eval.coeffByOuterInner(0, i));
+    for(Index i = 1; i < xpr.outerSize(); ++i)
+      for(Index j = 0; j < xpr.innerSize(); ++j)
+        res = func(res, eval.coeffByOuterInner(i, j));
     return res;
   }
 };
 
-template<typename Func, typename Derived>
-struct redux_impl<Func,Derived, DefaultTraversal, CompleteUnrolling>
-  : public redux_novec_unroller<Func,Derived, 0, Derived::SizeAtCompileTime>
-{};
+template<typename Func, typename Evaluator>
+struct redux_impl<Func,Evaluator, DefaultTraversal, CompleteUnrolling>
+  : redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime>
+{
+  typedef redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime> Base;
+  typedef typename Evaluator::Scalar Scalar;
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
+  Scalar run(const Evaluator &eval, const Func& func, const XprType& /*xpr*/)
+  {
+    return Base::run(eval,func);
+  }
+};
 
-template<typename Func, typename Derived>
-struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
+template<typename Func, typename Evaluator>
+struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, NoUnrolling>
 {
-  typedef typename Derived::Scalar Scalar;
-  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
+  typedef typename Evaluator::Scalar Scalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
 
-  static Scalar run(const Derived &mat, const Func& func)
+  template<typename XprType>
+  static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
   {
-    const Index size = mat.size();
+    const Index size = xpr.size();
     
-    const Index packetSize = redux_traits<Func, Derived>::PacketSize;
+    const Index packetSize = redux_traits<Func, Evaluator>::PacketSize;
     const int packetAlignment = unpacket_traits<PacketScalar>::alignment;
     enum {
-      alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
-      alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Derived::Alignment)
+      alignment0 = (bool(Evaluator::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
+      alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Evaluator::Alignment)
     };
-    const Index alignedStart = internal::first_default_aligned(mat.nestedExpression());
+    const Index alignedStart = internal::first_default_aligned(xpr);
     const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize);
     const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize);
     const Index alignedEnd2 = alignedStart + alignedSize2;
@@ -230,34 +242,34 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
     Scalar res;
     if(alignedSize)
     {
-      PacketScalar packet_res0 = mat.template packet<alignment,PacketScalar>(alignedStart);
+      PacketScalar packet_res0 = eval.template packet<alignment,PacketScalar>(alignedStart);
       if(alignedSize>packetSize) // we have at least two packets to partly unroll the loop
       {
-        PacketScalar packet_res1 = mat.template packet<alignment,PacketScalar>(alignedStart+packetSize);
+        PacketScalar packet_res1 = eval.template packet<alignment,PacketScalar>(alignedStart+packetSize);
         for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize)
         {
-          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(index));
-          packet_res1 = func.packetOp(packet_res1, mat.template packet<alignment,PacketScalar>(index+packetSize));
+          packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(index));
+          packet_res1 = func.packetOp(packet_res1, eval.template packet<alignment,PacketScalar>(index+packetSize));
         }
 
         packet_res0 = func.packetOp(packet_res0,packet_res1);
         if(alignedEnd>alignedEnd2)
-          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(alignedEnd2));
+          packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(alignedEnd2));
       }
       res = func.predux(packet_res0);
 
       for(Index index = 0; index < alignedStart; ++index)
-        res = func(res,mat.coeff(index));
+        res = func(res,eval.coeff(index));
 
       for(Index index = alignedEnd; index < size; ++index)
-        res = func(res,mat.coeff(index));
+        res = func(res,eval.coeff(index));
     }
     else // too small to vectorize anything.
          // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
     {
-      res = mat.coeff(0);
+      res = eval.coeff(0);
       for(Index index = 1; index < size; ++index)
-        res = func(res,mat.coeff(index));
+        res = func(res,eval.coeff(index));
     }
 
     return res;
@@ -265,130 +277,105 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
 };
 
 // NOTE: for SliceVectorizedTraversal we simply bypass unrolling
-template<typename Func, typename Derived, int Unrolling>
-struct redux_impl<Func, Derived, SliceVectorizedTraversal, Unrolling>
+template<typename Func, typename Evaluator, int Unrolling>
+struct redux_impl<Func, Evaluator, SliceVectorizedTraversal, Unrolling>
 {
-  typedef typename Derived::Scalar Scalar;
-  typedef typename redux_traits<Func, Derived>::PacketType PacketType;
+  typedef typename Evaluator::Scalar Scalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketType;
 
-  EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func)
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
   {
-    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
-    const Index innerSize = mat.innerSize();
-    const Index outerSize = mat.outerSize();
+    eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
+    const Index innerSize = xpr.innerSize();
+    const Index outerSize = xpr.outerSize();
     enum {
-      packetSize = redux_traits<Func, Derived>::PacketSize
+      packetSize = redux_traits<Func, Evaluator>::PacketSize
     };
     const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize;
     Scalar res;
     if(packetedInnerSize)
     {
-      PacketType packet_res = mat.template packet<Unaligned,PacketType>(0,0);
+      PacketType packet_res = eval.template packet<Unaligned,PacketType>(0,0);
       for(Index j=0; j<outerSize; ++j)
         for(Index i=(j==0?packetSize:0); i<packetedInnerSize; i+=Index(packetSize))
-          packet_res = func.packetOp(packet_res, mat.template packetByOuterInner<Unaligned,PacketType>(j,i));
+          packet_res = func.packetOp(packet_res, eval.template packetByOuterInner<Unaligned,PacketType>(j,i));
 
       res = func.predux(packet_res);
       for(Index j=0; j<outerSize; ++j)
         for(Index i=packetedInnerSize; i<innerSize; ++i)
-          res = func(res, mat.coeffByOuterInner(j,i));
+          res = func(res, eval.coeffByOuterInner(j,i));
     }
     else // too small to vectorize anything.
          // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
     {
-      res = redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>::run(mat, func);
+      res = redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>::run(eval, func, xpr);
     }
 
     return res;
   }
 };
 
-template<typename Func, typename Derived>
-struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling>
+template<typename Func, typename Evaluator>
+struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, CompleteUnrolling>
 {
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
 
-  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
   enum {
-    PacketSize = redux_traits<Func, Derived>::PacketSize,
-    Size = Derived::SizeAtCompileTime,
+    PacketSize = redux_traits<Func, Evaluator>::PacketSize,
+    Size = Evaluator::SizeAtCompileTime,
     VectorizedSize = (Size / PacketSize) * PacketSize
   };
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
+
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
+  Scalar run(const Evaluator &eval, const Func& func, const XprType &xpr)
   {
-    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
+    eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
     if (VectorizedSize > 0) {
-      Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
+      Scalar res = func.predux(redux_vec_unroller<Func, Evaluator, 0, Size / PacketSize>::run(eval,func));
       if (VectorizedSize != Size)
-        res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
+        res = func(res,redux_novec_unroller<Func, Evaluator, VectorizedSize, Size-VectorizedSize>::run(eval,func));
       return res;
     }
     else {
-      return redux_novec_unroller<Func, Derived, 0, Size>::run(mat,func);
+      return redux_novec_unroller<Func, Evaluator, 0, Size>::run(eval,func);
     }
   }
 };
 
 // evaluator adaptor
 template<typename _XprType>
-class redux_evaluator
+class redux_evaluator : public internal::evaluator<_XprType>
 {
+  typedef internal::evaluator<_XprType> Base;
 public:
   typedef _XprType XprType;
-  EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {}
+  EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : Base(xpr) {}
   
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketScalar PacketScalar;
-  typedef typename XprType::PacketReturnType PacketReturnType;
   
   enum {
     MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = XprType::MaxColsAtCompileTime,
     // TODO we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at runtime from the evaluator
-    Flags = evaluator<XprType>::Flags & ~DirectAccessBit,
+    Flags = Base::Flags & ~DirectAccessBit,
     IsRowMajor = XprType::IsRowMajor,
     SizeAtCompileTime = XprType::SizeAtCompileTime,
-    InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime,
-    CoeffReadCost = evaluator<XprType>::CoeffReadCost,
-    Alignment = evaluator<XprType>::Alignment
+    InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime
   };
   
-  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
-  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
-  EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); }
-  EIGEN_DEVICE_FUNC Index innerSize() const { return m_xpr.innerSize(); }
-  EIGEN_DEVICE_FUNC Index outerSize() const { return m_xpr.outerSize(); }
-
-  EIGEN_DEVICE_FUNC
-  CoeffReturnType coeff(Index row, Index col) const
-  { return m_evaluator.coeff(row, col); }
-
-  EIGEN_DEVICE_FUNC
-  CoeffReturnType coeff(Index index) const
-  { return m_evaluator.coeff(index); }
-
-  template<int LoadMode, typename PacketType>
-  PacketType packet(Index row, Index col) const
-  { return m_evaluator.template packet<LoadMode,PacketType>(row, col); }
-
-  template<int LoadMode, typename PacketType>
-  PacketType packet(Index index) const
-  { return m_evaluator.template packet<LoadMode,PacketType>(index); }
-  
   EIGEN_DEVICE_FUNC
   CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
-  { return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
+  { return Base::coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
   
   template<int LoadMode, typename PacketType>
   PacketType packetByOuterInner(Index outer, Index inner) const
-  { return m_evaluator.template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
-  
-  const XprType & nestedExpression() const { return m_xpr; }
+  { return Base::template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
   
-protected:
-  internal::evaluator<XprType> m_evaluator;
-  const XprType &m_xpr;
 };
 
 } // end namespace internal
@@ -415,7 +402,9 @@ DenseBase<Derived>::redux(const Func& func) const
   typedef typename internal::redux_evaluator<Derived> ThisEvaluator;
   ThisEvaluator thisEval(derived());
   
-  return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func);
+  // The initial expression is passed to the reducer as an additional argument instead of
+  // passing it as a member of redux_evaluator to help  
+  return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func, derived());
 }
 
 /** \returns the minimum of all coefficients of \c *this.
diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index ba7d6e629..d7c204579 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -79,6 +79,7 @@ template<typename MatrixType> class Transpose
     nestedExpression() { return m_matrix; }
 
     /** \internal */
+    EIGEN_DEVICE_FUNC
     void resize(Index nrows, Index ncols) {
       m_matrix.resize(ncols,nrows);
     }
diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h
index ab73fcf21..521de6160 100644
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@@ -65,6 +65,7 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
     inline Index innerStride() const { return derived().innerStride(); }
     
     // dummy resize function
+    EIGEN_DEVICE_FUNC
     void resize(Index rows, Index cols)
     {
       EIGEN_UNUSED_VARIABLE(rows);
@@ -716,6 +717,7 @@ struct unary_evaluator<TriangularView<MatrixType,Mode>, IndexBased>
 {
   typedef TriangularView<MatrixType,Mode> XprType;
   typedef evaluator<typename internal::remove_all<MatrixType>::type> Base;
+  EIGEN_DEVICE_FUNC
   unary_evaluator(const XprType &xpr) : Base(xpr.nestedExpression()) {}
 };
 
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index 6d7190a56..7f4e90f75 100755
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -103,7 +103,7 @@ static Packet16uc p16uc_PSET32_WODD   = vec_sld((Packet16uc) vec_splat((Packet4u
 static Packet16uc p16uc_PSET32_WEVEN  = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
 static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8);      //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
 #else
-static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; 
+static Packet16uc p16uc_FORWARD = p16uc_REVERSE32;
 static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
 static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
 static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
@@ -784,7 +784,7 @@ typedef __vector __bool long         Packet2bl;
 
 static Packet2l  p2l_ONE  = { 1, 1 };
 static Packet2l  p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
-static Packet2d  p2d_ONE  = { 1.0, 1.0 }; 
+static Packet2d  p2d_ONE  = { 1.0, 1.0 };
 static Packet2d  p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
 static Packet2d  p2d_MZERO = { -0.0, -0.0 };
 
@@ -1001,7 +1001,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
   Packet2d v[2], sum;
   v[0] = vecs[0] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[0]), reinterpret_cast<Packet4f>(vecs[0]), 8));
   v[1] = vecs[1] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[1]), reinterpret_cast<Packet4f>(vecs[1]), 8));
- 
+
 #ifdef _BIG_ENDIAN
   sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[0]), reinterpret_cast<Packet4f>(v[1]), 8));
 #else
@@ -1054,7 +1054,7 @@ ptranspose(PacketBlock<Packet2d,2>& kernel) {
 
 template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
   Packet2l select = { ifPacket.select[0], ifPacket.select[1] };
-  Packet2bl mask = vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE));
+  Packet2bl mask = reinterpret_cast<Packet2bl>( vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE)) );
   return vec_sel(elsePacket, thenPacket, mask);
 }
 #endif // __VSX__
diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
index e1ecac1ab..b0a72e1f9 100644
--- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
@@ -497,10 +497,10 @@ struct packet_traits<half> : default_packet_traits {
     AlignedOnScalar = 1,
     size = 16,
     HasHalfPacket = 0,
-    HasAdd    = 0,
-    HasSub    = 0,
-    HasMul    = 0,
-    HasNegate = 0,
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasNegate = 1,
     HasAbs    = 0,
     HasAbs2   = 0,
     HasMin    = 0,
@@ -550,6 +550,21 @@ template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet1
 }
 
 template<> EIGEN_STRONG_INLINE Packet16h
+ploaddup<Packet16h>(const Eigen::half*  from) {
+  Packet16h result;
+  unsigned short a = from[0].x;
+  unsigned short b = from[1].x;
+  unsigned short c = from[2].x;
+  unsigned short d = from[3].x;
+  unsigned short e = from[4].x;
+  unsigned short f = from[5].x;
+  unsigned short g = from[6].x;
+  unsigned short h = from[7].x;
+  result.x = _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h
 ploadquad(const Eigen::half* from) {
   Packet16h result;
   unsigned short a = from[0].x;
@@ -621,6 +636,13 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
 #endif
 }
 
+template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) {
+  // FIXME we could do that with bit manipulation
+  Packet16f af = half2float(a);
+  Packet16f rf = pnegate(af);
+  return float2half(rf);
+}
+
 template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
   Packet16f af = half2float(a);
   Packet16f bf = half2float(b);
@@ -628,6 +650,13 @@ template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, con
   return float2half(rf);
 }
 
+template<> EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  Packet16f rf = psub(af, bf);
+  return float2half(rf);
+}
+
 template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
   Packet16f af = half2float(a);
   Packet16f bf = half2float(b);
@@ -640,6 +669,57 @@ template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
   return half(predux(from_float));
 }
 
+template<> EIGEN_STRONG_INLINE half predux_mul<Packet16h>(const Packet16h& from) {
+  Packet16f from_float = half2float(from);
+  return half(predux_mul(from_float));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h preduxp<Packet16h>(const Packet16h* p) {
+  Packet16f pf[16];
+  pf[0] = half2float(p[0]);
+  pf[1] = half2float(p[1]);
+  pf[2] = half2float(p[2]);
+  pf[3] = half2float(p[3]);
+  pf[4] = half2float(p[4]);
+  pf[5] = half2float(p[5]);
+  pf[6] = half2float(p[6]);
+  pf[7] = half2float(p[7]);
+  pf[8] = half2float(p[8]);
+  pf[9] = half2float(p[9]);
+  pf[10] = half2float(p[10]);
+  pf[11] = half2float(p[11]);
+  pf[12] = half2float(p[12]);
+  pf[13] = half2float(p[13]);
+  pf[14] = half2float(p[14]);
+  pf[15] = half2float(p[15]);
+  Packet16f reduced = preduxp<Packet16f>(pf);
+  return float2half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a)
+{
+  __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
+  Packet16h res;
+  res.x = _mm256_insertf128_si256(
+                    _mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(a.x,1),m)),
+                                           _mm_shuffle_epi8(_mm256_extractf128_si256(a.x,0),m), 1);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pinsertfirst(const Packet16h& a, Eigen::half b)
+{
+  Packet16h res;
+  res.x = _mm256_insert_epi16(a.x,b.x,0);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pinsertlast(const Packet16h& a, Eigen::half b)
+{
+  Packet16h res;
+  res.x = _mm256_insert_epi16(a.x,b.x,15);
+  return res;
+}
+
 template<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride)
 {
   Packet16h result;
@@ -747,20 +827,20 @@ ptranspose(PacketBlock<Packet16h,16>& kernel) {
 
   // NOTE: no unpacklo/hi instr in this case, so using permute instr.
   __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
-  __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
-  __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
-  __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
-  __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
-  __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
-  __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
-  __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
-  __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
-  __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
-  __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
-  __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
-  __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
-  __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
-  __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
+  __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
+  __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
+  __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
+  __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
+  __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
+  __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
+  __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
+  __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
+  __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
+  __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
+  __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
+  __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
+  __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
+  __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
   __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
 
   kernel.packet[0].x = a_p_0;
@@ -865,10 +945,10 @@ struct packet_traits<Eigen::half> : default_packet_traits {
     AlignedOnScalar = 1,
     size = 8,
     HasHalfPacket = 0,
-    HasAdd    = 0,
-    HasSub    = 0,
-    HasMul    = 0,
-    HasNegate = 0,
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasNegate = 1,
     HasAbs    = 0,
     HasAbs2   = 0,
     HasMin    = 0,
@@ -918,6 +998,17 @@ template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const
 }
 
 template<> EIGEN_STRONG_INLINE Packet8h
+ploaddup<Packet8h>(const Eigen::half*  from) {
+  Packet8h result;
+  unsigned short a = from[0].x;
+  unsigned short b = from[1].x;
+  unsigned short c = from[2].x;
+  unsigned short d = from[3].x;
+  result.x = _mm_set_epi16(d, d, c, c, b, b, a, a);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h
 ploadquad<Packet8h>(const Eigen::half* from) {
   Packet8h result;
   unsigned short a = from[0].x;
@@ -970,6 +1061,13 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
 
 template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
 
+template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) {
+  // FIXME we could do that with bit manipulation
+  Packet8f af = half2float(a);
+  Packet8f rf = pnegate(af);
+  return float2half(rf);
+}
+
 template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
   Packet8f af = half2float(a);
   Packet8f bf = half2float(b);
@@ -977,6 +1075,13 @@ template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const
   return float2half(rf);
 }
 
+template<> EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = psub(af, bf);
+  return float2half(rf);
+}
+
 template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
   Packet8f af = half2float(a);
   Packet8f bf = half2float(b);
@@ -1029,6 +1134,52 @@ template<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h&
   return Eigen::half(reduced);
 }
 
+template<> EIGEN_STRONG_INLINE Packet8h preduxp<Packet8h>(const Packet8h* p) {
+  Packet8f pf[8];
+  pf[0] = half2float(p[0]);
+  pf[1] = half2float(p[1]);
+  pf[2] = half2float(p[2]);
+  pf[3] = half2float(p[3]);
+  pf[4] = half2float(p[4]);
+  pf[5] = half2float(p[5]);
+  pf[6] = half2float(p[6]);
+  pf[7] = half2float(p[7]);
+  Packet8f reduced = preduxp<Packet8f>(pf);
+  return float2half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a)
+{
+  __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
+  Packet8h res;
+  res.x = _mm_shuffle_epi8(a.x,m);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pinsertfirst(const Packet8h& a, Eigen::half b)
+{
+  Packet8h res;
+  res.x = _mm_insert_epi16(a.x,int(b.x),0);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pinsertlast(const Packet8h& a, Eigen::half b)
+{
+  Packet8h res;
+  res.x = _mm_insert_epi16(a.x,int(b.x),7);
+  return res;
+}
+
+template<int Offset>
+struct palign_impl<Offset,Packet8h>
+{
+  static EIGEN_STRONG_INLINE void run(Packet8h& first, const Packet8h& second)
+  {
+    if (Offset!=0)
+      first.x = _mm_alignr_epi8(second.x,first.x, Offset*2);
+  }
+};
+
 EIGEN_STRONG_INLINE void
 ptranspose(PacketBlock<Packet8h,8>& kernel) {
   __m128i a = kernel.packet[0].x;
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index dd4f366ee..1890efd4d 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -1526,10 +1526,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           // The following piece of code won't work for 512 bit registers
           // Moreover, if LhsProgress==8 it assumes that there is a half packet of the same size
           // as nr (which is currently 4) for the return type.
-          typedef typename unpacket_traits<SResPacket>::half SResPacketHalf;
+          const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
           if ((SwappedTraits::LhsProgress % 4) == 0 &&
               (SwappedTraits::LhsProgress <= 8) &&
-              (SwappedTraits::LhsProgress!=8 || unpacket_traits<SResPacketHalf>::size==nr))
+              (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr))
           {
             SAccPacket C0, C1, C2, C3;
             straits.initAcc(C0);
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index ed4d3182b..dc47cf7cf 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -431,10 +431,10 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
     // to determine the following heuristic.
     // EIGEN_GEMM_TO_COEFFBASED_THRESHOLD is typically defined to 20 in GeneralProduct.h,
     // unless it has been specialized by the user or for a given architecture.
-    // Note that the condition rhs.rows()>0 was required because lazy produc is (was?) not happy with empty inputs.
+    // Note that the condition rhs.rows()>0 was required because lazy product is (was?) not happy with empty inputs.
     // I'm not sure it is still required.
     if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
-      lazyproduct::evalTo(dst, lhs, rhs);
+      lazyproduct::eval_dynamic(dst, lhs, rhs, internal::assign_op<typename Dst::Scalar,Scalar>());
     else
     {
       dst.setZero();
@@ -446,7 +446,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
   static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
-      lazyproduct::addTo(dst, lhs, rhs);
+      lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op<typename Dst::Scalar,Scalar>());
     else
       scaleAndAddTo(dst,lhs, rhs, Scalar(1));
   }
@@ -455,7 +455,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
   static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
-      lazyproduct::subTo(dst, lhs, rhs);
+      lazyproduct::eval_dynamic(dst, lhs, rhs, internal::sub_assign_op<typename Dst::Scalar,Scalar>());
     else
       scaleAndAddTo(dst, lhs, rhs, Scalar(-1));
   }
diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h
index 67390f1d7..d38fd72b2 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h
@@ -27,7 +27,8 @@ template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool Conju
 struct selfadjoint_matrix_vector_product
 
 {
-static EIGEN_DONT_INLINE void run(
+static EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC
+void run(
   Index size,
   const Scalar*  lhs, Index lhsStride,
   const Scalar*  rhs,
@@ -36,7 +37,8 @@ static EIGEN_DONT_INLINE void run(
 };
 
 template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Version>::run(
+EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC
+void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Version>::run(
   Index size,
   const Scalar*  lhs, Index lhsStride,
   const Scalar*  rhs,
@@ -62,8 +64,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
 
   Scalar cjAlpha = ConjugateRhs ? numext::conj(alpha) : alpha;
 
-
-  Index bound = (std::max)(Index(0),size-8) & 0xfffffffe;
+  Index bound = numext::maxi(Index(0), size-8) & 0xfffffffe;
   if (FirstTriangular)
     bound = size - bound;
 
@@ -175,7 +176,8 @@ struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,0,true>
   enum { LhsUpLo = LhsMode&(Upper|Lower) };
 
   template<typename Dest>
-  static void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
+  static EIGEN_DEVICE_FUNC
+  void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
   {
     typedef typename Dest::Scalar ResScalar;
     typedef typename Rhs::Scalar RhsScalar;
diff --git a/Eigen/src/Core/products/SelfadjointRank2Update.h b/Eigen/src/Core/products/SelfadjointRank2Update.h
index d395888e5..09209f733 100644
--- a/Eigen/src/Core/products/SelfadjointRank2Update.h
+++ b/Eigen/src/Core/products/SelfadjointRank2Update.h
@@ -24,7 +24,8 @@ struct selfadjoint_rank2_update_selector;
 template<typename Scalar, typename Index, typename UType, typename VType>
 struct selfadjoint_rank2_update_selector<Scalar,Index,UType,VType,Lower>
 {
-  static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha)
+  static EIGEN_DEVICE_FUNC
+  void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha)
   {
     const Index size = u.size();
     for (Index i=0; i<size; ++i)
diff --git a/Eigen/src/Core/products/TriangularSolverVector.h b/Eigen/src/Core/products/TriangularSolverVector.h
index b994759b2..647317016 100644
--- a/Eigen/src/Core/products/TriangularSolverVector.h
+++ b/Eigen/src/Core/products/TriangularSolverVector.h
@@ -58,7 +58,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
       {
         // let's directly call the low level product function because:
         // 1 - it is faster to compile
-        // 2 - it is slighlty faster at runtime
+        // 2 - it is slightly faster at runtime
         Index startRow = IsLower ? pi : pi-actualPanelWidth;
         Index startCol = IsLower ? 0 : pi;
 
@@ -77,7 +77,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
         if (k>0)
           rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map<const Matrix<RhsScalar,Dynamic,1> >(rhs+s,k))).sum();
 
-        if(!(Mode & UnitDiag))
+        if((!(Mode & UnitDiag)) && numext::not_equal_strict(rhs[i],RhsScalar(0)))
           rhs[i] /= cjLhs(i,i);
       }
     }
@@ -114,20 +114,23 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
       for(Index k=0; k<actualPanelWidth; ++k)
       {
         Index i = IsLower ? pi+k : pi-k-1;
-        if(!(Mode & UnitDiag))
-          rhs[i] /= cjLhs.coeff(i,i);
-
-        Index r = actualPanelWidth - k - 1; // remaining size
-        Index s = IsLower ? i+1 : i-r;
-        if (r>0)
-          Map<Matrix<RhsScalar,Dynamic,1> >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r);
+        if(numext::not_equal_strict(rhs[i],RhsScalar(0)))
+        {
+          if(!(Mode & UnitDiag))
+            rhs[i] /= cjLhs.coeff(i,i);
+
+          Index r = actualPanelWidth - k - 1; // remaining size
+          Index s = IsLower ? i+1 : i-r;
+          if (r>0)
+            Map<Matrix<RhsScalar,Dynamic,1> >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r);
+        }
       }
       Index r = IsLower ? size - endBlock : startBlock; // remaining size
       if (r > 0)
       {
         // let's directly call the low level product function because:
         // 1 - it is faster to compile
-        // 2 - it is slighlty faster at runtime
+        // 2 - it is slightly faster at runtime
         general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,Conjugate,RhsScalar,RhsMapper,false>::run(
             r, actualPanelWidth,
             LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride),
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index b1791fb3a..705925984 100755
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -289,8 +289,8 @@ template<typename XprType> struct blas_traits
     ExtractType,
     typename _ExtractType::PlainObject
     >::type DirectLinearAccessType;
-  static inline ExtractType extract(const XprType& x) { return x; }
-  static inline const Scalar extractScalarFactor(const XprType&) { return Scalar(1); }
+  static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return x; }
+  static inline EIGEN_DEVICE_FUNC const Scalar extractScalarFactor(const XprType&) { return Scalar(1); }
 };
 
 // pop conjugate
@@ -318,8 +318,8 @@ struct blas_traits<CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp
   typedef blas_traits<NestedXpr> Base;
   typedef CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain>, NestedXpr> XprType;
   typedef typename Base::ExtractType ExtractType;
-  static inline ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); }
-  static inline Scalar extractScalarFactor(const XprType& x)
+  static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); }
+  static inline EIGEN_DEVICE_FUNC Scalar extractScalarFactor(const XprType& x)
   { return x.lhs().functor().m_other * Base::extractScalarFactor(x.rhs()); }
 };
 template<typename Scalar, typename NestedXpr, typename Plain>
diff --git a/Eigen/src/Core/util/IntegralConstant.h b/Eigen/src/Core/util/IntegralConstant.h
index 78a4705cd..bf99cd8ab 100644
--- a/Eigen/src/Core/util/IntegralConstant.h
+++ b/Eigen/src/Core/util/IntegralConstant.h
@@ -192,7 +192,7 @@ inline internal::FixedInt<N> fix() { return internal::FixedInt<N>(); }
 // The generic typename T is mandatory. Otherwise, a code like fix<N> could refer to either the function above or this next overload.
 // This way a code like fix<N> can only refer to the previous function.
 template<int N,typename T>
-inline internal::VariableAndFixedInt<N> fix(T val) { return internal::VariableAndFixedInt<N>(val); }
+inline internal::VariableAndFixedInt<N> fix(T val) { return internal::VariableAndFixedInt<N>(internal::convert_index<int>(val)); }
 #endif
 
 #else // EIGEN_PARSED_BY_DOXYGEN
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 059d06874..f2cac01ac 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -580,7 +580,7 @@ template<typename T> struct smart_memmove_helper<T,false> {
 
 // you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA
 // to the appropriate stack allocation function
-#ifndef EIGEN_ALLOCA
+#if ! defined EIGEN_ALLOCA && ! defined EIGEN_CUDA_ARCH
   #if EIGEN_OS_LINUX || EIGEN_OS_MAC || (defined alloca)
     #define EIGEN_ALLOCA alloca
   #elif EIGEN_COMP_MSVC
@@ -599,12 +599,14 @@ template<typename T> class aligned_stack_memory_handler : noncopyable
      * In this case, the buffer elements will also be destructed when this handler will be destructed.
      * Finally, if \a dealloc is true, then the pointer \a ptr is freed.
      **/
+    EIGEN_DEVICE_FUNC
     aligned_stack_memory_handler(T* ptr, std::size_t size, bool dealloc)
       : m_ptr(ptr), m_size(size), m_deallocate(dealloc)
     {
       if(NumTraits<T>::RequireInitialization && m_ptr)
         Eigen::internal::construct_elements_of_array(m_ptr, size);
     }
+    EIGEN_DEVICE_FUNC
     ~aligned_stack_memory_handler()
     {
       if(NumTraits<T>::RequireInitialization && m_ptr)
@@ -618,6 +620,60 @@ template<typename T> class aligned_stack_memory_handler : noncopyable
     bool m_deallocate;
 };
 
+#ifdef EIGEN_ALLOCA
+
+template<typename Xpr, int NbEvaluations,
+         bool MapExternalBuffer = nested_eval<Xpr,NbEvaluations>::Evaluate && Xpr::MaxSizeAtCompileTime==Dynamic
+         >
+struct local_nested_eval_wrapper
+{
+  static const bool NeedExternalBuffer = false;
+  typedef typename Xpr::Scalar Scalar;
+  typedef typename nested_eval<Xpr,NbEvaluations>::type ObjectType;
+  ObjectType object;
+
+  EIGEN_DEVICE_FUNC
+  local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr) : object(xpr)
+  {
+    EIGEN_UNUSED_VARIABLE(ptr);
+    eigen_internal_assert(ptr==0);
+  }
+};
+
+template<typename Xpr, int NbEvaluations>
+struct local_nested_eval_wrapper<Xpr,NbEvaluations,true>
+{
+  static const bool NeedExternalBuffer = true;
+  typedef typename Xpr::Scalar Scalar;
+  typedef typename plain_object_eval<Xpr>::type PlainObject;
+  typedef Map<PlainObject,EIGEN_DEFAULT_ALIGN_BYTES> ObjectType;
+  ObjectType object;
+
+  EIGEN_DEVICE_FUNC
+  local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr)
+    : object(ptr==0 ? reinterpret_cast<Scalar*>(Eigen::internal::aligned_malloc(sizeof(Scalar)*xpr.size())) : ptr, xpr.rows(), xpr.cols()),
+      m_deallocate(ptr==0)
+  {
+    if(NumTraits<Scalar>::RequireInitialization && object.data())
+      Eigen::internal::construct_elements_of_array(object.data(), object.size());
+    object = xpr;
+  }
+
+  EIGEN_DEVICE_FUNC
+  ~local_nested_eval_wrapper()
+  {
+    if(NumTraits<Scalar>::RequireInitialization && object.data())
+      Eigen::internal::destruct_elements_of_array(object.data(), object.size());
+    if(m_deallocate)
+      Eigen::internal::aligned_free(object.data());
+  }
+
+private:
+  bool m_deallocate;
+};
+
+#endif // EIGEN_ALLOCA
+
 template<typename T> class scoped_array : noncopyable
 {
   T* m_ptr;
@@ -645,9 +701,11 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
 } // end namespace internal
 
 /** \internal
-  * Declares, allocates and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack
-  * if SIZE is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform
-  * (currently, this is Linux and Visual Studio only). Otherwise the memory is allocated on the heap.
+  * 
+  * The macro ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) declares, allocates,
+  * and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack
+  * if the size in bytes is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform
+  * (currently, this is Linux, OSX and Visual Studio only). Otherwise the memory is allocated on the heap.
   * The allocated buffer is automatically deleted when exiting the scope of this declaration.
   * If BUFFER is non null, then the declared variable is simply an alias for BUFFER, and no allocation/deletion occurs.
   * Here is an example:
@@ -658,6 +716,14 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
   * }
   * \endcode
   * The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token.
+  * 
+  * The macro ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) is analogue to
+  * \code
+  *   typename internal::nested_eval<XPRT_T,N>::type NAME(XPR);
+  * \endcode
+  * with the advantage of using aligned stack allocation even if the maximal size of XPR at compile time is unknown.
+  * This is accomplished through alloca if this later is supported and if the required number of bytes
+  * is below EIGEN_STACK_ALLOCATION_LIMIT.
   */
 #ifdef EIGEN_ALLOCA
 
@@ -677,6 +743,13 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
                     : Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE) );  \
     Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,sizeof(TYPE)*SIZE>EIGEN_STACK_ALLOCATION_LIMIT)
 
+
+  #define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) \
+    Eigen::internal::local_nested_eval_wrapper<XPR_T,N> EIGEN_CAT(NAME,_wrapper)(XPR, reinterpret_cast<typename XPR_T::Scalar*>( \
+      ( (Eigen::internal::local_nested_eval_wrapper<XPR_T,N>::NeedExternalBuffer) && ((sizeof(typename XPR_T::Scalar)*XPR.size())<=EIGEN_STACK_ALLOCATION_LIMIT) ) \
+        ? EIGEN_ALIGNED_ALLOCA( sizeof(typename XPR_T::Scalar)*XPR.size() ) : 0 ) ) ; \
+    typename Eigen::internal::local_nested_eval_wrapper<XPR_T,N>::ObjectType NAME(EIGEN_CAT(NAME,_wrapper).object)
+
 #else
 
   #define ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) \
@@ -684,6 +757,9 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
     TYPE* NAME = (BUFFER)!=0 ? BUFFER : reinterpret_cast<TYPE*>(Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE));    \
     Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,true)
 
+
+#define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) typename Eigen::internal::nested_eval<XPR_T,N>::type NAME(XPR);
+
 #endif
 
 
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index 5a358bc12..f37aac56b 100755
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -583,6 +583,7 @@ using std::numeric_limits;
 // Integer division with rounding up.
 // T is assumed to be an integer type with a>=0, and b>0
 template<typename T>
+EIGEN_DEVICE_FUNC
 T div_ceil(const T &a, const T &b)
 {
   return (a+b-1) / b;
@@ -593,7 +594,7 @@ T div_ceil(const T &a, const T &b)
 template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
 bool equal_strict(const X& x,const Y& y) { return x == y; }
 
-#if !defined(EIGEN_GPU_COMPILE_PHASE) 
+#if !defined(EIGEN_GPU_COMPILE_PHASE) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)
 template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
 bool equal_strict(const float& x,const float& y) { return std::equal_to<float>()(x,y); }
 
@@ -604,7 +605,7 @@ bool equal_strict(const double& x,const double& y) { return std::equal_to<double
 template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
 bool not_equal_strict(const X& x,const Y& y) { return x != y; }
 
-#if !defined(EIGEN_GPU_COMPILE_PHASE) 
+#if !defined(EIGEN_GPU_COMPILE_PHASE) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)
 template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
 bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to<float>()(x,y); }
 
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index 16714fa5c..e3231c712 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -460,7 +460,7 @@ template<typename T, int n, typename PlainObject = typename plain_object_eval<T>
 {
   enum {
     ScalarReadCost = NumTraits<typename traits<T>::Scalar>::ReadCost,
-    CoeffReadCost = evaluator<T>::CoeffReadCost,  // NOTE What if an evaluator evaluate itself into a tempory?
+    CoeffReadCost = evaluator<T>::CoeffReadCost,  // NOTE What if an evaluator evaluate itself into a temporary?
                                                   //      Then CoeffReadCost will be small (e.g., 1) but we still have to evaluate, especially if n>1.
                                                   //      This situation is already taken care by the EvalBeforeNestingBit flag, which is turned ON
                                                   //      for all evaluator creating a temporary. This flag is then propagated by the parent evaluators.
@@ -676,17 +676,32 @@ template<typename T> struct is_diagonal<DiagonalWrapper<T> >
 template<typename T, int S> struct is_diagonal<DiagonalMatrix<T,S> >
 { enum { ret = true }; };
 
+
+template<typename T> struct is_identity
+{ enum { value = false }; };
+
+template<typename T> struct is_identity<CwiseNullaryOp<internal::scalar_identity_op<typename T::Scalar>, T> >
+{ enum { value = true }; };
+
+
 template<typename S1, typename S2> struct glue_shapes;
 template<> struct glue_shapes<DenseShape,TriangularShape> { typedef TriangularShape type;  };
 
 template<typename T1, typename T2>
-bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if<has_direct_access<T1>::ret&&has_direct_access<T2>::ret, T1>::type * = 0)
+struct possibly_same_dense {
+  enum { value = has_direct_access<T1>::ret && has_direct_access<T2>::ret && is_same<typename T1::Scalar,typename T2::Scalar>::value };
+};
+
+template<typename T1, typename T2>
+EIGEN_DEVICE_FUNC
+bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if<possibly_same_dense<T1,T2>::value>::type * = 0)
 {
   return (mat1.data()==mat2.data()) && (mat1.innerStride()==mat2.innerStride()) && (mat1.outerStride()==mat2.outerStride());
 }
 
 template<typename T1, typename T2>
-bool is_same_dense(const T1 &, const T2 &, typename enable_if<!(has_direct_access<T1>::ret&&has_direct_access<T2>::ret), T1>::type * = 0)
+EIGEN_DEVICE_FUNC
+bool is_same_dense(const T1 &, const T2 &, typename enable_if<!possibly_same_dense<T1,T2>::value>::type * = 0)
 {
   return false;
 }
author	Deven Desai <deven.desai.amd@gmail.com>	2018-07-11 09:17:33 -0400
committer	Deven Desai <deven.desai.amd@gmail.com>	2018-07-11 09:17:33 -0400
commit	38807a257500cd0746b819c994efab805b8a02e4 (patch)
tree	0be837e16ad1dc2b09d8f2be2f752f074b169717 /Eigen/src/Core
parent	e2b2c61533cb923ddba41ba7bd64b87f30a25e29 (diff)
parent	f00d08cc0a987fa624209b920608b56638404f13 (diff)