From 5418154a45db637211e94f11ee04c6ae4dc8cf85 Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Wed, 20 Jun 2018 17:51:48 -0700
Subject: Fix oversharding bug in parallelFor.

---
 unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index ca9ba402e..90fd99027 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -189,9 +189,11 @@ struct ThreadPoolDevice {
     // of blocks to be evenly dividable across threads.
 
     double block_size_f = 1.0 / CostModel::taskSize(1, cost);
-    Index block_size = numext::mini(n, numext::maxi<Index>(1, block_size_f));
-    const Index max_block_size =
-        numext::mini(n, numext::maxi<Index>(1, 2 * block_size_f));
+    const Index max_oversharding_factor = 4;
+    Index block_size = numext::mini(
+        n, numext::maxi<Index>(divup<Index>(n, max_oversharding_factor * numThreads()),
+                               block_size_f));
+    const Index max_block_size = numext::mini(n, 2 * block_size);
     if (block_align) {
       Index new_block_size = block_align(block_size);
       eigen_assert(new_block_size >= block_size);
@@ -205,7 +207,8 @@ struct ThreadPoolDevice {
         (divup<int>(block_count, numThreads()) * numThreads());
     // Now try to increase block size up to max_block_size as long as it
     // doesn't decrease parallel efficiency.
-    for (Index prev_block_count = block_count; prev_block_count > 1;) {
+    for (Index prev_block_count = block_count;
+         max_efficiency < 1.0 && prev_block_count > 1;) {
       // This is the next block size that divides size into a smaller number
       // of blocks than the current block_size.
       Index coarser_block_size = divup(n, prev_block_count - 1);
-- 
cgit v1.2.3


From 4cc32d80fd091559bc90a55946945fc1584fdd86 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 21 Jun 2018 10:28:38 +0200
Subject: bug #1555: compilation fix with XLC

---
 Eigen/src/Core/arch/AltiVec/PacketMath.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index 6d7190a56..7f6dbe62a 100755
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -1054,7 +1054,7 @@ ptranspose(PacketBlock<Packet2d,2>& kernel) {
 
 template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
   Packet2l select = { ifPacket.select[0], ifPacket.select[1] };
-  Packet2bl mask = vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE));
+  Packet2bl mask = reinterpret_cast<Packet2bl>( vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE));
   return vec_sel(elsePacket, thenPacket, mask);
 }
 #endif // __VSX__
-- 
cgit v1.2.3


From bda71ad3945d4cf2bf06cf567b0b5a8a4663fede Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Fri, 22 Jun 2018 15:04:35 -0700
Subject: Fix typo in pbend for AltiVec.

---
 Eigen/src/Core/arch/AltiVec/PacketMath.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index 6d7190a56..7f4e90f75 100755
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -103,7 +103,7 @@ static Packet16uc p16uc_PSET32_WODD   = vec_sld((Packet16uc) vec_splat((Packet4u
 static Packet16uc p16uc_PSET32_WEVEN  = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
 static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8);      //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
 #else
-static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; 
+static Packet16uc p16uc_FORWARD = p16uc_REVERSE32;
 static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
 static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
 static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
@@ -784,7 +784,7 @@ typedef __vector __bool long         Packet2bl;
 
 static Packet2l  p2l_ONE  = { 1, 1 };
 static Packet2l  p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
-static Packet2d  p2d_ONE  = { 1.0, 1.0 }; 
+static Packet2d  p2d_ONE  = { 1.0, 1.0 };
 static Packet2d  p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
 static Packet2d  p2d_MZERO = { -0.0, -0.0 };
 
@@ -1001,7 +1001,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
   Packet2d v[2], sum;
   v[0] = vecs[0] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[0]), reinterpret_cast<Packet4f>(vecs[0]), 8));
   v[1] = vecs[1] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[1]), reinterpret_cast<Packet4f>(vecs[1]), 8));
- 
+
 #ifdef _BIG_ENDIAN
   sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[0]), reinterpret_cast<Packet4f>(v[1]), 8));
 #else
@@ -1054,7 +1054,7 @@ ptranspose(PacketBlock<Packet2d,2>& kernel) {
 
 template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
   Packet2l select = { ifPacket.select[0], ifPacket.select[1] };
-  Packet2bl mask = vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE));
+  Packet2bl mask = reinterpret_cast<Packet2bl>( vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE)) );
   return vec_sel(elsePacket, thenPacket, mask);
 }
 #endif // __VSX__
-- 
cgit v1.2.3


From ee5864f72e83830f536ad91dc38d574c02a08348 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 25 Jun 2018 10:30:12 +0200
Subject: bug #1560 fix product with a 1x1 diagonal matrix

---
 Eigen/src/Core/ProductEvaluators.h | 11 +++++++++--
 test/diagonalmatrices.cpp          | 21 +++++++++++++++++++++
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index 0d5aec570..60de771b2 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -785,7 +785,11 @@ public:
     _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
     _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0,
     Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0),
-    Alignment = evaluator<MatrixType>::Alignment
+    Alignment = evaluator<MatrixType>::Alignment,
+
+    AsScalarProduct =     (DiagonalType::SizeAtCompileTime==1)
+                      ||  (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::RowsAtCompileTime==1 && ProductOrder==OnTheLeft)
+                      ||  (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==1 && ProductOrder==OnTheRight)
   };
   
   diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
@@ -797,7 +801,10 @@ public:
   
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
   {
-    return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
+    if(AsScalarProduct)
+      return m_diagImpl.coeff(0) * m_matImpl.coeff(idx);
+    else
+      return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
   }
   
 protected:
diff --git a/test/diagonalmatrices.cpp b/test/diagonalmatrices.cpp
index a2092c43e..73b4cd58c 100644
--- a/test/diagonalmatrices.cpp
+++ b/test/diagonalmatrices.cpp
@@ -30,6 +30,7 @@ template<typename MatrixType> void diagonalmatrices(const MatrixType& m)
              v2 = VectorType::Random(rows);
   RowVectorType rv1 = RowVectorType::Random(cols),
              rv2 = RowVectorType::Random(cols);
+
   LeftDiagonalMatrix ldm1(v1), ldm2(v2);
   RightDiagonalMatrix rdm1(rv1), rdm2(rv2);
   
@@ -105,6 +106,25 @@ template<typename MatrixType> void diagonalmatrices(const MatrixType& m)
   sq_m2 = sq_m1 * sq_m2;
   VERIFY_IS_APPROX( (sq_m1*v1.asDiagonal()).col(i), sq_m2.col(i) );
   VERIFY_IS_APPROX( (sq_m1*v1.asDiagonal()).row(i), sq_m2.row(i) );
+
+  if(v1.size()==1)
+  {
+    typedef Matrix<Scalar, Dynamic, 1> DynVectorType;
+    typedef Matrix<Scalar, 1, Dynamic> DynRowVectorType;
+    Index depth = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
+    DynVectorType     dv1 = DynVectorType::Random(depth);
+    DynRowVectorType  drv1 = DynRowVectorType::Random(depth);
+    DynMatrixType dm1 = dv1;
+    DynMatrixType drm1 = drv1;
+    
+    Scalar s = v1(0);
+
+    VERIFY_IS_APPROX( v1.asDiagonal() * drv1, s*drv1 );
+    VERIFY_IS_APPROX( dv1 * v1.asDiagonal(), dv1*s );
+
+    VERIFY_IS_APPROX( v1.asDiagonal() * drm1, s*drm1 );
+    VERIFY_IS_APPROX( dm1 * v1.asDiagonal(), dm1*s );
+  }
 }
 
 template<int>
@@ -130,6 +150,7 @@ void test_diagonalmatrices()
     CALL_SUBTEST_7( diagonalmatrices(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_8( diagonalmatrices(Matrix<double,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_9( diagonalmatrices(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_9( diagonalmatrices(MatrixXf(1,1)) );
   }
   CALL_SUBTEST_10( bug987<0>() );
 }
-- 
cgit v1.2.3


From f9d337780d49825765cbb8ea51843a905c0e5253 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 25 Jun 2018 14:26:51 +0200
Subject: First step towards a generic vectorised quaternion product

---
 Eigen/src/Geometry/arch/Geometry_SSE.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/Eigen/src/Geometry/arch/Geometry_SSE.h b/Eigen/src/Geometry/arch/Geometry_SSE.h
index f68cab583..d4346aa1c 100644
--- a/Eigen/src/Geometry/arch/Geometry_SSE.h
+++ b/Eigen/src/Geometry/arch/Geometry_SSE.h
@@ -26,17 +26,17 @@ struct quat_product<Architecture::SSE, Derived, OtherDerived, float>
   static inline Quaternion<float> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b)
   {
     Quaternion<float> res;
-    const __m128 mask = _mm_setr_ps(0.f,0.f,0.f,-0.f);
-    __m128 a = _a.coeffs().template packet<AAlignment>(0);
-    __m128 b = _b.coeffs().template packet<BAlignment>(0);
-    __m128 s1 = _mm_mul_ps(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2));
-    __m128 s2 = _mm_mul_ps(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1));
+    const Packet4f mask = _mm_setr_ps(0.f,0.f,0.f,-0.f);
+    Packet4f a = _a.coeffs().template packet<AAlignment>(0);
+    Packet4f b = _b.coeffs().template packet<BAlignment>(0);
+    Packet4f s1 = pmul(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2));
+    Packet4f s2 = pmul(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1));
     pstoret<float,Packet4f,ResAlignment>(
               &res.x(),
-              _mm_add_ps(_mm_sub_ps(_mm_mul_ps(a,vec4f_swizzle1(b,3,3,3,3)),
-                                    _mm_mul_ps(vec4f_swizzle1(a,2,0,1,0),
+              padd(psub(pmul(a,vec4f_swizzle1(b,3,3,3,3)),
+                                    pmul(vec4f_swizzle1(a,2,0,1,0),
                                                vec4f_swizzle1(b,1,2,0,0))),
-                         _mm_xor_ps(mask,_mm_add_ps(s1,s2))));
+                         pxor(mask,padd(s1,s2))));
     
     return res;
   }
-- 
cgit v1.2.3


From b7689bded9e87c95affb5963ad565a68974aa7f7 Mon Sep 17 00:00:00 2001
From: Jonathan Liu <net147@gmail.com>
Date: Thu, 28 Jun 2018 00:32:37 +1000
Subject: Use std::complex constructor instead of assignment from scalar

Fixes GCC conversion to non-scalar type requested compile error when
using boost::multiprecision::cpp_dec_float_50 as scalar type.
---
 unsupported/Eigen/src/FFT/ei_kissfft_impl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsupported/Eigen/src/FFT/ei_kissfft_impl.h b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
index be51b4e6f..079e88602 100644
--- a/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
+++ b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
@@ -316,8 +316,8 @@ struct kissfft_impl
 
         // use optimized mode for even real
         fwd( dst, reinterpret_cast<const Complex*> (src), ncfft);
-        Complex dc = dst[0].real() +  dst[0].imag();
-        Complex nyquist = dst[0].real() -  dst[0].imag();
+        Complex dc(dst[0].real() +  dst[0].imag());
+        Complex nyquist(dst[0].real() -  dst[0].imag());
         int k;
         for ( k=1;k <= ncfft2 ; ++k ) {
           Complex fpk = dst[k];
-- 
cgit v1.2.3


From 9a81de1d3543d89d2cc5d32294d96b3c58730b21 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 28 Jun 2018 00:20:59 +0200
Subject: Fix order of EIGEN_DEVICE_FUNC and returned type

---
 Eigen/src/Core/ArrayWrapper.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Core/ArrayWrapper.h b/Eigen/src/Core/ArrayWrapper.h
index 688aadd62..757b31825 100644
--- a/Eigen/src/Core/ArrayWrapper.h
+++ b/Eigen/src/Core/ArrayWrapper.h
@@ -90,8 +90,8 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
     EIGEN_DEVICE_FUNC
     inline void evalTo(Dest& dst) const { dst = m_expression; }
 
-    const typename internal::remove_all<NestedExpressionType>::type& 
     EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<NestedExpressionType>::type& 
     nestedExpression() const 
     {
       return m_expression;
-- 
cgit v1.2.3


From 0cdacf3fa49ff52fac3058beae1d6beaa0ebec46 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 29 Jun 2018 11:28:36 +0200
Subject: update comment

---
 Eigen/src/QR/HouseholderQR.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/QR/HouseholderQR.h b/Eigen/src/QR/HouseholderQR.h
index 13c02685d..33cb9c8ff 100644
--- a/Eigen/src/QR/HouseholderQR.h
+++ b/Eigen/src/QR/HouseholderQR.h
@@ -291,7 +291,7 @@ template<typename MatrixQR, typename HCoeffs,
   bool InnerStrideIsOne = (MatrixQR::InnerStrideAtCompileTime == 1 && HCoeffs::InnerStrideAtCompileTime == 1)>
 struct householder_qr_inplace_blocked
 {
-  // This is specialized for MKL-supported Scalar types in HouseholderQR_MKL.h
+  // This is specialized for LAPACK-supported Scalar types in HouseholderQR_LAPACKE.h
   static void run(MatrixQR& mat, HCoeffs& hCoeffs, Index maxBlockSize=32,
       typename MatrixQR::Scalar* tempData = 0)
   {
-- 
cgit v1.2.3


From a7b313a16cf5b64981dd953f150327638379e68b Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sun, 1 Jul 2018 22:45:47 +0200
Subject: Fix unit test

---
 test/diagonalmatrices.cpp | 43 +++++++++++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/test/diagonalmatrices.cpp b/test/diagonalmatrices.cpp
index 73b4cd58c..a4ff10239 100644
--- a/test/diagonalmatrices.cpp
+++ b/test/diagonalmatrices.cpp
@@ -106,25 +106,32 @@ template<typename MatrixType> void diagonalmatrices(const MatrixType& m)
   sq_m2 = sq_m1 * sq_m2;
   VERIFY_IS_APPROX( (sq_m1*v1.asDiagonal()).col(i), sq_m2.col(i) );
   VERIFY_IS_APPROX( (sq_m1*v1.asDiagonal()).row(i), sq_m2.row(i) );
+}
 
-  if(v1.size()==1)
-  {
-    typedef Matrix<Scalar, Dynamic, 1> DynVectorType;
-    typedef Matrix<Scalar, 1, Dynamic> DynRowVectorType;
-    Index depth = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
-    DynVectorType     dv1 = DynVectorType::Random(depth);
-    DynRowVectorType  drv1 = DynRowVectorType::Random(depth);
-    DynMatrixType dm1 = dv1;
-    DynMatrixType drm1 = drv1;
-    
-    Scalar s = v1(0);
+template<typename MatrixType> void as_scalar_product(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
+  typedef Matrix<Scalar, Dynamic, Dynamic> DynMatrixType;
+  typedef Matrix<Scalar, Dynamic, 1> DynVectorType;
+  typedef Matrix<Scalar, 1, Dynamic> DynRowVectorType;
 
-    VERIFY_IS_APPROX( v1.asDiagonal() * drv1, s*drv1 );
-    VERIFY_IS_APPROX( dv1 * v1.asDiagonal(), dv1*s );
+  Index rows = m.rows();
+  Index depth = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
 
-    VERIFY_IS_APPROX( v1.asDiagonal() * drm1, s*drm1 );
-    VERIFY_IS_APPROX( dm1 * v1.asDiagonal(), dm1*s );
-  }
+  VectorType v1 = VectorType::Random(rows);  
+  DynVectorType     dv1  = DynVectorType::Random(depth);
+  DynRowVectorType  drv1 = DynRowVectorType::Random(depth);
+  DynMatrixType     dm1  = dv1;
+  DynMatrixType     drm1 = drv1;
+  
+  Scalar s = v1(0);
+
+  VERIFY_IS_APPROX( v1.asDiagonal() * drv1, s*drv1 );
+  VERIFY_IS_APPROX( dv1 * v1.asDiagonal(), dv1*s );
+
+  VERIFY_IS_APPROX( v1.asDiagonal() * drm1, s*drm1 );
+  VERIFY_IS_APPROX( dm1 * v1.asDiagonal(), dm1*s );
 }
 
 template<int>
@@ -142,15 +149,19 @@ void test_diagonalmatrices()
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( diagonalmatrices(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_1( as_scalar_product(Matrix<float, 1, 1>()) );
+
     CALL_SUBTEST_2( diagonalmatrices(Matrix3f()) );
     CALL_SUBTEST_3( diagonalmatrices(Matrix<double,3,3,RowMajor>()) );
     CALL_SUBTEST_4( diagonalmatrices(Matrix4d()) );
     CALL_SUBTEST_5( diagonalmatrices(Matrix<float,4,4,RowMajor>()) );
     CALL_SUBTEST_6( diagonalmatrices(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_6( as_scalar_product(MatrixXcf(1,1)) );
     CALL_SUBTEST_7( diagonalmatrices(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_8( diagonalmatrices(Matrix<double,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_9( diagonalmatrices(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_9( diagonalmatrices(MatrixXf(1,1)) );
+    CALL_SUBTEST_9( as_scalar_product(MatrixXf(1,1)) );
   }
   CALL_SUBTEST_10( bug987<0>() );
 }
-- 
cgit v1.2.3


From d428a199ab70bc08db7551457a1e9d8f65d9ebb9 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 2 Jul 2018 11:41:09 +0200
Subject: bug #1562: optimize evaluation of small products of the form s*A*B by
 rewriting them as: s*(A.lazyProduct(B)) to save a costly temporary. Measured
 speedup from 2x to 5x...

---
 Eigen/src/Core/ProductEvaluators.h            | 28 ++++++++++++++++++++++++++-
 Eigen/src/Core/products/GeneralMatrixMatrix.h |  8 ++++----
 test/product.h                                | 11 +++++++++++
 test/product_large.cpp                        |  2 ++
 4 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index 60de771b2..8072a1959 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -396,7 +396,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
     // but easier on the compiler side
     call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op<typename Dst::Scalar,Scalar>());
   }
-  
+
   template<typename Dst>
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
@@ -410,6 +410,32 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
     // dst.noalias() -= lhs.lazyProduct(rhs);
     call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<typename Dst::Scalar,Scalar>());
   }
+
+  // Catch "dst {,+,-}= (s*A)*B" and evaluate it lazily by moving out the scalar factor:
+  //    dst {,+,-}= s * (A.lazyProduct(B))
+  // This is a huge benefit for heap-allocated matrix types as it save one costly allocation.
+  // For them, this strategy is also faster than simply by-passing the heap allocation through
+  // stack allocation.
+  // For fixed sizes matrices, this is less obvious, it is sometimes x2 faster, but sometimes x3 slower,
+  // and the behavior depends also a lot on the compiler... so let's be conservative and enable them for dynamic-size only,
+  // that is when coming from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h
+  template<typename Dst, typename Scalar1, typename Scalar2, typename Plain1, typename Xpr2, typename Func>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void eval_dynamic(Dst& dst, const CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
+                                           const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>, Xpr2>& lhs, const Rhs& rhs, const Func &func)
+  {
+    call_assignment_no_alias(dst, lhs.lhs().functor().m_other * lhs.rhs().lazyProduct(rhs), func);
+  }
+
+  // Here, we we always have LhsT==Lhs, but we need to make it a template type to make the above
+  // overload more specialized.
+  template<typename Dst, typename LhsT, typename Func>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void eval_dynamic(Dst& dst, const LhsT& lhs, const Rhs& rhs, const Func &func)
+  {
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), func);
+  }
+  
   
 //   template<typename Dst>
 //   static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index ed4d3182b..dc47cf7cf 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -431,10 +431,10 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
     // to determine the following heuristic.
     // EIGEN_GEMM_TO_COEFFBASED_THRESHOLD is typically defined to 20 in GeneralProduct.h,
     // unless it has been specialized by the user or for a given architecture.
-    // Note that the condition rhs.rows()>0 was required because lazy produc is (was?) not happy with empty inputs.
+    // Note that the condition rhs.rows()>0 was required because lazy product is (was?) not happy with empty inputs.
     // I'm not sure it is still required.
     if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
-      lazyproduct::evalTo(dst, lhs, rhs);
+      lazyproduct::eval_dynamic(dst, lhs, rhs, internal::assign_op<typename Dst::Scalar,Scalar>());
     else
     {
       dst.setZero();
@@ -446,7 +446,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
   static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
-      lazyproduct::addTo(dst, lhs, rhs);
+      lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op<typename Dst::Scalar,Scalar>());
     else
       scaleAndAddTo(dst,lhs, rhs, Scalar(1));
   }
@@ -455,7 +455,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
   static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
-      lazyproduct::subTo(dst, lhs, rhs);
+      lazyproduct::eval_dynamic(dst, lhs, rhs, internal::sub_assign_op<typename Dst::Scalar,Scalar>());
     else
       scaleAndAddTo(dst, lhs, rhs, Scalar(-1));
   }
diff --git a/test/product.h b/test/product.h
index 0425a929e..d26e8063d 100644
--- a/test/product.h
+++ b/test/product.h
@@ -111,6 +111,17 @@ template<typename MatrixType> void product(const MatrixType& m)
   vcres.noalias() -= m1.transpose() * v1;
   VERIFY_IS_APPROX(vcres, vc2 - m1.transpose() * v1);
 
+  // test scaled products
+  res = square;
+  res.noalias() = s1 * m1 * m2.transpose();
+  VERIFY_IS_APPROX(res, ((s1*m1).eval() * m2.transpose()));
+  res = square;
+  res.noalias() += s1 * m1 * m2.transpose();
+  VERIFY_IS_APPROX(res, square + ((s1*m1).eval() * m2.transpose()));
+  res = square;
+  res.noalias() -= s1 * m1 * m2.transpose();
+  VERIFY_IS_APPROX(res, square - ((s1*m1).eval() * m2.transpose()));
+
   // test d ?= a+b*c rules
   res.noalias() = square + m1 * m2.transpose();
   VERIFY_IS_APPROX(res, square + m1 * m2.transpose());
diff --git a/test/product_large.cpp b/test/product_large.cpp
index 845cd40ca..14a4f739d 100644
--- a/test/product_large.cpp
+++ b/test/product_large.cpp
@@ -35,6 +35,8 @@ void test_product_large()
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( product(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_2( product(MatrixXd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_2( product(MatrixXd(internal::random<int>(1,10), internal::random<int>(1,10))) );
+
     CALL_SUBTEST_3( product(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_4( product(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
     CALL_SUBTEST_5( product(Matrix<float,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
-- 
cgit v1.2.3


From d6255649361fde3e937320dfd216a84203adc295 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 2 Jul 2018 11:50:41 +0200
Subject: Simplify redux_evaluator using inheritance, and properly rename
 parameters in reducers.

---
 Eigen/src/Core/Redux.h | 245 +++++++++++++++++++++++--------------------------
 1 file changed, 115 insertions(+), 130 deletions(-)

diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h
index 0a99acb21..32574ba60 100644
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -23,22 +23,22 @@ namespace internal {
 * Part 1 : the logic deciding a strategy for vectorization and unrolling
 ***************************************************************************/
 
-template<typename Func, typename Derived>
+template<typename Func, typename Evaluator>
 struct redux_traits
 {
 public:
-    typedef typename find_best_packet<typename Derived::Scalar,Derived::SizeAtCompileTime>::type PacketType;
+    typedef typename find_best_packet<typename Evaluator::Scalar,Evaluator::SizeAtCompileTime>::type PacketType;
   enum {
     PacketSize = unpacket_traits<PacketType>::size,
-    InnerMaxSize = int(Derived::IsRowMajor)
-                 ? Derived::MaxColsAtCompileTime
-                 : Derived::MaxRowsAtCompileTime
+    InnerMaxSize = int(Evaluator::IsRowMajor)
+                 ? Evaluator::MaxColsAtCompileTime
+                 : Evaluator::MaxRowsAtCompileTime
   };
 
   enum {
-    MightVectorize = (int(Derived::Flags)&ActualPacketAccessBit)
+    MightVectorize = (int(Evaluator::Flags)&ActualPacketAccessBit)
                   && (functor_traits<Func>::PacketAccess),
-    MayLinearVectorize = bool(MightVectorize) && (int(Derived::Flags)&LinearAccessBit),
+    MayLinearVectorize = bool(MightVectorize) && (int(Evaluator::Flags)&LinearAccessBit),
     MaySliceVectorize  = bool(MightVectorize) && int(InnerMaxSize)>=3*PacketSize
   };
 
@@ -51,8 +51,8 @@ public:
 
 public:
   enum {
-    Cost = Derived::SizeAtCompileTime == Dynamic ? HugeCost
-         : Derived::SizeAtCompileTime * Derived::CoeffReadCost + (Derived::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
+    Cost = Evaluator::SizeAtCompileTime == Dynamic ? HugeCost
+         : Evaluator::SizeAtCompileTime * Evaluator::CoeffReadCost + (Evaluator::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
     UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize))
   };
 
@@ -64,9 +64,9 @@ public:
 #ifdef EIGEN_DEBUG_ASSIGN
   static void debug()
   {
-    std::cerr << "Xpr: " << typeid(typename Derived::XprType).name() << std::endl;
+    std::cerr << "Xpr: " << typeid(typename Evaluator::XprType).name() << std::endl;
     std::cerr.setf(std::ios::hex, std::ios::basefield);
-    EIGEN_DEBUG_VAR(Derived::Flags)
+    EIGEN_DEBUG_VAR(Evaluator::Flags)
     std::cerr.unsetf(std::ios::hex);
     EIGEN_DEBUG_VAR(InnerMaxSize)
     EIGEN_DEBUG_VAR(PacketSize)
@@ -87,88 +87,88 @@ public:
 
 /*** no vectorization ***/
 
-template<typename Func, typename Derived, int Start, int Length>
+template<typename Func, typename Evaluator, int Start, int Length>
 struct redux_novec_unroller
 {
   enum {
     HalfLength = Length/2
   };
 
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
 
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
+  static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func& func)
   {
-    return func(redux_novec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
-                redux_novec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func));
+    return func(redux_novec_unroller<Func, Evaluator, Start, HalfLength>::run(eval,func),
+                redux_novec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::run(eval,func));
   }
 };
 
-template<typename Func, typename Derived, int Start>
-struct redux_novec_unroller<Func, Derived, Start, 1>
+template<typename Func, typename Evaluator, int Start>
+struct redux_novec_unroller<Func, Evaluator, Start, 1>
 {
   enum {
-    outer = Start / Derived::InnerSizeAtCompileTime,
-    inner = Start % Derived::InnerSizeAtCompileTime
+    outer = Start / Evaluator::InnerSizeAtCompileTime,
+    inner = Start % Evaluator::InnerSizeAtCompileTime
   };
 
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
 
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func&)
+  static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func&)
   {
-    return mat.coeffByOuterInner(outer, inner);
+    return eval.coeffByOuterInner(outer, inner);
   }
 };
 
 // This is actually dead code and will never be called. It is required
 // to prevent false warnings regarding failed inlining though
 // for 0 length run() will never be called at all.
-template<typename Func, typename Derived, int Start>
-struct redux_novec_unroller<Func, Derived, Start, 0>
+template<typename Func, typename Evaluator, int Start>
+struct redux_novec_unroller<Func, Evaluator, Start, 0>
 {
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
   EIGEN_DEVICE_FUNC 
-  static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); }
+  static EIGEN_STRONG_INLINE Scalar run(const Evaluator&, const Func&) { return Scalar(); }
 };
 
 /*** vectorization ***/
 
-template<typename Func, typename Derived, int Start, int Length>
+template<typename Func, typename Evaluator, int Start, int Length>
 struct redux_vec_unroller
 {
   enum {
-    PacketSize = redux_traits<Func, Derived>::PacketSize,
+    PacketSize = redux_traits<Func, Evaluator>::PacketSize,
     HalfLength = Length/2
   };
 
-  typedef typename Derived::Scalar Scalar;
-  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
+  typedef typename Evaluator::Scalar Scalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
 
-  static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func& func)
+  static EIGEN_STRONG_INLINE PacketScalar run(const Evaluator &eval, const Func& func)
   {
     return func.packetOp(
-            redux_vec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
-            redux_vec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func) );
+            redux_vec_unroller<Func, Evaluator, Start, HalfLength>::run(eval,func),
+            redux_vec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::run(eval,func) );
   }
 };
 
-template<typename Func, typename Derived, int Start>
-struct redux_vec_unroller<Func, Derived, Start, 1>
+template<typename Func, typename Evaluator, int Start>
+struct redux_vec_unroller<Func, Evaluator, Start, 1>
 {
   enum {
-    index = Start * redux_traits<Func, Derived>::PacketSize,
-    outer = index / int(Derived::InnerSizeAtCompileTime),
-    inner = index % int(Derived::InnerSizeAtCompileTime),
-    alignment = Derived::Alignment
+    index = Start * redux_traits<Func, Evaluator>::PacketSize,
+    outer = index / int(Evaluator::InnerSizeAtCompileTime),
+    inner = index % int(Evaluator::InnerSizeAtCompileTime),
+    alignment = Evaluator::Alignment
   };
 
-  typedef typename Derived::Scalar Scalar;
-  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
+  typedef typename Evaluator::Scalar Scalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
 
-  static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&)
+  static EIGEN_STRONG_INLINE PacketScalar run(const Evaluator &eval, const Func&)
   {
-    return mat.template packetByOuterInner<alignment,PacketScalar>(outer, inner);
+    return eval.template packetByOuterInner<alignment,PacketScalar>(outer, inner);
   }
 };
 
@@ -176,53 +176,54 @@ struct redux_vec_unroller<Func, Derived, Start, 1>
 * Part 3 : implementation of all cases
 ***************************************************************************/
 
-template<typename Func, typename Derived,
-         int Traversal = redux_traits<Func, Derived>::Traversal,
-         int Unrolling = redux_traits<Func, Derived>::Unrolling
+template<typename Func, typename Evaluator,
+         int Traversal = redux_traits<Func, Evaluator>::Traversal,
+         int Unrolling = redux_traits<Func, Evaluator>::Unrolling
 >
 struct redux_impl;
 
-template<typename Func, typename Derived>
-struct redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>
+template<typename Func, typename Evaluator>
+struct redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>
 {
-  typedef typename Derived::Scalar Scalar;
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
+  typedef typename Evaluator::Scalar Scalar;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
+  Scalar run(const Evaluator &eval, const Func& func)
   {
-    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
+    eigen_assert(eval.rows()>0 && eval.cols()>0 && "you are using an empty matrix");
     Scalar res;
-    res = mat.coeffByOuterInner(0, 0);
-    for(Index i = 1; i < mat.innerSize(); ++i)
-      res = func(res, mat.coeffByOuterInner(0, i));
-    for(Index i = 1; i < mat.outerSize(); ++i)
-      for(Index j = 0; j < mat.innerSize(); ++j)
-        res = func(res, mat.coeffByOuterInner(i, j));
+    res = eval.coeffByOuterInner(0, 0);
+    for(Index i = 1; i < eval.innerSize(); ++i)
+      res = func(res, eval.coeffByOuterInner(0, i));
+    for(Index i = 1; i < eval.outerSize(); ++i)
+      for(Index j = 0; j < eval.innerSize(); ++j)
+        res = func(res, eval.coeffByOuterInner(i, j));
     return res;
   }
 };
 
-template<typename Func, typename Derived>
-struct redux_impl<Func,Derived, DefaultTraversal, CompleteUnrolling>
-  : public redux_novec_unroller<Func,Derived, 0, Derived::SizeAtCompileTime>
+template<typename Func, typename Evaluator>
+struct redux_impl<Func,Evaluator, DefaultTraversal, CompleteUnrolling>
+  : redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime>
 {};
 
-template<typename Func, typename Derived>
-struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
+template<typename Func, typename Evaluator>
+struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, NoUnrolling>
 {
-  typedef typename Derived::Scalar Scalar;
-  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
+  typedef typename Evaluator::Scalar Scalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
 
-  static Scalar run(const Derived &mat, const Func& func)
+  static Scalar run(const Evaluator &eval, const Func& func)
   {
-    const Index size = mat.size();
+    const Index size = eval.size();
     
-    const Index packetSize = redux_traits<Func, Derived>::PacketSize;
+    const Index packetSize = redux_traits<Func, Evaluator>::PacketSize;
     const int packetAlignment = unpacket_traits<PacketScalar>::alignment;
     enum {
-      alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
-      alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Derived::Alignment)
+      alignment0 = (bool(Evaluator::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
+      alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Evaluator::Alignment)
     };
-    const Index alignedStart = internal::first_default_aligned(mat.nestedExpression());
+    const Index alignedStart = internal::first_default_aligned(eval.nestedExpression());
     const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize);
     const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize);
     const Index alignedEnd2 = alignedStart + alignedSize2;
@@ -230,34 +231,34 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
     Scalar res;
     if(alignedSize)
     {
-      PacketScalar packet_res0 = mat.template packet<alignment,PacketScalar>(alignedStart);
+      PacketScalar packet_res0 = eval.template packet<alignment,PacketScalar>(alignedStart);
       if(alignedSize>packetSize) // we have at least two packets to partly unroll the loop
       {
-        PacketScalar packet_res1 = mat.template packet<alignment,PacketScalar>(alignedStart+packetSize);
+        PacketScalar packet_res1 = eval.template packet<alignment,PacketScalar>(alignedStart+packetSize);
         for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize)
         {
-          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(index));
-          packet_res1 = func.packetOp(packet_res1, mat.template packet<alignment,PacketScalar>(index+packetSize));
+          packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(index));
+          packet_res1 = func.packetOp(packet_res1, eval.template packet<alignment,PacketScalar>(index+packetSize));
         }
 
         packet_res0 = func.packetOp(packet_res0,packet_res1);
         if(alignedEnd>alignedEnd2)
-          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(alignedEnd2));
+          packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(alignedEnd2));
       }
       res = func.predux(packet_res0);
 
       for(Index index = 0; index < alignedStart; ++index)
-        res = func(res,mat.coeff(index));
+        res = func(res,eval.coeff(index));
 
       for(Index index = alignedEnd; index < size; ++index)
-        res = func(res,mat.coeff(index));
+        res = func(res,eval.coeff(index));
     }
     else // too small to vectorize anything.
          // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
     {
-      res = mat.coeff(0);
+      res = eval.coeff(0);
       for(Index index = 1; index < size; ++index)
-        res = func(res,mat.coeff(index));
+        res = func(res,eval.coeff(index));
     }
 
     return res;
@@ -265,77 +266,80 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
 };
 
 // NOTE: for SliceVectorizedTraversal we simply bypass unrolling
-template<typename Func, typename Derived, int Unrolling>
-struct redux_impl<Func, Derived, SliceVectorizedTraversal, Unrolling>
+template<typename Func, typename Evaluator, int Unrolling>
+struct redux_impl<Func, Evaluator, SliceVectorizedTraversal, Unrolling>
 {
-  typedef typename Derived::Scalar Scalar;
-  typedef typename redux_traits<Func, Derived>::PacketType PacketType;
+  typedef typename Evaluator::Scalar Scalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketType;
 
-  EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func)
+  EIGEN_DEVICE_FUNC static Scalar run(const Evaluator &eval, const Func& func)
   {
-    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
-    const Index innerSize = mat.innerSize();
-    const Index outerSize = mat.outerSize();
+    eigen_assert(eval.rows()>0 && eval.cols()>0 && "you are using an empty matrix");
+    const Index innerSize = eval.innerSize();
+    const Index outerSize = eval.outerSize();
     enum {
-      packetSize = redux_traits<Func, Derived>::PacketSize
+      packetSize = redux_traits<Func, Evaluator>::PacketSize
     };
     const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize;
     Scalar res;
     if(packetedInnerSize)
     {
-      PacketType packet_res = mat.template packet<Unaligned,PacketType>(0,0);
+      PacketType packet_res = eval.template packet<Unaligned,PacketType>(0,0);
       for(Index j=0; j<outerSize; ++j)
         for(Index i=(j==0?packetSize:0); i<packetedInnerSize; i+=Index(packetSize))
-          packet_res = func.packetOp(packet_res, mat.template packetByOuterInner<Unaligned,PacketType>(j,i));
+          packet_res = func.packetOp(packet_res, eval.template packetByOuterInner<Unaligned,PacketType>(j,i));
 
       res = func.predux(packet_res);
       for(Index j=0; j<outerSize; ++j)
         for(Index i=packetedInnerSize; i<innerSize; ++i)
-          res = func(res, mat.coeffByOuterInner(j,i));
+          res = func(res, eval.coeffByOuterInner(j,i));
     }
     else // too small to vectorize anything.
          // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
     {
-      res = redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>::run(mat, func);
+      res = redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>::run(eval, func);
     }
 
     return res;
   }
 };
 
-template<typename Func, typename Derived>
-struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling>
+template<typename Func, typename Evaluator>
+struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, CompleteUnrolling>
 {
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
 
-  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
   enum {
-    PacketSize = redux_traits<Func, Derived>::PacketSize,
-    Size = Derived::SizeAtCompileTime,
+    PacketSize = redux_traits<Func, Evaluator>::PacketSize,
+    Size = Evaluator::SizeAtCompileTime,
     VectorizedSize = (Size / PacketSize) * PacketSize
   };
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
+  Scalar run(const Evaluator &eval, const Func& func)
   {
-    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
+    eigen_assert(eval.rows()>0 && eval.cols()>0 && "you are using an empty matrix");
     if (VectorizedSize > 0) {
-      Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
+      Scalar res = func.predux(redux_vec_unroller<Func, Evaluator, 0, Size / PacketSize>::run(eval,func));
       if (VectorizedSize != Size)
-        res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
+        res = func(res,redux_novec_unroller<Func, Evaluator, VectorizedSize, Size-VectorizedSize>::run(eval,func));
       return res;
     }
     else {
-      return redux_novec_unroller<Func, Derived, 0, Size>::run(mat,func);
+      return redux_novec_unroller<Func, Evaluator, 0, Size>::run(eval,func);
     }
   }
 };
 
 // evaluator adaptor
 template<typename _XprType>
-class redux_evaluator
+class redux_evaluator : public internal::evaluator<_XprType>
 {
+  typedef internal::evaluator<_XprType> Base;
 public:
   typedef _XprType XprType;
-  EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {}
+  EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : Base(xpr), m_xpr(xpr) {}
   
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
@@ -346,12 +350,10 @@ public:
     MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = XprType::MaxColsAtCompileTime,
     // TODO we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at runtime from the evaluator
-    Flags = evaluator<XprType>::Flags & ~DirectAccessBit,
+    Flags = Base::Flags & ~DirectAccessBit,
     IsRowMajor = XprType::IsRowMajor,
     SizeAtCompileTime = XprType::SizeAtCompileTime,
-    InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime,
-    CoeffReadCost = evaluator<XprType>::CoeffReadCost,
-    Alignment = evaluator<XprType>::Alignment
+    InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime
   };
   
   EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
@@ -359,35 +361,18 @@ public:
   EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); }
   EIGEN_DEVICE_FUNC Index innerSize() const { return m_xpr.innerSize(); }
   EIGEN_DEVICE_FUNC Index outerSize() const { return m_xpr.outerSize(); }
-
-  EIGEN_DEVICE_FUNC
-  CoeffReturnType coeff(Index row, Index col) const
-  { return m_evaluator.coeff(row, col); }
-
-  EIGEN_DEVICE_FUNC
-  CoeffReturnType coeff(Index index) const
-  { return m_evaluator.coeff(index); }
-
-  template<int LoadMode, typename PacketType>
-  PacketType packet(Index row, Index col) const
-  { return m_evaluator.template packet<LoadMode,PacketType>(row, col); }
-
-  template<int LoadMode, typename PacketType>
-  PacketType packet(Index index) const
-  { return m_evaluator.template packet<LoadMode,PacketType>(index); }
   
   EIGEN_DEVICE_FUNC
   CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
-  { return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
+  { return Base::coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
   
   template<int LoadMode, typename PacketType>
   PacketType packetByOuterInner(Index outer, Index inner) const
-  { return m_evaluator.template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
+  { return Base::template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
   
   const XprType & nestedExpression() const { return m_xpr; }
   
 protected:
-  internal::evaluator<XprType> m_evaluator;
   const XprType &m_xpr;
 };
 
-- 
cgit v1.2.3


From 047677a08d907031bfa596017c15493c982181c5 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 2 Jul 2018 12:18:25 +0200
Subject: Fix regression in changeset f05dea6b2326836e5e0243fbaffbece84b833d64
 : computeFromHessenberg can take any expression for matrixQ, not only an
 HouseholderSequence.

---
 Eigen/src/Eigenvalues/RealSchur.h | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/Eigen/src/Eigenvalues/RealSchur.h b/Eigen/src/Eigenvalues/RealSchur.h
index 4c53344bb..aca8a8279 100644
--- a/Eigen/src/Eigenvalues/RealSchur.h
+++ b/Eigen/src/Eigenvalues/RealSchur.h
@@ -270,8 +270,13 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::compute(const EigenBase<InputType>
   // Step 1. Reduce to Hessenberg form
   m_hess.compute(matrix.derived()/scale);
 
-  // Step 2. Reduce to real Schur form  
-  computeFromHessenberg(m_hess.matrixH(), m_hess.matrixQ(), computeU);
+  // Step 2. Reduce to real Schur form
+  // Note: we copy m_hess.matrixQ() into m_matU here and not in computeFromHessenberg
+  //       to be able to pass our working-space buffer for the Householder to Dense evaluation.
+  m_workspaceVector.resize(matrix.cols());
+  if(computeU)
+    m_hess.matrixQ().evalTo(m_matU, m_workspaceVector);
+  computeFromHessenberg(m_hess.matrixH(), m_matU, computeU);
 
   m_matT *= scale;
   
@@ -285,8 +290,8 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::computeFromHessenberg(const HessMa
 
   m_matT = matrixH;
   m_workspaceVector.resize(m_matT.cols());
-  if(computeU)
-    matrixQ.evalTo(m_matU, m_workspaceVector);
+  if(computeU && !internal::is_same_dense(m_matU,matrixQ))
+    m_matU = matrixQ;
   
   Index maxIters = m_maxIters;
   if (maxIters == -1)
-- 
cgit v1.2.3


From 67ec37f7b0d9ccaaede508109e50528138eaf4d8 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 2 Jul 2018 12:54:14 +0200
Subject: Activate dgmres unit test

---
 unsupported/test/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index e99eab0e3..fa4e76e42 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -124,6 +124,7 @@ ei_add_test(polynomialsolver)
 ei_add_test(polynomialutils)
 ei_add_test(splines)
 ei_add_test(gmres)
+ei_add_test(dgmres)
 ei_add_test(minres)
 ei_add_test(levenberg_marquardt)
 ei_add_test(kronecker_product)
-- 
cgit v1.2.3


From 3ae2083e230ec9517cb9b40d1137dec7ba4b2b17 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 3 Jul 2018 13:21:43 +0200
Subject: Make is_same_dense compatible with different scalar types.

---
 Eigen/src/Core/util/XprHelper.h | 9 +++++++--
 test/is_same_dense.cpp          | 8 ++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index 16714fa5c..f7a3d9ce7 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -680,13 +680,18 @@ template<typename S1, typename S2> struct glue_shapes;
 template<> struct glue_shapes<DenseShape,TriangularShape> { typedef TriangularShape type;  };
 
 template<typename T1, typename T2>
-bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if<has_direct_access<T1>::ret&&has_direct_access<T2>::ret, T1>::type * = 0)
+struct possibly_same_dense {
+  enum { value = has_direct_access<T1>::ret && has_direct_access<T2>::ret && is_same<typename T1::Scalar,typename T2::Scalar>::value };
+};
+
+template<typename T1, typename T2>
+bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if<possibly_same_dense<T1,T2>::value>::type * = 0)
 {
   return (mat1.data()==mat2.data()) && (mat1.innerStride()==mat2.innerStride()) && (mat1.outerStride()==mat2.outerStride());
 }
 
 template<typename T1, typename T2>
-bool is_same_dense(const T1 &, const T2 &, typename enable_if<!(has_direct_access<T1>::ret&&has_direct_access<T2>::ret), T1>::type * = 0)
+bool is_same_dense(const T1 &, const T2 &, typename enable_if<!possibly_same_dense<T1,T2>::value>::type * = 0)
 {
   return false;
 }
diff --git a/test/is_same_dense.cpp b/test/is_same_dense.cpp
index 2c7838ce9..c4e2fbc92 100644
--- a/test/is_same_dense.cpp
+++ b/test/is_same_dense.cpp
@@ -14,9 +14,13 @@ using internal::is_same_dense;
 void test_is_same_dense()
 {
   typedef Matrix<double,Dynamic,Dynamic,ColMajor> ColMatrixXd;
+  typedef Matrix<std::complex<double>,Dynamic,Dynamic,ColMajor> ColMatrixXcd;
   ColMatrixXd m1(10,10);
+  ColMatrixXcd m2(10,10);
   Ref<ColMatrixXd> ref_m1(m1);
+  Ref<ColMatrixXd,0, Stride<Dynamic,Dynamic> >  ref_m2_real(m2.real());
   Ref<const ColMatrixXd> const_ref_m1(m1);
+
   VERIFY(is_same_dense(m1,m1));
   VERIFY(is_same_dense(m1,ref_m1));
   VERIFY(is_same_dense(const_ref_m1,m1));
@@ -30,4 +34,8 @@ void test_is_same_dense()
   
   Ref<const ColMatrixXd> const_ref_m1_col(m1.col(1));
   VERIFY(is_same_dense(m1.col(1),const_ref_m1_col));
+
+
+  VERIFY(!is_same_dense(m1, ref_m2_real));
+  VERIFY(!is_same_dense(m2, ref_m2_real));
 }
-- 
cgit v1.2.3


From 6a241bd8ee45df0ba112d4d6874888499b51cd34 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 3 Jul 2018 14:02:46 +0200
Subject: Implement custom inplace triangular product to avoid a temporary

---
 Eigen/src/Householder/BlockHouseholder.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Householder/BlockHouseholder.h b/Eigen/src/Householder/BlockHouseholder.h
index 01a7ed188..39ce1c2a0 100644
--- a/Eigen/src/Householder/BlockHouseholder.h
+++ b/Eigen/src/Householder/BlockHouseholder.h
@@ -63,8 +63,15 @@ void make_block_householder_triangular_factor(TriangularFactorType& triFactor, c
       triFactor.row(i).tail(rt).noalias() = -hCoeffs(i) * vectors.col(i).tail(rs).adjoint()
                                                         * vectors.bottomRightCorner(rs, rt).template triangularView<UnitLower>();
             
-      // FIXME add .noalias() once the triangular product can work inplace
-      triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template triangularView<Upper>();
+      // FIXME use the following line with .noalias() once the triangular product can work inplace
+      // triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template triangularView<Upper>();
+      for(Index j=nbVecs-1; j>i; --j)
+      {
+        typename TriangularFactorType::Scalar z = triFactor(i,j);
+        triFactor(i,j) = z * triFactor(j,j);
+        if(nbVecs-j-1>0)
+          triFactor.row(i).tail(nbVecs-j-1) += z * triFactor.row(j).tail(nbVecs-j-1);
+      }
       
     }
     triFactor(i,i) = hCoeffs(i);
-- 
cgit v1.2.3


From 05371239533012e652de0b88a3e0aa992a48a80f Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 5 Jul 2018 09:21:26 +0200
Subject: bug #1565: help MSVC to generatenot too bad ASM in reductions.

---
 Eigen/src/Core/Redux.h | 62 +++++++++++++++++++++++++++-----------------------
 1 file changed, 33 insertions(+), 29 deletions(-)

diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h
index 32574ba60..ddce65468 100644
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -187,16 +187,17 @@ struct redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>
 {
   typedef typename Evaluator::Scalar Scalar;
 
+  template<typename XprType>
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
-  Scalar run(const Evaluator &eval, const Func& func)
+  Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
   {
-    eigen_assert(eval.rows()>0 && eval.cols()>0 && "you are using an empty matrix");
+    eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
     Scalar res;
     res = eval.coeffByOuterInner(0, 0);
-    for(Index i = 1; i < eval.innerSize(); ++i)
+    for(Index i = 1; i < xpr.innerSize(); ++i)
       res = func(res, eval.coeffByOuterInner(0, i));
-    for(Index i = 1; i < eval.outerSize(); ++i)
-      for(Index j = 0; j < eval.innerSize(); ++j)
+    for(Index i = 1; i < xpr.outerSize(); ++i)
+      for(Index j = 0; j < xpr.innerSize(); ++j)
         res = func(res, eval.coeffByOuterInner(i, j));
     return res;
   }
@@ -205,7 +206,16 @@ struct redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>
 template<typename Func, typename Evaluator>
 struct redux_impl<Func,Evaluator, DefaultTraversal, CompleteUnrolling>
   : redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime>
-{};
+{
+  typedef redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime> Base;
+  typedef typename Evaluator::Scalar Scalar;
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
+  Scalar run(const Evaluator &eval, const Func& func, const XprType& /*xpr*/)
+  {
+    return Base::run(eval,func);
+  }
+};
 
 template<typename Func, typename Evaluator>
 struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, NoUnrolling>
@@ -213,9 +223,10 @@ struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, NoUnrolling>
   typedef typename Evaluator::Scalar Scalar;
   typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
 
-  static Scalar run(const Evaluator &eval, const Func& func)
+  template<typename XprType>
+  static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
   {
-    const Index size = eval.size();
+    const Index size = xpr.size();
     
     const Index packetSize = redux_traits<Func, Evaluator>::PacketSize;
     const int packetAlignment = unpacket_traits<PacketScalar>::alignment;
@@ -223,7 +234,7 @@ struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, NoUnrolling>
       alignment0 = (bool(Evaluator::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
       alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Evaluator::Alignment)
     };
-    const Index alignedStart = internal::first_default_aligned(eval.nestedExpression());
+    const Index alignedStart = internal::first_default_aligned(xpr);
     const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize);
     const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize);
     const Index alignedEnd2 = alignedStart + alignedSize2;
@@ -272,11 +283,12 @@ struct redux_impl<Func, Evaluator, SliceVectorizedTraversal, Unrolling>
   typedef typename Evaluator::Scalar Scalar;
   typedef typename redux_traits<Func, Evaluator>::PacketType PacketType;
 
-  EIGEN_DEVICE_FUNC static Scalar run(const Evaluator &eval, const Func& func)
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
   {
-    eigen_assert(eval.rows()>0 && eval.cols()>0 && "you are using an empty matrix");
-    const Index innerSize = eval.innerSize();
-    const Index outerSize = eval.outerSize();
+    eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
+    const Index innerSize = xpr.innerSize();
+    const Index outerSize = xpr.outerSize();
     enum {
       packetSize = redux_traits<Func, Evaluator>::PacketSize
     };
@@ -297,7 +309,7 @@ struct redux_impl<Func, Evaluator, SliceVectorizedTraversal, Unrolling>
     else // too small to vectorize anything.
          // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
     {
-      res = redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>::run(eval, func);
+      res = redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>::run(eval, func, xpr);
     }
 
     return res;
@@ -316,10 +328,11 @@ struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, CompleteUnrolling>
     VectorizedSize = (Size / PacketSize) * PacketSize
   };
 
+  template<typename XprType>
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
-  Scalar run(const Evaluator &eval, const Func& func)
+  Scalar run(const Evaluator &eval, const Func& func, const XprType &xpr)
   {
-    eigen_assert(eval.rows()>0 && eval.cols()>0 && "you are using an empty matrix");
+    eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
     if (VectorizedSize > 0) {
       Scalar res = func.predux(redux_vec_unroller<Func, Evaluator, 0, Size / PacketSize>::run(eval,func));
       if (VectorizedSize != Size)
@@ -339,12 +352,11 @@ class redux_evaluator : public internal::evaluator<_XprType>
   typedef internal::evaluator<_XprType> Base;
 public:
   typedef _XprType XprType;
-  EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : Base(xpr), m_xpr(xpr) {}
+  EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : Base(xpr) {}
   
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketScalar PacketScalar;
-  typedef typename XprType::PacketReturnType PacketReturnType;
   
   enum {
     MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime,
@@ -356,12 +368,6 @@ public:
     InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime
   };
   
-  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
-  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
-  EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); }
-  EIGEN_DEVICE_FUNC Index innerSize() const { return m_xpr.innerSize(); }
-  EIGEN_DEVICE_FUNC Index outerSize() const { return m_xpr.outerSize(); }
-  
   EIGEN_DEVICE_FUNC
   CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
   { return Base::coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
@@ -370,10 +376,6 @@ public:
   PacketType packetByOuterInner(Index outer, Index inner) const
   { return Base::template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
   
-  const XprType & nestedExpression() const { return m_xpr; }
-  
-protected:
-  const XprType &m_xpr;
 };
 
 } // end namespace internal
@@ -400,7 +402,9 @@ DenseBase<Derived>::redux(const Func& func) const
   typedef typename internal::redux_evaluator<Derived> ThisEvaluator;
   ThisEvaluator thisEval(derived());
   
-  return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func);
+  // The initial expression is passed to the reducer as an additional argument instead of
+  // passing it as a member of redux_evaluator to help  
+  return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func, derived());
 }
 
 /** \returns the minimum of all coefficients of \c *this.
-- 
cgit v1.2.3


From f7124b3e467363e45c3d906b7003f1520a5f804a Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 11 Jun 2018 18:33:24 +0200
Subject: Extend CUDA support to matrix inversion and selfadjointeigensolver

---
 Eigen/Core                                        | 39 ++++++++++-----
 Eigen/PardisoSupport                              |  0
 Eigen/src/Core/DenseStorage.h                     | 58 +++++++++++++++--------
 Eigen/src/Core/GeneralProduct.h                   |  4 +-
 Eigen/src/Core/MathFunctions.h                    | 31 ++++++++++--
 Eigen/src/Core/MathFunctionsImpl.h                |  5 +-
 Eigen/src/Core/MatrixBase.h                       |  8 ++++
 Eigen/src/Core/NumTraits.h                        |  6 +++
 Eigen/src/Core/PermutationMatrix.h                |  6 +--
 Eigen/src/Core/Product.h                          |  2 +-
 Eigen/src/Core/ProductEvaluators.h                |  3 +-
 Eigen/src/Core/Transpose.h                        |  1 +
 Eigen/src/Core/TriangularMatrix.h                 |  2 +
 Eigen/src/Core/products/SelfadjointMatrixVector.h | 12 +++--
 Eigen/src/Core/products/SelfadjointRank2Update.h  |  3 +-
 Eigen/src/Core/util/BlasUtil.h                    |  8 ++--
 Eigen/src/Core/util/Memory.h                      |  4 +-
 Eigen/src/Core/util/Meta.h                        |  5 +-
 Eigen/src/Core/util/XprHelper.h                   |  2 +
 Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h    | 16 ++++---
 Eigen/src/Eigenvalues/Tridiagonalization.h        |  9 +++-
 Eigen/src/Householder/Householder.h               |  4 ++
 Eigen/src/Householder/HouseholderSequence.h       | 17 ++++++-
 Eigen/src/Jacobi/Jacobi.h                         | 11 ++++-
 Eigen/src/LU/Determinant.h                        | 15 ++++--
 Eigen/src/LU/InverseImpl.h                        |  2 +
 Eigen/src/plugins/BlockMethods.h                  |  1 +
 27 files changed, 200 insertions(+), 74 deletions(-)
 mode change 100755 => 100644 Eigen/PardisoSupport

diff --git a/Eigen/Core b/Eigen/Core
index f6bc18a08..5117461c7 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -45,27 +45,40 @@
   #ifdef EIGEN_EXCEPTIONS
   #undef EIGEN_EXCEPTIONS
   #endif
+#endif
 
-  // All functions callable from CUDA code must be qualified with __device__
-  #ifdef EIGEN_CUDACC
-    // Do not try to vectorize on CUDA and SYCL!
-    #ifndef EIGEN_DONT_VECTORIZE
-    #define EIGEN_DONT_VECTORIZE
-    #endif
+// All functions callable from CUDA code must be qualified with __device__
+#ifdef EIGEN_CUDACC
+  // Do not try to vectorize on CUDA and SYCL!
+  #ifndef EIGEN_DONT_VECTORIZE
+  #define EIGEN_DONT_VECTORIZE
+  #endif
 
-    #define EIGEN_DEVICE_FUNC __host__ __device__
-    // We need cuda_runtime.h to ensure that that EIGEN_USING_STD_MATH macro
-    // works properly on the device side
-    #include <cuda_runtime.h>
-  #else
-    #define EIGEN_DEVICE_FUNC
+  #define EIGEN_DEVICE_FUNC __host__ __device__
+  // We need cuda_runtime.h to ensure that that EIGEN_USING_STD_MATH macro
+  // works properly on the device side
+  #include <cuda_runtime.h>
+
+  #if EIGEN_HAS_CONSTEXPR
+    // While available already with c++11, this is useful mostly starting with c++14 and relaxed constexpr rules
+    #if defined(__NVCC__)
+      // nvcc considers constexpr functions as __host__ __device__ with the option --expt-relaxed-constexpr
+      #ifdef __CUDACC_RELAXED_CONSTEXPR__
+        #define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC
+      #endif
+    #elif defined(__clang__) && defined(__CUDA__)
+      // clang++ always considers constexpr functions as implicitly __host__ __device__
+      #define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC
+    #endif
   #endif
 #else
   #define EIGEN_DEVICE_FUNC
 #endif
 
 #ifdef __NVCC__
-#define EIGEN_DONT_VECTORIZE
+  #ifndef EIGEN_DONT_VECTORIZE
+  #define EIGEN_DONT_VECTORIZE
+  #endif
 #endif
 
 // When compiling CUDA device code with NVCC, pull in math functions from the
diff --git a/Eigen/PardisoSupport b/Eigen/PardisoSupport
old mode 100755
new mode 100644
diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h
index 9e58fbf88..3c02a1025 100644
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h
@@ -207,7 +207,9 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
       EIGEN_UNUSED_VARIABLE(rows);
       EIGEN_UNUSED_VARIABLE(cols);
     }
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      numext::swap(m_data, other.m_data);
+    }
     EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
     EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
     EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {}
@@ -267,7 +269,11 @@ template<typename T, int Size, int _Options> class DenseStorage<T, Size, Dynamic
     }
     EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index cols) : m_rows(rows), m_cols(cols) {}
     EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
-    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
+    {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_rows,other.m_rows);
+      numext::swap(m_cols,other.m_cols);
+    }
     EIGEN_DEVICE_FUNC Index rows() const {return m_rows;}
     EIGEN_DEVICE_FUNC Index cols() const {return m_cols;}
     EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index cols) { m_rows = rows; m_cols = cols; }
@@ -296,7 +302,11 @@ template<typename T, int Size, int _Cols, int _Options> class DenseStorage<T, Si
       return *this; 
     }
     EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index) : m_rows(rows) {}
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
+    {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_rows,other.m_rows);
+    }
     EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
     EIGEN_DEVICE_FUNC Index cols(void) const {return _Cols;}
     EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index) { m_rows = rows; }
@@ -325,11 +335,14 @@ template<typename T, int Size, int _Rows, int _Options> class DenseStorage<T, Si
       return *this;
     }
     EIGEN_DEVICE_FUNC DenseStorage(Index, Index, Index cols) : m_cols(cols) {}
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_cols,other.m_cols);
+    }
     EIGEN_DEVICE_FUNC Index rows(void) const {return _Rows;}
     EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
-    void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
-    void resize(Index, Index, Index cols) { m_cols = cols; }
+    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
+    EIGEN_DEVICE_FUNC void resize(Index, Index, Index cols) { m_cols = cols; }
     EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
     EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
 };
@@ -381,16 +394,19 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
     EIGEN_DEVICE_FUNC
     DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
     {
-      using std::swap;
-      swap(m_data, other.m_data);
-      swap(m_rows, other.m_rows);
-      swap(m_cols, other.m_cols);
+      numext::swap(m_data, other.m_data);
+      numext::swap(m_rows, other.m_rows);
+      numext::swap(m_cols, other.m_cols);
       return *this;
     }
 #endif
     EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols); }
     EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
-    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
+    {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_rows,other.m_rows);
+      numext::swap(m_cols,other.m_cols);
+    }
     EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
     EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
     void conservativeResize(Index size, Index rows, Index cols)
@@ -459,14 +475,16 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
     EIGEN_DEVICE_FUNC
     DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
     {
-      using std::swap;
-      swap(m_data, other.m_data);
-      swap(m_cols, other.m_cols);
+      numext::swap(m_data, other.m_data);
+      numext::swap(m_cols, other.m_cols);
       return *this;
     }
 #endif
     EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols); }
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_cols,other.m_cols);
+    }
     EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
     EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
     EIGEN_DEVICE_FUNC void conservativeResize(Index size, Index, Index cols)
@@ -533,14 +551,16 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
     EIGEN_DEVICE_FUNC
     DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
     {
-      using std::swap;
-      swap(m_data, other.m_data);
-      swap(m_rows, other.m_rows);
+      numext::swap(m_data, other.m_data);
+      numext::swap(m_rows, other.m_rows);
       return *this;
     }
 #endif
     EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows); }
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_rows,other.m_rows);
+    }
     EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
     EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
     void conservativeResize(Index size, Index rows, Index)
diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h
index 694f7cbde..bd2361e9a 100644
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@@ -163,13 +163,13 @@ template<typename Scalar,int Size,int MaxSize,bool Cond> struct gemv_static_vect
 template<typename Scalar,int Size,int MaxSize>
 struct gemv_static_vector_if<Scalar,Size,MaxSize,false>
 {
-  EIGEN_STRONG_INLINE  Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; }
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; }
 };
 
 template<typename Scalar,int Size>
 struct gemv_static_vector_if<Scalar,Size,Dynamic,true>
 {
-  EIGEN_STRONG_INLINE Scalar* data() { return 0; }
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { return 0; }
 };
 
 template<typename Scalar,int Size,int MaxSize>
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 954863c39..a5740334a 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -864,7 +864,7 @@ template<typename T> T generic_fast_tanh_float(const T& a_x);
 
 namespace numext {
 
-#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
+#if (!defined(EIGEN_CUDACC) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) && !defined(__SYCL_DEVICE_ONLY__)
 template<typename T>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
@@ -881,19 +881,16 @@ EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
   return max EIGEN_NOT_A_MACRO (x,y);
 }
 
-
 #elif defined(__SYCL_DEVICE_ONLY__)
 template<typename T>
 EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
 {
-
   return y < x ? y : x;
 }
 
 template<typename T>
 EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
 {
-
   return x < y ? y : x;
 }
 
@@ -937,7 +934,6 @@ EIGEN_ALWAYS_INLINE unsigned long maxi(const unsigned long& x, const unsigned lo
   return cl::sycl::max(x,y);
 }
 
-
 EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y)
 {
   return cl::sycl::fmin(x,y);
@@ -971,6 +967,19 @@ EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y)
 {
   return fminf(x, y);
 }
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y)
+{
+  return fmin(x, y);
+}
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y)
+{
+  return fminl(x, y);
+}
+
 template<typename T>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
@@ -983,6 +992,18 @@ EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y)
 {
   return fmaxf(x, y);
 }
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y)
+{
+  return fmax(x, y);
+}
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y)
+{
+  return fmaxl(x, y);
+}
 #endif
 
 
diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h
index 28737c15e..a23e93ccb 100644
--- a/Eigen/src/Core/MathFunctionsImpl.h
+++ b/Eigen/src/Core/MathFunctionsImpl.h
@@ -67,7 +67,7 @@ T generic_fast_tanh_float(const T& a_x)
 }
 
 template<typename RealScalar>
-EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y)
 {
   EIGEN_USING_STD_MATH(sqrt);
@@ -82,7 +82,8 @@ template<typename Scalar>
 struct hypot_impl
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  static inline RealScalar run(const Scalar& x, const Scalar& y)
+  static EIGEN_DEVICE_FUNC
+  inline RealScalar run(const Scalar& x, const Scalar& y)
   {
     EIGEN_USING_STD_MATH(abs);
     return positive_real_hypot<RealScalar>(abs(x), abs(y));
diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index 11435903b..6046c8bae 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -328,6 +328,7 @@ template<typename Derived> class MatrixBase
 
     inline const PartialPivLU<PlainObject> lu() const;
 
+    EIGEN_DEVICE_FUNC
     inline const Inverse<Derived> inverse() const;
 
     template<typename ResultType>
@@ -337,12 +338,15 @@ template<typename Derived> class MatrixBase
       bool& invertible,
       const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
     ) const;
+
     template<typename ResultType>
     inline void computeInverseWithCheck(
       ResultType& inverse,
       bool& invertible,
       const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
     ) const;
+
+    EIGEN_DEVICE_FUNC
     Scalar determinant() const;
 
 /////////// Cholesky module ///////////
@@ -414,15 +418,19 @@ template<typename Derived> class MatrixBase
 
 ////////// Householder module ///////////
 
+    EIGEN_DEVICE_FUNC
     void makeHouseholderInPlace(Scalar& tau, RealScalar& beta);
     template<typename EssentialPart>
+    EIGEN_DEVICE_FUNC
     void makeHouseholder(EssentialPart& essential,
                          Scalar& tau, RealScalar& beta) const;
     template<typename EssentialPart>
+    EIGEN_DEVICE_FUNC
     void applyHouseholderOnTheLeft(const EssentialPart& essential,
                                    const Scalar& tau,
                                    Scalar* workspace);
     template<typename EssentialPart>
+    EIGEN_DEVICE_FUNC
     void applyHouseholderOnTheRight(const EssentialPart& essential,
                                     const Scalar& tau,
                                     Scalar* workspace);
diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h
index 5567d4c90..b053cff07 100644
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@@ -21,12 +21,14 @@ template< typename T,
           bool is_integer = NumTraits<T>::IsInteger>
 struct default_digits10_impl
 {
+  EIGEN_DEVICE_FUNC
   static int run() { return std::numeric_limits<T>::digits10; }
 };
 
 template<typename T>
 struct default_digits10_impl<T,false,false> // Floating point
 {
+  EIGEN_DEVICE_FUNC
   static int run() {
     using std::log10;
     using std::ceil;
@@ -38,6 +40,7 @@ struct default_digits10_impl<T,false,false> // Floating point
 template<typename T>
 struct default_digits10_impl<T,false,true> // Integer
 {
+  EIGEN_DEVICE_FUNC
   static int run() { return 0; }
 };
 
@@ -49,12 +52,14 @@ template< typename T,
           bool is_integer = NumTraits<T>::IsInteger>
 struct default_digits_impl
 {
+  EIGEN_DEVICE_FUNC
   static int run() { return std::numeric_limits<T>::digits; }
 };
 
 template<typename T>
 struct default_digits_impl<T,false,false> // Floating point
 {
+  EIGEN_DEVICE_FUNC
   static int run() {
     using std::log;
     using std::ceil;
@@ -66,6 +71,7 @@ struct default_digits_impl<T,false,false> // Floating point
 template<typename T>
 struct default_digits_impl<T,false,true> // Integer
 {
+  EIGEN_DEVICE_FUNC
   static int run() { return 0; }
 };
 
diff --git a/Eigen/src/Core/PermutationMatrix.h b/Eigen/src/Core/PermutationMatrix.h
index b1fb455b9..acd085301 100644
--- a/Eigen/src/Core/PermutationMatrix.h
+++ b/Eigen/src/Core/PermutationMatrix.h
@@ -99,13 +99,13 @@ class PermutationBase : public EigenBase<Derived>
     #endif
 
     /** \returns the number of rows */
-    inline Index rows() const { return Index(indices().size()); }
+    inline EIGEN_DEVICE_FUNC Index rows() const { return Index(indices().size()); }
 
     /** \returns the number of columns */
-    inline Index cols() const { return Index(indices().size()); }
+    inline EIGEN_DEVICE_FUNC Index cols() const { return Index(indices().size()); }
 
     /** \returns the size of a side of the respective square matrix, i.e., the number of indices */
-    inline Index size() const { return Index(indices().size()); }
+    inline EIGEN_DEVICE_FUNC Index size() const { return Index(indices().size()); }
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename DenseDerived>
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 3d67d9489..70790dbd4 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -127,7 +127,7 @@ public:
   using Base::derived;
   typedef typename Base::Scalar Scalar;
   
-  EIGEN_STRONG_INLINE operator const Scalar() const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator const Scalar() const
   {
     return internal::evaluator<ProductXpr>(derived()).coeff(0,0);
   }
diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index 8072a1959..76e5083c1 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -767,7 +767,8 @@ struct generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag>
   typedef typename Product<Lhs,Rhs>::Scalar Scalar;
   
   template<typename Dest>
-  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static EIGEN_DEVICE_FUNC
+  void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
   {
     selfadjoint_product_impl<typename Lhs::MatrixType,Lhs::Mode,false,Rhs,0,Rhs::IsVectorAtCompileTime>::run(dst, lhs.nestedExpression(), rhs, alpha);
   }
diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index ba7d6e629..d7c204579 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -79,6 +79,7 @@ template<typename MatrixType> class Transpose
     nestedExpression() { return m_matrix; }
 
     /** \internal */
+    EIGEN_DEVICE_FUNC
     void resize(Index nrows, Index ncols) {
       m_matrix.resize(ncols,nrows);
     }
diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h
index ab73fcf21..521de6160 100644
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@@ -65,6 +65,7 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
     inline Index innerStride() const { return derived().innerStride(); }
     
     // dummy resize function
+    EIGEN_DEVICE_FUNC
     void resize(Index rows, Index cols)
     {
       EIGEN_UNUSED_VARIABLE(rows);
@@ -716,6 +717,7 @@ struct unary_evaluator<TriangularView<MatrixType,Mode>, IndexBased>
 {
   typedef TriangularView<MatrixType,Mode> XprType;
   typedef evaluator<typename internal::remove_all<MatrixType>::type> Base;
+  EIGEN_DEVICE_FUNC
   unary_evaluator(const XprType &xpr) : Base(xpr.nestedExpression()) {}
 };
 
diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h
index 67390f1d7..d38fd72b2 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h
@@ -27,7 +27,8 @@ template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool Conju
 struct selfadjoint_matrix_vector_product
 
 {
-static EIGEN_DONT_INLINE void run(
+static EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC
+void run(
   Index size,
   const Scalar*  lhs, Index lhsStride,
   const Scalar*  rhs,
@@ -36,7 +37,8 @@ static EIGEN_DONT_INLINE void run(
 };
 
 template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Version>::run(
+EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC
+void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Version>::run(
   Index size,
   const Scalar*  lhs, Index lhsStride,
   const Scalar*  rhs,
@@ -62,8 +64,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
 
   Scalar cjAlpha = ConjugateRhs ? numext::conj(alpha) : alpha;
 
-
-  Index bound = (std::max)(Index(0),size-8) & 0xfffffffe;
+  Index bound = numext::maxi(Index(0), size-8) & 0xfffffffe;
   if (FirstTriangular)
     bound = size - bound;
 
@@ -175,7 +176,8 @@ struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,0,true>
   enum { LhsUpLo = LhsMode&(Upper|Lower) };
 
   template<typename Dest>
-  static void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
+  static EIGEN_DEVICE_FUNC
+  void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
   {
     typedef typename Dest::Scalar ResScalar;
     typedef typename Rhs::Scalar RhsScalar;
diff --git a/Eigen/src/Core/products/SelfadjointRank2Update.h b/Eigen/src/Core/products/SelfadjointRank2Update.h
index d395888e5..09209f733 100644
--- a/Eigen/src/Core/products/SelfadjointRank2Update.h
+++ b/Eigen/src/Core/products/SelfadjointRank2Update.h
@@ -24,7 +24,8 @@ struct selfadjoint_rank2_update_selector;
 template<typename Scalar, typename Index, typename UType, typename VType>
 struct selfadjoint_rank2_update_selector<Scalar,Index,UType,VType,Lower>
 {
-  static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha)
+  static EIGEN_DEVICE_FUNC
+  void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha)
   {
     const Index size = u.size();
     for (Index i=0; i<size; ++i)
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index b1791fb3a..705925984 100755
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -289,8 +289,8 @@ template<typename XprType> struct blas_traits
     ExtractType,
     typename _ExtractType::PlainObject
     >::type DirectLinearAccessType;
-  static inline ExtractType extract(const XprType& x) { return x; }
-  static inline const Scalar extractScalarFactor(const XprType&) { return Scalar(1); }
+  static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return x; }
+  static inline EIGEN_DEVICE_FUNC const Scalar extractScalarFactor(const XprType&) { return Scalar(1); }
 };
 
 // pop conjugate
@@ -318,8 +318,8 @@ struct blas_traits<CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp
   typedef blas_traits<NestedXpr> Base;
   typedef CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain>, NestedXpr> XprType;
   typedef typename Base::ExtractType ExtractType;
-  static inline ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); }
-  static inline Scalar extractScalarFactor(const XprType& x)
+  static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); }
+  static inline EIGEN_DEVICE_FUNC Scalar extractScalarFactor(const XprType& x)
   { return x.lhs().functor().m_other * Base::extractScalarFactor(x.rhs()); }
 };
 template<typename Scalar, typename NestedXpr, typename Plain>
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 53300c388..22d7679c5 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -542,7 +542,7 @@ template<typename T> struct smart_memmove_helper<T,false> {
 
 // you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA
 // to the appropriate stack allocation function
-#ifndef EIGEN_ALLOCA
+#if ! defined EIGEN_ALLOCA && ! defined EIGEN_CUDA_ARCH
   #if EIGEN_OS_LINUX || EIGEN_OS_MAC || (defined alloca)
     #define EIGEN_ALLOCA alloca
   #elif EIGEN_COMP_MSVC
@@ -561,12 +561,14 @@ template<typename T> class aligned_stack_memory_handler : noncopyable
      * In this case, the buffer elements will also be destructed when this handler will be destructed.
      * Finally, if \a dealloc is true, then the pointer \a ptr is freed.
      **/
+    EIGEN_DEVICE_FUNC
     aligned_stack_memory_handler(T* ptr, std::size_t size, bool dealloc)
       : m_ptr(ptr), m_size(size), m_deallocate(dealloc)
     {
       if(NumTraits<T>::RequireInitialization && m_ptr)
         Eigen::internal::construct_elements_of_array(m_ptr, size);
     }
+    EIGEN_DEVICE_FUNC
     ~aligned_stack_memory_handler()
     {
       if(NumTraits<T>::RequireInitialization && m_ptr)
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index 0d0b8c43a..ef9860c4b 100755
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -544,6 +544,7 @@ using std::numeric_limits;
 // Integer division with rounding up.
 // T is assumed to be an integer type with a>=0, and b>0
 template<typename T>
+EIGEN_DEVICE_FUNC
 T div_ceil(const T &a, const T &b)
 {
   return (a+b-1) / b;
@@ -554,7 +555,7 @@ T div_ceil(const T &a, const T &b)
 template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
 bool equal_strict(const X& x,const Y& y) { return x == y; }
 
-#if !defined(EIGEN_CUDA_ARCH)
+#if !defined(EIGEN_CUDA_ARCH) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)
 template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
 bool equal_strict(const float& x,const float& y) { return std::equal_to<float>()(x,y); }
 
@@ -565,7 +566,7 @@ bool equal_strict(const double& x,const double& y) { return std::equal_to<double
 template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
 bool not_equal_strict(const X& x,const Y& y) { return x != y; }
 
-#if !defined(EIGEN_CUDA_ARCH)
+#if !defined(EIGEN_CUDA_ARCH) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)
 template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
 bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to<float>()(x,y); }
 
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index f7a3d9ce7..7311adec7 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -685,12 +685,14 @@ struct possibly_same_dense {
 };
 
 template<typename T1, typename T2>
+EIGEN_DEVICE_FUNC
 bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if<possibly_same_dense<T1,T2>::value>::type * = 0)
 {
   return (mat1.data()==mat2.data()) && (mat1.innerStride()==mat2.innerStride()) && (mat1.outerStride()==mat2.outerStride());
 }
 
 template<typename T1, typename T2>
+EIGEN_DEVICE_FUNC
 bool is_same_dense(const T1 &, const T2 &, typename enable_if<!possibly_same_dense<T1,T2>::value>::type * = 0)
 {
   return false;
diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
index 040f8d3bb..fbc1ee2f6 100644
--- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
+++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
@@ -20,7 +20,9 @@ class GeneralizedSelfAdjointEigenSolver;
 
 namespace internal {
 template<typename SolverType,int Size,bool IsComplex> struct direct_selfadjoint_eigenvalues;
+
 template<typename MatrixType, typename DiagType, typename SubDiagType>
+EIGEN_DEVICE_FUNC
 ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec);
 }
 
@@ -354,7 +356,8 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
     static const int m_maxIterations = 30;
 
   protected:
-    static void check_template_parameters()
+    static EIGEN_DEVICE_FUNC
+    void check_template_parameters()
     {
       EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
     }
@@ -403,7 +406,7 @@ SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>
   
   const InputType &matrix(a_matrix.derived());
   
-  using std::abs;
+  EIGEN_USING_STD_MATH(abs);
   eigen_assert(matrix.cols() == matrix.rows());
   eigen_assert((options&~(EigVecMask|GenEigMask))==0
           && (options&EigVecMask)!=EigVecMask
@@ -479,9 +482,10 @@ namespace internal {
   * \returns \c Success or \c NoConvergence
   */
 template<typename MatrixType, typename DiagType, typename SubDiagType>
+EIGEN_DEVICE_FUNC
 ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec)
 {
-  using std::abs;
+  EIGEN_USING_STD_MATH(abs);
 
   ComputationInfo info;
   typedef typename MatrixType::Scalar Scalar;
@@ -535,7 +539,7 @@ ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag
       diag.segment(i,n-i).minCoeff(&k);
       if (k > 0)
       {
-        std::swap(diag[i], diag[k+i]);
+        numext::swap(diag[i], diag[k+i]);
         if(computeEigenvectors)
           eivec.col(i).swap(eivec.col(k+i));
       }
@@ -605,7 +609,7 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
   EIGEN_DEVICE_FUNC
   static inline bool extract_kernel(MatrixType& mat, Ref<VectorType> res, Ref<VectorType> representative)
   {
-    using std::abs;
+    EIGEN_USING_STD_MATH(abs);
     Index i0;
     // Find non-zero column i0 (by construction, there must exist a non zero coefficient on the diagonal):
     mat.diagonal().cwiseAbs().maxCoeff(&i0);
@@ -807,7 +811,7 @@ template<int StorageOrder,typename RealScalar, typename Scalar, typename Index>
 EIGEN_DEVICE_FUNC
 static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index start, Index end, Scalar* matrixQ, Index n)
 {
-  using std::abs;
+  EIGEN_USING_STD_MATH(abs);
   RealScalar td = (diag[end-1] - diag[end])*RealScalar(0.5);
   RealScalar e = subdiag[end-1];
   // Note that thanks to scaling, e^2 or td^2 cannot overflow, however they can still
diff --git a/Eigen/src/Eigenvalues/Tridiagonalization.h b/Eigen/src/Eigenvalues/Tridiagonalization.h
index 1d102c17b..c5c1acf46 100644
--- a/Eigen/src/Eigenvalues/Tridiagonalization.h
+++ b/Eigen/src/Eigenvalues/Tridiagonalization.h
@@ -25,6 +25,7 @@ struct traits<TridiagonalizationMatrixTReturnType<MatrixType> >
 };
 
 template<typename MatrixType, typename CoeffVectorType>
+EIGEN_DEVICE_FUNC
 void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs);
 }
 
@@ -344,6 +345,7 @@ namespace internal {
   * \sa Tridiagonalization::packedMatrix()
   */
 template<typename MatrixType, typename CoeffVectorType>
+EIGEN_DEVICE_FUNC
 void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs)
 {
   using numext::conj;
@@ -424,6 +426,7 @@ struct tridiagonalization_inplace_selector;
   * \sa class Tridiagonalization
   */
 template<typename MatrixType, typename DiagonalType, typename SubDiagonalType>
+EIGEN_DEVICE_FUNC
 void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
 {
   eigen_assert(mat.cols()==mat.rows() && diag.size()==mat.rows() && subdiag.size()==mat.rows()-1);
@@ -439,7 +442,8 @@ struct tridiagonalization_inplace_selector
   typedef typename Tridiagonalization<MatrixType>::CoeffVectorType CoeffVectorType;
   typedef typename Tridiagonalization<MatrixType>::HouseholderSequenceType HouseholderSequenceType;
   template<typename DiagonalType, typename SubDiagonalType>
-  static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
+  static EIGEN_DEVICE_FUNC
+  void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
   {
     CoeffVectorType hCoeffs(mat.cols()-1);
     tridiagonalization_inplace(mat,hCoeffs);
@@ -508,7 +512,8 @@ struct tridiagonalization_inplace_selector<MatrixType,1,IsComplex>
   typedef typename MatrixType::Scalar Scalar;
 
   template<typename DiagonalType, typename SubDiagonalType>
-  static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, bool extractQ)
+  static EIGEN_DEVICE_FUNC
+  void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, bool extractQ)
   {
     diag(0,0) = numext::real(mat(0,0));
     if(extractQ)
diff --git a/Eigen/src/Householder/Householder.h b/Eigen/src/Householder/Householder.h
index a5f336d18..5bc037f00 100644
--- a/Eigen/src/Householder/Householder.h
+++ b/Eigen/src/Householder/Householder.h
@@ -39,6 +39,7 @@ template<int n> struct decrement_size
   *     MatrixBase::applyHouseholderOnTheRight()
   */
 template<typename Derived>
+EIGEN_DEVICE_FUNC
 void MatrixBase<Derived>::makeHouseholderInPlace(Scalar& tau, RealScalar& beta)
 {
   VectorBlock<Derived, internal::decrement_size<Base::SizeAtCompileTime>::ret> essentialPart(derived(), 1, size()-1);
@@ -62,6 +63,7 @@ void MatrixBase<Derived>::makeHouseholderInPlace(Scalar& tau, RealScalar& beta)
   */
 template<typename Derived>
 template<typename EssentialPart>
+EIGEN_DEVICE_FUNC
 void MatrixBase<Derived>::makeHouseholder(
   EssentialPart& essential,
   Scalar& tau,
@@ -110,6 +112,7 @@ void MatrixBase<Derived>::makeHouseholder(
   */
 template<typename Derived>
 template<typename EssentialPart>
+EIGEN_DEVICE_FUNC
 void MatrixBase<Derived>::applyHouseholderOnTheLeft(
   const EssentialPart& essential,
   const Scalar& tau,
@@ -147,6 +150,7 @@ void MatrixBase<Derived>::applyHouseholderOnTheLeft(
   */
 template<typename Derived>
 template<typename EssentialPart>
+EIGEN_DEVICE_FUNC
 void MatrixBase<Derived>::applyHouseholderOnTheRight(
   const EssentialPart& essential,
   const Scalar& tau,
diff --git a/Eigen/src/Householder/HouseholderSequence.h b/Eigen/src/Householder/HouseholderSequence.h
index fad1d5ab6..a4f40b75c 100644
--- a/Eigen/src/Householder/HouseholderSequence.h
+++ b/Eigen/src/Householder/HouseholderSequence.h
@@ -87,7 +87,7 @@ struct hseq_side_dependent_impl
 {
   typedef Block<const VectorsType, Dynamic, 1> EssentialVectorType;
   typedef HouseholderSequence<VectorsType, CoeffsType, OnTheLeft> HouseholderSequenceType;
-  static inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k)
+  static EIGEN_DEVICE_FUNC inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k)
   {
     Index start = k+1+h.m_shift;
     return Block<const VectorsType,Dynamic,1>(h.m_vectors, start, k, h.rows()-start, 1);
@@ -173,6 +173,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       *
       * \sa setLength(), setShift()
       */
+    EIGEN_DEVICE_FUNC
     HouseholderSequence(const VectorsType& v, const CoeffsType& h)
       : m_vectors(v), m_coeffs(h), m_reverse(false), m_length(v.diagonalSize()),
         m_shift(0)
@@ -180,6 +181,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
     }
 
     /** \brief Copy constructor. */
+    EIGEN_DEVICE_FUNC
     HouseholderSequence(const HouseholderSequence& other)
       : m_vectors(other.m_vectors),
         m_coeffs(other.m_coeffs),
@@ -193,12 +195,14 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       * \returns Number of rows 
       * \details This equals the dimension of the space that the transformation acts on.
       */
+    EIGEN_DEVICE_FUNC
     Index rows() const { return Side==OnTheLeft ? m_vectors.rows() : m_vectors.cols(); }
 
     /** \brief Number of columns of transformation viewed as a matrix.
       * \returns Number of columns
       * \details This equals the dimension of the space that the transformation acts on.
       */
+    EIGEN_DEVICE_FUNC
     Index cols() const { return rows(); }
 
     /** \brief Essential part of a Householder vector.
@@ -215,6 +219,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       *
       * \sa setShift(), shift()
       */
+    EIGEN_DEVICE_FUNC
     const EssentialVectorType essentialVector(Index k) const
     {
       eigen_assert(k >= 0 && k < m_length);
@@ -252,7 +257,9 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
     AdjointReturnType inverse() const { return adjoint(); }
 
     /** \internal */
-    template<typename DestType> inline void evalTo(DestType& dst) const
+    template<typename DestType>
+    inline EIGEN_DEVICE_FUNC
+    void evalTo(DestType& dst) const
     {
       Matrix<Scalar, DestType::RowsAtCompileTime, 1,
              AutoAlign|ColMajor, DestType::MaxRowsAtCompileTime, 1> workspace(rows());
@@ -261,6 +268,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
 
     /** \internal */
     template<typename Dest, typename Workspace>
+    EIGEN_DEVICE_FUNC
     void evalTo(Dest& dst, Workspace& workspace) const
     {
       workspace.resize(rows());
@@ -394,6 +402,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       *
       * \sa length()
       */
+    EIGEN_DEVICE_FUNC
     HouseholderSequence& setLength(Index length)
     {
       m_length = length;
@@ -411,13 +420,17 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       *
       * \sa shift()
       */
+    EIGEN_DEVICE_FUNC
     HouseholderSequence& setShift(Index shift)
     {
       m_shift = shift;
       return *this;
     }
 
+    EIGEN_DEVICE_FUNC
     Index length() const { return m_length; }  /**< \brief Returns the length of the Householder sequence. */
+
+    EIGEN_DEVICE_FUNC
     Index shift() const { return m_shift; }    /**< \brief Returns the shift of the Householder sequence. */
 
     /* Necessary for .adjoint() and .conjugate() */
diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h
index af1228cb8..bba75fc4f 100644
--- a/Eigen/src/Jacobi/Jacobi.h
+++ b/Eigen/src/Jacobi/Jacobi.h
@@ -90,6 +90,7 @@ template<typename Scalar> class JacobiRotation
   * \sa MatrixBase::makeJacobi(const MatrixBase<Derived>&, Index, Index), MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
   */
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 bool JacobiRotation<Scalar>::makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z)
 {
   using std::sqrt;
@@ -134,6 +135,7 @@ bool JacobiRotation<Scalar>::makeJacobi(const RealScalar& x, const Scalar& y, co
   */
 template<typename Scalar>
 template<typename Derived>
+EIGEN_DEVICE_FUNC
 inline bool JacobiRotation<Scalar>::makeJacobi(const MatrixBase<Derived>& m, Index p, Index q)
 {
   return makeJacobi(numext::real(m.coeff(p,p)), m.coeff(p,q), numext::real(m.coeff(q,q)));
@@ -156,6 +158,7 @@ inline bool JacobiRotation<Scalar>::makeJacobi(const MatrixBase<Derived>& m, Ind
   * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
   */
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* z)
 {
   makeGivens(p, q, z, typename internal::conditional<NumTraits<Scalar>::IsComplex, internal::true_type, internal::false_type>::type());
@@ -164,6 +167,7 @@ void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar
 
 // specialization for complexes
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::true_type)
 {
   using std::sqrt;
@@ -223,6 +227,7 @@ void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar
 
 // specialization for reals
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::false_type)
 {
   using std::sqrt;
@@ -286,6 +291,7 @@ void apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>&
   */
 template<typename Derived>
 template<typename OtherScalar>
+EIGEN_DEVICE_FUNC
 inline void MatrixBase<Derived>::applyOnTheLeft(Index p, Index q, const JacobiRotation<OtherScalar>& j)
 {
   RowXpr x(this->row(p));
@@ -301,6 +307,7 @@ inline void MatrixBase<Derived>::applyOnTheLeft(Index p, Index q, const JacobiRo
   */
 template<typename Derived>
 template<typename OtherScalar>
+EIGEN_DEVICE_FUNC
 inline void MatrixBase<Derived>::applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j)
 {
   ColXpr x(this->col(p));
@@ -314,7 +321,8 @@ template<typename Scalar, typename OtherScalar,
          int SizeAtCompileTime, int MinAlignment, bool Vectorizable>
 struct apply_rotation_in_the_plane_selector
 {
-  static inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s)
+  static EIGEN_DEVICE_FUNC
+  inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s)
   {
     for(Index i=0; i<size; ++i)
     {
@@ -441,6 +449,7 @@ struct apply_rotation_in_the_plane_selector<Scalar,OtherScalar,SizeAtCompileTime
 };
 
 template<typename VectorX, typename VectorY, typename OtherScalar>
+EIGEN_DEVICE_FUNC
 void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>& xpr_y, const JacobiRotation<OtherScalar>& j)
 {
   typedef typename VectorX::Scalar Scalar;
diff --git a/Eigen/src/LU/Determinant.h b/Eigen/src/LU/Determinant.h
index d6a3c1e5a..6af63a6e7 100644
--- a/Eigen/src/LU/Determinant.h
+++ b/Eigen/src/LU/Determinant.h
@@ -15,6 +15,7 @@ namespace Eigen {
 namespace internal {
 
 template<typename Derived>
+EIGEN_DEVICE_FUNC
 inline const typename Derived::Scalar bruteforce_det3_helper
 (const MatrixBase<Derived>& matrix, int a, int b, int c)
 {
@@ -23,6 +24,7 @@ inline const typename Derived::Scalar bruteforce_det3_helper
 }
 
 template<typename Derived>
+EIGEN_DEVICE_FUNC
 const typename Derived::Scalar bruteforce_det4_helper
 (const MatrixBase<Derived>& matrix, int j, int k, int m, int n)
 {
@@ -44,7 +46,8 @@ template<typename Derived,
 
 template<typename Derived> struct determinant_impl<Derived, 1>
 {
-  static inline typename traits<Derived>::Scalar run(const Derived& m)
+  static inline EIGEN_DEVICE_FUNC
+  typename traits<Derived>::Scalar run(const Derived& m)
   {
     return m.coeff(0,0);
   }
@@ -52,7 +55,8 @@ template<typename Derived> struct determinant_impl<Derived, 1>
 
 template<typename Derived> struct determinant_impl<Derived, 2>
 {
-  static inline typename traits<Derived>::Scalar run(const Derived& m)
+  static inline EIGEN_DEVICE_FUNC
+  typename traits<Derived>::Scalar run(const Derived& m)
   {
     return m.coeff(0,0) * m.coeff(1,1) - m.coeff(1,0) * m.coeff(0,1);
   }
@@ -60,7 +64,8 @@ template<typename Derived> struct determinant_impl<Derived, 2>
 
 template<typename Derived> struct determinant_impl<Derived, 3>
 {
-  static inline typename traits<Derived>::Scalar run(const Derived& m)
+  static inline EIGEN_DEVICE_FUNC
+  typename traits<Derived>::Scalar run(const Derived& m)
   {
     return bruteforce_det3_helper(m,0,1,2)
           - bruteforce_det3_helper(m,1,0,2)
@@ -70,7 +75,8 @@ template<typename Derived> struct determinant_impl<Derived, 3>
 
 template<typename Derived> struct determinant_impl<Derived, 4>
 {
-  static typename traits<Derived>::Scalar run(const Derived& m)
+  static EIGEN_DEVICE_FUNC
+  typename traits<Derived>::Scalar run(const Derived& m)
   {
     // trick by Martin Costabel to compute 4x4 det with only 30 muls
     return bruteforce_det4_helper(m,0,1,2,3)
@@ -89,6 +95,7 @@ template<typename Derived> struct determinant_impl<Derived, 4>
   * \returns the determinant of this matrix
   */
 template<typename Derived>
+EIGEN_DEVICE_FUNC
 inline typename internal::traits<Derived>::Scalar MatrixBase<Derived>::determinant() const
 {
   eigen_assert(rows() == cols());
diff --git a/Eigen/src/LU/InverseImpl.h b/Eigen/src/LU/InverseImpl.h
index f49f23360..1bab00c01 100644
--- a/Eigen/src/LU/InverseImpl.h
+++ b/Eigen/src/LU/InverseImpl.h
@@ -290,6 +290,7 @@ template<typename DstXprType, typename XprType>
 struct Assignment<DstXprType, Inverse<XprType>, internal::assign_op<typename DstXprType::Scalar,typename XprType::Scalar>, Dense2Dense>
 {
   typedef Inverse<XprType> SrcXprType;
+  EIGEN_DEVICE_FUNC
   static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename XprType::Scalar> &)
   {
     Index dstRows = src.rows();
@@ -332,6 +333,7 @@ struct Assignment<DstXprType, Inverse<XprType>, internal::assign_op<typename Dst
   * \sa computeInverseAndDetWithCheck()
   */
 template<typename Derived>
+EIGEN_DEVICE_FUNC
 inline const Inverse<Derived> MatrixBase<Derived>::inverse() const
 {
   EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsInteger,THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES)
diff --git a/Eigen/src/plugins/BlockMethods.h b/Eigen/src/plugins/BlockMethods.h
index a5748525b..67fdebc6f 100644
--- a/Eigen/src/plugins/BlockMethods.h
+++ b/Eigen/src/plugins/BlockMethods.h
@@ -1061,6 +1061,7 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template<int NRows, int NCols>
+EIGEN_DEVICE_FUNC
 inline typename FixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol,
                                                   Index blockRows, Index blockCols)
 {
-- 
cgit v1.2.3


From a8ab6060df270709b9975102382caa0eb8ef22ec Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 6 Jul 2018 09:58:45 +0200
Subject: Add unitests for inverse and selfadjoint-eigenvalues on CUDA

---
 test/cuda_basic.cu | 39 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 36 insertions(+), 3 deletions(-)

diff --git a/test/cuda_basic.cu b/test/cuda_basic.cu
index ce66c2c78..33e5fd119 100644
--- a/test/cuda_basic.cu
+++ b/test/cuda_basic.cu
@@ -121,7 +121,7 @@ struct diagonal {
 };
 
 template<typename T>
-struct eigenvalues {
+struct eigenvalues_direct {
   EIGEN_DEVICE_FUNC
   void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
   {
@@ -136,6 +136,34 @@ struct eigenvalues {
   }
 };
 
+template<typename T>
+struct eigenvalues {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    using namespace Eigen;
+    typedef Matrix<typename T::Scalar, T::RowsAtCompileTime, 1> Vec;
+    T M(in+i);
+    Map<Vec> res(out+i*Vec::MaxSizeAtCompileTime);
+    T A = M*M.adjoint();
+    SelfAdjointEigenSolver<T> eig;
+    eig.compute(M);
+    res = eig.eigenvalues();
+  }
+};
+
+template<typename T>
+struct matrix_inverse {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    using namespace Eigen;
+    T M(in+i);
+    Map<T> res(out+i*T::MaxSizeAtCompileTime);
+    res = M.inverse();
+  }
+};
+
 void test_cuda_basic()
 {
   ei_test_init_cuda();
@@ -163,8 +191,13 @@ void test_cuda_basic()
   
   CALL_SUBTEST( run_and_compare_to_cuda(diagonal<Matrix3f,Vector3f>(), nthreads, in, out) );
   CALL_SUBTEST( run_and_compare_to_cuda(diagonal<Matrix4f,Vector4f>(), nthreads, in, out) );
+
+  CALL_SUBTEST( run_and_compare_to_cuda(matrix_inverse<Matrix2f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_cuda(matrix_inverse<Matrix3f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_cuda(matrix_inverse<Matrix4f>(), nthreads, in, out) );
   
-  CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues<Matrix3f>(), nthreads, in, out) );
-  CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues<Matrix2f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues_direct<Matrix3f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues_direct<Matrix2f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues<Matrix4f>(), nthreads, in, out) );
 
 }
-- 
cgit v1.2.3


From f4d623ffa772093169a0ac949c7955bf38bf1bbf Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 6 Jul 2018 17:13:36 +0200
Subject: Complete Packet8h implementation and test it in packetmath unit test

---
 Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 85 +++++++++++++++++++++++++++++--
 test/packetmath.cpp                       | 15 +++---
 2 files changed, 90 insertions(+), 10 deletions(-)

diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
index 8a6f209c4..2ee92b4f6 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
@@ -493,6 +493,13 @@ template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, con
   return float2half(rf);
 }
 
+template<> EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  Packet16f rf = psub(af, bf);
+  return float2half(rf);
+}
+
 template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
   Packet16f af = half2float(a);
   Packet16f bf = half2float(b);
@@ -730,10 +737,10 @@ struct packet_traits<Eigen::half> : default_packet_traits {
     AlignedOnScalar = 1,
     size = 8,
     HasHalfPacket = 0,
-    HasAdd    = 0,
-    HasSub    = 0,
-    HasMul    = 0,
-    HasNegate = 0,
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasNegate = 1,
     HasAbs    = 0,
     HasAbs2   = 0,
     HasMin    = 0,
@@ -782,6 +789,17 @@ template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const
   _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x);
 }
 
+template<> EIGEN_STRONG_INLINE Packet8h
+ploaddup<Packet8h>(const Eigen::half*  from) {
+  Packet8h result;
+  unsigned short a = from[0].x;
+  unsigned short b = from[1].x;
+  unsigned short c = from[2].x;
+  unsigned short d = from[3].x;
+  result.x = _mm_set_epi16(d, d, c, c, b, b, a, a);
+  return result;
+}
+
 template<> EIGEN_STRONG_INLINE Packet8h
 ploadquad<Packet8h>(const Eigen::half* from) {
   Packet8h result;
@@ -835,6 +853,12 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
 
 template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
 
+template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) {
+  Packet8f af = half2float(a);
+  Packet8f rf = pnegate(af);
+  return float2half(rf);
+}
+
 template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
   Packet8f af = half2float(a);
   Packet8f bf = half2float(b);
@@ -842,6 +866,13 @@ template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const
   return float2half(rf);
 }
 
+template<> EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = psub(af, bf);
+  return float2half(rf);
+}
+
 template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
   Packet8f af = half2float(a);
   Packet8f bf = half2float(b);
@@ -894,6 +925,52 @@ template<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h&
   return Eigen::half(reduced);
 }
 
+template<> EIGEN_STRONG_INLINE Packet8h preduxp<Packet8h>(const Packet8h* p) {
+  Packet8f pf[8];
+  pf[0] = half2float(p[0]);
+  pf[1] = half2float(p[1]);
+  pf[2] = half2float(p[2]);
+  pf[3] = half2float(p[3]);
+  pf[4] = half2float(p[4]);
+  pf[5] = half2float(p[5]);
+  pf[6] = half2float(p[6]);
+  pf[7] = half2float(p[7]);
+  Packet8f reduced = preduxp<Packet8f>(pf);
+  return float2half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a)
+{
+  __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
+  Packet8h res;
+  res.x = _mm_shuffle_epi8(a.x,m);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pinsertfirst(const Packet8h& a, Eigen::half b)
+{
+  Packet8h res;
+  res.x = _mm_insert_epi16(a.x,int(b.x),0);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pinsertlast(const Packet8h& a, Eigen::half b)
+{
+  Packet8h res;
+  res.x = _mm_insert_epi16(a.x,int(b.x),15);
+  return res;
+}
+
+template<int Offset>
+struct palign_impl<Offset,Packet8h>
+{
+  static EIGEN_STRONG_INLINE void run(Packet8h& first, const Packet8h& second)
+  {
+    if (Offset!=0)
+      first.x = _mm_alignr_epi8(second.x,first.x, Offset*2);
+  }
+};
+
 EIGEN_STRONG_INLINE void
 ptranspose(PacketBlock<Packet8h,8>& kernel) {
   __m128i a = kernel.packet[0].x;
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index 4b19be92d..94ee91e3b 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -123,7 +123,7 @@ template<typename Scalar> void packetmath()
   EIGEN_ALIGN_MAX Scalar data2[size];
   EIGEN_ALIGN_MAX Packet packets[PacketSize*2];
   EIGEN_ALIGN_MAX Scalar ref[size];
-  RealScalar refvalue = 0;
+  RealScalar refvalue = RealScalar(0);
   for (int i=0; i<size; ++i)
   {
     data1[i] = internal::random<Scalar>()/RealScalar(PacketSize);
@@ -178,7 +178,8 @@ template<typename Scalar> void packetmath()
   VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasSub);
   VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasMul);
   VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasNegate);
-  VERIFY((internal::is_same<Scalar,int>::value) || (!PacketTraits::Vectorizable) || PacketTraits::HasDiv);
+  // Disabled as it is not clear why it would be mandatory to support division.
+  //VERIFY((internal::is_same<Scalar,int>::value) || (!PacketTraits::Vectorizable) || PacketTraits::HasDiv);
 
   CHECK_CWISE2_IF(PacketTraits::HasAdd, REF_ADD,  internal::padd);
   CHECK_CWISE2_IF(PacketTraits::HasSub, REF_SUB,  internal::psub);
@@ -242,29 +243,30 @@ template<typename Scalar> void packetmath()
     }
   }
 
-  ref[0] = 0;
+  ref[0] = Scalar(0);
   for (int i=0; i<PacketSize; ++i)
     ref[0] += data1[i];
   VERIFY(isApproxAbs(ref[0], internal::predux(internal::pload<Packet>(data1)), refvalue) && "internal::predux");
 
+  if(PacketSize==8 && internal::unpacket_traits<typename internal::unpacket_traits<Packet>::half>::size ==4) // so far, predux_half_dowto4 is only required in such a case
   {
     int HalfPacketSize = PacketSize>4 ? PacketSize/2 : PacketSize;
     for (int i=0; i<HalfPacketSize; ++i)
-      ref[i] = 0;
+      ref[i] = Scalar(0);
     for (int i=0; i<PacketSize; ++i)
       ref[i%HalfPacketSize] += data1[i];
     internal::pstore(data2, internal::predux_half_dowto4(internal::pload<Packet>(data1)));
     VERIFY(areApprox(ref, data2, HalfPacketSize) && "internal::predux_half_dowto4");
   }
 
-  ref[0] = 1;
+  ref[0] = Scalar(1);
   for (int i=0; i<PacketSize; ++i)
     ref[0] *= data1[i];
   VERIFY(internal::isApprox(ref[0], internal::predux_mul(internal::pload<Packet>(data1))) && "internal::predux_mul");
 
   for (int j=0; j<PacketSize; ++j)
   {
-    ref[j] = 0;
+    ref[j] = Scalar(0);
     for (int i=0; i<PacketSize; ++i)
       ref[j] += data1[i+j*PacketSize];
     packets[j] = internal::pload<Packet>(data1+j*PacketSize);
@@ -630,6 +632,7 @@ void test_packetmath()
     CALL_SUBTEST_3( packetmath<int>() );
     CALL_SUBTEST_4( packetmath<std::complex<float> >() );
     CALL_SUBTEST_5( packetmath<std::complex<double> >() );
+    CALL_SUBTEST_6( packetmath<half>() );
 
     CALL_SUBTEST_1( packetmath_notcomplex<float>() );
     CALL_SUBTEST_2( packetmath_notcomplex<double>() );
-- 
cgit v1.2.3


From 56a33ae57dd536f35be007eb051a636cea4467d0 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 6 Jul 2018 17:14:04 +0200
Subject: test product kernel with half-floats.

---
 test/half_float.cpp | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/test/half_float.cpp b/test/half_float.cpp
index 7734f82cc..487d0d1b2 100644
--- a/test/half_float.cpp
+++ b/test/half_float.cpp
@@ -257,13 +257,31 @@ void test_array()
   ss << a1;
 }
 
+void test_product()
+{
+  typedef Matrix<half,Dynamic,Dynamic> MatrixXh;
+  Index rows  = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
+  Index cols  = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
+  Index depth = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
+  MatrixXh Ah = MatrixXh::Random(rows,depth);
+  MatrixXh Bh = MatrixXh::Random(depth,cols);
+  MatrixXh Ch = MatrixXh::Random(rows,cols);
+  MatrixXf Af = Ah.cast<float>();
+  MatrixXf Bf = Bh.cast<float>();
+  MatrixXf Cf = Ch.cast<float>();
+  VERIFY_IS_APPROX(Ch.noalias()+=Ah*Bh, (Cf.noalias()+=Af*Bf).cast<half>());
+}
+
 void test_half_float()
 {
-  CALL_SUBTEST(test_conversion());
   CALL_SUBTEST(test_numtraits());
-  CALL_SUBTEST(test_arithmetic());
-  CALL_SUBTEST(test_comparison());
-  CALL_SUBTEST(test_basic_functions());
-  CALL_SUBTEST(test_trigonometric_functions());
-  CALL_SUBTEST(test_array());
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST(test_conversion());
+    CALL_SUBTEST(test_arithmetic());
+    CALL_SUBTEST(test_comparison());
+    CALL_SUBTEST(test_basic_functions());
+    CALL_SUBTEST(test_trigonometric_functions());
+    CALL_SUBTEST(test_array());
+    CALL_SUBTEST(test_product());
+  }
 }
-- 
cgit v1.2.3


From a937c5020889f95aadccefc79338ed83c2ff8784 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 6 Jul 2018 17:41:52 +0200
Subject: palign is not used anymore, so let's relax the unit test

---
 test/packetmath.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index 94ee91e3b..56e017383 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -171,7 +171,10 @@ template<typename Scalar> void packetmath()
     for (int i=0; i<PacketSize; ++i)
       ref[i] = data1[i+offset];
 
+    // palign is not used anymore, so let's just put a warning if it fails
+    ++g_test_level;
     VERIFY(areApprox(ref, data2, PacketSize) && "internal::palign");
+    --g_test_level;
   }
 
   VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasAdd);
-- 
cgit v1.2.3


From f2dc048df9dddda48cf8945f3a0a61325ffa6c97 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 6 Jul 2018 17:43:11 +0200
Subject: complete implementation of Packet16h (AVX512)

---
 Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 55 ++++++++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
index 2ee92b4f6..c068351ce 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
@@ -486,6 +486,13 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
 #endif
 }
 
+template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) {
+  // FIXME we could do that with bit manipulation
+  Packet16f af = half2float(a);
+  Packet16f rf = pnegate(af);
+  return float2half(rf);
+}
+
 template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
   Packet16f af = half2float(a);
   Packet16f bf = half2float(b);
@@ -512,6 +519,51 @@ template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
   return half(predux(from_float));
 }
 
+template<> EIGEN_STRONG_INLINE Packet16h preduxp<Packet16h>(const Packet16h* p) {
+  Packet16f pf[16];
+  pf[0] = half2float(p[0]);
+  pf[1] = half2float(p[1]);
+  pf[2] = half2float(p[2]);
+  pf[3] = half2float(p[3]);
+  pf[4] = half2float(p[4]);
+  pf[5] = half2float(p[5]);
+  pf[6] = half2float(p[6]);
+  pf[7] = half2float(p[7]);
+  pf[8] = half2float(p[8]);
+  pf[9] = half2float(p[9]);
+  pf[10] = half2float(p[10]);
+  pf[11] = half2float(p[11]);
+  pf[12] = half2float(p[12]);
+  pf[13] = half2float(p[13]);
+  pf[14] = half2float(p[14]);
+  pf[15] = half2float(p[15]);
+  Packet16f reduced = preduxp<Packet16f>(pf);
+  return float2half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a)
+{
+  __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
+  Packet16h res;
+  res.x = _mm256_set_m128i(_mm_shuffle_epi8(_mm256_extractf128_si256(a.x,0),m),
+                           _mm_shuffle_epi8(_mm256_extractf128_si256(a.x,1),m));
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pinsertfirst(const Packet16h& a, Eigen::half b)
+{
+  Packet16h res;
+  res.x = _mm256_insert_epi16(a.x,b.x,0);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pinsertlast(const Packet16h& a, Eigen::half b)
+{
+  Packet16h res;
+  res.x = _mm256_insert_epi16(a.x,b.x,15);
+  return res;
+}
+
 template<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride)
 {
   Packet16h result;
@@ -854,6 +906,7 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
 template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
 
 template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) {
+  // FIXME we could do that with bit manipulation
   Packet8f af = half2float(a);
   Packet8f rf = pnegate(af);
   return float2half(rf);
@@ -957,7 +1010,7 @@ template<> EIGEN_STRONG_INLINE Packet8h pinsertfirst(const Packet8h& a, Eigen::h
 template<> EIGEN_STRONG_INLINE Packet8h pinsertlast(const Packet8h& a, Eigen::half b)
 {
   Packet8h res;
-  res.x = _mm_insert_epi16(a.x,int(b.x),15);
+  res.x = _mm_insert_epi16(a.x,int(b.x),7);
   return res;
 }
 
-- 
cgit v1.2.3


From 1f54164eca5a8960205b94dd3f3cec87089e4ac6 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sat, 7 Jul 2018 00:15:07 +0200
Subject: Fix a few issues with Packet16h

---
 Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 33 +++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
index c068351ce..ae9193b0d 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
@@ -362,10 +362,10 @@ struct packet_traits<half> : default_packet_traits {
     AlignedOnScalar = 1,
     size = 16,
     HasHalfPacket = 0,
-    HasAdd    = 0,
-    HasSub    = 0,
-    HasMul    = 0,
-    HasNegate = 0,
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasNegate = 1,
     HasAbs    = 0,
     HasAbs2   = 0,
     HasMin    = 0,
@@ -414,6 +414,21 @@ template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet1
   _mm256_storeu_si256((__m256i*)to, from.x);
 }
 
+template<> EIGEN_STRONG_INLINE Packet16h
+ploaddup<Packet16h>(const Eigen::half*  from) {
+  Packet16h result;
+  unsigned short a = from[0].x;
+  unsigned short b = from[1].x;
+  unsigned short c = from[2].x;
+  unsigned short d = from[3].x;
+  unsigned short e = from[4].x;
+  unsigned short f = from[5].x;
+  unsigned short g = from[6].x;
+  unsigned short h = from[7].x;
+  result.x = _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
+  return result;
+}
+
 template<> EIGEN_STRONG_INLINE Packet16h
 ploadquad(const Eigen::half* from) {
   Packet16h result;
@@ -519,6 +534,11 @@ template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
   return half(predux(from_float));
 }
 
+template<> EIGEN_STRONG_INLINE half predux_mul<Packet16h>(const Packet16h& from) {
+  Packet16f from_float = half2float(from);
+  return half(predux_mul(from_float));
+}
+
 template<> EIGEN_STRONG_INLINE Packet16h preduxp<Packet16h>(const Packet16h* p) {
   Packet16f pf[16];
   pf[0] = half2float(p[0]);
@@ -545,8 +565,9 @@ template<> EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a)
 {
   __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
   Packet16h res;
-  res.x = _mm256_set_m128i(_mm_shuffle_epi8(_mm256_extractf128_si256(a.x,0),m),
-                           _mm_shuffle_epi8(_mm256_extractf128_si256(a.x,1),m));
+  res.x = _mm256_insertf128_si256(
+                    _mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(a.x,1),m)),
+                                           _mm_shuffle_epi8(_mm256_extractf128_si256(a.x,0),m), 1);
   return res;
 }
 
-- 
cgit v1.2.3


From 90a53ca6fd8a79d39f926430ec07f626c8260a20 Mon Sep 17 00:00:00 2001
From: Mark D Ryan <mark.d.ryan@intel.com>
Date: Sat, 16 Jun 2018 15:13:06 -0700
Subject: Fix the Packet16h version of ptranspose

The AVX512 version of ptranpose for PacketBlock<Packet16h,16> was
reordering the PacketBlock argument incorrectly.  This lead to errors in
the multiplication of matrices composed of 16 bit floats on AVX512
machines, if at least of the matrices was using RowMajor order.  This
error is responsible for one tensorflow unit test failure on AVX512
machines:

//tensorflow/python/kernel_tests:batch_matmul_op_test
---
 Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
index ae9193b0d..9897bd4e5 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
@@ -692,20 +692,20 @@ ptranspose(PacketBlock<Packet16h,16>& kernel) {
 
   // NOTE: no unpacklo/hi instr in this case, so using permute instr.
   __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
-  __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
-  __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
-  __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
-  __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
-  __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
-  __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
-  __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
-  __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
-  __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
-  __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
-  __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
-  __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
-  __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
-  __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
+  __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
+  __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
+  __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
+  __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
+  __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
+  __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
+  __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
+  __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
+  __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
+  __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
+  __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
+  __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
+  __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
+  __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
   __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
 
   kernel.packet[0].x = a_p_0;
-- 
cgit v1.2.3


From 359dd77ec3583fa44a927e3ebcee902bd37f6623 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 9 Jul 2018 11:03:39 +0200
Subject: Fix legitimate "declaration shadows a typedef" warning

---
 Eigen/src/Core/products/GeneralBlockPanelKernel.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index dd4f366ee..1890efd4d 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -1526,10 +1526,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           // The following piece of code won't work for 512 bit registers
           // Moreover, if LhsProgress==8 it assumes that there is a half packet of the same size
           // as nr (which is currently 4) for the return type.
-          typedef typename unpacket_traits<SResPacket>::half SResPacketHalf;
+          const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
           if ((SwappedTraits::LhsProgress % 4) == 0 &&
               (SwappedTraits::LhsProgress <= 8) &&
-              (SwappedTraits::LhsProgress!=8 || unpacket_traits<SResPacketHalf>::size==nr))
+              (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr))
           {
             SAccPacket C0, C1, C2, C3;
             straits.initAcc(C0);
-- 
cgit v1.2.3


From ec323b7e6643d9da3d9cda86570cc7724e0a8d3c Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 9 Jul 2018 11:13:19 +0200
Subject: Skip null numerators in triangular-vector-solve (as in BLAS TRSV).

---
 Eigen/src/Core/products/TriangularSolverVector.h | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/Eigen/src/Core/products/TriangularSolverVector.h b/Eigen/src/Core/products/TriangularSolverVector.h
index b994759b2..647317016 100644
--- a/Eigen/src/Core/products/TriangularSolverVector.h
+++ b/Eigen/src/Core/products/TriangularSolverVector.h
@@ -58,7 +58,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
       {
         // let's directly call the low level product function because:
         // 1 - it is faster to compile
-        // 2 - it is slighlty faster at runtime
+        // 2 - it is slightly faster at runtime
         Index startRow = IsLower ? pi : pi-actualPanelWidth;
         Index startCol = IsLower ? 0 : pi;
 
@@ -77,7 +77,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
         if (k>0)
           rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map<const Matrix<RhsScalar,Dynamic,1> >(rhs+s,k))).sum();
 
-        if(!(Mode & UnitDiag))
+        if((!(Mode & UnitDiag)) && numext::not_equal_strict(rhs[i],RhsScalar(0)))
           rhs[i] /= cjLhs(i,i);
       }
     }
@@ -114,20 +114,23 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
       for(Index k=0; k<actualPanelWidth; ++k)
       {
         Index i = IsLower ? pi+k : pi-k-1;
-        if(!(Mode & UnitDiag))
-          rhs[i] /= cjLhs.coeff(i,i);
-
-        Index r = actualPanelWidth - k - 1; // remaining size
-        Index s = IsLower ? i+1 : i-r;
-        if (r>0)
-          Map<Matrix<RhsScalar,Dynamic,1> >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r);
+        if(numext::not_equal_strict(rhs[i],RhsScalar(0)))
+        {
+          if(!(Mode & UnitDiag))
+            rhs[i] /= cjLhs.coeff(i,i);
+
+          Index r = actualPanelWidth - k - 1; // remaining size
+          Index s = IsLower ? i+1 : i-r;
+          if (r>0)
+            Map<Matrix<RhsScalar,Dynamic,1> >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r);
+        }
       }
       Index r = IsLower ? size - endBlock : startBlock; // remaining size
       if (r > 0)
       {
         // let's directly call the low level product function because:
         // 1 - it is faster to compile
-        // 2 - it is slighlty faster at runtime
+        // 2 - it is slightly faster at runtime
         general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,Conjugate,RhsScalar,RhsMapper,false>::run(
             r, actualPanelWidth,
             LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride),
-- 
cgit v1.2.3


From 6190aa5632fb698fa66d2dad2949275089f15738 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 9 Jul 2018 11:23:16 +0200
Subject: bug #1567: add optimized path for tensor broadcasting and 'Channel
 First' shape

---
 .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h    | 70 ++++++++++++++++++++--
 unsupported/test/cxx11_tensor_broadcasting.cpp     | 57 ++++++++++++++++++
 2 files changed, 123 insertions(+), 4 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index 9ab6b3565..b35b36475 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -161,6 +161,22 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
         }
       }
     }
+
+    // Handle special format like NCHW, its input shape is '[1, N..., 1]' and
+    // broadcast shape is '[N, 1..., N]'
+    if (!oneByN && !nByOne) {
+      if (input_dims[0] == 1 && input_dims[NumDims-1] == 1 && NumDims > 2) {
+        nByOne = true;
+        oneByN = true;
+        for (int i = 1; i < NumDims-1; ++i) {
+          if (broadcast[i] != 1) {
+            nByOne = false;
+            oneByN = false;
+            break;
+          }
+        }
+      }
+    }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@@ -256,24 +272,70 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     }
 
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      if (oneByN) {
+      if (oneByN && !nByOne) {
         return packetNByOne<LoadMode>(index);
-      } else if (nByOne) {
+      } else if (!oneByN && nByOne) {
         return packetOneByN<LoadMode>(index);
+      } else if (oneByN && nByOne) {
+        return packetOneByNByOne<LoadMode>(index);
       } else {
         return packetColMajor<LoadMode>(index);
       }
     } else {
-      if (oneByN) {
+      if (oneByN && !nByOne) {
         return packetOneByN<LoadMode>(index);
-      } else if (nByOne) {
+      } else if (!oneByN && nByOne) {
         return packetNByOne<LoadMode>(index);
+      } else if (oneByN && nByOne) {
+        return packetOneByNByOne<LoadMode>(index);
       } else {
         return packetRowMajor<LoadMode>(index);
       }
     }
   }
 
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByNByOne
+  (Index index) const
+  {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    Index startDim, endDim;
+    Index inputIndex, outputOffset, batchedIndex;
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      startDim = NumDims - 1;
+      endDim = 1;
+    } else {
+      startDim = 0;
+      endDim = NumDims - 2;
+    }
+
+    batchedIndex = index % m_outputStrides[startDim];
+    inputIndex   = batchedIndex / m_outputStrides[endDim];
+    outputOffset = batchedIndex % m_outputStrides[endDim];
+
+    if (outputOffset + PacketSize <= m_outputStrides[endDim]) {
+      values[0] = m_impl.coeff(inputIndex);
+      return internal::pload1<PacketReturnType>(values);
+    } else {
+      for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
+        if (outputOffset + cur < m_outputStrides[endDim]) {
+          values[i] = m_impl.coeff(inputIndex);
+        } else {
+          ++inputIndex;
+          inputIndex = (inputIndex == m_inputStrides[startDim] ? 0 : inputIndex);
+          values[i] = m_impl.coeff(inputIndex);
+          outputOffset = 0;
+          cur = 0;
+        }
+      }
+      return internal::pload<PacketReturnType>(values);
+    }
+  }
+
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const
   {
diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp
index a9d268ea6..f0ff03184 100644
--- a/unsupported/test/cxx11_tensor_broadcasting.cpp
+++ b/unsupported/test/cxx11_tensor_broadcasting.cpp
@@ -238,6 +238,59 @@ static void test_simple_broadcasting_n_by_one()
   }
 }
 
+template <int DataLayout>
+static void test_simple_broadcasting_one_by_n_by_one_1d()
+{
+  Tensor<float, 3, DataLayout> tensor(1,7,1);
+  tensor.setRandom();
+  array<ptrdiff_t, 3> broadcasts;
+  broadcasts[0] = 5;
+  broadcasts[1] = 1;
+  broadcasts[2] = 13;
+  Tensor<float, 3, DataLayout> broadcasted;
+  broadcasted = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcasted.dimension(0), 5);
+  VERIFY_IS_EQUAL(broadcasted.dimension(1), 7);
+  VERIFY_IS_EQUAL(broadcasted.dimension(2), 13);
+
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      for (int k = 0; k < 13; ++k) {
+        VERIFY_IS_EQUAL(tensor(0,j%7,0), broadcasted(i,j,k));
+      }
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_simple_broadcasting_one_by_n_by_one_2d()
+{
+  Tensor<float, 4, DataLayout> tensor(1,7,13,1);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> broadcasts;
+  broadcasts[0] = 5;
+  broadcasts[1] = 1;
+  broadcasts[2] = 1;
+  broadcasts[3] = 19;
+  Tensor<float, 4, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 5);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 7);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 13);
+  VERIFY_IS_EQUAL(broadcast.dimension(3), 19);
+
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      for (int k = 0; k < 13; ++k) {
+        for (int l = 0; l < 19; ++l) {
+          VERIFY_IS_EQUAL(tensor(0,j%7,k%13,0), broadcast(i,j,k,l));
+        }
+      }
+    }
+  }
+}
 
 void test_cxx11_tensor_broadcasting()
 {
@@ -253,4 +306,8 @@ void test_cxx11_tensor_broadcasting()
   CALL_SUBTEST(test_simple_broadcasting_n_by_one<RowMajor>());
   CALL_SUBTEST(test_simple_broadcasting_one_by_n<ColMajor>());
   CALL_SUBTEST(test_simple_broadcasting_n_by_one<ColMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_1d<ColMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_2d<ColMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_1d<RowMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_2d<RowMajor>());
 }
-- 
cgit v1.2.3


From de9e31a06d0324862d9200d08eb3cc4d3d07e660 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 9 Jul 2018 15:41:14 +0200
Subject: Introduce the macro ei_declare_local_nested_eval to help allocating
 on the stack local temporaries via alloca, and let outer-products makes a
 good use of it. If successful, we should use it everywhere nested_eval is
 used to declare local dense temporaries.

---
 Eigen/src/Core/ProductEvaluators.h |  4 +-
 Eigen/src/Core/util/Memory.h       | 80 ++++++++++++++++++++++++++++++++++++--
 Eigen/src/Core/util/XprHelper.h    |  2 +-
 test/product_notemporary.cpp       | 10 ++++-
 4 files changed, 89 insertions(+), 7 deletions(-)

diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index 76e5083c1..22ad32ae3 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -272,7 +272,7 @@ template<typename Dst, typename Lhs, typename Rhs, typename Func>
 void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
 {
   evaluator<Rhs> rhsEval(rhs);
-  typename nested_eval<Lhs,Rhs::SizeAtCompileTime>::type actual_lhs(lhs);
+  ei_declare_local_nested_eval(Lhs,lhs,Rhs::SizeAtCompileTime,actual_lhs);
   // FIXME if cols is large enough, then it might be useful to make sure that lhs is sequentially stored
   // FIXME not very good if rhs is real and lhs complex while alpha is real too
   const Index cols = dst.cols();
@@ -285,7 +285,7 @@ template<typename Dst, typename Lhs, typename Rhs, typename Func>
 void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
 {
   evaluator<Lhs> lhsEval(lhs);
-  typename nested_eval<Rhs,Lhs::SizeAtCompileTime>::type actual_rhs(rhs);
+  ei_declare_local_nested_eval(Rhs,rhs,Lhs::SizeAtCompileTime,actual_rhs);
   // FIXME if rows is large enough, then it might be useful to make sure that rhs is sequentially stored
   // FIXME not very good if lhs is real and rhs complex while alpha is real too
   const Index rows = dst.rows();
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 22d7679c5..43920bdc8 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -582,6 +582,60 @@ template<typename T> class aligned_stack_memory_handler : noncopyable
     bool m_deallocate;
 };
 
+#ifdef EIGEN_ALLOCA
+
+template<typename Xpr, int NbEvaluations,
+         bool MapExternalBuffer = nested_eval<Xpr,NbEvaluations>::Evaluate && Xpr::MaxSizeAtCompileTime==Dynamic
+         >
+struct local_nested_eval_wrapper
+{
+  static const bool NeedExternalBuffer = false;
+  typedef typename Xpr::Scalar Scalar;
+  typedef typename nested_eval<Xpr,NbEvaluations>::type ObjectType;
+  ObjectType object;
+
+  EIGEN_DEVICE_FUNC
+  local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr) : object(xpr)
+  {
+    EIGEN_UNUSED_VARIABLE(ptr);
+    eigen_internal_assert(ptr==0);
+  }
+};
+
+template<typename Xpr, int NbEvaluations>
+struct local_nested_eval_wrapper<Xpr,NbEvaluations,true>
+{
+  static const bool NeedExternalBuffer = true;
+  typedef typename Xpr::Scalar Scalar;
+  typedef typename plain_object_eval<Xpr>::type PlainObject;
+  typedef Map<PlainObject,EIGEN_DEFAULT_ALIGN_BYTES> ObjectType;
+  ObjectType object;
+
+  EIGEN_DEVICE_FUNC
+  local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr)
+    : object(ptr==0 ? reinterpret_cast<Scalar*>(Eigen::internal::aligned_malloc(sizeof(Scalar)*xpr.size())) : ptr, xpr.rows(), xpr.cols()),
+      m_deallocate(ptr==0)
+  {
+    if(NumTraits<Scalar>::RequireInitialization && object.data())
+      Eigen::internal::construct_elements_of_array(object.data(), object.size());
+    object = xpr;
+  }
+
+  EIGEN_DEVICE_FUNC
+  ~local_nested_eval_wrapper()
+  {
+    if(NumTraits<Scalar>::RequireInitialization && object.data())
+      Eigen::internal::destruct_elements_of_array(object.data(), object.size());
+    if(m_deallocate)
+      Eigen::internal::aligned_free(object.data());
+  }
+
+private:
+  bool m_deallocate;
+};
+
+#endif // EIGEN_ALLOCA
+
 template<typename T> class scoped_array : noncopyable
 {
   T* m_ptr;
@@ -609,9 +663,11 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
 } // end namespace internal
 
 /** \internal
-  * Declares, allocates and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack
-  * if SIZE is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform
-  * (currently, this is Linux and Visual Studio only). Otherwise the memory is allocated on the heap.
+  * 
+  * The macro ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) declares, allocates,
+  * and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack
+  * if the size in bytes is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform
+  * (currently, this is Linux, OSX and Visual Studio only). Otherwise the memory is allocated on the heap.
   * The allocated buffer is automatically deleted when exiting the scope of this declaration.
   * If BUFFER is non null, then the declared variable is simply an alias for BUFFER, and no allocation/deletion occurs.
   * Here is an example:
@@ -622,6 +678,14 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
   * }
   * \endcode
   * The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token.
+  * 
+  * The macro ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) is analogue to
+  * \code
+  *   typename internal::nested_eval<XPRT_T,N>::type NAME(XPR);
+  * \endcode
+  * with the advantage of using aligned stack allocation even if the maximal size of XPR at compile time is unknown.
+  * This is accomplished through alloca if this later is supported and if the required number of bytes
+  * is below EIGEN_STACK_ALLOCATION_LIMIT.
   */
 #ifdef EIGEN_ALLOCA
 
@@ -641,6 +705,13 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
                     : Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE) );  \
     Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,sizeof(TYPE)*SIZE>EIGEN_STACK_ALLOCATION_LIMIT)
 
+
+  #define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) \
+    Eigen::internal::local_nested_eval_wrapper<XPR_T,N> EIGEN_CAT(NAME,_wrapper)(XPR, reinterpret_cast<typename XPR_T::Scalar*>( \
+      ( (Eigen::internal::local_nested_eval_wrapper<XPR_T,N>::NeedExternalBuffer) && ((sizeof(typename XPR_T::Scalar)*XPR.size())<=EIGEN_STACK_ALLOCATION_LIMIT) ) \
+        ? EIGEN_ALIGNED_ALLOCA( sizeof(typename XPR_T::Scalar)*XPR.size() ) : 0 ) ) ; \
+    typename Eigen::internal::local_nested_eval_wrapper<XPR_T,N>::ObjectType NAME(EIGEN_CAT(NAME,_wrapper).object)
+
 #else
 
   #define ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) \
@@ -648,6 +719,9 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
     TYPE* NAME = (BUFFER)!=0 ? BUFFER : reinterpret_cast<TYPE*>(Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE));    \
     Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,true)
 
+
+#define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) typename Eigen::internal::nested_eval<XPR_T,N>::type NAME(XPR);
+
 #endif
 
 
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index 7311adec7..3926d3a6c 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -460,7 +460,7 @@ template<typename T, int n, typename PlainObject = typename plain_object_eval<T>
 {
   enum {
     ScalarReadCost = NumTraits<typename traits<T>::Scalar>::ReadCost,
-    CoeffReadCost = evaluator<T>::CoeffReadCost,  // NOTE What if an evaluator evaluate itself into a tempory?
+    CoeffReadCost = evaluator<T>::CoeffReadCost,  // NOTE What if an evaluator evaluate itself into a temporary?
                                                   //      Then CoeffReadCost will be small (e.g., 1) but we still have to evaluate, especially if n>1.
                                                   //      This situation is already taken care by the EvalBeforeNestingBit flag, which is turned ON
                                                   //      for all evaluator creating a temporary. This flag is then propagated by the parent evaluators.
diff --git a/test/product_notemporary.cpp b/test/product_notemporary.cpp
index 30592b79e..062180f42 100644
--- a/test/product_notemporary.cpp
+++ b/test/product_notemporary.cpp
@@ -128,11 +128,19 @@ template<typename MatrixType> void product_notemporary(const MatrixType& m)
   VERIFY_EVALUATION_COUNT( cvres.noalias() = (rm3+rm3) * (m1*cv1), 1 );
 
   // Check outer products
+  #ifdef EIGEN_ALLOCA
+  bool temp_via_alloca = m3.rows()*sizeof(Scalar) <= EIGEN_STACK_ALLOCATION_LIMIT;
+  #else
+  bool temp_via_alloca = false;
+  #endif
   m3 = cv1 * rv1;
   VERIFY_EVALUATION_COUNT( m3.noalias() = cv1 * rv1, 0 );
-  VERIFY_EVALUATION_COUNT( m3.noalias() = (cv1+cv1) * (rv1+rv1), 1 );
+  VERIFY_EVALUATION_COUNT( m3.noalias() = (cv1+cv1) * (rv1+rv1), temp_via_alloca ? 0 : 1 );
   VERIFY_EVALUATION_COUNT( m3.noalias() = (m1*cv1) * (rv1), 1 );
   VERIFY_EVALUATION_COUNT( m3.noalias() += (m1*cv1) * (rv1), 1 );
+  rm3 = cv1 * rv1;
+  VERIFY_EVALUATION_COUNT( rm3.noalias() = cv1 * rv1, 0 );
+  VERIFY_EVALUATION_COUNT( rm3.noalias() = (cv1+cv1) * (rv1+rv1), temp_via_alloca ? 0 : 1 );
   VERIFY_EVALUATION_COUNT( rm3.noalias() = (cv1) * (rv1 * m1), 1 );
   VERIFY_EVALUATION_COUNT( rm3.noalias() -= (cv1) * (rv1 * m1), 1 );
   VERIFY_EVALUATION_COUNT( rm3.noalias() = (m1*cv1) * (rv1 * m1), 2 );
-- 
cgit v1.2.3


From 9357838f94d2907996adadc7e5200376f3561ed4 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 10 Jul 2018 09:10:15 +0200
Subject: bug #1543: improve linear indexing for general block expressions

---
 Eigen/src/Core/CoreEvaluators.h | 6 +++---
 test/block.cpp                  | 2 ++
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index 6231f2edd..b65ec4ffb 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -1080,7 +1080,7 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
     : m_argImpl(block.nestedExpression()), 
       m_startRow(block.startRow()), 
       m_startCol(block.startCol()),
-      m_linear_offset(InnerPanel?(XprType::IsRowMajor ? block.startRow()*block.cols() : block.startCol()*block.rows()):0)
+      m_linear_offset((InnerPanel|| XprType::IsVectorAtCompileTime)?(XprType::IsRowMajor ? block.startRow()*block.cols() + block.startCol() : block.startCol()*block.rows() + block.startRow()):0)
   { }
  
   typedef typename XprType::Scalar Scalar;
@@ -1088,7 +1088,7 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
 
   enum {
     RowsAtCompileTime = XprType::RowsAtCompileTime,
-    ForwardLinearAccess = InnerPanel && bool(evaluator<ArgType>::Flags&LinearAccessBit)
+    ForwardLinearAccess = (InnerPanel || XprType::IsVectorAtCompileTime) && bool(evaluator<ArgType>::Flags&LinearAccessBit)
   };
  
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -1162,7 +1162,7 @@ protected:
   evaluator<ArgType> m_argImpl;
   const variable_if_dynamic<Index, (ArgType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow;
   const variable_if_dynamic<Index, (ArgType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol;
-  const variable_if_dynamic<Index, InnerPanel ? Dynamic : 0> m_linear_offset;
+  const variable_if_dynamic<Index, (InnerPanel || XprType::IsVectorAtCompileTime) ? Dynamic : 0> m_linear_offset;
 };
 
 // TODO: This evaluator does not actually use the child evaluator; 
diff --git a/test/block.cpp b/test/block.cpp
index 8c4dd87be..9c2424662 100644
--- a/test/block.cpp
+++ b/test/block.cpp
@@ -162,9 +162,11 @@ template<typename MatrixType> void block(const MatrixType& m)
   // expressions without direct access
   VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,rows-r1,cols-c1).block(r2-r1,c2-c1,rows-r2,cols-c2)) , ((m1+m2).block(r2,c2,rows-r2,cols-c2)) );
   VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).row(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)) );
+  VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).row(0)) , ((m1+m2).eval().row(r1).segment(c1,c2-c1+1)) );
   VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).col(0)) , ((m1+m2).col(c1).segment(r1,r2-r1+1)) );
   VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).transpose().col(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)).transpose() );
   VERIFY_IS_APPROX( ((m1+m2).transpose().block(c1,r1,c2-c1+1,r2-r1+1).col(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)).transpose() );
+  VERIFY_IS_APPROX( ((m1+m2).template block<1,Dynamic>(r1,c1,1,c2-c1+1)) , ((m1+m2).eval().row(r1).segment(c1,c2-c1+1)) );
 
   VERIFY_IS_APPROX( (m1*1).topRows(r1),  m1.topRows(r1) );
   VERIFY_IS_APPROX( (m1*1).leftCols(c1), m1.leftCols(c1) );
-- 
cgit v1.2.3


From fe723d6129f2fde60e91e7cef52a7907a0cb479f Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 10 Jul 2018 09:10:32 +0200
Subject: Fix conversion warning

---
 Eigen/src/Core/util/IntegralConstant.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Core/util/IntegralConstant.h b/Eigen/src/Core/util/IntegralConstant.h
index 78a4705cd..bf99cd8ab 100644
--- a/Eigen/src/Core/util/IntegralConstant.h
+++ b/Eigen/src/Core/util/IntegralConstant.h
@@ -192,7 +192,7 @@ inline internal::FixedInt<N> fix() { return internal::FixedInt<N>(); }
 // The generic typename T is mandatory. Otherwise, a code like fix<N> could refer to either the function above or this next overload.
 // This way a code like fix<N> can only refer to the previous function.
 template<int N,typename T>
-inline internal::VariableAndFixedInt<N> fix(T val) { return internal::VariableAndFixedInt<N>(val); }
+inline internal::VariableAndFixedInt<N> fix(T val) { return internal::VariableAndFixedInt<N>(internal::convert_index<int>(val)); }
 #endif
 
 #else // EIGEN_PARSED_BY_DOXYGEN
-- 
cgit v1.2.3


From 1625476091c2c4576fe9f2eef91df4d0444dc9e0 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 11 Jul 2018 14:00:24 +0200
Subject: Add internall::is_identity compile-time helper

---
 Eigen/src/Core/util/XprHelper.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index 3926d3a6c..e3231c712 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -676,6 +676,14 @@ template<typename T> struct is_diagonal<DiagonalWrapper<T> >
 template<typename T, int S> struct is_diagonal<DiagonalMatrix<T,S> >
 { enum { ret = true }; };
 
+
+template<typename T> struct is_identity
+{ enum { value = false }; };
+
+template<typename T> struct is_identity<CwiseNullaryOp<internal::scalar_identity_op<typename T::Scalar>, T> >
+{ enum { value = true }; };
+
+
 template<typename S1, typename S2> struct glue_shapes;
 template<> struct glue_shapes<DenseShape,TriangularShape> { typedef TriangularShape type;  };
 
-- 
cgit v1.2.3


From f00d08cc0a987fa624209b920608b56638404f13 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 11 Jul 2018 14:01:47 +0200
Subject: Optimize extraction of Q in SparseQR by exploiting the structure of
 the identity matrix.

---
 Eigen/src/SparseQR/SparseQR.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Eigen/src/SparseQR/SparseQR.h b/Eigen/src/SparseQR/SparseQR.h
index 7409fcae9..1a28389e8 100644
--- a/Eigen/src/SparseQR/SparseQR.h
+++ b/Eigen/src/SparseQR/SparseQR.h
@@ -640,7 +640,8 @@ struct SparseQR_QProduct : ReturnByValue<SparseQR_QProduct<SparseQRType, Derived
       // Compute res = Q * other column by column
       for(Index j = 0; j < res.cols(); j++)
       {
-        for (Index k = diagSize-1; k >=0; k--)
+        Index start_k = internal::is_identity<Derived>::value ? numext::mini(j,diagSize-1) : diagSize-1;
+        for (Index k = start_k; k >=0; k--)
         {
           Scalar tau = Scalar(0);
           tau = m_qr.m_Q.col(k).dot(res.col(j));
-- 
cgit v1.2.3