From 5418154a45db637211e94f11ee04c6ae4dc8cf85 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 20 Jun 2018 17:51:48 -0700 Subject: Fix oversharding bug in parallelFor. --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index ca9ba402e..90fd99027 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -189,9 +189,11 @@ struct ThreadPoolDevice { // of blocks to be evenly dividable across threads. double block_size_f = 1.0 / CostModel::taskSize(1, cost); - Index block_size = numext::mini(n, numext::maxi(1, block_size_f)); - const Index max_block_size = - numext::mini(n, numext::maxi(1, 2 * block_size_f)); + const Index max_oversharding_factor = 4; + Index block_size = numext::mini( + n, numext::maxi(divup(n, max_oversharding_factor * numThreads()), + block_size_f)); + const Index max_block_size = numext::mini(n, 2 * block_size); if (block_align) { Index new_block_size = block_align(block_size); eigen_assert(new_block_size >= block_size); @@ -205,7 +207,8 @@ struct ThreadPoolDevice { (divup(block_count, numThreads()) * numThreads()); // Now try to increase block size up to max_block_size as long as it // doesn't decrease parallel efficiency. - for (Index prev_block_count = block_count; prev_block_count > 1;) { + for (Index prev_block_count = block_count; + max_efficiency < 1.0 && prev_block_count > 1;) { // This is the next block size that divides size into a smaller number // of blocks than the current block_size. Index coarser_block_size = divup(n, prev_block_count - 1); -- cgit v1.2.3 From 4cc32d80fd091559bc90a55946945fc1584fdd86 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 21 Jun 2018 10:28:38 +0200 Subject: bug #1555: compilation fix with XLC --- Eigen/src/Core/arch/AltiVec/PacketMath.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 6d7190a56..7f6dbe62a 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -1054,7 +1054,7 @@ ptranspose(PacketBlock& kernel) { template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { Packet2l select = { ifPacket.select[0], ifPacket.select[1] }; - Packet2bl mask = vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p2l_ONE)); + Packet2bl mask = reinterpret_cast( vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p2l_ONE)); return vec_sel(elsePacket, thenPacket, mask); } #endif // __VSX__ -- cgit v1.2.3 From bda71ad3945d4cf2bf06cf567b0b5a8a4663fede Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 22 Jun 2018 15:04:35 -0700 Subject: Fix typo in pbend for AltiVec. --- Eigen/src/Core/arch/AltiVec/PacketMath.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 6d7190a56..7f4e90f75 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -103,7 +103,7 @@ static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4u static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; #else -static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; +static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; @@ -784,7 +784,7 @@ typedef __vector __bool long Packet2bl; static Packet2l p2l_ONE = { 1, 1 }; static Packet2l p2l_ZERO = reinterpret_cast(p4i_ZERO); -static Packet2d p2d_ONE = { 1.0, 1.0 }; +static Packet2d p2d_ONE = { 1.0, 1.0 }; static Packet2d p2d_ZERO = reinterpret_cast(p4f_ZERO); static Packet2d p2d_MZERO = { -0.0, -0.0 }; @@ -1001,7 +1001,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) Packet2d v[2], sum; v[0] = vecs[0] + reinterpret_cast(vec_sld(reinterpret_cast(vecs[0]), reinterpret_cast(vecs[0]), 8)); v[1] = vecs[1] + reinterpret_cast(vec_sld(reinterpret_cast(vecs[1]), reinterpret_cast(vecs[1]), 8)); - + #ifdef _BIG_ENDIAN sum = reinterpret_cast(vec_sld(reinterpret_cast(v[0]), reinterpret_cast(v[1]), 8)); #else @@ -1054,7 +1054,7 @@ ptranspose(PacketBlock& kernel) { template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { Packet2l select = { ifPacket.select[0], ifPacket.select[1] }; - Packet2bl mask = vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p2l_ONE)); + Packet2bl mask = reinterpret_cast( vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p2l_ONE)) ); return vec_sel(elsePacket, thenPacket, mask); } #endif // __VSX__ -- cgit v1.2.3 From ee5864f72e83830f536ad91dc38d574c02a08348 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 25 Jun 2018 10:30:12 +0200 Subject: bug #1560 fix product with a 1x1 diagonal matrix --- Eigen/src/Core/ProductEvaluators.h | 11 +++++++++-- test/diagonalmatrices.cpp | 21 +++++++++++++++++++++ 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 0d5aec570..60de771b2 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -785,7 +785,11 @@ public: _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))), _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0, Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0), - Alignment = evaluator::Alignment + Alignment = evaluator::Alignment, + + AsScalarProduct = (DiagonalType::SizeAtCompileTime==1) + || (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::RowsAtCompileTime==1 && ProductOrder==OnTheLeft) + || (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==1 && ProductOrder==OnTheRight) }; diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag) @@ -797,7 +801,10 @@ public: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const { - return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx); + if(AsScalarProduct) + return m_diagImpl.coeff(0) * m_matImpl.coeff(idx); + else + return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx); } protected: diff --git a/test/diagonalmatrices.cpp b/test/diagonalmatrices.cpp index a2092c43e..73b4cd58c 100644 --- a/test/diagonalmatrices.cpp +++ b/test/diagonalmatrices.cpp @@ -30,6 +30,7 @@ template void diagonalmatrices(const MatrixType& m) v2 = VectorType::Random(rows); RowVectorType rv1 = RowVectorType::Random(cols), rv2 = RowVectorType::Random(cols); + LeftDiagonalMatrix ldm1(v1), ldm2(v2); RightDiagonalMatrix rdm1(rv1), rdm2(rv2); @@ -105,6 +106,25 @@ template void diagonalmatrices(const MatrixType& m) sq_m2 = sq_m1 * sq_m2; VERIFY_IS_APPROX( (sq_m1*v1.asDiagonal()).col(i), sq_m2.col(i) ); VERIFY_IS_APPROX( (sq_m1*v1.asDiagonal()).row(i), sq_m2.row(i) ); + + if(v1.size()==1) + { + typedef Matrix DynVectorType; + typedef Matrix DynRowVectorType; + Index depth = internal::random(1,EIGEN_TEST_MAX_SIZE); + DynVectorType dv1 = DynVectorType::Random(depth); + DynRowVectorType drv1 = DynRowVectorType::Random(depth); + DynMatrixType dm1 = dv1; + DynMatrixType drm1 = drv1; + + Scalar s = v1(0); + + VERIFY_IS_APPROX( v1.asDiagonal() * drv1, s*drv1 ); + VERIFY_IS_APPROX( dv1 * v1.asDiagonal(), dv1*s ); + + VERIFY_IS_APPROX( v1.asDiagonal() * drm1, s*drm1 ); + VERIFY_IS_APPROX( dm1 * v1.asDiagonal(), dm1*s ); + } } template @@ -130,6 +150,7 @@ void test_diagonalmatrices() CALL_SUBTEST_7( diagonalmatrices(MatrixXi(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_8( diagonalmatrices(Matrix(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_9( diagonalmatrices(MatrixXf(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); + CALL_SUBTEST_9( diagonalmatrices(MatrixXf(1,1)) ); } CALL_SUBTEST_10( bug987<0>() ); } -- cgit v1.2.3 From f9d337780d49825765cbb8ea51843a905c0e5253 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 25 Jun 2018 14:26:51 +0200 Subject: First step towards a generic vectorised quaternion product --- Eigen/src/Geometry/arch/Geometry_SSE.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Eigen/src/Geometry/arch/Geometry_SSE.h b/Eigen/src/Geometry/arch/Geometry_SSE.h index f68cab583..d4346aa1c 100644 --- a/Eigen/src/Geometry/arch/Geometry_SSE.h +++ b/Eigen/src/Geometry/arch/Geometry_SSE.h @@ -26,17 +26,17 @@ struct quat_product static inline Quaternion run(const QuaternionBase& _a, const QuaternionBase& _b) { Quaternion res; - const __m128 mask = _mm_setr_ps(0.f,0.f,0.f,-0.f); - __m128 a = _a.coeffs().template packet(0); - __m128 b = _b.coeffs().template packet(0); - __m128 s1 = _mm_mul_ps(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2)); - __m128 s2 = _mm_mul_ps(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1)); + const Packet4f mask = _mm_setr_ps(0.f,0.f,0.f,-0.f); + Packet4f a = _a.coeffs().template packet(0); + Packet4f b = _b.coeffs().template packet(0); + Packet4f s1 = pmul(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2)); + Packet4f s2 = pmul(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1)); pstoret( &res.x(), - _mm_add_ps(_mm_sub_ps(_mm_mul_ps(a,vec4f_swizzle1(b,3,3,3,3)), - _mm_mul_ps(vec4f_swizzle1(a,2,0,1,0), + padd(psub(pmul(a,vec4f_swizzle1(b,3,3,3,3)), + pmul(vec4f_swizzle1(a,2,0,1,0), vec4f_swizzle1(b,1,2,0,0))), - _mm_xor_ps(mask,_mm_add_ps(s1,s2)))); + pxor(mask,padd(s1,s2)))); return res; } -- cgit v1.2.3 From b7689bded9e87c95affb5963ad565a68974aa7f7 Mon Sep 17 00:00:00 2001 From: Jonathan Liu Date: Thu, 28 Jun 2018 00:32:37 +1000 Subject: Use std::complex constructor instead of assignment from scalar Fixes GCC conversion to non-scalar type requested compile error when using boost::multiprecision::cpp_dec_float_50 as scalar type. --- unsupported/Eigen/src/FFT/ei_kissfft_impl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/src/FFT/ei_kissfft_impl.h b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h index be51b4e6f..079e88602 100644 --- a/unsupported/Eigen/src/FFT/ei_kissfft_impl.h +++ b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h @@ -316,8 +316,8 @@ struct kissfft_impl // use optimized mode for even real fwd( dst, reinterpret_cast (src), ncfft); - Complex dc = dst[0].real() + dst[0].imag(); - Complex nyquist = dst[0].real() - dst[0].imag(); + Complex dc(dst[0].real() + dst[0].imag()); + Complex nyquist(dst[0].real() - dst[0].imag()); int k; for ( k=1;k <= ncfft2 ; ++k ) { Complex fpk = dst[k]; -- cgit v1.2.3 From 9a81de1d3543d89d2cc5d32294d96b3c58730b21 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 28 Jun 2018 00:20:59 +0200 Subject: Fix order of EIGEN_DEVICE_FUNC and returned type --- Eigen/src/Core/ArrayWrapper.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/ArrayWrapper.h b/Eigen/src/Core/ArrayWrapper.h index 688aadd62..757b31825 100644 --- a/Eigen/src/Core/ArrayWrapper.h +++ b/Eigen/src/Core/ArrayWrapper.h @@ -90,8 +90,8 @@ class ArrayWrapper : public ArrayBase > EIGEN_DEVICE_FUNC inline void evalTo(Dest& dst) const { dst = m_expression; } - const typename internal::remove_all::type& EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& nestedExpression() const { return m_expression; -- cgit v1.2.3 From 0cdacf3fa49ff52fac3058beae1d6beaa0ebec46 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 29 Jun 2018 11:28:36 +0200 Subject: update comment --- Eigen/src/QR/HouseholderQR.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/QR/HouseholderQR.h b/Eigen/src/QR/HouseholderQR.h index 13c02685d..33cb9c8ff 100644 --- a/Eigen/src/QR/HouseholderQR.h +++ b/Eigen/src/QR/HouseholderQR.h @@ -291,7 +291,7 @@ template struct householder_qr_inplace_blocked { - // This is specialized for MKL-supported Scalar types in HouseholderQR_MKL.h + // This is specialized for LAPACK-supported Scalar types in HouseholderQR_LAPACKE.h static void run(MatrixQR& mat, HCoeffs& hCoeffs, Index maxBlockSize=32, typename MatrixQR::Scalar* tempData = 0) { -- cgit v1.2.3 From a7b313a16cf5b64981dd953f150327638379e68b Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sun, 1 Jul 2018 22:45:47 +0200 Subject: Fix unit test --- test/diagonalmatrices.cpp | 43 +++++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/test/diagonalmatrices.cpp b/test/diagonalmatrices.cpp index 73b4cd58c..a4ff10239 100644 --- a/test/diagonalmatrices.cpp +++ b/test/diagonalmatrices.cpp @@ -106,25 +106,32 @@ template void diagonalmatrices(const MatrixType& m) sq_m2 = sq_m1 * sq_m2; VERIFY_IS_APPROX( (sq_m1*v1.asDiagonal()).col(i), sq_m2.col(i) ); VERIFY_IS_APPROX( (sq_m1*v1.asDiagonal()).row(i), sq_m2.row(i) ); +} - if(v1.size()==1) - { - typedef Matrix DynVectorType; - typedef Matrix DynRowVectorType; - Index depth = internal::random(1,EIGEN_TEST_MAX_SIZE); - DynVectorType dv1 = DynVectorType::Random(depth); - DynRowVectorType drv1 = DynRowVectorType::Random(depth); - DynMatrixType dm1 = dv1; - DynMatrixType drm1 = drv1; - - Scalar s = v1(0); +template void as_scalar_product(const MatrixType& m) +{ + typedef typename MatrixType::Scalar Scalar; + typedef Matrix VectorType; + typedef Matrix DynMatrixType; + typedef Matrix DynVectorType; + typedef Matrix DynRowVectorType; - VERIFY_IS_APPROX( v1.asDiagonal() * drv1, s*drv1 ); - VERIFY_IS_APPROX( dv1 * v1.asDiagonal(), dv1*s ); + Index rows = m.rows(); + Index depth = internal::random(1,EIGEN_TEST_MAX_SIZE); - VERIFY_IS_APPROX( v1.asDiagonal() * drm1, s*drm1 ); - VERIFY_IS_APPROX( dm1 * v1.asDiagonal(), dm1*s ); - } + VectorType v1 = VectorType::Random(rows); + DynVectorType dv1 = DynVectorType::Random(depth); + DynRowVectorType drv1 = DynRowVectorType::Random(depth); + DynMatrixType dm1 = dv1; + DynMatrixType drm1 = drv1; + + Scalar s = v1(0); + + VERIFY_IS_APPROX( v1.asDiagonal() * drv1, s*drv1 ); + VERIFY_IS_APPROX( dv1 * v1.asDiagonal(), dv1*s ); + + VERIFY_IS_APPROX( v1.asDiagonal() * drm1, s*drm1 ); + VERIFY_IS_APPROX( dm1 * v1.asDiagonal(), dm1*s ); } template @@ -142,15 +149,19 @@ void test_diagonalmatrices() { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( diagonalmatrices(Matrix()) ); + CALL_SUBTEST_1( as_scalar_product(Matrix()) ); + CALL_SUBTEST_2( diagonalmatrices(Matrix3f()) ); CALL_SUBTEST_3( diagonalmatrices(Matrix()) ); CALL_SUBTEST_4( diagonalmatrices(Matrix4d()) ); CALL_SUBTEST_5( diagonalmatrices(Matrix()) ); CALL_SUBTEST_6( diagonalmatrices(MatrixXcf(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); + CALL_SUBTEST_6( as_scalar_product(MatrixXcf(1,1)) ); CALL_SUBTEST_7( diagonalmatrices(MatrixXi(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_8( diagonalmatrices(Matrix(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_9( diagonalmatrices(MatrixXf(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_9( diagonalmatrices(MatrixXf(1,1)) ); + CALL_SUBTEST_9( as_scalar_product(MatrixXf(1,1)) ); } CALL_SUBTEST_10( bug987<0>() ); } -- cgit v1.2.3 From d428a199ab70bc08db7551457a1e9d8f65d9ebb9 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 2 Jul 2018 11:41:09 +0200 Subject: bug #1562: optimize evaluation of small products of the form s*A*B by rewriting them as: s*(A.lazyProduct(B)) to save a costly temporary. Measured speedup from 2x to 5x... --- Eigen/src/Core/ProductEvaluators.h | 28 ++++++++++++++++++++++++++- Eigen/src/Core/products/GeneralMatrixMatrix.h | 8 ++++---- test/product.h | 11 +++++++++++ test/product_large.cpp | 2 ++ 4 files changed, 44 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 60de771b2..8072a1959 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -396,7 +396,7 @@ struct generic_product_impl // but easier on the compiler side call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op()); } - + template static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { @@ -410,6 +410,32 @@ struct generic_product_impl // dst.noalias() -= lhs.lazyProduct(rhs); call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op()); } + + // Catch "dst {,+,-}= (s*A)*B" and evaluate it lazily by moving out the scalar factor: + // dst {,+,-}= s * (A.lazyProduct(B)) + // This is a huge benefit for heap-allocated matrix types as it save one costly allocation. + // For them, this strategy is also faster than simply by-passing the heap allocation through + // stack allocation. + // For fixed sizes matrices, this is less obvious, it is sometimes x2 faster, but sometimes x3 slower, + // and the behavior depends also a lot on the compiler... so let's be conservative and enable them for dynamic-size only, + // that is when coming from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void eval_dynamic(Dst& dst, const CwiseBinaryOp, + const CwiseNullaryOp, Plain1>, Xpr2>& lhs, const Rhs& rhs, const Func &func) + { + call_assignment_no_alias(dst, lhs.lhs().functor().m_other * lhs.rhs().lazyProduct(rhs), func); + } + + // Here, we we always have LhsT==Lhs, but we need to make it a template type to make the above + // overload more specialized. + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void eval_dynamic(Dst& dst, const LhsT& lhs, const Rhs& rhs, const Func &func) + { + call_assignment_no_alias(dst, lhs.lazyProduct(rhs), func); + } + // template // static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index ed4d3182b..dc47cf7cf 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -431,10 +431,10 @@ struct generic_product_impl // to determine the following heuristic. // EIGEN_GEMM_TO_COEFFBASED_THRESHOLD is typically defined to 20 in GeneralProduct.h, // unless it has been specialized by the user or for a given architecture. - // Note that the condition rhs.rows()>0 was required because lazy produc is (was?) not happy with empty inputs. + // Note that the condition rhs.rows()>0 was required because lazy product is (was?) not happy with empty inputs. // I'm not sure it is still required. if((rhs.rows()+dst.rows()+dst.cols())0) - lazyproduct::evalTo(dst, lhs, rhs); + lazyproduct::eval_dynamic(dst, lhs, rhs, internal::assign_op()); else { dst.setZero(); @@ -446,7 +446,7 @@ struct generic_product_impl static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { if((rhs.rows()+dst.rows()+dst.cols())0) - lazyproduct::addTo(dst, lhs, rhs); + lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op()); else scaleAndAddTo(dst,lhs, rhs, Scalar(1)); } @@ -455,7 +455,7 @@ struct generic_product_impl static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { if((rhs.rows()+dst.rows()+dst.cols())0) - lazyproduct::subTo(dst, lhs, rhs); + lazyproduct::eval_dynamic(dst, lhs, rhs, internal::sub_assign_op()); else scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); } diff --git a/test/product.h b/test/product.h index 0425a929e..d26e8063d 100644 --- a/test/product.h +++ b/test/product.h @@ -111,6 +111,17 @@ template void product(const MatrixType& m) vcres.noalias() -= m1.transpose() * v1; VERIFY_IS_APPROX(vcres, vc2 - m1.transpose() * v1); + // test scaled products + res = square; + res.noalias() = s1 * m1 * m2.transpose(); + VERIFY_IS_APPROX(res, ((s1*m1).eval() * m2.transpose())); + res = square; + res.noalias() += s1 * m1 * m2.transpose(); + VERIFY_IS_APPROX(res, square + ((s1*m1).eval() * m2.transpose())); + res = square; + res.noalias() -= s1 * m1 * m2.transpose(); + VERIFY_IS_APPROX(res, square - ((s1*m1).eval() * m2.transpose())); + // test d ?= a+b*c rules res.noalias() = square + m1 * m2.transpose(); VERIFY_IS_APPROX(res, square + m1 * m2.transpose()); diff --git a/test/product_large.cpp b/test/product_large.cpp index 845cd40ca..14a4f739d 100644 --- a/test/product_large.cpp +++ b/test/product_large.cpp @@ -35,6 +35,8 @@ void test_product_large() for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( product(MatrixXf(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_2( product(MatrixXd(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); + CALL_SUBTEST_2( product(MatrixXd(internal::random(1,10), internal::random(1,10))) ); + CALL_SUBTEST_3( product(MatrixXi(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_4( product(MatrixXcf(internal::random(1,EIGEN_TEST_MAX_SIZE/2), internal::random(1,EIGEN_TEST_MAX_SIZE/2))) ); CALL_SUBTEST_5( product(Matrix(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); -- cgit v1.2.3 From d6255649361fde3e937320dfd216a84203adc295 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 2 Jul 2018 11:50:41 +0200 Subject: Simplify redux_evaluator using inheritance, and properly rename parameters in reducers. --- Eigen/src/Core/Redux.h | 245 +++++++++++++++++++++++-------------------------- 1 file changed, 115 insertions(+), 130 deletions(-) diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h index 0a99acb21..32574ba60 100644 --- a/Eigen/src/Core/Redux.h +++ b/Eigen/src/Core/Redux.h @@ -23,22 +23,22 @@ namespace internal { * Part 1 : the logic deciding a strategy for vectorization and unrolling ***************************************************************************/ -template +template struct redux_traits { public: - typedef typename find_best_packet::type PacketType; + typedef typename find_best_packet::type PacketType; enum { PacketSize = unpacket_traits::size, - InnerMaxSize = int(Derived::IsRowMajor) - ? Derived::MaxColsAtCompileTime - : Derived::MaxRowsAtCompileTime + InnerMaxSize = int(Evaluator::IsRowMajor) + ? Evaluator::MaxColsAtCompileTime + : Evaluator::MaxRowsAtCompileTime }; enum { - MightVectorize = (int(Derived::Flags)&ActualPacketAccessBit) + MightVectorize = (int(Evaluator::Flags)&ActualPacketAccessBit) && (functor_traits::PacketAccess), - MayLinearVectorize = bool(MightVectorize) && (int(Derived::Flags)&LinearAccessBit), + MayLinearVectorize = bool(MightVectorize) && (int(Evaluator::Flags)&LinearAccessBit), MaySliceVectorize = bool(MightVectorize) && int(InnerMaxSize)>=3*PacketSize }; @@ -51,8 +51,8 @@ public: public: enum { - Cost = Derived::SizeAtCompileTime == Dynamic ? HugeCost - : Derived::SizeAtCompileTime * Derived::CoeffReadCost + (Derived::SizeAtCompileTime-1) * functor_traits::Cost, + Cost = Evaluator::SizeAtCompileTime == Dynamic ? HugeCost + : Evaluator::SizeAtCompileTime * Evaluator::CoeffReadCost + (Evaluator::SizeAtCompileTime-1) * functor_traits::Cost, UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize)) }; @@ -64,9 +64,9 @@ public: #ifdef EIGEN_DEBUG_ASSIGN static void debug() { - std::cerr << "Xpr: " << typeid(typename Derived::XprType).name() << std::endl; + std::cerr << "Xpr: " << typeid(typename Evaluator::XprType).name() << std::endl; std::cerr.setf(std::ios::hex, std::ios::basefield); - EIGEN_DEBUG_VAR(Derived::Flags) + EIGEN_DEBUG_VAR(Evaluator::Flags) std::cerr.unsetf(std::ios::hex); EIGEN_DEBUG_VAR(InnerMaxSize) EIGEN_DEBUG_VAR(PacketSize) @@ -87,88 +87,88 @@ public: /*** no vectorization ***/ -template +template struct redux_novec_unroller { enum { HalfLength = Length/2 }; - typedef typename Derived::Scalar Scalar; + typedef typename Evaluator::Scalar Scalar; EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) + static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func& func) { - return func(redux_novec_unroller::run(mat,func), - redux_novec_unroller::run(mat,func)); + return func(redux_novec_unroller::run(eval,func), + redux_novec_unroller::run(eval,func)); } }; -template -struct redux_novec_unroller +template +struct redux_novec_unroller { enum { - outer = Start / Derived::InnerSizeAtCompileTime, - inner = Start % Derived::InnerSizeAtCompileTime + outer = Start / Evaluator::InnerSizeAtCompileTime, + inner = Start % Evaluator::InnerSizeAtCompileTime }; - typedef typename Derived::Scalar Scalar; + typedef typename Evaluator::Scalar Scalar; EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func&) + static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func&) { - return mat.coeffByOuterInner(outer, inner); + return eval.coeffByOuterInner(outer, inner); } }; // This is actually dead code and will never be called. It is required // to prevent false warnings regarding failed inlining though // for 0 length run() will never be called at all. -template -struct redux_novec_unroller +template +struct redux_novec_unroller { - typedef typename Derived::Scalar Scalar; + typedef typename Evaluator::Scalar Scalar; EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); } + static EIGEN_STRONG_INLINE Scalar run(const Evaluator&, const Func&) { return Scalar(); } }; /*** vectorization ***/ -template +template struct redux_vec_unroller { enum { - PacketSize = redux_traits::PacketSize, + PacketSize = redux_traits::PacketSize, HalfLength = Length/2 }; - typedef typename Derived::Scalar Scalar; - typedef typename redux_traits::PacketType PacketScalar; + typedef typename Evaluator::Scalar Scalar; + typedef typename redux_traits::PacketType PacketScalar; - static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func& func) + static EIGEN_STRONG_INLINE PacketScalar run(const Evaluator &eval, const Func& func) { return func.packetOp( - redux_vec_unroller::run(mat,func), - redux_vec_unroller::run(mat,func) ); + redux_vec_unroller::run(eval,func), + redux_vec_unroller::run(eval,func) ); } }; -template -struct redux_vec_unroller +template +struct redux_vec_unroller { enum { - index = Start * redux_traits::PacketSize, - outer = index / int(Derived::InnerSizeAtCompileTime), - inner = index % int(Derived::InnerSizeAtCompileTime), - alignment = Derived::Alignment + index = Start * redux_traits::PacketSize, + outer = index / int(Evaluator::InnerSizeAtCompileTime), + inner = index % int(Evaluator::InnerSizeAtCompileTime), + alignment = Evaluator::Alignment }; - typedef typename Derived::Scalar Scalar; - typedef typename redux_traits::PacketType PacketScalar; + typedef typename Evaluator::Scalar Scalar; + typedef typename redux_traits::PacketType PacketScalar; - static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&) + static EIGEN_STRONG_INLINE PacketScalar run(const Evaluator &eval, const Func&) { - return mat.template packetByOuterInner(outer, inner); + return eval.template packetByOuterInner(outer, inner); } }; @@ -176,53 +176,54 @@ struct redux_vec_unroller * Part 3 : implementation of all cases ***************************************************************************/ -template::Traversal, - int Unrolling = redux_traits::Unrolling +template::Traversal, + int Unrolling = redux_traits::Unrolling > struct redux_impl; -template -struct redux_impl +template +struct redux_impl { - typedef typename Derived::Scalar Scalar; - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) + typedef typename Evaluator::Scalar Scalar; + + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE + Scalar run(const Evaluator &eval, const Func& func) { - eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); + eigen_assert(eval.rows()>0 && eval.cols()>0 && "you are using an empty matrix"); Scalar res; - res = mat.coeffByOuterInner(0, 0); - for(Index i = 1; i < mat.innerSize(); ++i) - res = func(res, mat.coeffByOuterInner(0, i)); - for(Index i = 1; i < mat.outerSize(); ++i) - for(Index j = 0; j < mat.innerSize(); ++j) - res = func(res, mat.coeffByOuterInner(i, j)); + res = eval.coeffByOuterInner(0, 0); + for(Index i = 1; i < eval.innerSize(); ++i) + res = func(res, eval.coeffByOuterInner(0, i)); + for(Index i = 1; i < eval.outerSize(); ++i) + for(Index j = 0; j < eval.innerSize(); ++j) + res = func(res, eval.coeffByOuterInner(i, j)); return res; } }; -template -struct redux_impl - : public redux_novec_unroller +template +struct redux_impl + : redux_novec_unroller {}; -template -struct redux_impl +template +struct redux_impl { - typedef typename Derived::Scalar Scalar; - typedef typename redux_traits::PacketType PacketScalar; + typedef typename Evaluator::Scalar Scalar; + typedef typename redux_traits::PacketType PacketScalar; - static Scalar run(const Derived &mat, const Func& func) + static Scalar run(const Evaluator &eval, const Func& func) { - const Index size = mat.size(); + const Index size = eval.size(); - const Index packetSize = redux_traits::PacketSize; + const Index packetSize = redux_traits::PacketSize; const int packetAlignment = unpacket_traits::alignment; enum { - alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned), - alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Derived::Alignment) + alignment0 = (bool(Evaluator::Flags & DirectAccessBit) && bool(packet_traits::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned), + alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Evaluator::Alignment) }; - const Index alignedStart = internal::first_default_aligned(mat.nestedExpression()); + const Index alignedStart = internal::first_default_aligned(eval.nestedExpression()); const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize); const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize); const Index alignedEnd2 = alignedStart + alignedSize2; @@ -230,34 +231,34 @@ struct redux_impl Scalar res; if(alignedSize) { - PacketScalar packet_res0 = mat.template packet(alignedStart); + PacketScalar packet_res0 = eval.template packet(alignedStart); if(alignedSize>packetSize) // we have at least two packets to partly unroll the loop { - PacketScalar packet_res1 = mat.template packet(alignedStart+packetSize); + PacketScalar packet_res1 = eval.template packet(alignedStart+packetSize); for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize) { - packet_res0 = func.packetOp(packet_res0, mat.template packet(index)); - packet_res1 = func.packetOp(packet_res1, mat.template packet(index+packetSize)); + packet_res0 = func.packetOp(packet_res0, eval.template packet(index)); + packet_res1 = func.packetOp(packet_res1, eval.template packet(index+packetSize)); } packet_res0 = func.packetOp(packet_res0,packet_res1); if(alignedEnd>alignedEnd2) - packet_res0 = func.packetOp(packet_res0, mat.template packet(alignedEnd2)); + packet_res0 = func.packetOp(packet_res0, eval.template packet(alignedEnd2)); } res = func.predux(packet_res0); for(Index index = 0; index < alignedStart; ++index) - res = func(res,mat.coeff(index)); + res = func(res,eval.coeff(index)); for(Index index = alignedEnd; index < size; ++index) - res = func(res,mat.coeff(index)); + res = func(res,eval.coeff(index)); } else // too small to vectorize anything. // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize. { - res = mat.coeff(0); + res = eval.coeff(0); for(Index index = 1; index < size; ++index) - res = func(res,mat.coeff(index)); + res = func(res,eval.coeff(index)); } return res; @@ -265,77 +266,80 @@ struct redux_impl }; // NOTE: for SliceVectorizedTraversal we simply bypass unrolling -template -struct redux_impl +template +struct redux_impl { - typedef typename Derived::Scalar Scalar; - typedef typename redux_traits::PacketType PacketType; + typedef typename Evaluator::Scalar Scalar; + typedef typename redux_traits::PacketType PacketType; - EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func) + EIGEN_DEVICE_FUNC static Scalar run(const Evaluator &eval, const Func& func) { - eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); - const Index innerSize = mat.innerSize(); - const Index outerSize = mat.outerSize(); + eigen_assert(eval.rows()>0 && eval.cols()>0 && "you are using an empty matrix"); + const Index innerSize = eval.innerSize(); + const Index outerSize = eval.outerSize(); enum { - packetSize = redux_traits::PacketSize + packetSize = redux_traits::PacketSize }; const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize; Scalar res; if(packetedInnerSize) { - PacketType packet_res = mat.template packet(0,0); + PacketType packet_res = eval.template packet(0,0); for(Index j=0; j(j,i)); + packet_res = func.packetOp(packet_res, eval.template packetByOuterInner(j,i)); res = func.predux(packet_res); for(Index j=0; j::run(mat, func); + res = redux_impl::run(eval, func); } return res; } }; -template -struct redux_impl +template +struct redux_impl { - typedef typename Derived::Scalar Scalar; + typedef typename Evaluator::Scalar Scalar; - typedef typename redux_traits::PacketType PacketScalar; + typedef typename redux_traits::PacketType PacketScalar; enum { - PacketSize = redux_traits::PacketSize, - Size = Derived::SizeAtCompileTime, + PacketSize = redux_traits::PacketSize, + Size = Evaluator::SizeAtCompileTime, VectorizedSize = (Size / PacketSize) * PacketSize }; - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) + + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE + Scalar run(const Evaluator &eval, const Func& func) { - eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); + eigen_assert(eval.rows()>0 && eval.cols()>0 && "you are using an empty matrix"); if (VectorizedSize > 0) { - Scalar res = func.predux(redux_vec_unroller::run(mat,func)); + Scalar res = func.predux(redux_vec_unroller::run(eval,func)); if (VectorizedSize != Size) - res = func(res,redux_novec_unroller::run(mat,func)); + res = func(res,redux_novec_unroller::run(eval,func)); return res; } else { - return redux_novec_unroller::run(mat,func); + return redux_novec_unroller::run(eval,func); } } }; // evaluator adaptor template -class redux_evaluator +class redux_evaluator : public internal::evaluator<_XprType> { + typedef internal::evaluator<_XprType> Base; public: typedef _XprType XprType; - EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {} + EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : Base(xpr), m_xpr(xpr) {} typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; @@ -346,12 +350,10 @@ public: MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime, MaxColsAtCompileTime = XprType::MaxColsAtCompileTime, // TODO we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at runtime from the evaluator - Flags = evaluator::Flags & ~DirectAccessBit, + Flags = Base::Flags & ~DirectAccessBit, IsRowMajor = XprType::IsRowMajor, SizeAtCompileTime = XprType::SizeAtCompileTime, - InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime, - CoeffReadCost = evaluator::CoeffReadCost, - Alignment = evaluator::Alignment + InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime }; EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); } @@ -359,35 +361,18 @@ public: EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); } EIGEN_DEVICE_FUNC Index innerSize() const { return m_xpr.innerSize(); } EIGEN_DEVICE_FUNC Index outerSize() const { return m_xpr.outerSize(); } - - EIGEN_DEVICE_FUNC - CoeffReturnType coeff(Index row, Index col) const - { return m_evaluator.coeff(row, col); } - - EIGEN_DEVICE_FUNC - CoeffReturnType coeff(Index index) const - { return m_evaluator.coeff(index); } - - template - PacketType packet(Index row, Index col) const - { return m_evaluator.template packet(row, col); } - - template - PacketType packet(Index index) const - { return m_evaluator.template packet(index); } EIGEN_DEVICE_FUNC CoeffReturnType coeffByOuterInner(Index outer, Index inner) const - { return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } + { return Base::coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } template PacketType packetByOuterInner(Index outer, Index inner) const - { return m_evaluator.template packet(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } + { return Base::template packet(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } const XprType & nestedExpression() const { return m_xpr; } protected: - internal::evaluator m_evaluator; const XprType &m_xpr; }; -- cgit v1.2.3 From 047677a08d907031bfa596017c15493c982181c5 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 2 Jul 2018 12:18:25 +0200 Subject: Fix regression in changeset f05dea6b2326836e5e0243fbaffbece84b833d64 : computeFromHessenberg can take any expression for matrixQ, not only an HouseholderSequence. --- Eigen/src/Eigenvalues/RealSchur.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Eigenvalues/RealSchur.h b/Eigen/src/Eigenvalues/RealSchur.h index 4c53344bb..aca8a8279 100644 --- a/Eigen/src/Eigenvalues/RealSchur.h +++ b/Eigen/src/Eigenvalues/RealSchur.h @@ -270,8 +270,13 @@ RealSchur& RealSchur::compute(const EigenBase // Step 1. Reduce to Hessenberg form m_hess.compute(matrix.derived()/scale); - // Step 2. Reduce to real Schur form - computeFromHessenberg(m_hess.matrixH(), m_hess.matrixQ(), computeU); + // Step 2. Reduce to real Schur form + // Note: we copy m_hess.matrixQ() into m_matU here and not in computeFromHessenberg + // to be able to pass our working-space buffer for the Householder to Dense evaluation. + m_workspaceVector.resize(matrix.cols()); + if(computeU) + m_hess.matrixQ().evalTo(m_matU, m_workspaceVector); + computeFromHessenberg(m_hess.matrixH(), m_matU, computeU); m_matT *= scale; @@ -285,8 +290,8 @@ RealSchur& RealSchur::computeFromHessenberg(const HessMa m_matT = matrixH; m_workspaceVector.resize(m_matT.cols()); - if(computeU) - matrixQ.evalTo(m_matU, m_workspaceVector); + if(computeU && !internal::is_same_dense(m_matU,matrixQ)) + m_matU = matrixQ; Index maxIters = m_maxIters; if (maxIters == -1) -- cgit v1.2.3 From 67ec37f7b0d9ccaaede508109e50528138eaf4d8 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 2 Jul 2018 12:54:14 +0200 Subject: Activate dgmres unit test --- unsupported/test/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index e99eab0e3..fa4e76e42 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -124,6 +124,7 @@ ei_add_test(polynomialsolver) ei_add_test(polynomialutils) ei_add_test(splines) ei_add_test(gmres) +ei_add_test(dgmres) ei_add_test(minres) ei_add_test(levenberg_marquardt) ei_add_test(kronecker_product) -- cgit v1.2.3 From 3ae2083e230ec9517cb9b40d1137dec7ba4b2b17 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 3 Jul 2018 13:21:43 +0200 Subject: Make is_same_dense compatible with different scalar types. --- Eigen/src/Core/util/XprHelper.h | 9 +++++++-- test/is_same_dense.cpp | 8 ++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 16714fa5c..f7a3d9ce7 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -680,13 +680,18 @@ template struct glue_shapes; template<> struct glue_shapes { typedef TriangularShape type; }; template -bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if::ret&&has_direct_access::ret, T1>::type * = 0) +struct possibly_same_dense { + enum { value = has_direct_access::ret && has_direct_access::ret && is_same::value }; +}; + +template +bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if::value>::type * = 0) { return (mat1.data()==mat2.data()) && (mat1.innerStride()==mat2.innerStride()) && (mat1.outerStride()==mat2.outerStride()); } template -bool is_same_dense(const T1 &, const T2 &, typename enable_if::ret&&has_direct_access::ret), T1>::type * = 0) +bool is_same_dense(const T1 &, const T2 &, typename enable_if::value>::type * = 0) { return false; } diff --git a/test/is_same_dense.cpp b/test/is_same_dense.cpp index 2c7838ce9..c4e2fbc92 100644 --- a/test/is_same_dense.cpp +++ b/test/is_same_dense.cpp @@ -14,9 +14,13 @@ using internal::is_same_dense; void test_is_same_dense() { typedef Matrix ColMatrixXd; + typedef Matrix,Dynamic,Dynamic,ColMajor> ColMatrixXcd; ColMatrixXd m1(10,10); + ColMatrixXcd m2(10,10); Ref ref_m1(m1); + Ref > ref_m2_real(m2.real()); Ref const_ref_m1(m1); + VERIFY(is_same_dense(m1,m1)); VERIFY(is_same_dense(m1,ref_m1)); VERIFY(is_same_dense(const_ref_m1,m1)); @@ -30,4 +34,8 @@ void test_is_same_dense() Ref const_ref_m1_col(m1.col(1)); VERIFY(is_same_dense(m1.col(1),const_ref_m1_col)); + + + VERIFY(!is_same_dense(m1, ref_m2_real)); + VERIFY(!is_same_dense(m2, ref_m2_real)); } -- cgit v1.2.3 From 6a241bd8ee45df0ba112d4d6874888499b51cd34 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 3 Jul 2018 14:02:46 +0200 Subject: Implement custom inplace triangular product to avoid a temporary --- Eigen/src/Householder/BlockHouseholder.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Householder/BlockHouseholder.h b/Eigen/src/Householder/BlockHouseholder.h index 01a7ed188..39ce1c2a0 100644 --- a/Eigen/src/Householder/BlockHouseholder.h +++ b/Eigen/src/Householder/BlockHouseholder.h @@ -63,8 +63,15 @@ void make_block_householder_triangular_factor(TriangularFactorType& triFactor, c triFactor.row(i).tail(rt).noalias() = -hCoeffs(i) * vectors.col(i).tail(rs).adjoint() * vectors.bottomRightCorner(rs, rt).template triangularView(); - // FIXME add .noalias() once the triangular product can work inplace - triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template triangularView(); + // FIXME use the following line with .noalias() once the triangular product can work inplace + // triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template triangularView(); + for(Index j=nbVecs-1; j>i; --j) + { + typename TriangularFactorType::Scalar z = triFactor(i,j); + triFactor(i,j) = z * triFactor(j,j); + if(nbVecs-j-1>0) + triFactor.row(i).tail(nbVecs-j-1) += z * triFactor.row(j).tail(nbVecs-j-1); + } } triFactor(i,i) = hCoeffs(i); -- cgit v1.2.3 From 05371239533012e652de0b88a3e0aa992a48a80f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 5 Jul 2018 09:21:26 +0200 Subject: bug #1565: help MSVC to generatenot too bad ASM in reductions. --- Eigen/src/Core/Redux.h | 62 +++++++++++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 29 deletions(-) diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h index 32574ba60..ddce65468 100644 --- a/Eigen/src/Core/Redux.h +++ b/Eigen/src/Core/Redux.h @@ -187,16 +187,17 @@ struct redux_impl { typedef typename Evaluator::Scalar Scalar; + template EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE - Scalar run(const Evaluator &eval, const Func& func) + Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr) { - eigen_assert(eval.rows()>0 && eval.cols()>0 && "you are using an empty matrix"); + eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix"); Scalar res; res = eval.coeffByOuterInner(0, 0); - for(Index i = 1; i < eval.innerSize(); ++i) + for(Index i = 1; i < xpr.innerSize(); ++i) res = func(res, eval.coeffByOuterInner(0, i)); - for(Index i = 1; i < eval.outerSize(); ++i) - for(Index j = 0; j < eval.innerSize(); ++j) + for(Index i = 1; i < xpr.outerSize(); ++i) + for(Index j = 0; j < xpr.innerSize(); ++j) res = func(res, eval.coeffByOuterInner(i, j)); return res; } @@ -205,7 +206,16 @@ struct redux_impl template struct redux_impl : redux_novec_unroller -{}; +{ + typedef redux_novec_unroller Base; + typedef typename Evaluator::Scalar Scalar; + template + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE + Scalar run(const Evaluator &eval, const Func& func, const XprType& /*xpr*/) + { + return Base::run(eval,func); + } +}; template struct redux_impl @@ -213,9 +223,10 @@ struct redux_impl typedef typename Evaluator::Scalar Scalar; typedef typename redux_traits::PacketType PacketScalar; - static Scalar run(const Evaluator &eval, const Func& func) + template + static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr) { - const Index size = eval.size(); + const Index size = xpr.size(); const Index packetSize = redux_traits::PacketSize; const int packetAlignment = unpacket_traits::alignment; @@ -223,7 +234,7 @@ struct redux_impl alignment0 = (bool(Evaluator::Flags & DirectAccessBit) && bool(packet_traits::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned), alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Evaluator::Alignment) }; - const Index alignedStart = internal::first_default_aligned(eval.nestedExpression()); + const Index alignedStart = internal::first_default_aligned(xpr); const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize); const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize); const Index alignedEnd2 = alignedStart + alignedSize2; @@ -272,11 +283,12 @@ struct redux_impl typedef typename Evaluator::Scalar Scalar; typedef typename redux_traits::PacketType PacketType; - EIGEN_DEVICE_FUNC static Scalar run(const Evaluator &eval, const Func& func) + template + EIGEN_DEVICE_FUNC static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr) { - eigen_assert(eval.rows()>0 && eval.cols()>0 && "you are using an empty matrix"); - const Index innerSize = eval.innerSize(); - const Index outerSize = eval.outerSize(); + eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix"); + const Index innerSize = xpr.innerSize(); + const Index outerSize = xpr.outerSize(); enum { packetSize = redux_traits::PacketSize }; @@ -297,7 +309,7 @@ struct redux_impl else // too small to vectorize anything. // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize. { - res = redux_impl::run(eval, func); + res = redux_impl::run(eval, func, xpr); } return res; @@ -316,10 +328,11 @@ struct redux_impl VectorizedSize = (Size / PacketSize) * PacketSize }; + template EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE - Scalar run(const Evaluator &eval, const Func& func) + Scalar run(const Evaluator &eval, const Func& func, const XprType &xpr) { - eigen_assert(eval.rows()>0 && eval.cols()>0 && "you are using an empty matrix"); + eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix"); if (VectorizedSize > 0) { Scalar res = func.predux(redux_vec_unroller::run(eval,func)); if (VectorizedSize != Size) @@ -339,12 +352,11 @@ class redux_evaluator : public internal::evaluator<_XprType> typedef internal::evaluator<_XprType> Base; public: typedef _XprType XprType; - EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : Base(xpr), m_xpr(xpr) {} + EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : Base(xpr) {} typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketScalar PacketScalar; - typedef typename XprType::PacketReturnType PacketReturnType; enum { MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime, @@ -356,12 +368,6 @@ public: InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime }; - EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); } - EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); } - EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); } - EIGEN_DEVICE_FUNC Index innerSize() const { return m_xpr.innerSize(); } - EIGEN_DEVICE_FUNC Index outerSize() const { return m_xpr.outerSize(); } - EIGEN_DEVICE_FUNC CoeffReturnType coeffByOuterInner(Index outer, Index inner) const { return Base::coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } @@ -370,10 +376,6 @@ public: PacketType packetByOuterInner(Index outer, Index inner) const { return Base::template packet(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } - const XprType & nestedExpression() const { return m_xpr; } - -protected: - const XprType &m_xpr; }; } // end namespace internal @@ -400,7 +402,9 @@ DenseBase::redux(const Func& func) const typedef typename internal::redux_evaluator ThisEvaluator; ThisEvaluator thisEval(derived()); - return internal::redux_impl::run(thisEval, func); + // The initial expression is passed to the reducer as an additional argument instead of + // passing it as a member of redux_evaluator to help + return internal::redux_impl::run(thisEval, func, derived()); } /** \returns the minimum of all coefficients of \c *this. -- cgit v1.2.3 From f7124b3e467363e45c3d906b7003f1520a5f804a Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Mon, 11 Jun 2018 18:33:24 +0200 Subject: Extend CUDA support to matrix inversion and selfadjointeigensolver --- Eigen/Core | 39 ++++++++++----- Eigen/PardisoSupport | 0 Eigen/src/Core/DenseStorage.h | 58 +++++++++++++++-------- Eigen/src/Core/GeneralProduct.h | 4 +- Eigen/src/Core/MathFunctions.h | 31 ++++++++++-- Eigen/src/Core/MathFunctionsImpl.h | 5 +- Eigen/src/Core/MatrixBase.h | 8 ++++ Eigen/src/Core/NumTraits.h | 6 +++ Eigen/src/Core/PermutationMatrix.h | 6 +-- Eigen/src/Core/Product.h | 2 +- Eigen/src/Core/ProductEvaluators.h | 3 +- Eigen/src/Core/Transpose.h | 1 + Eigen/src/Core/TriangularMatrix.h | 2 + Eigen/src/Core/products/SelfadjointMatrixVector.h | 12 +++-- Eigen/src/Core/products/SelfadjointRank2Update.h | 3 +- Eigen/src/Core/util/BlasUtil.h | 8 ++-- Eigen/src/Core/util/Memory.h | 4 +- Eigen/src/Core/util/Meta.h | 5 +- Eigen/src/Core/util/XprHelper.h | 2 + Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h | 16 ++++--- Eigen/src/Eigenvalues/Tridiagonalization.h | 9 +++- Eigen/src/Householder/Householder.h | 4 ++ Eigen/src/Householder/HouseholderSequence.h | 17 ++++++- Eigen/src/Jacobi/Jacobi.h | 11 ++++- Eigen/src/LU/Determinant.h | 15 ++++-- Eigen/src/LU/InverseImpl.h | 2 + Eigen/src/plugins/BlockMethods.h | 1 + 27 files changed, 200 insertions(+), 74 deletions(-) mode change 100755 => 100644 Eigen/PardisoSupport diff --git a/Eigen/Core b/Eigen/Core index f6bc18a08..5117461c7 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -45,27 +45,40 @@ #ifdef EIGEN_EXCEPTIONS #undef EIGEN_EXCEPTIONS #endif +#endif - // All functions callable from CUDA code must be qualified with __device__ - #ifdef EIGEN_CUDACC - // Do not try to vectorize on CUDA and SYCL! - #ifndef EIGEN_DONT_VECTORIZE - #define EIGEN_DONT_VECTORIZE - #endif +// All functions callable from CUDA code must be qualified with __device__ +#ifdef EIGEN_CUDACC + // Do not try to vectorize on CUDA and SYCL! + #ifndef EIGEN_DONT_VECTORIZE + #define EIGEN_DONT_VECTORIZE + #endif - #define EIGEN_DEVICE_FUNC __host__ __device__ - // We need cuda_runtime.h to ensure that that EIGEN_USING_STD_MATH macro - // works properly on the device side - #include - #else - #define EIGEN_DEVICE_FUNC + #define EIGEN_DEVICE_FUNC __host__ __device__ + // We need cuda_runtime.h to ensure that that EIGEN_USING_STD_MATH macro + // works properly on the device side + #include + + #if EIGEN_HAS_CONSTEXPR + // While available already with c++11, this is useful mostly starting with c++14 and relaxed constexpr rules + #if defined(__NVCC__) + // nvcc considers constexpr functions as __host__ __device__ with the option --expt-relaxed-constexpr + #ifdef __CUDACC_RELAXED_CONSTEXPR__ + #define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC + #endif + #elif defined(__clang__) && defined(__CUDA__) + // clang++ always considers constexpr functions as implicitly __host__ __device__ + #define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC + #endif #endif #else #define EIGEN_DEVICE_FUNC #endif #ifdef __NVCC__ -#define EIGEN_DONT_VECTORIZE + #ifndef EIGEN_DONT_VECTORIZE + #define EIGEN_DONT_VECTORIZE + #endif #endif // When compiling CUDA device code with NVCC, pull in math functions from the diff --git a/Eigen/PardisoSupport b/Eigen/PardisoSupport old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h index 9e58fbf88..3c02a1025 100644 --- a/Eigen/src/Core/DenseStorage.h +++ b/Eigen/src/Core/DenseStorage.h @@ -207,7 +207,9 @@ template class DenseSt EIGEN_UNUSED_VARIABLE(rows); EIGEN_UNUSED_VARIABLE(cols); } - EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); } + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { + numext::swap(m_data, other.m_data); + } EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;} EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;} EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {} @@ -267,7 +269,11 @@ template class DenseStorage class DenseStorage class DenseStorage class DenseStorage(m_data, m_rows*m_cols); } EIGEN_DEVICE_FUNC void swap(DenseStorage& other) - { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); } + { + numext::swap(m_data,other.m_data); + numext::swap(m_rows,other.m_rows); + numext::swap(m_cols,other.m_cols); + } EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;} EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;} void conservativeResize(Index size, Index rows, Index cols) @@ -459,14 +475,16 @@ template class DenseStorage(m_data, _Rows*m_cols); } - EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); } + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { + numext::swap(m_data,other.m_data); + numext::swap(m_cols,other.m_cols); + } EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;} EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;} EIGEN_DEVICE_FUNC void conservativeResize(Index size, Index, Index cols) @@ -533,14 +551,16 @@ template class DenseStorage(m_data, _Cols*m_rows); } - EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); } + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { + numext::swap(m_data,other.m_data); + numext::swap(m_rows,other.m_rows); + } EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;} EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;} void conservativeResize(Index size, Index rows, Index) diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index 694f7cbde..bd2361e9a 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -163,13 +163,13 @@ template struct gemv_static_vect template struct gemv_static_vector_if { - EIGEN_STRONG_INLINE Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; } }; template struct gemv_static_vector_if { - EIGEN_STRONG_INLINE Scalar* data() { return 0; } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { return 0; } }; template diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 954863c39..a5740334a 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -864,7 +864,7 @@ template T generic_fast_tanh_float(const T& a_x); namespace numext { -#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) +#if (!defined(EIGEN_CUDACC) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) && !defined(__SYCL_DEVICE_ONLY__) template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) @@ -881,19 +881,16 @@ EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) return max EIGEN_NOT_A_MACRO (x,y); } - #elif defined(__SYCL_DEVICE_ONLY__) template EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) { - return y < x ? y : x; } template EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) { - return x < y ? y : x; } @@ -937,7 +934,6 @@ EIGEN_ALWAYS_INLINE unsigned long maxi(const unsigned long& x, const unsigned lo return cl::sycl::max(x,y); } - EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y) { return cl::sycl::fmin(x,y); @@ -971,6 +967,19 @@ EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y) { return fminf(x, y); } +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y) +{ + return fmin(x, y); +} +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y) +{ + return fminl(x, y); +} + template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) @@ -983,6 +992,18 @@ EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y) { return fmaxf(x, y); } +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y) +{ + return fmax(x, y); +} +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y) +{ + return fmaxl(x, y); +} #endif diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h index 28737c15e..a23e93ccb 100644 --- a/Eigen/src/Core/MathFunctionsImpl.h +++ b/Eigen/src/Core/MathFunctionsImpl.h @@ -67,7 +67,7 @@ T generic_fast_tanh_float(const T& a_x) } template -EIGEN_STRONG_INLINE +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y) { EIGEN_USING_STD_MATH(sqrt); @@ -82,7 +82,8 @@ template struct hypot_impl { typedef typename NumTraits::Real RealScalar; - static inline RealScalar run(const Scalar& x, const Scalar& y) + static EIGEN_DEVICE_FUNC + inline RealScalar run(const Scalar& x, const Scalar& y) { EIGEN_USING_STD_MATH(abs); return positive_real_hypot(abs(x), abs(y)); diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h index 11435903b..6046c8bae 100644 --- a/Eigen/src/Core/MatrixBase.h +++ b/Eigen/src/Core/MatrixBase.h @@ -328,6 +328,7 @@ template class MatrixBase inline const PartialPivLU lu() const; + EIGEN_DEVICE_FUNC inline const Inverse inverse() const; template @@ -337,12 +338,15 @@ template class MatrixBase bool& invertible, const RealScalar& absDeterminantThreshold = NumTraits::dummy_precision() ) const; + template inline void computeInverseWithCheck( ResultType& inverse, bool& invertible, const RealScalar& absDeterminantThreshold = NumTraits::dummy_precision() ) const; + + EIGEN_DEVICE_FUNC Scalar determinant() const; /////////// Cholesky module /////////// @@ -414,15 +418,19 @@ template class MatrixBase ////////// Householder module /////////// + EIGEN_DEVICE_FUNC void makeHouseholderInPlace(Scalar& tau, RealScalar& beta); template + EIGEN_DEVICE_FUNC void makeHouseholder(EssentialPart& essential, Scalar& tau, RealScalar& beta) const; template + EIGEN_DEVICE_FUNC void applyHouseholderOnTheLeft(const EssentialPart& essential, const Scalar& tau, Scalar* workspace); template + EIGEN_DEVICE_FUNC void applyHouseholderOnTheRight(const EssentialPart& essential, const Scalar& tau, Scalar* workspace); diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h index 5567d4c90..b053cff07 100644 --- a/Eigen/src/Core/NumTraits.h +++ b/Eigen/src/Core/NumTraits.h @@ -21,12 +21,14 @@ template< typename T, bool is_integer = NumTraits::IsInteger> struct default_digits10_impl { + EIGEN_DEVICE_FUNC static int run() { return std::numeric_limits::digits10; } }; template struct default_digits10_impl // Floating point { + EIGEN_DEVICE_FUNC static int run() { using std::log10; using std::ceil; @@ -38,6 +40,7 @@ struct default_digits10_impl // Floating point template struct default_digits10_impl // Integer { + EIGEN_DEVICE_FUNC static int run() { return 0; } }; @@ -49,12 +52,14 @@ template< typename T, bool is_integer = NumTraits::IsInteger> struct default_digits_impl { + EIGEN_DEVICE_FUNC static int run() { return std::numeric_limits::digits; } }; template struct default_digits_impl // Floating point { + EIGEN_DEVICE_FUNC static int run() { using std::log; using std::ceil; @@ -66,6 +71,7 @@ struct default_digits_impl // Floating point template struct default_digits_impl // Integer { + EIGEN_DEVICE_FUNC static int run() { return 0; } }; diff --git a/Eigen/src/Core/PermutationMatrix.h b/Eigen/src/Core/PermutationMatrix.h index b1fb455b9..acd085301 100644 --- a/Eigen/src/Core/PermutationMatrix.h +++ b/Eigen/src/Core/PermutationMatrix.h @@ -99,13 +99,13 @@ class PermutationBase : public EigenBase #endif /** \returns the number of rows */ - inline Index rows() const { return Index(indices().size()); } + inline EIGEN_DEVICE_FUNC Index rows() const { return Index(indices().size()); } /** \returns the number of columns */ - inline Index cols() const { return Index(indices().size()); } + inline EIGEN_DEVICE_FUNC Index cols() const { return Index(indices().size()); } /** \returns the size of a side of the respective square matrix, i.e., the number of indices */ - inline Index size() const { return Index(indices().size()); } + inline EIGEN_DEVICE_FUNC Index size() const { return Index(indices().size()); } #ifndef EIGEN_PARSED_BY_DOXYGEN template diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index 3d67d9489..70790dbd4 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -127,7 +127,7 @@ public: using Base::derived; typedef typename Base::Scalar Scalar; - EIGEN_STRONG_INLINE operator const Scalar() const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator const Scalar() const { return internal::evaluator(derived()).coeff(0,0); } diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 8072a1959..76e5083c1 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -767,7 +767,8 @@ struct generic_product_impl typedef typename Product::Scalar Scalar; template - static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + static EIGEN_DEVICE_FUNC + void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { selfadjoint_product_impl::run(dst, lhs.nestedExpression(), rhs, alpha); } diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h index ba7d6e629..d7c204579 100644 --- a/Eigen/src/Core/Transpose.h +++ b/Eigen/src/Core/Transpose.h @@ -79,6 +79,7 @@ template class Transpose nestedExpression() { return m_matrix; } /** \internal */ + EIGEN_DEVICE_FUNC void resize(Index nrows, Index ncols) { m_matrix.resize(ncols,nrows); } diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h index ab73fcf21..521de6160 100644 --- a/Eigen/src/Core/TriangularMatrix.h +++ b/Eigen/src/Core/TriangularMatrix.h @@ -65,6 +65,7 @@ template class TriangularBase : public EigenBase inline Index innerStride() const { return derived().innerStride(); } // dummy resize function + EIGEN_DEVICE_FUNC void resize(Index rows, Index cols) { EIGEN_UNUSED_VARIABLE(rows); @@ -716,6 +717,7 @@ struct unary_evaluator, IndexBased> { typedef TriangularView XprType; typedef evaluator::type> Base; + EIGEN_DEVICE_FUNC unary_evaluator(const XprType &xpr) : Base(xpr.nestedExpression()) {} }; diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h index 67390f1d7..d38fd72b2 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixVector.h +++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h @@ -27,7 +27,8 @@ template -EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product::run( +EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC +void selfadjoint_matrix_vector_product::run( Index size, const Scalar* lhs, Index lhsStride, const Scalar* rhs, @@ -62,8 +64,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product enum { LhsUpLo = LhsMode&(Upper|Lower) }; template - static void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha) + static EIGEN_DEVICE_FUNC + void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha) { typedef typename Dest::Scalar ResScalar; typedef typename Rhs::Scalar RhsScalar; diff --git a/Eigen/src/Core/products/SelfadjointRank2Update.h b/Eigen/src/Core/products/SelfadjointRank2Update.h index d395888e5..09209f733 100644 --- a/Eigen/src/Core/products/SelfadjointRank2Update.h +++ b/Eigen/src/Core/products/SelfadjointRank2Update.h @@ -24,7 +24,8 @@ struct selfadjoint_rank2_update_selector; template struct selfadjoint_rank2_update_selector { - static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) + static EIGEN_DEVICE_FUNC + void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) { const Index size = u.size(); for (Index i=0; i struct blas_traits ExtractType, typename _ExtractType::PlainObject >::type DirectLinearAccessType; - static inline ExtractType extract(const XprType& x) { return x; } - static inline const Scalar extractScalarFactor(const XprType&) { return Scalar(1); } + static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return x; } + static inline EIGEN_DEVICE_FUNC const Scalar extractScalarFactor(const XprType&) { return Scalar(1); } }; // pop conjugate @@ -318,8 +318,8 @@ struct blas_traits, const CwiseNullaryOp typedef blas_traits Base; typedef CwiseBinaryOp, const CwiseNullaryOp,Plain>, NestedXpr> XprType; typedef typename Base::ExtractType ExtractType; - static inline ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); } - static inline Scalar extractScalarFactor(const XprType& x) + static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); } + static inline EIGEN_DEVICE_FUNC Scalar extractScalarFactor(const XprType& x) { return x.lhs().functor().m_other * Base::extractScalarFactor(x.rhs()); } }; template diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 53300c388..22d7679c5 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -542,7 +542,7 @@ template struct smart_memmove_helper { // you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA // to the appropriate stack allocation function -#ifndef EIGEN_ALLOCA +#if ! defined EIGEN_ALLOCA && ! defined EIGEN_CUDA_ARCH #if EIGEN_OS_LINUX || EIGEN_OS_MAC || (defined alloca) #define EIGEN_ALLOCA alloca #elif EIGEN_COMP_MSVC @@ -561,12 +561,14 @@ template class aligned_stack_memory_handler : noncopyable * In this case, the buffer elements will also be destructed when this handler will be destructed. * Finally, if \a dealloc is true, then the pointer \a ptr is freed. **/ + EIGEN_DEVICE_FUNC aligned_stack_memory_handler(T* ptr, std::size_t size, bool dealloc) : m_ptr(ptr), m_size(size), m_deallocate(dealloc) { if(NumTraits::RequireInitialization && m_ptr) Eigen::internal::construct_elements_of_array(m_ptr, size); } + EIGEN_DEVICE_FUNC ~aligned_stack_memory_handler() { if(NumTraits::RequireInitialization && m_ptr) diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 0d0b8c43a..ef9860c4b 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -544,6 +544,7 @@ using std::numeric_limits; // Integer division with rounding up. // T is assumed to be an integer type with a>=0, and b>0 template +EIGEN_DEVICE_FUNC T div_ceil(const T &a, const T &b) { return (a+b-1) / b; @@ -554,7 +555,7 @@ T div_ceil(const T &a, const T &b) template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const X& x,const Y& y) { return x == y; } -#if !defined(EIGEN_CUDA_ARCH) +#if !defined(EIGEN_CUDA_ARCH) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC) template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const float& x,const float& y) { return std::equal_to()(x,y); } @@ -565,7 +566,7 @@ bool equal_strict(const double& x,const double& y) { return std::equal_to EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const X& x,const Y& y) { return x != y; } -#if !defined(EIGEN_CUDA_ARCH) +#if !defined(EIGEN_CUDA_ARCH) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC) template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to()(x,y); } diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index f7a3d9ce7..7311adec7 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -685,12 +685,14 @@ struct possibly_same_dense { }; template +EIGEN_DEVICE_FUNC bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if::value>::type * = 0) { return (mat1.data()==mat2.data()) && (mat1.innerStride()==mat2.innerStride()) && (mat1.outerStride()==mat2.outerStride()); } template +EIGEN_DEVICE_FUNC bool is_same_dense(const T1 &, const T2 &, typename enable_if::value>::type * = 0) { return false; diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h index 040f8d3bb..fbc1ee2f6 100644 --- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h @@ -20,7 +20,9 @@ class GeneralizedSelfAdjointEigenSolver; namespace internal { template struct direct_selfadjoint_eigenvalues; + template +EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec); } @@ -354,7 +356,8 @@ template class SelfAdjointEigenSolver static const int m_maxIterations = 30; protected: - static void check_template_parameters() + static EIGEN_DEVICE_FUNC + void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } @@ -403,7 +406,7 @@ SelfAdjointEigenSolver& SelfAdjointEigenSolver const InputType &matrix(a_matrix.derived()); - using std::abs; + EIGEN_USING_STD_MATH(abs); eigen_assert(matrix.cols() == matrix.rows()); eigen_assert((options&~(EigVecMask|GenEigMask))==0 && (options&EigVecMask)!=EigVecMask @@ -479,9 +482,10 @@ namespace internal { * \returns \c Success or \c NoConvergence */ template +EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec) { - using std::abs; + EIGEN_USING_STD_MATH(abs); ComputationInfo info; typedef typename MatrixType::Scalar Scalar; @@ -535,7 +539,7 @@ ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag diag.segment(i,n-i).minCoeff(&k); if (k > 0) { - std::swap(diag[i], diag[k+i]); + numext::swap(diag[i], diag[k+i]); if(computeEigenvectors) eivec.col(i).swap(eivec.col(k+i)); } @@ -605,7 +609,7 @@ template struct direct_selfadjoint_eigenvalues res, Ref representative) { - using std::abs; + EIGEN_USING_STD_MATH(abs); Index i0; // Find non-zero column i0 (by construction, there must exist a non zero coefficient on the diagonal): mat.diagonal().cwiseAbs().maxCoeff(&i0); @@ -807,7 +811,7 @@ template EIGEN_DEVICE_FUNC static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index start, Index end, Scalar* matrixQ, Index n) { - using std::abs; + EIGEN_USING_STD_MATH(abs); RealScalar td = (diag[end-1] - diag[end])*RealScalar(0.5); RealScalar e = subdiag[end-1]; // Note that thanks to scaling, e^2 or td^2 cannot overflow, however they can still diff --git a/Eigen/src/Eigenvalues/Tridiagonalization.h b/Eigen/src/Eigenvalues/Tridiagonalization.h index 1d102c17b..c5c1acf46 100644 --- a/Eigen/src/Eigenvalues/Tridiagonalization.h +++ b/Eigen/src/Eigenvalues/Tridiagonalization.h @@ -25,6 +25,7 @@ struct traits > }; template +EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs); } @@ -344,6 +345,7 @@ namespace internal { * \sa Tridiagonalization::packedMatrix() */ template +EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs) { using numext::conj; @@ -424,6 +426,7 @@ struct tridiagonalization_inplace_selector; * \sa class Tridiagonalization */ template +EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ) { eigen_assert(mat.cols()==mat.rows() && diag.size()==mat.rows() && subdiag.size()==mat.rows()-1); @@ -439,7 +442,8 @@ struct tridiagonalization_inplace_selector typedef typename Tridiagonalization::CoeffVectorType CoeffVectorType; typedef typename Tridiagonalization::HouseholderSequenceType HouseholderSequenceType; template - static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ) + static EIGEN_DEVICE_FUNC + void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ) { CoeffVectorType hCoeffs(mat.cols()-1); tridiagonalization_inplace(mat,hCoeffs); @@ -508,7 +512,8 @@ struct tridiagonalization_inplace_selector typedef typename MatrixType::Scalar Scalar; template - static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, bool extractQ) + static EIGEN_DEVICE_FUNC + void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, bool extractQ) { diag(0,0) = numext::real(mat(0,0)); if(extractQ) diff --git a/Eigen/src/Householder/Householder.h b/Eigen/src/Householder/Householder.h index a5f336d18..5bc037f00 100644 --- a/Eigen/src/Householder/Householder.h +++ b/Eigen/src/Householder/Householder.h @@ -39,6 +39,7 @@ template struct decrement_size * MatrixBase::applyHouseholderOnTheRight() */ template +EIGEN_DEVICE_FUNC void MatrixBase::makeHouseholderInPlace(Scalar& tau, RealScalar& beta) { VectorBlock::ret> essentialPart(derived(), 1, size()-1); @@ -62,6 +63,7 @@ void MatrixBase::makeHouseholderInPlace(Scalar& tau, RealScalar& beta) */ template template +EIGEN_DEVICE_FUNC void MatrixBase::makeHouseholder( EssentialPart& essential, Scalar& tau, @@ -110,6 +112,7 @@ void MatrixBase::makeHouseholder( */ template template +EIGEN_DEVICE_FUNC void MatrixBase::applyHouseholderOnTheLeft( const EssentialPart& essential, const Scalar& tau, @@ -147,6 +150,7 @@ void MatrixBase::applyHouseholderOnTheLeft( */ template template +EIGEN_DEVICE_FUNC void MatrixBase::applyHouseholderOnTheRight( const EssentialPart& essential, const Scalar& tau, diff --git a/Eigen/src/Householder/HouseholderSequence.h b/Eigen/src/Householder/HouseholderSequence.h index fad1d5ab6..a4f40b75c 100644 --- a/Eigen/src/Householder/HouseholderSequence.h +++ b/Eigen/src/Householder/HouseholderSequence.h @@ -87,7 +87,7 @@ struct hseq_side_dependent_impl { typedef Block EssentialVectorType; typedef HouseholderSequence HouseholderSequenceType; - static inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k) + static EIGEN_DEVICE_FUNC inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k) { Index start = k+1+h.m_shift; return Block(h.m_vectors, start, k, h.rows()-start, 1); @@ -173,6 +173,7 @@ template class HouseholderS * * \sa setLength(), setShift() */ + EIGEN_DEVICE_FUNC HouseholderSequence(const VectorsType& v, const CoeffsType& h) : m_vectors(v), m_coeffs(h), m_reverse(false), m_length(v.diagonalSize()), m_shift(0) @@ -180,6 +181,7 @@ template class HouseholderS } /** \brief Copy constructor. */ + EIGEN_DEVICE_FUNC HouseholderSequence(const HouseholderSequence& other) : m_vectors(other.m_vectors), m_coeffs(other.m_coeffs), @@ -193,12 +195,14 @@ template class HouseholderS * \returns Number of rows * \details This equals the dimension of the space that the transformation acts on. */ + EIGEN_DEVICE_FUNC Index rows() const { return Side==OnTheLeft ? m_vectors.rows() : m_vectors.cols(); } /** \brief Number of columns of transformation viewed as a matrix. * \returns Number of columns * \details This equals the dimension of the space that the transformation acts on. */ + EIGEN_DEVICE_FUNC Index cols() const { return rows(); } /** \brief Essential part of a Householder vector. @@ -215,6 +219,7 @@ template class HouseholderS * * \sa setShift(), shift() */ + EIGEN_DEVICE_FUNC const EssentialVectorType essentialVector(Index k) const { eigen_assert(k >= 0 && k < m_length); @@ -252,7 +257,9 @@ template class HouseholderS AdjointReturnType inverse() const { return adjoint(); } /** \internal */ - template inline void evalTo(DestType& dst) const + template + inline EIGEN_DEVICE_FUNC + void evalTo(DestType& dst) const { Matrix workspace(rows()); @@ -261,6 +268,7 @@ template class HouseholderS /** \internal */ template + EIGEN_DEVICE_FUNC void evalTo(Dest& dst, Workspace& workspace) const { workspace.resize(rows()); @@ -394,6 +402,7 @@ template class HouseholderS * * \sa length() */ + EIGEN_DEVICE_FUNC HouseholderSequence& setLength(Index length) { m_length = length; @@ -411,13 +420,17 @@ template class HouseholderS * * \sa shift() */ + EIGEN_DEVICE_FUNC HouseholderSequence& setShift(Index shift) { m_shift = shift; return *this; } + EIGEN_DEVICE_FUNC Index length() const { return m_length; } /**< \brief Returns the length of the Householder sequence. */ + + EIGEN_DEVICE_FUNC Index shift() const { return m_shift; } /**< \brief Returns the shift of the Householder sequence. */ /* Necessary for .adjoint() and .conjugate() */ diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h index af1228cb8..bba75fc4f 100644 --- a/Eigen/src/Jacobi/Jacobi.h +++ b/Eigen/src/Jacobi/Jacobi.h @@ -90,6 +90,7 @@ template class JacobiRotation * \sa MatrixBase::makeJacobi(const MatrixBase&, Index, Index), MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight() */ template +EIGEN_DEVICE_FUNC bool JacobiRotation::makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z) { using std::sqrt; @@ -134,6 +135,7 @@ bool JacobiRotation::makeJacobi(const RealScalar& x, const Scalar& y, co */ template template +EIGEN_DEVICE_FUNC inline bool JacobiRotation::makeJacobi(const MatrixBase& m, Index p, Index q) { return makeJacobi(numext::real(m.coeff(p,p)), m.coeff(p,q), numext::real(m.coeff(q,q))); @@ -156,6 +158,7 @@ inline bool JacobiRotation::makeJacobi(const MatrixBase& m, Ind * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight() */ template +EIGEN_DEVICE_FUNC void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar* z) { makeGivens(p, q, z, typename internal::conditional::IsComplex, internal::true_type, internal::false_type>::type()); @@ -164,6 +167,7 @@ void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar // specialization for complexes template +EIGEN_DEVICE_FUNC void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::true_type) { using std::sqrt; @@ -223,6 +227,7 @@ void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar // specialization for reals template +EIGEN_DEVICE_FUNC void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::false_type) { using std::sqrt; @@ -286,6 +291,7 @@ void apply_rotation_in_the_plane(DenseBase& xpr_x, DenseBase& */ template template +EIGEN_DEVICE_FUNC inline void MatrixBase::applyOnTheLeft(Index p, Index q, const JacobiRotation& j) { RowXpr x(this->row(p)); @@ -301,6 +307,7 @@ inline void MatrixBase::applyOnTheLeft(Index p, Index q, const JacobiRo */ template template +EIGEN_DEVICE_FUNC inline void MatrixBase::applyOnTheRight(Index p, Index q, const JacobiRotation& j) { ColXpr x(this->col(p)); @@ -314,7 +321,8 @@ template struct apply_rotation_in_the_plane_selector { - static inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s) + static EIGEN_DEVICE_FUNC + inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s) { for(Index i=0; i +EIGEN_DEVICE_FUNC void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x, DenseBase& xpr_y, const JacobiRotation& j) { typedef typename VectorX::Scalar Scalar; diff --git a/Eigen/src/LU/Determinant.h b/Eigen/src/LU/Determinant.h index d6a3c1e5a..6af63a6e7 100644 --- a/Eigen/src/LU/Determinant.h +++ b/Eigen/src/LU/Determinant.h @@ -15,6 +15,7 @@ namespace Eigen { namespace internal { template +EIGEN_DEVICE_FUNC inline const typename Derived::Scalar bruteforce_det3_helper (const MatrixBase& matrix, int a, int b, int c) { @@ -23,6 +24,7 @@ inline const typename Derived::Scalar bruteforce_det3_helper } template +EIGEN_DEVICE_FUNC const typename Derived::Scalar bruteforce_det4_helper (const MatrixBase& matrix, int j, int k, int m, int n) { @@ -44,7 +46,8 @@ template struct determinant_impl { - static inline typename traits::Scalar run(const Derived& m) + static inline EIGEN_DEVICE_FUNC + typename traits::Scalar run(const Derived& m) { return m.coeff(0,0); } @@ -52,7 +55,8 @@ template struct determinant_impl template struct determinant_impl { - static inline typename traits::Scalar run(const Derived& m) + static inline EIGEN_DEVICE_FUNC + typename traits::Scalar run(const Derived& m) { return m.coeff(0,0) * m.coeff(1,1) - m.coeff(1,0) * m.coeff(0,1); } @@ -60,7 +64,8 @@ template struct determinant_impl template struct determinant_impl { - static inline typename traits::Scalar run(const Derived& m) + static inline EIGEN_DEVICE_FUNC + typename traits::Scalar run(const Derived& m) { return bruteforce_det3_helper(m,0,1,2) - bruteforce_det3_helper(m,1,0,2) @@ -70,7 +75,8 @@ template struct determinant_impl template struct determinant_impl { - static typename traits::Scalar run(const Derived& m) + static EIGEN_DEVICE_FUNC + typename traits::Scalar run(const Derived& m) { // trick by Martin Costabel to compute 4x4 det with only 30 muls return bruteforce_det4_helper(m,0,1,2,3) @@ -89,6 +95,7 @@ template struct determinant_impl * \returns the determinant of this matrix */ template +EIGEN_DEVICE_FUNC inline typename internal::traits::Scalar MatrixBase::determinant() const { eigen_assert(rows() == cols()); diff --git a/Eigen/src/LU/InverseImpl.h b/Eigen/src/LU/InverseImpl.h index f49f23360..1bab00c01 100644 --- a/Eigen/src/LU/InverseImpl.h +++ b/Eigen/src/LU/InverseImpl.h @@ -290,6 +290,7 @@ template struct Assignment, internal::assign_op, Dense2Dense> { typedef Inverse SrcXprType; + EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &) { Index dstRows = src.rows(); @@ -332,6 +333,7 @@ struct Assignment, internal::assign_op +EIGEN_DEVICE_FUNC inline const Inverse MatrixBase::inverse() const { EIGEN_STATIC_ASSERT(!NumTraits::IsInteger,THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES) diff --git a/Eigen/src/plugins/BlockMethods.h b/Eigen/src/plugins/BlockMethods.h index a5748525b..67fdebc6f 100644 --- a/Eigen/src/plugins/BlockMethods.h +++ b/Eigen/src/plugins/BlockMethods.h @@ -1061,6 +1061,7 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa block(Index,Index,NRowsType,NColsType), class Block /// template +EIGEN_DEVICE_FUNC inline typename FixedBlockXpr::Type block(Index startRow, Index startCol, Index blockRows, Index blockCols) { -- cgit v1.2.3 From a8ab6060df270709b9975102382caa0eb8ef22ec Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 6 Jul 2018 09:58:45 +0200 Subject: Add unitests for inverse and selfadjoint-eigenvalues on CUDA --- test/cuda_basic.cu | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/test/cuda_basic.cu b/test/cuda_basic.cu index ce66c2c78..33e5fd119 100644 --- a/test/cuda_basic.cu +++ b/test/cuda_basic.cu @@ -121,7 +121,7 @@ struct diagonal { }; template -struct eigenvalues { +struct eigenvalues_direct { EIGEN_DEVICE_FUNC void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const { @@ -136,6 +136,34 @@ struct eigenvalues { } }; +template +struct eigenvalues { + EIGEN_DEVICE_FUNC + void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const + { + using namespace Eigen; + typedef Matrix Vec; + T M(in+i); + Map res(out+i*Vec::MaxSizeAtCompileTime); + T A = M*M.adjoint(); + SelfAdjointEigenSolver eig; + eig.compute(M); + res = eig.eigenvalues(); + } +}; + +template +struct matrix_inverse { + EIGEN_DEVICE_FUNC + void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const + { + using namespace Eigen; + T M(in+i); + Map res(out+i*T::MaxSizeAtCompileTime); + res = M.inverse(); + } +}; + void test_cuda_basic() { ei_test_init_cuda(); @@ -163,8 +191,13 @@ void test_cuda_basic() CALL_SUBTEST( run_and_compare_to_cuda(diagonal(), nthreads, in, out) ); CALL_SUBTEST( run_and_compare_to_cuda(diagonal(), nthreads, in, out) ); + + CALL_SUBTEST( run_and_compare_to_cuda(matrix_inverse(), nthreads, in, out) ); + CALL_SUBTEST( run_and_compare_to_cuda(matrix_inverse(), nthreads, in, out) ); + CALL_SUBTEST( run_and_compare_to_cuda(matrix_inverse(), nthreads, in, out) ); - CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues(), nthreads, in, out) ); - CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues(), nthreads, in, out) ); + CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues_direct(), nthreads, in, out) ); + CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues_direct(), nthreads, in, out) ); + CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues(), nthreads, in, out) ); } -- cgit v1.2.3 From f4d623ffa772093169a0ac949c7955bf38bf1bbf Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 6 Jul 2018 17:13:36 +0200 Subject: Complete Packet8h implementation and test it in packetmath unit test --- Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 85 +++++++++++++++++++++++++++++-- test/packetmath.cpp | 15 +++--- 2 files changed, 90 insertions(+), 10 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index 8a6f209c4..2ee92b4f6 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -493,6 +493,13 @@ template<> EIGEN_STRONG_INLINE Packet16h padd(const Packet16h& a, con return float2half(rf); } +template<> EIGEN_STRONG_INLINE Packet16h psub(const Packet16h& a, const Packet16h& b) { + Packet16f af = half2float(a); + Packet16f bf = half2float(b); + Packet16f rf = psub(af, bf); + return float2half(rf); +} + template<> EIGEN_STRONG_INLINE Packet16h pmul(const Packet16h& a, const Packet16h& b) { Packet16f af = half2float(a); Packet16f bf = half2float(b); @@ -730,10 +737,10 @@ struct packet_traits : default_packet_traits { AlignedOnScalar = 1, size = 8, HasHalfPacket = 0, - HasAdd = 0, - HasSub = 0, - HasMul = 0, - HasNegate = 0, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasNegate = 1, HasAbs = 0, HasAbs2 = 0, HasMin = 0, @@ -782,6 +789,17 @@ template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x); } +template<> EIGEN_STRONG_INLINE Packet8h +ploaddup(const Eigen::half* from) { + Packet8h result; + unsigned short a = from[0].x; + unsigned short b = from[1].x; + unsigned short c = from[2].x; + unsigned short d = from[3].x; + result.x = _mm_set_epi16(d, d, c, c, b, b, a, a); + return result; +} + template<> EIGEN_STRONG_INLINE Packet8h ploadquad(const Eigen::half* from) { Packet8h result; @@ -835,6 +853,12 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) { + Packet8f af = half2float(a); + Packet8f rf = pnegate(af); + return float2half(rf); +} + template<> EIGEN_STRONG_INLINE Packet8h padd(const Packet8h& a, const Packet8h& b) { Packet8f af = half2float(a); Packet8f bf = half2float(b); @@ -842,6 +866,13 @@ template<> EIGEN_STRONG_INLINE Packet8h padd(const Packet8h& a, const return float2half(rf); } +template<> EIGEN_STRONG_INLINE Packet8h psub(const Packet8h& a, const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = psub(af, bf); + return float2half(rf); +} + template<> EIGEN_STRONG_INLINE Packet8h pmul(const Packet8h& a, const Packet8h& b) { Packet8f af = half2float(a); Packet8f bf = half2float(b); @@ -894,6 +925,52 @@ template<> EIGEN_STRONG_INLINE Eigen::half predux_mul(const Packet8h& return Eigen::half(reduced); } +template<> EIGEN_STRONG_INLINE Packet8h preduxp(const Packet8h* p) { + Packet8f pf[8]; + pf[0] = half2float(p[0]); + pf[1] = half2float(p[1]); + pf[2] = half2float(p[2]); + pf[3] = half2float(p[3]); + pf[4] = half2float(p[4]); + pf[5] = half2float(p[5]); + pf[6] = half2float(p[6]); + pf[7] = half2float(p[7]); + Packet8f reduced = preduxp(pf); + return float2half(reduced); +} + +template<> EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a) +{ + __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1); + Packet8h res; + res.x = _mm_shuffle_epi8(a.x,m); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet8h pinsertfirst(const Packet8h& a, Eigen::half b) +{ + Packet8h res; + res.x = _mm_insert_epi16(a.x,int(b.x),0); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet8h pinsertlast(const Packet8h& a, Eigen::half b) +{ + Packet8h res; + res.x = _mm_insert_epi16(a.x,int(b.x),15); + return res; +} + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet8h& first, const Packet8h& second) + { + if (Offset!=0) + first.x = _mm_alignr_epi8(second.x,first.x, Offset*2); + } +}; + EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { __m128i a = kernel.packet[0].x; diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 4b19be92d..94ee91e3b 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -123,7 +123,7 @@ template void packetmath() EIGEN_ALIGN_MAX Scalar data2[size]; EIGEN_ALIGN_MAX Packet packets[PacketSize*2]; EIGEN_ALIGN_MAX Scalar ref[size]; - RealScalar refvalue = 0; + RealScalar refvalue = RealScalar(0); for (int i=0; i()/RealScalar(PacketSize); @@ -178,7 +178,8 @@ template void packetmath() VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasSub); VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasMul); VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasNegate); - VERIFY((internal::is_same::value) || (!PacketTraits::Vectorizable) || PacketTraits::HasDiv); + // Disabled as it is not clear why it would be mandatory to support division. + //VERIFY((internal::is_same::value) || (!PacketTraits::Vectorizable) || PacketTraits::HasDiv); CHECK_CWISE2_IF(PacketTraits::HasAdd, REF_ADD, internal::padd); CHECK_CWISE2_IF(PacketTraits::HasSub, REF_SUB, internal::psub); @@ -242,29 +243,30 @@ template void packetmath() } } - ref[0] = 0; + ref[0] = Scalar(0); for (int i=0; i(data1)), refvalue) && "internal::predux"); + if(PacketSize==8 && internal::unpacket_traits::half>::size ==4) // so far, predux_half_dowto4 is only required in such a case { int HalfPacketSize = PacketSize>4 ? PacketSize/2 : PacketSize; for (int i=0; i(data1))); VERIFY(areApprox(ref, data2, HalfPacketSize) && "internal::predux_half_dowto4"); } - ref[0] = 1; + ref[0] = Scalar(1); for (int i=0; i(data1))) && "internal::predux_mul"); for (int j=0; j(data1+j*PacketSize); @@ -630,6 +632,7 @@ void test_packetmath() CALL_SUBTEST_3( packetmath() ); CALL_SUBTEST_4( packetmath >() ); CALL_SUBTEST_5( packetmath >() ); + CALL_SUBTEST_6( packetmath() ); CALL_SUBTEST_1( packetmath_notcomplex() ); CALL_SUBTEST_2( packetmath_notcomplex() ); -- cgit v1.2.3 From 56a33ae57dd536f35be007eb051a636cea4467d0 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 6 Jul 2018 17:14:04 +0200 Subject: test product kernel with half-floats. --- test/half_float.cpp | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/test/half_float.cpp b/test/half_float.cpp index 7734f82cc..487d0d1b2 100644 --- a/test/half_float.cpp +++ b/test/half_float.cpp @@ -257,13 +257,31 @@ void test_array() ss << a1; } +void test_product() +{ + typedef Matrix MatrixXh; + Index rows = internal::random(1,EIGEN_TEST_MAX_SIZE); + Index cols = internal::random(1,EIGEN_TEST_MAX_SIZE); + Index depth = internal::random(1,EIGEN_TEST_MAX_SIZE); + MatrixXh Ah = MatrixXh::Random(rows,depth); + MatrixXh Bh = MatrixXh::Random(depth,cols); + MatrixXh Ch = MatrixXh::Random(rows,cols); + MatrixXf Af = Ah.cast(); + MatrixXf Bf = Bh.cast(); + MatrixXf Cf = Ch.cast(); + VERIFY_IS_APPROX(Ch.noalias()+=Ah*Bh, (Cf.noalias()+=Af*Bf).cast()); +} + void test_half_float() { - CALL_SUBTEST(test_conversion()); CALL_SUBTEST(test_numtraits()); - CALL_SUBTEST(test_arithmetic()); - CALL_SUBTEST(test_comparison()); - CALL_SUBTEST(test_basic_functions()); - CALL_SUBTEST(test_trigonometric_functions()); - CALL_SUBTEST(test_array()); + for(int i = 0; i < g_repeat; i++) { + CALL_SUBTEST(test_conversion()); + CALL_SUBTEST(test_arithmetic()); + CALL_SUBTEST(test_comparison()); + CALL_SUBTEST(test_basic_functions()); + CALL_SUBTEST(test_trigonometric_functions()); + CALL_SUBTEST(test_array()); + CALL_SUBTEST(test_product()); + } } -- cgit v1.2.3 From a937c5020889f95aadccefc79338ed83c2ff8784 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 6 Jul 2018 17:41:52 +0200 Subject: palign is not used anymore, so let's relax the unit test --- test/packetmath.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 94ee91e3b..56e017383 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -171,7 +171,10 @@ template void packetmath() for (int i=0; i Date: Fri, 6 Jul 2018 17:43:11 +0200 Subject: complete implementation of Packet16h (AVX512) --- Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 55 ++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index 2ee92b4f6..c068351ce 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -486,6 +486,13 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { #endif } +template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) { + // FIXME we could do that with bit manipulation + Packet16f af = half2float(a); + Packet16f rf = pnegate(af); + return float2half(rf); +} + template<> EIGEN_STRONG_INLINE Packet16h padd(const Packet16h& a, const Packet16h& b) { Packet16f af = half2float(a); Packet16f bf = half2float(b); @@ -512,6 +519,51 @@ template<> EIGEN_STRONG_INLINE half predux(const Packet16h& from) { return half(predux(from_float)); } +template<> EIGEN_STRONG_INLINE Packet16h preduxp(const Packet16h* p) { + Packet16f pf[16]; + pf[0] = half2float(p[0]); + pf[1] = half2float(p[1]); + pf[2] = half2float(p[2]); + pf[3] = half2float(p[3]); + pf[4] = half2float(p[4]); + pf[5] = half2float(p[5]); + pf[6] = half2float(p[6]); + pf[7] = half2float(p[7]); + pf[8] = half2float(p[8]); + pf[9] = half2float(p[9]); + pf[10] = half2float(p[10]); + pf[11] = half2float(p[11]); + pf[12] = half2float(p[12]); + pf[13] = half2float(p[13]); + pf[14] = half2float(p[14]); + pf[15] = half2float(p[15]); + Packet16f reduced = preduxp(pf); + return float2half(reduced); +} + +template<> EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a) +{ + __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1); + Packet16h res; + res.x = _mm256_set_m128i(_mm_shuffle_epi8(_mm256_extractf128_si256(a.x,0),m), + _mm_shuffle_epi8(_mm256_extractf128_si256(a.x,1),m)); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet16h pinsertfirst(const Packet16h& a, Eigen::half b) +{ + Packet16h res; + res.x = _mm256_insert_epi16(a.x,b.x,0); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet16h pinsertlast(const Packet16h& a, Eigen::half b) +{ + Packet16h res; + res.x = _mm256_insert_epi16(a.x,b.x,15); + return res; +} + template<> EIGEN_STRONG_INLINE Packet16h pgather(const Eigen::half* from, Index stride) { Packet16h result; @@ -854,6 +906,7 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; } template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) { + // FIXME we could do that with bit manipulation Packet8f af = half2float(a); Packet8f rf = pnegate(af); return float2half(rf); @@ -957,7 +1010,7 @@ template<> EIGEN_STRONG_INLINE Packet8h pinsertfirst(const Packet8h& a, Eigen::h template<> EIGEN_STRONG_INLINE Packet8h pinsertlast(const Packet8h& a, Eigen::half b) { Packet8h res; - res.x = _mm_insert_epi16(a.x,int(b.x),15); + res.x = _mm_insert_epi16(a.x,int(b.x),7); return res; } -- cgit v1.2.3 From 1f54164eca5a8960205b94dd3f3cec87089e4ac6 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 7 Jul 2018 00:15:07 +0200 Subject: Fix a few issues with Packet16h --- Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 33 +++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index c068351ce..ae9193b0d 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -362,10 +362,10 @@ struct packet_traits : default_packet_traits { AlignedOnScalar = 1, size = 16, HasHalfPacket = 0, - HasAdd = 0, - HasSub = 0, - HasMul = 0, - HasNegate = 0, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasNegate = 1, HasAbs = 0, HasAbs2 = 0, HasMin = 0, @@ -414,6 +414,21 @@ template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet1 _mm256_storeu_si256((__m256i*)to, from.x); } +template<> EIGEN_STRONG_INLINE Packet16h +ploaddup(const Eigen::half* from) { + Packet16h result; + unsigned short a = from[0].x; + unsigned short b = from[1].x; + unsigned short c = from[2].x; + unsigned short d = from[3].x; + unsigned short e = from[4].x; + unsigned short f = from[5].x; + unsigned short g = from[6].x; + unsigned short h = from[7].x; + result.x = _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a); + return result; +} + template<> EIGEN_STRONG_INLINE Packet16h ploadquad(const Eigen::half* from) { Packet16h result; @@ -519,6 +534,11 @@ template<> EIGEN_STRONG_INLINE half predux(const Packet16h& from) { return half(predux(from_float)); } +template<> EIGEN_STRONG_INLINE half predux_mul(const Packet16h& from) { + Packet16f from_float = half2float(from); + return half(predux_mul(from_float)); +} + template<> EIGEN_STRONG_INLINE Packet16h preduxp(const Packet16h* p) { Packet16f pf[16]; pf[0] = half2float(p[0]); @@ -545,8 +565,9 @@ template<> EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a) { __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1); Packet16h res; - res.x = _mm256_set_m128i(_mm_shuffle_epi8(_mm256_extractf128_si256(a.x,0),m), - _mm_shuffle_epi8(_mm256_extractf128_si256(a.x,1),m)); + res.x = _mm256_insertf128_si256( + _mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(a.x,1),m)), + _mm_shuffle_epi8(_mm256_extractf128_si256(a.x,0),m), 1); return res; } -- cgit v1.2.3 From 90a53ca6fd8a79d39f926430ec07f626c8260a20 Mon Sep 17 00:00:00 2001 From: Mark D Ryan Date: Sat, 16 Jun 2018 15:13:06 -0700 Subject: Fix the Packet16h version of ptranspose The AVX512 version of ptranpose for PacketBlock was reordering the PacketBlock argument incorrectly. This lead to errors in the multiplication of matrices composed of 16 bit floats on AVX512 machines, if at least of the matrices was using RowMajor order. This error is responsible for one tensorflow unit test failure on AVX512 machines: //tensorflow/python/kernel_tests:batch_matmul_op_test --- Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index ae9193b0d..9897bd4e5 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -692,20 +692,20 @@ ptranspose(PacketBlock& kernel) { // NOTE: no unpacklo/hi instr in this case, so using permute instr. __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20); - __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31); - __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20); - __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31); - __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20); - __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31); - __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20); - __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31); - __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20); - __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31); - __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20); - __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31); - __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20); - __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31); - __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20); + __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20); + __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20); + __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20); + __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20); + __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20); + __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20); + __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20); + __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31); + __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31); + __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31); + __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31); + __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31); + __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31); + __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31); __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31); kernel.packet[0].x = a_p_0; -- cgit v1.2.3 From 359dd77ec3583fa44a927e3ebcee902bd37f6623 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 9 Jul 2018 11:03:39 +0200 Subject: Fix legitimate "declaration shadows a typedef" warning --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index dd4f366ee..1890efd4d 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1526,10 +1526,10 @@ void gebp_kernel::half SResPacketHalf; + const int SResPacketHalfSize = unpacket_traits::half>::size; if ((SwappedTraits::LhsProgress % 4) == 0 && (SwappedTraits::LhsProgress <= 8) && - (SwappedTraits::LhsProgress!=8 || unpacket_traits::size==nr)) + (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr)) { SAccPacket C0, C1, C2, C3; straits.initAcc(C0); -- cgit v1.2.3 From ec323b7e6643d9da3d9cda86570cc7724e0a8d3c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 9 Jul 2018 11:13:19 +0200 Subject: Skip null numerators in triangular-vector-solve (as in BLAS TRSV). --- Eigen/src/Core/products/TriangularSolverVector.h | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/Eigen/src/Core/products/TriangularSolverVector.h b/Eigen/src/Core/products/TriangularSolverVector.h index b994759b2..647317016 100644 --- a/Eigen/src/Core/products/TriangularSolverVector.h +++ b/Eigen/src/Core/products/TriangularSolverVector.h @@ -58,7 +58,7 @@ struct triangular_solve_vector0) rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map >(rhs+s,k))).sum(); - if(!(Mode & UnitDiag)) + if((!(Mode & UnitDiag)) && numext::not_equal_strict(rhs[i],RhsScalar(0))) rhs[i] /= cjLhs(i,i); } } @@ -114,20 +114,23 @@ struct triangular_solve_vector0) - Map >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r); + if(numext::not_equal_strict(rhs[i],RhsScalar(0))) + { + if(!(Mode & UnitDiag)) + rhs[i] /= cjLhs.coeff(i,i); + + Index r = actualPanelWidth - k - 1; // remaining size + Index s = IsLower ? i+1 : i-r; + if (r>0) + Map >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r); + } } Index r = IsLower ? size - endBlock : startBlock; // remaining size if (r > 0) { // let's directly call the low level product function because: // 1 - it is faster to compile - // 2 - it is slighlty faster at runtime + // 2 - it is slightly faster at runtime general_matrix_vector_product::run( r, actualPanelWidth, LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride), -- cgit v1.2.3 From 6190aa5632fb698fa66d2dad2949275089f15738 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 9 Jul 2018 11:23:16 +0200 Subject: bug #1567: add optimized path for tensor broadcasting and 'Channel First' shape --- .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 70 ++++++++++++++++++++-- unsupported/test/cxx11_tensor_broadcasting.cpp | 57 ++++++++++++++++++ 2 files changed, 123 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 9ab6b3565..b35b36475 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -161,6 +161,22 @@ struct TensorEvaluator, Device> } } } + + // Handle special format like NCHW, its input shape is '[1, N..., 1]' and + // broadcast shape is '[N, 1..., N]' + if (!oneByN && !nByOne) { + if (input_dims[0] == 1 && input_dims[NumDims-1] == 1 && NumDims > 2) { + nByOne = true; + oneByN = true; + for (int i = 1; i < NumDims-1; ++i) { + if (broadcast[i] != 1) { + nByOne = false; + oneByN = false; + break; + } + } + } + } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -256,24 +272,70 @@ struct TensorEvaluator, Device> } if (static_cast(Layout) == static_cast(ColMajor)) { - if (oneByN) { + if (oneByN && !nByOne) { return packetNByOne(index); - } else if (nByOne) { + } else if (!oneByN && nByOne) { return packetOneByN(index); + } else if (oneByN && nByOne) { + return packetOneByNByOne(index); } else { return packetColMajor(index); } } else { - if (oneByN) { + if (oneByN && !nByOne) { return packetOneByN(index); - } else if (nByOne) { + } else if (!oneByN && nByOne) { return packetNByOne(index); + } else if (oneByN && nByOne) { + return packetOneByNByOne(index); } else { return packetRowMajor(index); } } } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByNByOne + (Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + Index startDim, endDim; + Index inputIndex, outputOffset, batchedIndex; + + if (static_cast(Layout) == static_cast(ColMajor)) { + startDim = NumDims - 1; + endDim = 1; + } else { + startDim = 0; + endDim = NumDims - 2; + } + + batchedIndex = index % m_outputStrides[startDim]; + inputIndex = batchedIndex / m_outputStrides[endDim]; + outputOffset = batchedIndex % m_outputStrides[endDim]; + + if (outputOffset + PacketSize <= m_outputStrides[endDim]) { + values[0] = m_impl.coeff(inputIndex); + return internal::pload1(values); + } else { + for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) { + if (outputOffset + cur < m_outputStrides[endDim]) { + values[i] = m_impl.coeff(inputIndex); + } else { + ++inputIndex; + inputIndex = (inputIndex == m_inputStrides[startDim] ? 0 : inputIndex); + values[i] = m_impl.coeff(inputIndex); + outputOffset = 0; + cur = 0; + } + } + return internal::pload(values); + } + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const { diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp index a9d268ea6..f0ff03184 100644 --- a/unsupported/test/cxx11_tensor_broadcasting.cpp +++ b/unsupported/test/cxx11_tensor_broadcasting.cpp @@ -238,6 +238,59 @@ static void test_simple_broadcasting_n_by_one() } } +template +static void test_simple_broadcasting_one_by_n_by_one_1d() +{ + Tensor tensor(1,7,1); + tensor.setRandom(); + array broadcasts; + broadcasts[0] = 5; + broadcasts[1] = 1; + broadcasts[2] = 13; + Tensor broadcasted; + broadcasted = tensor.broadcast(broadcasts); + + VERIFY_IS_EQUAL(broadcasted.dimension(0), 5); + VERIFY_IS_EQUAL(broadcasted.dimension(1), 7); + VERIFY_IS_EQUAL(broadcasted.dimension(2), 13); + + for (int i = 0; i < 5; ++i) { + for (int j = 0; j < 7; ++j) { + for (int k = 0; k < 13; ++k) { + VERIFY_IS_EQUAL(tensor(0,j%7,0), broadcasted(i,j,k)); + } + } + } +} + +template +static void test_simple_broadcasting_one_by_n_by_one_2d() +{ + Tensor tensor(1,7,13,1); + tensor.setRandom(); + array broadcasts; + broadcasts[0] = 5; + broadcasts[1] = 1; + broadcasts[2] = 1; + broadcasts[3] = 19; + Tensor broadcast; + broadcast = tensor.broadcast(broadcasts); + + VERIFY_IS_EQUAL(broadcast.dimension(0), 5); + VERIFY_IS_EQUAL(broadcast.dimension(1), 7); + VERIFY_IS_EQUAL(broadcast.dimension(2), 13); + VERIFY_IS_EQUAL(broadcast.dimension(3), 19); + + for (int i = 0; i < 5; ++i) { + for (int j = 0; j < 7; ++j) { + for (int k = 0; k < 13; ++k) { + for (int l = 0; l < 19; ++l) { + VERIFY_IS_EQUAL(tensor(0,j%7,k%13,0), broadcast(i,j,k,l)); + } + } + } + } +} void test_cxx11_tensor_broadcasting() { @@ -253,4 +306,8 @@ void test_cxx11_tensor_broadcasting() CALL_SUBTEST(test_simple_broadcasting_n_by_one()); CALL_SUBTEST(test_simple_broadcasting_one_by_n()); CALL_SUBTEST(test_simple_broadcasting_n_by_one()); + CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_1d()); + CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_2d()); + CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_1d()); + CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_2d()); } -- cgit v1.2.3 From de9e31a06d0324862d9200d08eb3cc4d3d07e660 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 9 Jul 2018 15:41:14 +0200 Subject: Introduce the macro ei_declare_local_nested_eval to help allocating on the stack local temporaries via alloca, and let outer-products makes a good use of it. If successful, we should use it everywhere nested_eval is used to declare local dense temporaries. --- Eigen/src/Core/ProductEvaluators.h | 4 +- Eigen/src/Core/util/Memory.h | 80 ++++++++++++++++++++++++++++++++++++-- Eigen/src/Core/util/XprHelper.h | 2 +- test/product_notemporary.cpp | 10 ++++- 4 files changed, 89 insertions(+), 7 deletions(-) diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 76e5083c1..22ad32ae3 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -272,7 +272,7 @@ template void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&) { evaluator rhsEval(rhs); - typename nested_eval::type actual_lhs(lhs); + ei_declare_local_nested_eval(Lhs,lhs,Rhs::SizeAtCompileTime,actual_lhs); // FIXME if cols is large enough, then it might be useful to make sure that lhs is sequentially stored // FIXME not very good if rhs is real and lhs complex while alpha is real too const Index cols = dst.cols(); @@ -285,7 +285,7 @@ template void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&) { evaluator lhsEval(lhs); - typename nested_eval::type actual_rhs(rhs); + ei_declare_local_nested_eval(Rhs,rhs,Lhs::SizeAtCompileTime,actual_rhs); // FIXME if rows is large enough, then it might be useful to make sure that rhs is sequentially stored // FIXME not very good if lhs is real and rhs complex while alpha is real too const Index rows = dst.rows(); diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 22d7679c5..43920bdc8 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -582,6 +582,60 @@ template class aligned_stack_memory_handler : noncopyable bool m_deallocate; }; +#ifdef EIGEN_ALLOCA + +template::Evaluate && Xpr::MaxSizeAtCompileTime==Dynamic + > +struct local_nested_eval_wrapper +{ + static const bool NeedExternalBuffer = false; + typedef typename Xpr::Scalar Scalar; + typedef typename nested_eval::type ObjectType; + ObjectType object; + + EIGEN_DEVICE_FUNC + local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr) : object(xpr) + { + EIGEN_UNUSED_VARIABLE(ptr); + eigen_internal_assert(ptr==0); + } +}; + +template +struct local_nested_eval_wrapper +{ + static const bool NeedExternalBuffer = true; + typedef typename Xpr::Scalar Scalar; + typedef typename plain_object_eval::type PlainObject; + typedef Map ObjectType; + ObjectType object; + + EIGEN_DEVICE_FUNC + local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr) + : object(ptr==0 ? reinterpret_cast(Eigen::internal::aligned_malloc(sizeof(Scalar)*xpr.size())) : ptr, xpr.rows(), xpr.cols()), + m_deallocate(ptr==0) + { + if(NumTraits::RequireInitialization && object.data()) + Eigen::internal::construct_elements_of_array(object.data(), object.size()); + object = xpr; + } + + EIGEN_DEVICE_FUNC + ~local_nested_eval_wrapper() + { + if(NumTraits::RequireInitialization && object.data()) + Eigen::internal::destruct_elements_of_array(object.data(), object.size()); + if(m_deallocate) + Eigen::internal::aligned_free(object.data()); + } + +private: + bool m_deallocate; +}; + +#endif // EIGEN_ALLOCA + template class scoped_array : noncopyable { T* m_ptr; @@ -609,9 +663,11 @@ template void swap(scoped_array &a,scoped_array &b) } // end namespace internal /** \internal - * Declares, allocates and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack - * if SIZE is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform - * (currently, this is Linux and Visual Studio only). Otherwise the memory is allocated on the heap. + * + * The macro ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) declares, allocates, + * and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack + * if the size in bytes is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform + * (currently, this is Linux, OSX and Visual Studio only). Otherwise the memory is allocated on the heap. * The allocated buffer is automatically deleted when exiting the scope of this declaration. * If BUFFER is non null, then the declared variable is simply an alias for BUFFER, and no allocation/deletion occurs. * Here is an example: @@ -622,6 +678,14 @@ template void swap(scoped_array &a,scoped_array &b) * } * \endcode * The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token. + * + * The macro ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) is analogue to + * \code + * typename internal::nested_eval::type NAME(XPR); + * \endcode + * with the advantage of using aligned stack allocation even if the maximal size of XPR at compile time is unknown. + * This is accomplished through alloca if this later is supported and if the required number of bytes + * is below EIGEN_STACK_ALLOCATION_LIMIT. */ #ifdef EIGEN_ALLOCA @@ -641,6 +705,13 @@ template void swap(scoped_array &a,scoped_array &b) : Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE) ); \ Eigen::internal::aligned_stack_memory_handler EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,sizeof(TYPE)*SIZE>EIGEN_STACK_ALLOCATION_LIMIT) + + #define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) \ + Eigen::internal::local_nested_eval_wrapper EIGEN_CAT(NAME,_wrapper)(XPR, reinterpret_cast( \ + ( (Eigen::internal::local_nested_eval_wrapper::NeedExternalBuffer) && ((sizeof(typename XPR_T::Scalar)*XPR.size())<=EIGEN_STACK_ALLOCATION_LIMIT) ) \ + ? EIGEN_ALIGNED_ALLOCA( sizeof(typename XPR_T::Scalar)*XPR.size() ) : 0 ) ) ; \ + typename Eigen::internal::local_nested_eval_wrapper::ObjectType NAME(EIGEN_CAT(NAME,_wrapper).object) + #else #define ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) \ @@ -648,6 +719,9 @@ template void swap(scoped_array &a,scoped_array &b) TYPE* NAME = (BUFFER)!=0 ? BUFFER : reinterpret_cast(Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE)); \ Eigen::internal::aligned_stack_memory_handler EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,true) + +#define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) typename Eigen::internal::nested_eval::type NAME(XPR); + #endif diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 7311adec7..3926d3a6c 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -460,7 +460,7 @@ template { enum { ScalarReadCost = NumTraits::Scalar>::ReadCost, - CoeffReadCost = evaluator::CoeffReadCost, // NOTE What if an evaluator evaluate itself into a tempory? + CoeffReadCost = evaluator::CoeffReadCost, // NOTE What if an evaluator evaluate itself into a temporary? // Then CoeffReadCost will be small (e.g., 1) but we still have to evaluate, especially if n>1. // This situation is already taken care by the EvalBeforeNestingBit flag, which is turned ON // for all evaluator creating a temporary. This flag is then propagated by the parent evaluators. diff --git a/test/product_notemporary.cpp b/test/product_notemporary.cpp index 30592b79e..062180f42 100644 --- a/test/product_notemporary.cpp +++ b/test/product_notemporary.cpp @@ -128,11 +128,19 @@ template void product_notemporary(const MatrixType& m) VERIFY_EVALUATION_COUNT( cvres.noalias() = (rm3+rm3) * (m1*cv1), 1 ); // Check outer products + #ifdef EIGEN_ALLOCA + bool temp_via_alloca = m3.rows()*sizeof(Scalar) <= EIGEN_STACK_ALLOCATION_LIMIT; + #else + bool temp_via_alloca = false; + #endif m3 = cv1 * rv1; VERIFY_EVALUATION_COUNT( m3.noalias() = cv1 * rv1, 0 ); - VERIFY_EVALUATION_COUNT( m3.noalias() = (cv1+cv1) * (rv1+rv1), 1 ); + VERIFY_EVALUATION_COUNT( m3.noalias() = (cv1+cv1) * (rv1+rv1), temp_via_alloca ? 0 : 1 ); VERIFY_EVALUATION_COUNT( m3.noalias() = (m1*cv1) * (rv1), 1 ); VERIFY_EVALUATION_COUNT( m3.noalias() += (m1*cv1) * (rv1), 1 ); + rm3 = cv1 * rv1; + VERIFY_EVALUATION_COUNT( rm3.noalias() = cv1 * rv1, 0 ); + VERIFY_EVALUATION_COUNT( rm3.noalias() = (cv1+cv1) * (rv1+rv1), temp_via_alloca ? 0 : 1 ); VERIFY_EVALUATION_COUNT( rm3.noalias() = (cv1) * (rv1 * m1), 1 ); VERIFY_EVALUATION_COUNT( rm3.noalias() -= (cv1) * (rv1 * m1), 1 ); VERIFY_EVALUATION_COUNT( rm3.noalias() = (m1*cv1) * (rv1 * m1), 2 ); -- cgit v1.2.3 From 9357838f94d2907996adadc7e5200376f3561ed4 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 10 Jul 2018 09:10:15 +0200 Subject: bug #1543: improve linear indexing for general block expressions --- Eigen/src/Core/CoreEvaluators.h | 6 +++--- test/block.cpp | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index 6231f2edd..b65ec4ffb 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -1080,7 +1080,7 @@ struct unary_evaluator, IndexBa : m_argImpl(block.nestedExpression()), m_startRow(block.startRow()), m_startCol(block.startCol()), - m_linear_offset(InnerPanel?(XprType::IsRowMajor ? block.startRow()*block.cols() : block.startCol()*block.rows()):0) + m_linear_offset((InnerPanel|| XprType::IsVectorAtCompileTime)?(XprType::IsRowMajor ? block.startRow()*block.cols() + block.startCol() : block.startCol()*block.rows() + block.startRow()):0) { } typedef typename XprType::Scalar Scalar; @@ -1088,7 +1088,7 @@ struct unary_evaluator, IndexBa enum { RowsAtCompileTime = XprType::RowsAtCompileTime, - ForwardLinearAccess = InnerPanel && bool(evaluator::Flags&LinearAccessBit) + ForwardLinearAccess = (InnerPanel || XprType::IsVectorAtCompileTime) && bool(evaluator::Flags&LinearAccessBit) }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -1162,7 +1162,7 @@ protected: evaluator m_argImpl; const variable_if_dynamic m_startRow; const variable_if_dynamic m_startCol; - const variable_if_dynamic m_linear_offset; + const variable_if_dynamic m_linear_offset; }; // TODO: This evaluator does not actually use the child evaluator; diff --git a/test/block.cpp b/test/block.cpp index 8c4dd87be..9c2424662 100644 --- a/test/block.cpp +++ b/test/block.cpp @@ -162,9 +162,11 @@ template void block(const MatrixType& m) // expressions without direct access VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,rows-r1,cols-c1).block(r2-r1,c2-c1,rows-r2,cols-c2)) , ((m1+m2).block(r2,c2,rows-r2,cols-c2)) ); VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).row(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)) ); + VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).row(0)) , ((m1+m2).eval().row(r1).segment(c1,c2-c1+1)) ); VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).col(0)) , ((m1+m2).col(c1).segment(r1,r2-r1+1)) ); VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).transpose().col(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)).transpose() ); VERIFY_IS_APPROX( ((m1+m2).transpose().block(c1,r1,c2-c1+1,r2-r1+1).col(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)).transpose() ); + VERIFY_IS_APPROX( ((m1+m2).template block<1,Dynamic>(r1,c1,1,c2-c1+1)) , ((m1+m2).eval().row(r1).segment(c1,c2-c1+1)) ); VERIFY_IS_APPROX( (m1*1).topRows(r1), m1.topRows(r1) ); VERIFY_IS_APPROX( (m1*1).leftCols(c1), m1.leftCols(c1) ); -- cgit v1.2.3 From fe723d6129f2fde60e91e7cef52a7907a0cb479f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 10 Jul 2018 09:10:32 +0200 Subject: Fix conversion warning --- Eigen/src/Core/util/IntegralConstant.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/IntegralConstant.h b/Eigen/src/Core/util/IntegralConstant.h index 78a4705cd..bf99cd8ab 100644 --- a/Eigen/src/Core/util/IntegralConstant.h +++ b/Eigen/src/Core/util/IntegralConstant.h @@ -192,7 +192,7 @@ inline internal::FixedInt fix() { return internal::FixedInt(); } // The generic typename T is mandatory. Otherwise, a code like fix could refer to either the function above or this next overload. // This way a code like fix can only refer to the previous function. template -inline internal::VariableAndFixedInt fix(T val) { return internal::VariableAndFixedInt(val); } +inline internal::VariableAndFixedInt fix(T val) { return internal::VariableAndFixedInt(internal::convert_index(val)); } #endif #else // EIGEN_PARSED_BY_DOXYGEN -- cgit v1.2.3 From 1625476091c2c4576fe9f2eef91df4d0444dc9e0 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 11 Jul 2018 14:00:24 +0200 Subject: Add internall::is_identity compile-time helper --- Eigen/src/Core/util/XprHelper.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 3926d3a6c..e3231c712 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -676,6 +676,14 @@ template struct is_diagonal > template struct is_diagonal > { enum { ret = true }; }; + +template struct is_identity +{ enum { value = false }; }; + +template struct is_identity, T> > +{ enum { value = true }; }; + + template struct glue_shapes; template<> struct glue_shapes { typedef TriangularShape type; }; -- cgit v1.2.3 From f00d08cc0a987fa624209b920608b56638404f13 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 11 Jul 2018 14:01:47 +0200 Subject: Optimize extraction of Q in SparseQR by exploiting the structure of the identity matrix. --- Eigen/src/SparseQR/SparseQR.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Eigen/src/SparseQR/SparseQR.h b/Eigen/src/SparseQR/SparseQR.h index 7409fcae9..1a28389e8 100644 --- a/Eigen/src/SparseQR/SparseQR.h +++ b/Eigen/src/SparseQR/SparseQR.h @@ -640,7 +640,8 @@ struct SparseQR_QProduct : ReturnByValue=0; k--) + Index start_k = internal::is_identity::value ? numext::mini(j,diagSize-1) : diagSize-1; + for (Index k = start_k; k >=0; k--) { Scalar tau = Scalar(0); tau = m_qr.m_Q.col(k).dot(res.col(j)); -- cgit v1.2.3