From ee5864f72e83830f536ad91dc38d574c02a08348 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 25 Jun 2018 10:30:12 +0200 Subject: bug #1560 fix product with a 1x1 diagonal matrix --- Eigen/src/Core/ProductEvaluators.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'Eigen/src/Core/ProductEvaluators.h') diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 0d5aec570..60de771b2 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -785,7 +785,11 @@ public: _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))), _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0, Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0), - Alignment = evaluator::Alignment + Alignment = evaluator::Alignment, + + AsScalarProduct = (DiagonalType::SizeAtCompileTime==1) + || (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::RowsAtCompileTime==1 && ProductOrder==OnTheLeft) + || (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==1 && ProductOrder==OnTheRight) }; diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag) @@ -797,7 +801,10 @@ public: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const { - return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx); + if(AsScalarProduct) + return m_diagImpl.coeff(0) * m_matImpl.coeff(idx); + else + return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx); } protected: -- cgit v1.2.3 From d428a199ab70bc08db7551457a1e9d8f65d9ebb9 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 2 Jul 2018 11:41:09 +0200 Subject: bug #1562: optimize evaluation of small products of the form s*A*B by rewriting them as: s*(A.lazyProduct(B)) to save a costly temporary. Measured speedup from 2x to 5x... --- Eigen/src/Core/ProductEvaluators.h | 28 ++++++++++++++++++++++++++- Eigen/src/Core/products/GeneralMatrixMatrix.h | 8 ++++---- test/product.h | 11 +++++++++++ test/product_large.cpp | 2 ++ 4 files changed, 44 insertions(+), 5 deletions(-) (limited to 'Eigen/src/Core/ProductEvaluators.h') diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 60de771b2..8072a1959 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -396,7 +396,7 @@ struct generic_product_impl // but easier on the compiler side call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op()); } - + template static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { @@ -410,6 +410,32 @@ struct generic_product_impl // dst.noalias() -= lhs.lazyProduct(rhs); call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op()); } + + // Catch "dst {,+,-}= (s*A)*B" and evaluate it lazily by moving out the scalar factor: + // dst {,+,-}= s * (A.lazyProduct(B)) + // This is a huge benefit for heap-allocated matrix types as it save one costly allocation. + // For them, this strategy is also faster than simply by-passing the heap allocation through + // stack allocation. + // For fixed sizes matrices, this is less obvious, it is sometimes x2 faster, but sometimes x3 slower, + // and the behavior depends also a lot on the compiler... so let's be conservative and enable them for dynamic-size only, + // that is when coming from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void eval_dynamic(Dst& dst, const CwiseBinaryOp, + const CwiseNullaryOp, Plain1>, Xpr2>& lhs, const Rhs& rhs, const Func &func) + { + call_assignment_no_alias(dst, lhs.lhs().functor().m_other * lhs.rhs().lazyProduct(rhs), func); + } + + // Here, we we always have LhsT==Lhs, but we need to make it a template type to make the above + // overload more specialized. + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void eval_dynamic(Dst& dst, const LhsT& lhs, const Rhs& rhs, const Func &func) + { + call_assignment_no_alias(dst, lhs.lazyProduct(rhs), func); + } + // template // static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index ed4d3182b..dc47cf7cf 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -431,10 +431,10 @@ struct generic_product_impl // to determine the following heuristic. // EIGEN_GEMM_TO_COEFFBASED_THRESHOLD is typically defined to 20 in GeneralProduct.h, // unless it has been specialized by the user or for a given architecture. - // Note that the condition rhs.rows()>0 was required because lazy produc is (was?) not happy with empty inputs. + // Note that the condition rhs.rows()>0 was required because lazy product is (was?) not happy with empty inputs. // I'm not sure it is still required. if((rhs.rows()+dst.rows()+dst.cols())0) - lazyproduct::evalTo(dst, lhs, rhs); + lazyproduct::eval_dynamic(dst, lhs, rhs, internal::assign_op()); else { dst.setZero(); @@ -446,7 +446,7 @@ struct generic_product_impl static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { if((rhs.rows()+dst.rows()+dst.cols())0) - lazyproduct::addTo(dst, lhs, rhs); + lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op()); else scaleAndAddTo(dst,lhs, rhs, Scalar(1)); } @@ -455,7 +455,7 @@ struct generic_product_impl static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { if((rhs.rows()+dst.rows()+dst.cols())0) - lazyproduct::subTo(dst, lhs, rhs); + lazyproduct::eval_dynamic(dst, lhs, rhs, internal::sub_assign_op()); else scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); } diff --git a/test/product.h b/test/product.h index 0425a929e..d26e8063d 100644 --- a/test/product.h +++ b/test/product.h @@ -111,6 +111,17 @@ template void product(const MatrixType& m) vcres.noalias() -= m1.transpose() * v1; VERIFY_IS_APPROX(vcres, vc2 - m1.transpose() * v1); + // test scaled products + res = square; + res.noalias() = s1 * m1 * m2.transpose(); + VERIFY_IS_APPROX(res, ((s1*m1).eval() * m2.transpose())); + res = square; + res.noalias() += s1 * m1 * m2.transpose(); + VERIFY_IS_APPROX(res, square + ((s1*m1).eval() * m2.transpose())); + res = square; + res.noalias() -= s1 * m1 * m2.transpose(); + VERIFY_IS_APPROX(res, square - ((s1*m1).eval() * m2.transpose())); + // test d ?= a+b*c rules res.noalias() = square + m1 * m2.transpose(); VERIFY_IS_APPROX(res, square + m1 * m2.transpose()); diff --git a/test/product_large.cpp b/test/product_large.cpp index 845cd40ca..14a4f739d 100644 --- a/test/product_large.cpp +++ b/test/product_large.cpp @@ -35,6 +35,8 @@ void test_product_large() for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( product(MatrixXf(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_2( product(MatrixXd(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); + CALL_SUBTEST_2( product(MatrixXd(internal::random(1,10), internal::random(1,10))) ); + CALL_SUBTEST_3( product(MatrixXi(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_4( product(MatrixXcf(internal::random(1,EIGEN_TEST_MAX_SIZE/2), internal::random(1,EIGEN_TEST_MAX_SIZE/2))) ); CALL_SUBTEST_5( product(Matrix(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); -- cgit v1.2.3 From f7124b3e467363e45c3d906b7003f1520a5f804a Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Mon, 11 Jun 2018 18:33:24 +0200 Subject: Extend CUDA support to matrix inversion and selfadjointeigensolver --- Eigen/Core | 39 ++++++++++----- Eigen/PardisoSupport | 0 Eigen/src/Core/DenseStorage.h | 58 +++++++++++++++-------- Eigen/src/Core/GeneralProduct.h | 4 +- Eigen/src/Core/MathFunctions.h | 31 ++++++++++-- Eigen/src/Core/MathFunctionsImpl.h | 5 +- Eigen/src/Core/MatrixBase.h | 8 ++++ Eigen/src/Core/NumTraits.h | 6 +++ Eigen/src/Core/PermutationMatrix.h | 6 +-- Eigen/src/Core/Product.h | 2 +- Eigen/src/Core/ProductEvaluators.h | 3 +- Eigen/src/Core/Transpose.h | 1 + Eigen/src/Core/TriangularMatrix.h | 2 + Eigen/src/Core/products/SelfadjointMatrixVector.h | 12 +++-- Eigen/src/Core/products/SelfadjointRank2Update.h | 3 +- Eigen/src/Core/util/BlasUtil.h | 8 ++-- Eigen/src/Core/util/Memory.h | 4 +- Eigen/src/Core/util/Meta.h | 5 +- Eigen/src/Core/util/XprHelper.h | 2 + Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h | 16 ++++--- Eigen/src/Eigenvalues/Tridiagonalization.h | 9 +++- Eigen/src/Householder/Householder.h | 4 ++ Eigen/src/Householder/HouseholderSequence.h | 17 ++++++- Eigen/src/Jacobi/Jacobi.h | 11 ++++- Eigen/src/LU/Determinant.h | 15 ++++-- Eigen/src/LU/InverseImpl.h | 2 + Eigen/src/plugins/BlockMethods.h | 1 + 27 files changed, 200 insertions(+), 74 deletions(-) mode change 100755 => 100644 Eigen/PardisoSupport (limited to 'Eigen/src/Core/ProductEvaluators.h') diff --git a/Eigen/Core b/Eigen/Core index f6bc18a08..5117461c7 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -45,27 +45,40 @@ #ifdef EIGEN_EXCEPTIONS #undef EIGEN_EXCEPTIONS #endif +#endif - // All functions callable from CUDA code must be qualified with __device__ - #ifdef EIGEN_CUDACC - // Do not try to vectorize on CUDA and SYCL! - #ifndef EIGEN_DONT_VECTORIZE - #define EIGEN_DONT_VECTORIZE - #endif +// All functions callable from CUDA code must be qualified with __device__ +#ifdef EIGEN_CUDACC + // Do not try to vectorize on CUDA and SYCL! + #ifndef EIGEN_DONT_VECTORIZE + #define EIGEN_DONT_VECTORIZE + #endif - #define EIGEN_DEVICE_FUNC __host__ __device__ - // We need cuda_runtime.h to ensure that that EIGEN_USING_STD_MATH macro - // works properly on the device side - #include - #else - #define EIGEN_DEVICE_FUNC + #define EIGEN_DEVICE_FUNC __host__ __device__ + // We need cuda_runtime.h to ensure that that EIGEN_USING_STD_MATH macro + // works properly on the device side + #include + + #if EIGEN_HAS_CONSTEXPR + // While available already with c++11, this is useful mostly starting with c++14 and relaxed constexpr rules + #if defined(__NVCC__) + // nvcc considers constexpr functions as __host__ __device__ with the option --expt-relaxed-constexpr + #ifdef __CUDACC_RELAXED_CONSTEXPR__ + #define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC + #endif + #elif defined(__clang__) && defined(__CUDA__) + // clang++ always considers constexpr functions as implicitly __host__ __device__ + #define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC + #endif #endif #else #define EIGEN_DEVICE_FUNC #endif #ifdef __NVCC__ -#define EIGEN_DONT_VECTORIZE + #ifndef EIGEN_DONT_VECTORIZE + #define EIGEN_DONT_VECTORIZE + #endif #endif // When compiling CUDA device code with NVCC, pull in math functions from the diff --git a/Eigen/PardisoSupport b/Eigen/PardisoSupport old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h index 9e58fbf88..3c02a1025 100644 --- a/Eigen/src/Core/DenseStorage.h +++ b/Eigen/src/Core/DenseStorage.h @@ -207,7 +207,9 @@ template class DenseSt EIGEN_UNUSED_VARIABLE(rows); EIGEN_UNUSED_VARIABLE(cols); } - EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); } + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { + numext::swap(m_data, other.m_data); + } EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;} EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;} EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {} @@ -267,7 +269,11 @@ template class DenseStorage class DenseStorage class DenseStorage class DenseStorage(m_data, m_rows*m_cols); } EIGEN_DEVICE_FUNC void swap(DenseStorage& other) - { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); } + { + numext::swap(m_data,other.m_data); + numext::swap(m_rows,other.m_rows); + numext::swap(m_cols,other.m_cols); + } EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;} EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;} void conservativeResize(Index size, Index rows, Index cols) @@ -459,14 +475,16 @@ template class DenseStorage(m_data, _Rows*m_cols); } - EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); } + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { + numext::swap(m_data,other.m_data); + numext::swap(m_cols,other.m_cols); + } EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;} EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;} EIGEN_DEVICE_FUNC void conservativeResize(Index size, Index, Index cols) @@ -533,14 +551,16 @@ template class DenseStorage(m_data, _Cols*m_rows); } - EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); } + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { + numext::swap(m_data,other.m_data); + numext::swap(m_rows,other.m_rows); + } EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;} EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;} void conservativeResize(Index size, Index rows, Index) diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index 694f7cbde..bd2361e9a 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -163,13 +163,13 @@ template struct gemv_static_vect template struct gemv_static_vector_if { - EIGEN_STRONG_INLINE Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; } }; template struct gemv_static_vector_if { - EIGEN_STRONG_INLINE Scalar* data() { return 0; } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { return 0; } }; template diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 954863c39..a5740334a 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -864,7 +864,7 @@ template T generic_fast_tanh_float(const T& a_x); namespace numext { -#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) +#if (!defined(EIGEN_CUDACC) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) && !defined(__SYCL_DEVICE_ONLY__) template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) @@ -881,19 +881,16 @@ EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) return max EIGEN_NOT_A_MACRO (x,y); } - #elif defined(__SYCL_DEVICE_ONLY__) template EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) { - return y < x ? y : x; } template EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) { - return x < y ? y : x; } @@ -937,7 +934,6 @@ EIGEN_ALWAYS_INLINE unsigned long maxi(const unsigned long& x, const unsigned lo return cl::sycl::max(x,y); } - EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y) { return cl::sycl::fmin(x,y); @@ -971,6 +967,19 @@ EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y) { return fminf(x, y); } +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y) +{ + return fmin(x, y); +} +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y) +{ + return fminl(x, y); +} + template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) @@ -983,6 +992,18 @@ EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y) { return fmaxf(x, y); } +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y) +{ + return fmax(x, y); +} +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y) +{ + return fmaxl(x, y); +} #endif diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h index 28737c15e..a23e93ccb 100644 --- a/Eigen/src/Core/MathFunctionsImpl.h +++ b/Eigen/src/Core/MathFunctionsImpl.h @@ -67,7 +67,7 @@ T generic_fast_tanh_float(const T& a_x) } template -EIGEN_STRONG_INLINE +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y) { EIGEN_USING_STD_MATH(sqrt); @@ -82,7 +82,8 @@ template struct hypot_impl { typedef typename NumTraits::Real RealScalar; - static inline RealScalar run(const Scalar& x, const Scalar& y) + static EIGEN_DEVICE_FUNC + inline RealScalar run(const Scalar& x, const Scalar& y) { EIGEN_USING_STD_MATH(abs); return positive_real_hypot(abs(x), abs(y)); diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h index 11435903b..6046c8bae 100644 --- a/Eigen/src/Core/MatrixBase.h +++ b/Eigen/src/Core/MatrixBase.h @@ -328,6 +328,7 @@ template class MatrixBase inline const PartialPivLU lu() const; + EIGEN_DEVICE_FUNC inline const Inverse inverse() const; template @@ -337,12 +338,15 @@ template class MatrixBase bool& invertible, const RealScalar& absDeterminantThreshold = NumTraits::dummy_precision() ) const; + template inline void computeInverseWithCheck( ResultType& inverse, bool& invertible, const RealScalar& absDeterminantThreshold = NumTraits::dummy_precision() ) const; + + EIGEN_DEVICE_FUNC Scalar determinant() const; /////////// Cholesky module /////////// @@ -414,15 +418,19 @@ template class MatrixBase ////////// Householder module /////////// + EIGEN_DEVICE_FUNC void makeHouseholderInPlace(Scalar& tau, RealScalar& beta); template + EIGEN_DEVICE_FUNC void makeHouseholder(EssentialPart& essential, Scalar& tau, RealScalar& beta) const; template + EIGEN_DEVICE_FUNC void applyHouseholderOnTheLeft(const EssentialPart& essential, const Scalar& tau, Scalar* workspace); template + EIGEN_DEVICE_FUNC void applyHouseholderOnTheRight(const EssentialPart& essential, const Scalar& tau, Scalar* workspace); diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h index 5567d4c90..b053cff07 100644 --- a/Eigen/src/Core/NumTraits.h +++ b/Eigen/src/Core/NumTraits.h @@ -21,12 +21,14 @@ template< typename T, bool is_integer = NumTraits::IsInteger> struct default_digits10_impl { + EIGEN_DEVICE_FUNC static int run() { return std::numeric_limits::digits10; } }; template struct default_digits10_impl // Floating point { + EIGEN_DEVICE_FUNC static int run() { using std::log10; using std::ceil; @@ -38,6 +40,7 @@ struct default_digits10_impl // Floating point template struct default_digits10_impl // Integer { + EIGEN_DEVICE_FUNC static int run() { return 0; } }; @@ -49,12 +52,14 @@ template< typename T, bool is_integer = NumTraits::IsInteger> struct default_digits_impl { + EIGEN_DEVICE_FUNC static int run() { return std::numeric_limits::digits; } }; template struct default_digits_impl // Floating point { + EIGEN_DEVICE_FUNC static int run() { using std::log; using std::ceil; @@ -66,6 +71,7 @@ struct default_digits_impl // Floating point template struct default_digits_impl // Integer { + EIGEN_DEVICE_FUNC static int run() { return 0; } }; diff --git a/Eigen/src/Core/PermutationMatrix.h b/Eigen/src/Core/PermutationMatrix.h index b1fb455b9..acd085301 100644 --- a/Eigen/src/Core/PermutationMatrix.h +++ b/Eigen/src/Core/PermutationMatrix.h @@ -99,13 +99,13 @@ class PermutationBase : public EigenBase #endif /** \returns the number of rows */ - inline Index rows() const { return Index(indices().size()); } + inline EIGEN_DEVICE_FUNC Index rows() const { return Index(indices().size()); } /** \returns the number of columns */ - inline Index cols() const { return Index(indices().size()); } + inline EIGEN_DEVICE_FUNC Index cols() const { return Index(indices().size()); } /** \returns the size of a side of the respective square matrix, i.e., the number of indices */ - inline Index size() const { return Index(indices().size()); } + inline EIGEN_DEVICE_FUNC Index size() const { return Index(indices().size()); } #ifndef EIGEN_PARSED_BY_DOXYGEN template diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index 3d67d9489..70790dbd4 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -127,7 +127,7 @@ public: using Base::derived; typedef typename Base::Scalar Scalar; - EIGEN_STRONG_INLINE operator const Scalar() const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator const Scalar() const { return internal::evaluator(derived()).coeff(0,0); } diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 8072a1959..76e5083c1 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -767,7 +767,8 @@ struct generic_product_impl typedef typename Product::Scalar Scalar; template - static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + static EIGEN_DEVICE_FUNC + void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { selfadjoint_product_impl::run(dst, lhs.nestedExpression(), rhs, alpha); } diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h index ba7d6e629..d7c204579 100644 --- a/Eigen/src/Core/Transpose.h +++ b/Eigen/src/Core/Transpose.h @@ -79,6 +79,7 @@ template class Transpose nestedExpression() { return m_matrix; } /** \internal */ + EIGEN_DEVICE_FUNC void resize(Index nrows, Index ncols) { m_matrix.resize(ncols,nrows); } diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h index ab73fcf21..521de6160 100644 --- a/Eigen/src/Core/TriangularMatrix.h +++ b/Eigen/src/Core/TriangularMatrix.h @@ -65,6 +65,7 @@ template class TriangularBase : public EigenBase inline Index innerStride() const { return derived().innerStride(); } // dummy resize function + EIGEN_DEVICE_FUNC void resize(Index rows, Index cols) { EIGEN_UNUSED_VARIABLE(rows); @@ -716,6 +717,7 @@ struct unary_evaluator, IndexBased> { typedef TriangularView XprType; typedef evaluator::type> Base; + EIGEN_DEVICE_FUNC unary_evaluator(const XprType &xpr) : Base(xpr.nestedExpression()) {} }; diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h index 67390f1d7..d38fd72b2 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixVector.h +++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h @@ -27,7 +27,8 @@ template -EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product::run( +EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC +void selfadjoint_matrix_vector_product::run( Index size, const Scalar* lhs, Index lhsStride, const Scalar* rhs, @@ -62,8 +64,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product enum { LhsUpLo = LhsMode&(Upper|Lower) }; template - static void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha) + static EIGEN_DEVICE_FUNC + void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha) { typedef typename Dest::Scalar ResScalar; typedef typename Rhs::Scalar RhsScalar; diff --git a/Eigen/src/Core/products/SelfadjointRank2Update.h b/Eigen/src/Core/products/SelfadjointRank2Update.h index d395888e5..09209f733 100644 --- a/Eigen/src/Core/products/SelfadjointRank2Update.h +++ b/Eigen/src/Core/products/SelfadjointRank2Update.h @@ -24,7 +24,8 @@ struct selfadjoint_rank2_update_selector; template struct selfadjoint_rank2_update_selector { - static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) + static EIGEN_DEVICE_FUNC + void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) { const Index size = u.size(); for (Index i=0; i struct blas_traits ExtractType, typename _ExtractType::PlainObject >::type DirectLinearAccessType; - static inline ExtractType extract(const XprType& x) { return x; } - static inline const Scalar extractScalarFactor(const XprType&) { return Scalar(1); } + static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return x; } + static inline EIGEN_DEVICE_FUNC const Scalar extractScalarFactor(const XprType&) { return Scalar(1); } }; // pop conjugate @@ -318,8 +318,8 @@ struct blas_traits, const CwiseNullaryOp typedef blas_traits Base; typedef CwiseBinaryOp, const CwiseNullaryOp,Plain>, NestedXpr> XprType; typedef typename Base::ExtractType ExtractType; - static inline ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); } - static inline Scalar extractScalarFactor(const XprType& x) + static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); } + static inline EIGEN_DEVICE_FUNC Scalar extractScalarFactor(const XprType& x) { return x.lhs().functor().m_other * Base::extractScalarFactor(x.rhs()); } }; template diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 53300c388..22d7679c5 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -542,7 +542,7 @@ template struct smart_memmove_helper { // you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA // to the appropriate stack allocation function -#ifndef EIGEN_ALLOCA +#if ! defined EIGEN_ALLOCA && ! defined EIGEN_CUDA_ARCH #if EIGEN_OS_LINUX || EIGEN_OS_MAC || (defined alloca) #define EIGEN_ALLOCA alloca #elif EIGEN_COMP_MSVC @@ -561,12 +561,14 @@ template class aligned_stack_memory_handler : noncopyable * In this case, the buffer elements will also be destructed when this handler will be destructed. * Finally, if \a dealloc is true, then the pointer \a ptr is freed. **/ + EIGEN_DEVICE_FUNC aligned_stack_memory_handler(T* ptr, std::size_t size, bool dealloc) : m_ptr(ptr), m_size(size), m_deallocate(dealloc) { if(NumTraits::RequireInitialization && m_ptr) Eigen::internal::construct_elements_of_array(m_ptr, size); } + EIGEN_DEVICE_FUNC ~aligned_stack_memory_handler() { if(NumTraits::RequireInitialization && m_ptr) diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 0d0b8c43a..ef9860c4b 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -544,6 +544,7 @@ using std::numeric_limits; // Integer division with rounding up. // T is assumed to be an integer type with a>=0, and b>0 template +EIGEN_DEVICE_FUNC T div_ceil(const T &a, const T &b) { return (a+b-1) / b; @@ -554,7 +555,7 @@ T div_ceil(const T &a, const T &b) template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const X& x,const Y& y) { return x == y; } -#if !defined(EIGEN_CUDA_ARCH) +#if !defined(EIGEN_CUDA_ARCH) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC) template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const float& x,const float& y) { return std::equal_to()(x,y); } @@ -565,7 +566,7 @@ bool equal_strict(const double& x,const double& y) { return std::equal_to EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const X& x,const Y& y) { return x != y; } -#if !defined(EIGEN_CUDA_ARCH) +#if !defined(EIGEN_CUDA_ARCH) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC) template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to()(x,y); } diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index f7a3d9ce7..7311adec7 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -685,12 +685,14 @@ struct possibly_same_dense { }; template +EIGEN_DEVICE_FUNC bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if::value>::type * = 0) { return (mat1.data()==mat2.data()) && (mat1.innerStride()==mat2.innerStride()) && (mat1.outerStride()==mat2.outerStride()); } template +EIGEN_DEVICE_FUNC bool is_same_dense(const T1 &, const T2 &, typename enable_if::value>::type * = 0) { return false; diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h index 040f8d3bb..fbc1ee2f6 100644 --- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h @@ -20,7 +20,9 @@ class GeneralizedSelfAdjointEigenSolver; namespace internal { template struct direct_selfadjoint_eigenvalues; + template +EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec); } @@ -354,7 +356,8 @@ template class SelfAdjointEigenSolver static const int m_maxIterations = 30; protected: - static void check_template_parameters() + static EIGEN_DEVICE_FUNC + void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } @@ -403,7 +406,7 @@ SelfAdjointEigenSolver& SelfAdjointEigenSolver const InputType &matrix(a_matrix.derived()); - using std::abs; + EIGEN_USING_STD_MATH(abs); eigen_assert(matrix.cols() == matrix.rows()); eigen_assert((options&~(EigVecMask|GenEigMask))==0 && (options&EigVecMask)!=EigVecMask @@ -479,9 +482,10 @@ namespace internal { * \returns \c Success or \c NoConvergence */ template +EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec) { - using std::abs; + EIGEN_USING_STD_MATH(abs); ComputationInfo info; typedef typename MatrixType::Scalar Scalar; @@ -535,7 +539,7 @@ ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag diag.segment(i,n-i).minCoeff(&k); if (k > 0) { - std::swap(diag[i], diag[k+i]); + numext::swap(diag[i], diag[k+i]); if(computeEigenvectors) eivec.col(i).swap(eivec.col(k+i)); } @@ -605,7 +609,7 @@ template struct direct_selfadjoint_eigenvalues res, Ref representative) { - using std::abs; + EIGEN_USING_STD_MATH(abs); Index i0; // Find non-zero column i0 (by construction, there must exist a non zero coefficient on the diagonal): mat.diagonal().cwiseAbs().maxCoeff(&i0); @@ -807,7 +811,7 @@ template EIGEN_DEVICE_FUNC static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index start, Index end, Scalar* matrixQ, Index n) { - using std::abs; + EIGEN_USING_STD_MATH(abs); RealScalar td = (diag[end-1] - diag[end])*RealScalar(0.5); RealScalar e = subdiag[end-1]; // Note that thanks to scaling, e^2 or td^2 cannot overflow, however they can still diff --git a/Eigen/src/Eigenvalues/Tridiagonalization.h b/Eigen/src/Eigenvalues/Tridiagonalization.h index 1d102c17b..c5c1acf46 100644 --- a/Eigen/src/Eigenvalues/Tridiagonalization.h +++ b/Eigen/src/Eigenvalues/Tridiagonalization.h @@ -25,6 +25,7 @@ struct traits > }; template +EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs); } @@ -344,6 +345,7 @@ namespace internal { * \sa Tridiagonalization::packedMatrix() */ template +EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs) { using numext::conj; @@ -424,6 +426,7 @@ struct tridiagonalization_inplace_selector; * \sa class Tridiagonalization */ template +EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ) { eigen_assert(mat.cols()==mat.rows() && diag.size()==mat.rows() && subdiag.size()==mat.rows()-1); @@ -439,7 +442,8 @@ struct tridiagonalization_inplace_selector typedef typename Tridiagonalization::CoeffVectorType CoeffVectorType; typedef typename Tridiagonalization::HouseholderSequenceType HouseholderSequenceType; template - static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ) + static EIGEN_DEVICE_FUNC + void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ) { CoeffVectorType hCoeffs(mat.cols()-1); tridiagonalization_inplace(mat,hCoeffs); @@ -508,7 +512,8 @@ struct tridiagonalization_inplace_selector typedef typename MatrixType::Scalar Scalar; template - static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, bool extractQ) + static EIGEN_DEVICE_FUNC + void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, bool extractQ) { diag(0,0) = numext::real(mat(0,0)); if(extractQ) diff --git a/Eigen/src/Householder/Householder.h b/Eigen/src/Householder/Householder.h index a5f336d18..5bc037f00 100644 --- a/Eigen/src/Householder/Householder.h +++ b/Eigen/src/Householder/Householder.h @@ -39,6 +39,7 @@ template struct decrement_size * MatrixBase::applyHouseholderOnTheRight() */ template +EIGEN_DEVICE_FUNC void MatrixBase::makeHouseholderInPlace(Scalar& tau, RealScalar& beta) { VectorBlock::ret> essentialPart(derived(), 1, size()-1); @@ -62,6 +63,7 @@ void MatrixBase::makeHouseholderInPlace(Scalar& tau, RealScalar& beta) */ template template +EIGEN_DEVICE_FUNC void MatrixBase::makeHouseholder( EssentialPart& essential, Scalar& tau, @@ -110,6 +112,7 @@ void MatrixBase::makeHouseholder( */ template template +EIGEN_DEVICE_FUNC void MatrixBase::applyHouseholderOnTheLeft( const EssentialPart& essential, const Scalar& tau, @@ -147,6 +150,7 @@ void MatrixBase::applyHouseholderOnTheLeft( */ template template +EIGEN_DEVICE_FUNC void MatrixBase::applyHouseholderOnTheRight( const EssentialPart& essential, const Scalar& tau, diff --git a/Eigen/src/Householder/HouseholderSequence.h b/Eigen/src/Householder/HouseholderSequence.h index fad1d5ab6..a4f40b75c 100644 --- a/Eigen/src/Householder/HouseholderSequence.h +++ b/Eigen/src/Householder/HouseholderSequence.h @@ -87,7 +87,7 @@ struct hseq_side_dependent_impl { typedef Block EssentialVectorType; typedef HouseholderSequence HouseholderSequenceType; - static inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k) + static EIGEN_DEVICE_FUNC inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k) { Index start = k+1+h.m_shift; return Block(h.m_vectors, start, k, h.rows()-start, 1); @@ -173,6 +173,7 @@ template class HouseholderS * * \sa setLength(), setShift() */ + EIGEN_DEVICE_FUNC HouseholderSequence(const VectorsType& v, const CoeffsType& h) : m_vectors(v), m_coeffs(h), m_reverse(false), m_length(v.diagonalSize()), m_shift(0) @@ -180,6 +181,7 @@ template class HouseholderS } /** \brief Copy constructor. */ + EIGEN_DEVICE_FUNC HouseholderSequence(const HouseholderSequence& other) : m_vectors(other.m_vectors), m_coeffs(other.m_coeffs), @@ -193,12 +195,14 @@ template class HouseholderS * \returns Number of rows * \details This equals the dimension of the space that the transformation acts on. */ + EIGEN_DEVICE_FUNC Index rows() const { return Side==OnTheLeft ? m_vectors.rows() : m_vectors.cols(); } /** \brief Number of columns of transformation viewed as a matrix. * \returns Number of columns * \details This equals the dimension of the space that the transformation acts on. */ + EIGEN_DEVICE_FUNC Index cols() const { return rows(); } /** \brief Essential part of a Householder vector. @@ -215,6 +219,7 @@ template class HouseholderS * * \sa setShift(), shift() */ + EIGEN_DEVICE_FUNC const EssentialVectorType essentialVector(Index k) const { eigen_assert(k >= 0 && k < m_length); @@ -252,7 +257,9 @@ template class HouseholderS AdjointReturnType inverse() const { return adjoint(); } /** \internal */ - template inline void evalTo(DestType& dst) const + template + inline EIGEN_DEVICE_FUNC + void evalTo(DestType& dst) const { Matrix workspace(rows()); @@ -261,6 +268,7 @@ template class HouseholderS /** \internal */ template + EIGEN_DEVICE_FUNC void evalTo(Dest& dst, Workspace& workspace) const { workspace.resize(rows()); @@ -394,6 +402,7 @@ template class HouseholderS * * \sa length() */ + EIGEN_DEVICE_FUNC HouseholderSequence& setLength(Index length) { m_length = length; @@ -411,13 +420,17 @@ template class HouseholderS * * \sa shift() */ + EIGEN_DEVICE_FUNC HouseholderSequence& setShift(Index shift) { m_shift = shift; return *this; } + EIGEN_DEVICE_FUNC Index length() const { return m_length; } /**< \brief Returns the length of the Householder sequence. */ + + EIGEN_DEVICE_FUNC Index shift() const { return m_shift; } /**< \brief Returns the shift of the Householder sequence. */ /* Necessary for .adjoint() and .conjugate() */ diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h index af1228cb8..bba75fc4f 100644 --- a/Eigen/src/Jacobi/Jacobi.h +++ b/Eigen/src/Jacobi/Jacobi.h @@ -90,6 +90,7 @@ template class JacobiRotation * \sa MatrixBase::makeJacobi(const MatrixBase&, Index, Index), MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight() */ template +EIGEN_DEVICE_FUNC bool JacobiRotation::makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z) { using std::sqrt; @@ -134,6 +135,7 @@ bool JacobiRotation::makeJacobi(const RealScalar& x, const Scalar& y, co */ template template +EIGEN_DEVICE_FUNC inline bool JacobiRotation::makeJacobi(const MatrixBase& m, Index p, Index q) { return makeJacobi(numext::real(m.coeff(p,p)), m.coeff(p,q), numext::real(m.coeff(q,q))); @@ -156,6 +158,7 @@ inline bool JacobiRotation::makeJacobi(const MatrixBase& m, Ind * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight() */ template +EIGEN_DEVICE_FUNC void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar* z) { makeGivens(p, q, z, typename internal::conditional::IsComplex, internal::true_type, internal::false_type>::type()); @@ -164,6 +167,7 @@ void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar // specialization for complexes template +EIGEN_DEVICE_FUNC void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::true_type) { using std::sqrt; @@ -223,6 +227,7 @@ void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar // specialization for reals template +EIGEN_DEVICE_FUNC void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::false_type) { using std::sqrt; @@ -286,6 +291,7 @@ void apply_rotation_in_the_plane(DenseBase& xpr_x, DenseBase& */ template template +EIGEN_DEVICE_FUNC inline void MatrixBase::applyOnTheLeft(Index p, Index q, const JacobiRotation& j) { RowXpr x(this->row(p)); @@ -301,6 +307,7 @@ inline void MatrixBase::applyOnTheLeft(Index p, Index q, const JacobiRo */ template template +EIGEN_DEVICE_FUNC inline void MatrixBase::applyOnTheRight(Index p, Index q, const JacobiRotation& j) { ColXpr x(this->col(p)); @@ -314,7 +321,8 @@ template struct apply_rotation_in_the_plane_selector { - static inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s) + static EIGEN_DEVICE_FUNC + inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s) { for(Index i=0; i +EIGEN_DEVICE_FUNC void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x, DenseBase& xpr_y, const JacobiRotation& j) { typedef typename VectorX::Scalar Scalar; diff --git a/Eigen/src/LU/Determinant.h b/Eigen/src/LU/Determinant.h index d6a3c1e5a..6af63a6e7 100644 --- a/Eigen/src/LU/Determinant.h +++ b/Eigen/src/LU/Determinant.h @@ -15,6 +15,7 @@ namespace Eigen { namespace internal { template +EIGEN_DEVICE_FUNC inline const typename Derived::Scalar bruteforce_det3_helper (const MatrixBase& matrix, int a, int b, int c) { @@ -23,6 +24,7 @@ inline const typename Derived::Scalar bruteforce_det3_helper } template +EIGEN_DEVICE_FUNC const typename Derived::Scalar bruteforce_det4_helper (const MatrixBase& matrix, int j, int k, int m, int n) { @@ -44,7 +46,8 @@ template struct determinant_impl { - static inline typename traits::Scalar run(const Derived& m) + static inline EIGEN_DEVICE_FUNC + typename traits::Scalar run(const Derived& m) { return m.coeff(0,0); } @@ -52,7 +55,8 @@ template struct determinant_impl template struct determinant_impl { - static inline typename traits::Scalar run(const Derived& m) + static inline EIGEN_DEVICE_FUNC + typename traits::Scalar run(const Derived& m) { return m.coeff(0,0) * m.coeff(1,1) - m.coeff(1,0) * m.coeff(0,1); } @@ -60,7 +64,8 @@ template struct determinant_impl template struct determinant_impl { - static inline typename traits::Scalar run(const Derived& m) + static inline EIGEN_DEVICE_FUNC + typename traits::Scalar run(const Derived& m) { return bruteforce_det3_helper(m,0,1,2) - bruteforce_det3_helper(m,1,0,2) @@ -70,7 +75,8 @@ template struct determinant_impl template struct determinant_impl { - static typename traits::Scalar run(const Derived& m) + static EIGEN_DEVICE_FUNC + typename traits::Scalar run(const Derived& m) { // trick by Martin Costabel to compute 4x4 det with only 30 muls return bruteforce_det4_helper(m,0,1,2,3) @@ -89,6 +95,7 @@ template struct determinant_impl * \returns the determinant of this matrix */ template +EIGEN_DEVICE_FUNC inline typename internal::traits::Scalar MatrixBase::determinant() const { eigen_assert(rows() == cols()); diff --git a/Eigen/src/LU/InverseImpl.h b/Eigen/src/LU/InverseImpl.h index f49f23360..1bab00c01 100644 --- a/Eigen/src/LU/InverseImpl.h +++ b/Eigen/src/LU/InverseImpl.h @@ -290,6 +290,7 @@ template struct Assignment, internal::assign_op, Dense2Dense> { typedef Inverse SrcXprType; + EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &) { Index dstRows = src.rows(); @@ -332,6 +333,7 @@ struct Assignment, internal::assign_op +EIGEN_DEVICE_FUNC inline const Inverse MatrixBase::inverse() const { EIGEN_STATIC_ASSERT(!NumTraits::IsInteger,THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES) diff --git a/Eigen/src/plugins/BlockMethods.h b/Eigen/src/plugins/BlockMethods.h index a5748525b..67fdebc6f 100644 --- a/Eigen/src/plugins/BlockMethods.h +++ b/Eigen/src/plugins/BlockMethods.h @@ -1061,6 +1061,7 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa block(Index,Index,NRowsType,NColsType), class Block /// template +EIGEN_DEVICE_FUNC inline typename FixedBlockXpr::Type block(Index startRow, Index startCol, Index blockRows, Index blockCols) { -- cgit v1.2.3 From de9e31a06d0324862d9200d08eb3cc4d3d07e660 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 9 Jul 2018 15:41:14 +0200 Subject: Introduce the macro ei_declare_local_nested_eval to help allocating on the stack local temporaries via alloca, and let outer-products makes a good use of it. If successful, we should use it everywhere nested_eval is used to declare local dense temporaries. --- Eigen/src/Core/ProductEvaluators.h | 4 +- Eigen/src/Core/util/Memory.h | 80 ++++++++++++++++++++++++++++++++++++-- Eigen/src/Core/util/XprHelper.h | 2 +- test/product_notemporary.cpp | 10 ++++- 4 files changed, 89 insertions(+), 7 deletions(-) (limited to 'Eigen/src/Core/ProductEvaluators.h') diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 76e5083c1..22ad32ae3 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -272,7 +272,7 @@ template void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&) { evaluator rhsEval(rhs); - typename nested_eval::type actual_lhs(lhs); + ei_declare_local_nested_eval(Lhs,lhs,Rhs::SizeAtCompileTime,actual_lhs); // FIXME if cols is large enough, then it might be useful to make sure that lhs is sequentially stored // FIXME not very good if rhs is real and lhs complex while alpha is real too const Index cols = dst.cols(); @@ -285,7 +285,7 @@ template void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&) { evaluator lhsEval(lhs); - typename nested_eval::type actual_rhs(rhs); + ei_declare_local_nested_eval(Rhs,rhs,Lhs::SizeAtCompileTime,actual_rhs); // FIXME if rows is large enough, then it might be useful to make sure that rhs is sequentially stored // FIXME not very good if lhs is real and rhs complex while alpha is real too const Index rows = dst.rows(); diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 22d7679c5..43920bdc8 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -582,6 +582,60 @@ template class aligned_stack_memory_handler : noncopyable bool m_deallocate; }; +#ifdef EIGEN_ALLOCA + +template::Evaluate && Xpr::MaxSizeAtCompileTime==Dynamic + > +struct local_nested_eval_wrapper +{ + static const bool NeedExternalBuffer = false; + typedef typename Xpr::Scalar Scalar; + typedef typename nested_eval::type ObjectType; + ObjectType object; + + EIGEN_DEVICE_FUNC + local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr) : object(xpr) + { + EIGEN_UNUSED_VARIABLE(ptr); + eigen_internal_assert(ptr==0); + } +}; + +template +struct local_nested_eval_wrapper +{ + static const bool NeedExternalBuffer = true; + typedef typename Xpr::Scalar Scalar; + typedef typename plain_object_eval::type PlainObject; + typedef Map ObjectType; + ObjectType object; + + EIGEN_DEVICE_FUNC + local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr) + : object(ptr==0 ? reinterpret_cast(Eigen::internal::aligned_malloc(sizeof(Scalar)*xpr.size())) : ptr, xpr.rows(), xpr.cols()), + m_deallocate(ptr==0) + { + if(NumTraits::RequireInitialization && object.data()) + Eigen::internal::construct_elements_of_array(object.data(), object.size()); + object = xpr; + } + + EIGEN_DEVICE_FUNC + ~local_nested_eval_wrapper() + { + if(NumTraits::RequireInitialization && object.data()) + Eigen::internal::destruct_elements_of_array(object.data(), object.size()); + if(m_deallocate) + Eigen::internal::aligned_free(object.data()); + } + +private: + bool m_deallocate; +}; + +#endif // EIGEN_ALLOCA + template class scoped_array : noncopyable { T* m_ptr; @@ -609,9 +663,11 @@ template void swap(scoped_array &a,scoped_array &b) } // end namespace internal /** \internal - * Declares, allocates and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack - * if SIZE is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform - * (currently, this is Linux and Visual Studio only). Otherwise the memory is allocated on the heap. + * + * The macro ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) declares, allocates, + * and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack + * if the size in bytes is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform + * (currently, this is Linux, OSX and Visual Studio only). Otherwise the memory is allocated on the heap. * The allocated buffer is automatically deleted when exiting the scope of this declaration. * If BUFFER is non null, then the declared variable is simply an alias for BUFFER, and no allocation/deletion occurs. * Here is an example: @@ -622,6 +678,14 @@ template void swap(scoped_array &a,scoped_array &b) * } * \endcode * The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token. + * + * The macro ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) is analogue to + * \code + * typename internal::nested_eval::type NAME(XPR); + * \endcode + * with the advantage of using aligned stack allocation even if the maximal size of XPR at compile time is unknown. + * This is accomplished through alloca if this later is supported and if the required number of bytes + * is below EIGEN_STACK_ALLOCATION_LIMIT. */ #ifdef EIGEN_ALLOCA @@ -641,6 +705,13 @@ template void swap(scoped_array &a,scoped_array &b) : Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE) ); \ Eigen::internal::aligned_stack_memory_handler EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,sizeof(TYPE)*SIZE>EIGEN_STACK_ALLOCATION_LIMIT) + + #define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) \ + Eigen::internal::local_nested_eval_wrapper EIGEN_CAT(NAME,_wrapper)(XPR, reinterpret_cast( \ + ( (Eigen::internal::local_nested_eval_wrapper::NeedExternalBuffer) && ((sizeof(typename XPR_T::Scalar)*XPR.size())<=EIGEN_STACK_ALLOCATION_LIMIT) ) \ + ? EIGEN_ALIGNED_ALLOCA( sizeof(typename XPR_T::Scalar)*XPR.size() ) : 0 ) ) ; \ + typename Eigen::internal::local_nested_eval_wrapper::ObjectType NAME(EIGEN_CAT(NAME,_wrapper).object) + #else #define ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) \ @@ -648,6 +719,9 @@ template void swap(scoped_array &a,scoped_array &b) TYPE* NAME = (BUFFER)!=0 ? BUFFER : reinterpret_cast(Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE)); \ Eigen::internal::aligned_stack_memory_handler EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,true) + +#define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) typename Eigen::internal::nested_eval::type NAME(XPR); + #endif diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 7311adec7..3926d3a6c 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -460,7 +460,7 @@ template { enum { ScalarReadCost = NumTraits::Scalar>::ReadCost, - CoeffReadCost = evaluator::CoeffReadCost, // NOTE What if an evaluator evaluate itself into a tempory? + CoeffReadCost = evaluator::CoeffReadCost, // NOTE What if an evaluator evaluate itself into a temporary? // Then CoeffReadCost will be small (e.g., 1) but we still have to evaluate, especially if n>1. // This situation is already taken care by the EvalBeforeNestingBit flag, which is turned ON // for all evaluator creating a temporary. This flag is then propagated by the parent evaluators. diff --git a/test/product_notemporary.cpp b/test/product_notemporary.cpp index 30592b79e..062180f42 100644 --- a/test/product_notemporary.cpp +++ b/test/product_notemporary.cpp @@ -128,11 +128,19 @@ template void product_notemporary(const MatrixType& m) VERIFY_EVALUATION_COUNT( cvres.noalias() = (rm3+rm3) * (m1*cv1), 1 ); // Check outer products + #ifdef EIGEN_ALLOCA + bool temp_via_alloca = m3.rows()*sizeof(Scalar) <= EIGEN_STACK_ALLOCATION_LIMIT; + #else + bool temp_via_alloca = false; + #endif m3 = cv1 * rv1; VERIFY_EVALUATION_COUNT( m3.noalias() = cv1 * rv1, 0 ); - VERIFY_EVALUATION_COUNT( m3.noalias() = (cv1+cv1) * (rv1+rv1), 1 ); + VERIFY_EVALUATION_COUNT( m3.noalias() = (cv1+cv1) * (rv1+rv1), temp_via_alloca ? 0 : 1 ); VERIFY_EVALUATION_COUNT( m3.noalias() = (m1*cv1) * (rv1), 1 ); VERIFY_EVALUATION_COUNT( m3.noalias() += (m1*cv1) * (rv1), 1 ); + rm3 = cv1 * rv1; + VERIFY_EVALUATION_COUNT( rm3.noalias() = cv1 * rv1, 0 ); + VERIFY_EVALUATION_COUNT( rm3.noalias() = (cv1+cv1) * (rv1+rv1), temp_via_alloca ? 0 : 1 ); VERIFY_EVALUATION_COUNT( rm3.noalias() = (cv1) * (rv1 * m1), 1 ); VERIFY_EVALUATION_COUNT( rm3.noalias() -= (cv1) * (rv1 * m1), 1 ); VERIFY_EVALUATION_COUNT( rm3.noalias() = (m1*cv1) * (rv1 * m1), 2 ); -- cgit v1.2.3