diff options
-rw-r--r-- | Eigen/src/Cholesky/Cholesky.h | 38 | ||||
-rw-r--r-- | Eigen/src/Cholesky/CholeskyWithoutSquareRoot.h | 39 | ||||
-rw-r--r-- | Eigen/src/Core/CacheFriendlyProduct.h | 42 | ||||
-rw-r--r-- | bench/benchCholesky.cpp | 22 |
4 files changed, 93 insertions, 48 deletions
diff --git a/Eigen/src/Cholesky/Cholesky.h b/Eigen/src/Cholesky/Cholesky.h index 4feeee7c6..02dbed399 100644 --- a/Eigen/src/Cholesky/Cholesky.h +++ b/Eigen/src/Cholesky/Cholesky.h @@ -48,24 +48,28 @@ */ template<typename MatrixType> class Cholesky { - public: - + private: typedef typename MatrixType::Scalar Scalar; typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar; typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, 1> VectorType; + enum { + PacketSize = ei_packet_traits<Scalar>::size, + AlignmentMask = int(PacketSize)-1 + }; + + public: + Cholesky(const MatrixType& matrix) : m_matrix(matrix.rows(), matrix.cols()) { compute(matrix); } - Part<MatrixType, Lower> matrixL(void) const - { - return m_matrix; - } + inline Part<MatrixType, Lower> matrixL(void) const { return m_matrix; } - bool isPositiveDefinite(void) const { return m_isPositiveDefinite; } + /** \returns true if the matrix is positive definite */ + inline bool isPositiveDefinite(void) const { return m_isPositiveDefinite; } template<typename Derived> typename Derived::Eval solve(const MatrixBase<Derived> &b) const; @@ -92,20 +96,30 @@ void Cholesky<MatrixType>::compute(const MatrixType& a) RealScalar x; x = ei_real(a.coeff(0,0)); - m_isPositiveDefinite = x > RealScalar(0) && ei_isMuchSmallerThan(ei_imag(m_matrix.coeff(0,0)), RealScalar(1)); + m_isPositiveDefinite = x > precision<Scalar>() && ei_isMuchSmallerThan(ei_imag(m_matrix.coeff(0,0)), RealScalar(1)); m_matrix.coeffRef(0,0) = ei_sqrt(x); - m_matrix.col(0).end(size-1) = a.row(0).end(size-1).adjoint() / m_matrix.coeff(0,0); + m_matrix.col(0).end(size-1) = a.row(0).end(size-1).adjoint() / ei_real(m_matrix.coeff(0,0)); for (int j = 1; j < size; ++j) { Scalar tmp = ei_real(a.coeff(j,j)) - m_matrix.row(j).start(j).norm2(); x = ei_real(tmp); - m_isPositiveDefinite = m_isPositiveDefinite && x > RealScalar(0) && ei_isMuchSmallerThan(ei_imag(tmp), RealScalar(1)); + if (x < precision<Scalar>() || (!ei_isMuchSmallerThan(ei_imag(tmp), RealScalar(1)))) + { + m_isPositiveDefinite = m_isPositiveDefinite; + return; + } m_matrix.coeffRef(j,j) = x = ei_sqrt(x); int endSize = size-j-1; - if (endSize>0) + if (endSize>0) { + // Note that when all matrix columns have good alignment, then the following + // product is guaranteed to be optimal with respect to alignment. + m_matrix.col(j).end(endSize) = + (m_matrix.block(j+1, 0, endSize, j) * m_matrix.row(j).start(j).adjoint()).lazy(); + m_matrix.col(j).end(endSize) = (a.row(j).end(endSize).adjoint() - - m_matrix.block(j+1, 0, endSize, j) * m_matrix.row(j).start(j).adjoint()) / x; + - m_matrix.col(j).end(endSize) ) / x; + } } } diff --git a/Eigen/src/Cholesky/CholeskyWithoutSquareRoot.h b/Eigen/src/Cholesky/CholeskyWithoutSquareRoot.h index 38d9e9e50..ac26c88b1 100644 --- a/Eigen/src/Cholesky/CholeskyWithoutSquareRoot.h +++ b/Eigen/src/Cholesky/CholeskyWithoutSquareRoot.h @@ -60,23 +60,13 @@ template<typename MatrixType> class CholeskyWithoutSquareRoot } /** \returns the lower triangular matrix L */ - Part<MatrixType, UnitLower> matrixL(void) const - { - return m_matrix; - } + inline Part<MatrixType, UnitLower> matrixL(void) const { return m_matrix; } /** \returns the coefficients of the diagonal matrix D */ - DiagonalCoeffs<MatrixType> vectorD(void) const - { - return m_matrix.diagonal(); - } + inline DiagonalCoeffs<MatrixType> vectorD(void) const { return m_matrix.diagonal(); } - /** \returns whether the matrix is positive definite */ - bool isPositiveDefinite(void) const - { - // FIXME is it really correct ? - return m_matrix.diagonal().real().minCoeff() > RealScalar(0); - } + /** \returns true if the matrix is positive definite */ + inline bool isPositiveDefinite(void) const { return m_isPositiveDefinite; } template<typename Derived> typename Derived::Eval solve(const MatrixBase<Derived> &b) const; @@ -91,6 +81,8 @@ template<typename MatrixType> class CholeskyWithoutSquareRoot * is not stored), and the diagonal entries correspond to D. */ MatrixType m_matrix; + + bool m_isPositiveDefinite; }; /** Compute / recompute the Cholesky decomposition A = L D L^* = U^* D U of \a matrix @@ -101,6 +93,13 @@ void CholeskyWithoutSquareRoot<MatrixType>::compute(const MatrixType& a) assert(a.rows()==a.cols()); const int size = a.rows(); m_matrix.resize(size, size); + m_isPositiveDefinite = true; + + // Let's preallocate a temporay vector to evaluate the matrix-vector product into it. + // Unlike the standard Cholesky decomposition, here we cannot evaluate it to the destination + // matrix because it a sub-row which is not compatible suitable for efficient packet evaluation. + // (at least if we assume the matrix is col-major) + Matrix<Scalar,MatrixType::RowsAtCompileTime,1> _temporary(size); // Note that, in this algorithm the rows of the strict upper part of m_matrix is used to store // column vector, thus the strange .conjugate() and .transpose()... @@ -112,11 +111,21 @@ void CholeskyWithoutSquareRoot<MatrixType>::compute(const MatrixType& a) RealScalar tmp = ei_real(a.coeff(j,j) - (m_matrix.row(j).start(j) * m_matrix.col(j).start(j).conjugate()).coeff(0,0)); m_matrix.coeffRef(j,j) = tmp; + if (ei_isMuchSmallerThan(tmp,RealScalar(1))) + { + m_isPositiveDefinite = false; + return; + } + int endSize = size-j-1; if (endSize>0) { + _temporary.end(endSize) = ( m_matrix.block(j+1,0, endSize, j) + * m_matrix.col(j).start(j).conjugate() ).lazy(); + m_matrix.row(j).end(endSize) = a.row(j).end(endSize).conjugate() - - (m_matrix.block(j+1,0, endSize, j) * m_matrix.col(j).start(j).conjugate()).transpose(); + - _temporary.end(endSize).transpose(); + m_matrix.col(j).end(endSize) = m_matrix.row(j).end(endSize) / tmp; } } diff --git a/Eigen/src/Core/CacheFriendlyProduct.h b/Eigen/src/Core/CacheFriendlyProduct.h index 83f5da558..a54f1743f 100644 --- a/Eigen/src/Core/CacheFriendlyProduct.h +++ b/Eigen/src/Core/CacheFriendlyProduct.h @@ -359,6 +359,19 @@ static void ei_cache_friendly_product( #endif // EIGEN_EXTERN_INSTANTIATIONS +template<typename Scalar> +inline static int ei_alignmentOffset(const Scalar* ptr, int maxOffset) +{ + typedef typename ei_packet_traits<Scalar>::type Packet; + const int PacketSize = ei_packet_traits<Scalar>::size; + const int PacketAlignedMask = PacketSize-1; + const bool Vectorized = PacketSize>1; + return Vectorized + ? std::min<int>( (PacketSize - ((size_t(ptr)/sizeof(Scalar)) & PacketAlignedMask)) + & PacketAlignedMask, maxOffset) + : 0; +} + /* Optimized col-major matrix * vector product: * This algorithm processes 4 columns at onces that allows to both reduce * the number of load/stores of the result by a factor 4 and to reduce @@ -397,9 +410,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector( // How many coeffs of the result do we have to skip to be aligned. // Here we assume data are at least aligned on the base scalar type that is mandatory anyway. - const int alignedStart = Vectorized - ? std::min<int>( (PacketSize - ((size_t(res)/sizeof(Scalar)) & PacketAlignedMask)) & PacketAlignedMask, size) - : 0; + const int alignedStart = ei_alignmentOffset(res,size); const int alignedSize = alignedStart + ((size-alignedStart) & ~PacketAlignedMask); const int peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart; @@ -408,9 +419,13 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector( : alignmentStep==2 ? EvenAligned : FirstAligned; + // we cannot assume the first element is aligned because of sub-matrices + const int lhsAlignmentOffset = ei_alignmentOffset(lhs,size); + ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0); + // find how many columns do we have to skip to be aligned with the result (if possible) int skipColumns=0; - for (; skipColumns<PacketSize && alignedStart != alignmentStep*skipColumns; ++skipColumns) + for (; skipColumns<PacketSize && alignedStart != lhsAlignmentOffset + alignmentStep*skipColumns; ++skipColumns) {} if (skipColumns==PacketSize) { @@ -423,6 +438,10 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector( skipColumns = std::min(skipColumns,rhs.size()); // note that the skiped columns are processed later. } + + ei_internal_assert((alignmentPattern==NoneAligned) + || size_t(lhs+alignedStart+alignmentStep*skipColumns)%sizeof(Packet)==0); + int columnBound = ((rhs.size()-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns; for (int i=skipColumns; i<columnBound; i+=columnsAtOnce) { @@ -500,7 +519,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector( res[j] += ei_pfirst(ptmp0) * lhs0[j]; // process aligned result's coeffs - if (((i*lhsStride) % PacketSize+alignedStart) == 0) + if ((size_t(lhs0+alignedStart)%sizeof(Packet))==0) for (int j = alignedStart;j<alignedSize;j+=PacketSize) ei_pstore(&res[j], ei_pmadd(ptmp0,ei_pload(&lhs0[j]),ei_pload(&res[j]))); else @@ -557,9 +576,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector( // How many coeffs of the result do we have to skip to be aligned. // Here we assume data are at least aligned on the base scalar type that is mandatory anyway. - const int alignedStart = Vectorized - ? std::min<int>( (PacketSize - ((size_t(rhs)/sizeof(Scalar)) & PacketAlignedMask)) & PacketAlignedMask, size) - : 0; + const int alignedStart = ei_alignmentOffset(rhs, size); const int alignedSize = alignedStart + ((size-alignedStart) & ~PacketAlignedMask); //const int peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : 0; @@ -568,9 +585,12 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector( : alignmentStep==2 ? EvenAligned : FirstAligned; + // we cannot assume the first element is aligned because of sub-matrices + const int lhsAlignmentOffset = ei_alignmentOffset(lhs,size); + ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0); // find how many rows do we have to skip to be aligned with rhs (if possible) int skipRows=0; - for (; skipRows<PacketSize && alignedStart != alignmentStep*skipRows; ++skipRows) + for (; skipRows<PacketSize && alignedStart != lhsAlignmentOffset + alignmentStep*skipRows; ++skipRows) {} if (skipRows==PacketSize) { @@ -583,6 +603,8 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector( skipRows = std::min(skipRows,res.size()); // note that the skiped columns are processed later. } + ei_internal_assert((alignmentPattern==NoneAligned) + || size_t(lhs+alignedStart+alignmentStep*skipRows)%sizeof(Packet)==0); int rowBound = ((res.size()-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows; for (int i=skipRows; i<rowBound; i+=rowsAtOnce) @@ -652,7 +674,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector( if (alignedSize>alignedStart) { // process aligned rhs coeffs - if ((i*lhsStride+alignedStart) % PacketSize==0) + if ((size_t(lhs0+alignedStart)%sizeof(Packet))==0) for (int j = alignedStart;j<alignedSize;j+=PacketSize) ptmp0 = ei_pmadd(ei_pload(&rhs[j]), ei_pload(&lhs0[j]), ptmp0); else diff --git a/bench/benchCholesky.cpp b/bench/benchCholesky.cpp index 88b52eb4e..dd0c7f83b 100644 --- a/bench/benchCholesky.cpp +++ b/bench/benchCholesky.cpp @@ -34,7 +34,7 @@ __attribute__ ((noinline)) void benchCholesky(const MatrixType& m) typedef typename MatrixType::Scalar Scalar; typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> SquareMatrixType; - MatrixType a = MatrixType::random(rows,cols); + MatrixType a = MatrixType::Random(rows,cols); SquareMatrixType covMat = a * a.adjoint(); BenchTimer timerNoSqrt, timerSqrt; @@ -108,7 +108,7 @@ __attribute__ ((noinline)) void benchCholesky(const MatrixType& m) int main(int argc, char* argv[]) { - const int dynsizes[] = {4,6,8,12,16,24,32,64,128,256,512,0}; + const int dynsizes[] = {/*4,6,8,12,16,24,32,49,64,67,128,129,130,131,132,*/256,257,258,259,260,512,0}; std::cout << "size no sqrt standard"; #ifdef BENCH_GSL std::cout << " GSL (standard + double + ATLAS) "; @@ -118,15 +118,15 @@ int main(int argc, char* argv[]) for (uint i=0; dynsizes[i]>0; ++i) benchCholesky(Matrix<Scalar,Dynamic,Dynamic>(dynsizes[i],dynsizes[i])); - benchCholesky(Matrix<Scalar,2,2>()); - benchCholesky(Matrix<Scalar,3,3>()); - benchCholesky(Matrix<Scalar,4,4>()); - benchCholesky(Matrix<Scalar,5,5>()); - benchCholesky(Matrix<Scalar,6,6>()); - benchCholesky(Matrix<Scalar,7,7>()); - benchCholesky(Matrix<Scalar,8,8>()); - benchCholesky(Matrix<Scalar,12,12>()); - benchCholesky(Matrix<Scalar,16,16>()); +// benchCholesky(Matrix<Scalar,2,2>()); +// benchCholesky(Matrix<Scalar,3,3>()); +// benchCholesky(Matrix<Scalar,4,4>()); +// benchCholesky(Matrix<Scalar,5,5>()); +// benchCholesky(Matrix<Scalar,6,6>()); +// benchCholesky(Matrix<Scalar,7,7>()); +// benchCholesky(Matrix<Scalar,8,8>()); +// benchCholesky(Matrix<Scalar,12,12>()); +// benchCholesky(Matrix<Scalar,16,16>()); return 0; } |