From b466c266a0081685d27855684f22d7ccc5fb10ea Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 23 Jul 2008 17:30:00 +0000
Subject: * Fix some complex alignment issues in the cache friendly
 matrix-vector products. * Minor update of the cores of the Cholesky
 algorithms to make them more friendly   wrt to matrix-vector products =>
 speedup x5 !

---
 Eigen/src/Cholesky/Cholesky.h                  | 38 +++++++++++++++--------
 Eigen/src/Cholesky/CholeskyWithoutSquareRoot.h | 39 +++++++++++++++---------
 Eigen/src/Core/CacheFriendlyProduct.h          | 42 ++++++++++++++++++++------
 bench/benchCholesky.cpp                        | 22 +++++++-------
 4 files changed, 93 insertions(+), 48 deletions(-)
diff --git a/Eigen/src/Cholesky/Cholesky.h b/Eigen/src/Cholesky/Cholesky.h
index 4feeee7c6..02dbed399 100644
--- a/Eigen/src/Cholesky/Cholesky.h
+++ b/Eigen/src/Cholesky/Cholesky.h
@@ -48,24 +48,28 @@
   */
 template<typename MatrixType> class Cholesky
 {
-  public:
-
+  private:
     typedef typename MatrixType::Scalar Scalar;
     typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
     typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, 1> VectorType;
 
+    enum {
+      PacketSize = ei_packet_traits<Scalar>::size,
+      AlignmentMask = int(PacketSize)-1
+    };
+
+  public:
+  
     Cholesky(const MatrixType& matrix)
       : m_matrix(matrix.rows(), matrix.cols())
     {
       compute(matrix);
     }
 
-    Part<MatrixType, Lower> matrixL(void) const
-    {
-      return m_matrix;
-    }
+    inline Part<MatrixType, Lower> matrixL(void) const { return m_matrix; }
 
-    bool isPositiveDefinite(void) const { return m_isPositiveDefinite; }
+    /** \returns true if the matrix is positive definite */
+    inline bool isPositiveDefinite(void) const { return m_isPositiveDefinite; }
 
     template<typename Derived>
     typename Derived::Eval solve(const MatrixBase<Derived> &b) const;
@@ -92,20 +96,30 @@ void Cholesky<MatrixType>::compute(const MatrixType& a)
 
   RealScalar x;
   x = ei_real(a.coeff(0,0));
-  m_isPositiveDefinite = x > RealScalar(0) && ei_isMuchSmallerThan(ei_imag(m_matrix.coeff(0,0)), RealScalar(1));
+  m_isPositiveDefinite = x > precision<Scalar>() && ei_isMuchSmallerThan(ei_imag(m_matrix.coeff(0,0)), RealScalar(1));
   m_matrix.coeffRef(0,0) = ei_sqrt(x);
-  m_matrix.col(0).end(size-1) = a.row(0).end(size-1).adjoint() / m_matrix.coeff(0,0);
+  m_matrix.col(0).end(size-1) = a.row(0).end(size-1).adjoint() / ei_real(m_matrix.coeff(0,0));
   for (int j = 1; j < size; ++j)
   {
     Scalar tmp = ei_real(a.coeff(j,j)) - m_matrix.row(j).start(j).norm2();
     x = ei_real(tmp);
-    m_isPositiveDefinite = m_isPositiveDefinite && x > RealScalar(0) && ei_isMuchSmallerThan(ei_imag(tmp), RealScalar(1));
+    if (x < precision<Scalar>() || (!ei_isMuchSmallerThan(ei_imag(tmp), RealScalar(1))))
+    {
+      m_isPositiveDefinite = m_isPositiveDefinite;
+      return;
+    }
     m_matrix.coeffRef(j,j) = x = ei_sqrt(x);
 
     int endSize = size-j-1;
-    if (endSize>0)
+    if (endSize>0) {
+      // Note that when all matrix columns have good alignment, then the following
+      // product is guaranteed to be optimal with respect to alignment.
+      m_matrix.col(j).end(endSize) =
+        (m_matrix.block(j+1, 0, endSize, j) * m_matrix.row(j).start(j).adjoint()).lazy();
+
       m_matrix.col(j).end(endSize) = (a.row(j).end(endSize).adjoint()
-        - m_matrix.block(j+1, 0, endSize, j) * m_matrix.row(j).start(j).adjoint()) / x;
+        - m_matrix.col(j).end(endSize) ) / x;
+    }
   }
 }
 
diff --git a/Eigen/src/Cholesky/CholeskyWithoutSquareRoot.h b/Eigen/src/Cholesky/CholeskyWithoutSquareRoot.h
index 38d9e9e50..ac26c88b1 100644
--- a/Eigen/src/Cholesky/CholeskyWithoutSquareRoot.h
+++ b/Eigen/src/Cholesky/CholeskyWithoutSquareRoot.h
@@ -60,23 +60,13 @@ template<typename MatrixType> class CholeskyWithoutSquareRoot
     }
 
     /** \returns the lower triangular matrix L */
-    Part<MatrixType, UnitLower> matrixL(void) const
-    {
-      return m_matrix;
-    }
+    inline Part<MatrixType, UnitLower> matrixL(void) const { return m_matrix; }
 
     /** \returns the coefficients of the diagonal matrix D */
-    DiagonalCoeffs<MatrixType> vectorD(void) const
-    {
-      return m_matrix.diagonal();
-    }
+    inline DiagonalCoeffs<MatrixType> vectorD(void) const { return m_matrix.diagonal(); }
 
-    /** \returns whether the matrix is positive definite */
-    bool isPositiveDefinite(void) const
-    {
-      // FIXME is it really correct ?
-      return m_matrix.diagonal().real().minCoeff() > RealScalar(0);
-    }
+    /** \returns true if the matrix is positive definite */
+    inline bool isPositiveDefinite(void) const { return m_isPositiveDefinite; }
 
     template<typename Derived>
     typename Derived::Eval solve(const MatrixBase<Derived> &b) const;
@@ -91,6 +81,8 @@ template<typename MatrixType> class CholeskyWithoutSquareRoot
       * is not stored), and the diagonal entries correspond to D.
       */
     MatrixType m_matrix;
+
+    bool m_isPositiveDefinite;
 };
 
 /** Compute / recompute the Cholesky decomposition A = L D L^* = U^* D U of \a matrix
@@ -101,6 +93,13 @@ void CholeskyWithoutSquareRoot<MatrixType>::compute(const MatrixType& a)
   assert(a.rows()==a.cols());
   const int size = a.rows();
   m_matrix.resize(size, size);
+  m_isPositiveDefinite = true;
+
+  // Let's preallocate a temporay vector to evaluate the matrix-vector product into it.
+  // Unlike the standard Cholesky decomposition, here we cannot evaluate it to the destination
+  // matrix because it a sub-row which is not compatible suitable for efficient packet evaluation.
+  // (at least if we assume the matrix is col-major)
+  Matrix<Scalar,MatrixType::RowsAtCompileTime,1> _temporary(size);
 
   // Note that, in this algorithm the rows of the strict upper part of m_matrix is used to store
   // column vector, thus the strange .conjugate() and .transpose()...
@@ -112,11 +111,21 @@ void CholeskyWithoutSquareRoot<MatrixType>::compute(const MatrixType& a)
     RealScalar tmp = ei_real(a.coeff(j,j) - (m_matrix.row(j).start(j) * m_matrix.col(j).start(j).conjugate()).coeff(0,0));
     m_matrix.coeffRef(j,j) = tmp;
 
+    if (ei_isMuchSmallerThan(tmp,RealScalar(1)))
+    {
+      m_isPositiveDefinite = false;
+      return;
+    }
+
     int endSize = size-j-1;
     if (endSize>0)
     {
+      _temporary.end(endSize) = ( m_matrix.block(j+1,0, endSize, j)
+                                  * m_matrix.col(j).start(j).conjugate() ).lazy();
+
       m_matrix.row(j).end(endSize) = a.row(j).end(endSize).conjugate()
-        - (m_matrix.block(j+1,0, endSize, j) * m_matrix.col(j).start(j).conjugate()).transpose();
+                                   - _temporary.end(endSize).transpose();
+
       m_matrix.col(j).end(endSize) = m_matrix.row(j).end(endSize) / tmp;
     }
   }
diff --git a/Eigen/src/Core/CacheFriendlyProduct.h b/Eigen/src/Core/CacheFriendlyProduct.h
index 83f5da558..a54f1743f 100644
--- a/Eigen/src/Core/CacheFriendlyProduct.h
+++ b/Eigen/src/Core/CacheFriendlyProduct.h
@@ -359,6 +359,19 @@ static void ei_cache_friendly_product(
 
 #endif // EIGEN_EXTERN_INSTANTIATIONS
 
+template<typename Scalar>
+inline static int ei_alignmentOffset(const Scalar* ptr, int maxOffset)
+{
+  typedef typename ei_packet_traits<Scalar>::type Packet;
+  const int PacketSize = ei_packet_traits<Scalar>::size;
+  const int PacketAlignedMask = PacketSize-1;
+  const bool Vectorized = PacketSize>1;
+  return Vectorized
+          ? std::min<int>( (PacketSize - ((size_t(ptr)/sizeof(Scalar)) & PacketAlignedMask))
+                           & PacketAlignedMask, maxOffset)
+          : 0;
+}
+
 /* Optimized col-major matrix * vector product:
  * This algorithm processes 4 columns at onces that allows to both reduce
  * the number of load/stores of the result by a factor 4 and to reduce
@@ -397,9 +410,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
 
   // How many coeffs of the result do we have to skip to be aligned.
   // Here we assume data are at least aligned on the base scalar type that is mandatory anyway.
-  const int alignedStart = Vectorized
-                         ? std::min<int>( (PacketSize - ((size_t(res)/sizeof(Scalar)) & PacketAlignedMask)) & PacketAlignedMask, size)
-                         : 0;
+  const int alignedStart = ei_alignmentOffset(res,size);
   const int alignedSize = alignedStart + ((size-alignedStart) & ~PacketAlignedMask);
   const int peeledSize  = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart;
 
@@ -408,9 +419,13 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
                        : alignmentStep==2 ? EvenAligned
                        : FirstAligned;
 
+  // we cannot assume the first element is aligned because of sub-matrices
+  const int lhsAlignmentOffset = ei_alignmentOffset(lhs,size);
+  ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0);
+  
   // find how many columns do we have to skip to be aligned with the result (if possible)
   int skipColumns=0;
-  for (; skipColumns<PacketSize && alignedStart != alignmentStep*skipColumns; ++skipColumns)
+  for (; skipColumns<PacketSize && alignedStart != lhsAlignmentOffset + alignmentStep*skipColumns; ++skipColumns)
   {}
   if (skipColumns==PacketSize)
   {
@@ -423,6 +438,10 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
     skipColumns = std::min(skipColumns,rhs.size());
     // note that the skiped columns are processed later.
   }
+
+  ei_internal_assert((alignmentPattern==NoneAligned)
+    || size_t(lhs+alignedStart+alignmentStep*skipColumns)%sizeof(Packet)==0);
+
   int columnBound = ((rhs.size()-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
   for (int i=skipColumns; i<columnBound; i+=columnsAtOnce)
   {
@@ -500,7 +519,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
         res[j] += ei_pfirst(ptmp0) * lhs0[j];
 
       // process aligned result's coeffs
-      if (((i*lhsStride) % PacketSize+alignedStart) == 0)
+      if ((size_t(lhs0+alignedStart)%sizeof(Packet))==0)
         for (int j = alignedStart;j<alignedSize;j+=PacketSize)
           ei_pstore(&res[j], ei_pmadd(ptmp0,ei_pload(&lhs0[j]),ei_pload(&res[j])));
       else
@@ -557,9 +576,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
 
   // How many coeffs of the result do we have to skip to be aligned.
   // Here we assume data are at least aligned on the base scalar type that is mandatory anyway.
-  const int alignedStart = Vectorized
-                         ? std::min<int>( (PacketSize - ((size_t(rhs)/sizeof(Scalar)) & PacketAlignedMask)) & PacketAlignedMask, size)
-                         : 0;
+  const int alignedStart = ei_alignmentOffset(rhs, size);
   const int alignedSize = alignedStart + ((size-alignedStart) & ~PacketAlignedMask);
   //const int peeledSize  = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : 0;
 
@@ -568,9 +585,12 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
                        : alignmentStep==2 ? EvenAligned
                        : FirstAligned;
 
+  // we cannot assume the first element is aligned because of sub-matrices
+  const int lhsAlignmentOffset = ei_alignmentOffset(lhs,size);
+  ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0);
   // find how many rows do we have to skip to be aligned with rhs (if possible)
   int skipRows=0;
-  for (; skipRows<PacketSize && alignedStart != alignmentStep*skipRows; ++skipRows)
+  for (; skipRows<PacketSize && alignedStart != lhsAlignmentOffset + alignmentStep*skipRows; ++skipRows)
   {}
   if (skipRows==PacketSize)
   {
@@ -583,6 +603,8 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
     skipRows = std::min(skipRows,res.size());
     // note that the skiped columns are processed later.
   }
+  ei_internal_assert((alignmentPattern==NoneAligned)
+    || size_t(lhs+alignedStart+alignmentStep*skipRows)%sizeof(Packet)==0);
   
   int rowBound = ((res.size()-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
   for (int i=skipRows; i<rowBound; i+=rowsAtOnce)
@@ -652,7 +674,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
       if (alignedSize>alignedStart)
       {
         // process aligned rhs coeffs
-        if ((i*lhsStride+alignedStart) % PacketSize==0)
+        if ((size_t(lhs0+alignedStart)%sizeof(Packet))==0)
           for (int j = alignedStart;j<alignedSize;j+=PacketSize)
             ptmp0 = ei_pmadd(ei_pload(&rhs[j]), ei_pload(&lhs0[j]), ptmp0);
         else
diff --git a/bench/benchCholesky.cpp b/bench/benchCholesky.cpp
index 88b52eb4e..dd0c7f83b 100644
--- a/bench/benchCholesky.cpp
+++ b/bench/benchCholesky.cpp
@@ -34,7 +34,7 @@ __attribute__ ((noinline)) void benchCholesky(const MatrixType& m)
   typedef typename MatrixType::Scalar Scalar;
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> SquareMatrixType;
 
-  MatrixType a = MatrixType::random(rows,cols);
+  MatrixType a = MatrixType::Random(rows,cols);
   SquareMatrixType covMat =  a * a.adjoint();
 
   BenchTimer timerNoSqrt, timerSqrt;
@@ -108,7 +108,7 @@ __attribute__ ((noinline)) void benchCholesky(const MatrixType& m)
 
 int main(int argc, char* argv[])
 {
-  const int dynsizes[] = {4,6,8,12,16,24,32,64,128,256,512,0};
+  const int dynsizes[] = {/*4,6,8,12,16,24,32,49,64,67,128,129,130,131,132,*/256,257,258,259,260,512,0};
   std::cout << "size            no sqrt         standard";
   #ifdef BENCH_GSL
   std::cout << "       GSL (standard + double + ATLAS)  ";
@@ -118,15 +118,15 @@ int main(int argc, char* argv[])
   for (uint i=0; dynsizes[i]>0; ++i)
     benchCholesky(Matrix<Scalar,Dynamic,Dynamic>(dynsizes[i],dynsizes[i]));
 
-  benchCholesky(Matrix<Scalar,2,2>());
-  benchCholesky(Matrix<Scalar,3,3>());
-  benchCholesky(Matrix<Scalar,4,4>());
-  benchCholesky(Matrix<Scalar,5,5>());
-  benchCholesky(Matrix<Scalar,6,6>());
-  benchCholesky(Matrix<Scalar,7,7>());
-  benchCholesky(Matrix<Scalar,8,8>());
-  benchCholesky(Matrix<Scalar,12,12>());
-  benchCholesky(Matrix<Scalar,16,16>());
+//   benchCholesky(Matrix<Scalar,2,2>());
+//   benchCholesky(Matrix<Scalar,3,3>());
+//   benchCholesky(Matrix<Scalar,4,4>());
+//   benchCholesky(Matrix<Scalar,5,5>());
+//   benchCholesky(Matrix<Scalar,6,6>());
+//   benchCholesky(Matrix<Scalar,7,7>());
+//   benchCholesky(Matrix<Scalar,8,8>());
+//   benchCholesky(Matrix<Scalar,12,12>());
+//   benchCholesky(Matrix<Scalar,16,16>());
   return 0;
 }
 
-- 
cgit v1.2.3