aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2009-08-04 11:28:02 +0200
committerGravatar Gael Guennebaud <g.gael@free.fr>2009-08-04 11:28:02 +0200
commit4bec101470091aae27e7469025c80e31b889f566 (patch)
tree2f611c04043575272e7f1a24cda4e1199b7196c4
parent912da9fade7a02086747d086e0634cab0d6ff4b6 (diff)
implement two levels of blocking in PartialLU => high speedup
-rw-r--r--Eigen/src/LU/PartialLU.h209
-rw-r--r--test/inverse.cpp7
2 files changed, 137 insertions, 79 deletions
diff --git a/Eigen/src/LU/PartialLU.h b/Eigen/src/LU/PartialLU.h
index 337438e2f..1cca1c7db 100644
--- a/Eigen/src/LU/PartialLU.h
+++ b/Eigen/src/LU/PartialLU.h
@@ -201,98 +201,153 @@ PartialLU<MatrixType>::PartialLU(const MatrixType& matrix)
compute(matrix);
}
-/** \internal performs the LU decomposition in place of the matrix \a lu.
- * In addition, this function returns the row transpositions in the
- * vector \a row_transpositions which must have a size equal to the number
- * of columns of the matrix \a lu, and an integer \a nb_transpositions
- * which returns the actual number of transpositions.
- */
-template<typename MatrixType, typename IntVector>
-void ei_lu_unblocked(MatrixType& lu, IntVector& row_transpositions, int& nb_transpositions)
-{
- const int rows = lu.rows();
- const int size = std::min(lu.rows(),lu.cols());
- nb_transpositions = 0;
- for(int k = 0; k < size; ++k)
- {
- int row_of_biggest_in_col;
- lu.block(k,k,rows-k,1).cwise().abs().maxCoeff(&row_of_biggest_in_col);
- row_of_biggest_in_col += k;
- row_transpositions.coeffRef(k) = row_of_biggest_in_col;
- if(k != row_of_biggest_in_col)
- {
- lu.row(k).swap(lu.row(row_of_biggest_in_col));
- ++nb_transpositions;
- }
-
- if(k<rows-1)
+/** This is the blocked version of ei_lu_unblocked() */
+template<typename Scalar, int StorageOrder>
+struct ei_partial_lu_impl
+{
+ // FIXME add a stride to Map, so that the following mapping becomes easier,
+ // another option would be to create an expression being able to automatically
+ // warp any Map, Matrix, and Block expressions as a unique type, but since that's exactly
+ // a Map + stride, why not adding a stride to Map, and convenient ctors from a Matrix,
+ // and Block.
+ typedef Map<Matrix<Scalar, Dynamic, Dynamic, StorageOrder> > MapLU;
+ typedef Block<MapLU, Dynamic, Dynamic> MatrixType;
+ typedef Block<MatrixType,Dynamic,Dynamic> BlockType;
+
+ /** \internal performs the LU decomposition in-place of the matrix \a lu
+ * using an unblocked algorithm.
+ *
+ * In addition, this function returns the row transpositions in the
+ * vector \a row_transpositions which must have a size equal to the number
+ * of columns of the matrix \a lu, and an integer \a nb_transpositions
+ * which returns the actual number of transpositions.
+ */
+ static void unblocked_lu(MatrixType& lu, int* row_transpositions, int& nb_transpositions)
+ {
+ const int rows = lu.rows();
+ const int size = std::min(lu.rows(),lu.cols());
+ nb_transpositions = 0;
+ for(int k = 0; k < size; ++k)
{
- lu.col(k).end(rows-k-1) /= lu.coeff(k,k);
- for(int col = k + 1; col < size; ++col)
- lu.col(col).end(rows-k-1) -= lu.col(k).end(rows-k-1) * lu.coeff(k,col);
+ int row_of_biggest_in_col;
+ lu.block(k,k,rows-k,1).cwise().abs().maxCoeff(&row_of_biggest_in_col);
+ row_of_biggest_in_col += k;
+
+ row_transpositions[k] = row_of_biggest_in_col;
+
+ if(k != row_of_biggest_in_col)
+ {
+ lu.row(k).swap(lu.row(row_of_biggest_in_col));
+ ++nb_transpositions;
+ }
+
+ if(k<rows-1)
+ {
+ lu.col(k).end(rows-k-1) /= lu.coeff(k,k);
+
+ // TODO implement a fast rank one update routine
+ for(int col = k + 1; col < size; ++col)
+ lu.col(col).end(rows-k-1) -= lu.col(k).end(rows-k-1) * lu.coeff(k,col);
+ }
}
}
-}
-/** This is the blocked version of ei_lu_unblocked() */
-template<typename MatrixType, typename IntVector>
-void ei_lu_blocked(MatrixType& lu, IntVector& row_transpositions, int& nb_transpositions)
-{
- const int size = lu.rows();
-
- // automatically adjust the number of subdivisions to the size
- // of the matrix so that there is enough sub blocks:
- int blockSize = size/8;
- blockSize = (blockSize/16)*16;
- blockSize = std::min(std::max(blockSize,8), 256);
- // if the matrix is too small, no blocking:
- if(size<32)
- blockSize = size;
-
- nb_transpositions = 0;
- for(int k = 0; k < size; k+=blockSize)
+ /** \internal performs the LU decomposition in-place of the matrix represented
+ * by the variables \a rows, \a cols, \a lu_data, and \a lu_stride using a
+ * recursive, blocked algorithm.
+ *
+ * In addition, this function returns the row transpositions in the
+ * vector \a row_transpositions which must have a size equal to the number
+ * of columns of the matrix \a lu, and an integer \a nb_transpositions
+ * which returns the actual number of transpositions.
+ *
+ * \note This very low level interface using pointers, etc. is to:
+ * 1 - reduce the number of instanciations to the strict minimum
+ * 2 - avoid infinite recursion of the instanciations with Block<Block<Block<...> > >
+ */
+ static void blocked_lu(int rows, int cols, Scalar* lu_data, int luStride, int* row_transpositions, int& nb_transpositions, int maxBlockSize=256)
{
- int bs = std::min(size-k,blockSize);
- int ps = size - k;
- int rs = size - k - bs;
- // partition the matrix:
- // A00 | A01 | A02
- // lu = A10 | A11 | A12
- // A20 | A21 | A22
- Block<MatrixType,Dynamic,Dynamic> A_0(lu,0,0,size,k);
- Block<MatrixType,Dynamic,Dynamic> A11_21(lu,k,k,ps,bs);
- Block<MatrixType,Dynamic,Dynamic> A_2(lu,0,k+bs,size,rs);
- Block<MatrixType,Dynamic,Dynamic> A11(lu,k,k,bs,bs);
- Block<MatrixType,Dynamic,Dynamic> A12(lu,k,k+bs,bs,rs);
- Block<MatrixType,Dynamic,Dynamic> A21(lu,k+bs,k,rs,bs);
- Block<MatrixType,Dynamic,Dynamic> A22(lu,k+bs,k+bs,rs,rs);
+ MapLU lu1(lu_data,StorageOrder==RowMajor?rows:luStride,StorageOrder==RowMajor?luStride:cols);
+ MatrixType lu(lu1,0,0,rows,cols);
- VectorBlock<IntVector,Dynamic> row_transpositions_in_panel(row_transpositions,k,bs);
- int nb_transpositions_in_panel;
- ei_lu_unblocked(A11_21, row_transpositions_in_panel, nb_transpositions_in_panel);
- nb_transpositions_in_panel += nb_transpositions_in_panel;
- // update permutations and apply them to A10
- for(int i=k;i<k+bs; ++i)
+ const int size = std::min(rows,cols);
+
+ // if the matrix is too small, no blocking:
+ if(size<=16)
{
- int piv = (row_transpositions.coeffRef(i) += k);
- A_0.row(i).swap(A_0.row(piv));
+ unblocked_lu(lu, row_transpositions, nb_transpositions);
+ return;
}
- if(rs)
+ // automatically adjust the number of subdivisions to the size
+ // of the matrix so that there is enough sub blocks:
+ int blockSize;
{
- // apply permutations to A_2
- for(int i=k;i<k+bs; ++i)
- A_2.row(i).swap(A_2.row(row_transpositions.coeff(i)));
-
- // A12 = A11^-1 A12
- A11.template triangularView<UnitLowerTriangular>().solveInPlace(A12);
+ blockSize = size/8;
+ blockSize = (blockSize/16)*16;
+ blockSize = std::min(std::max(blockSize,8), maxBlockSize);
+ }
- A22 -= A21 * A12;
+ nb_transpositions = 0;
+ for(int k = 0; k < size; k+=blockSize)
+ {
+ int bs = std::min(size-k,blockSize);
+ int ps = size - k;
+ int rs = size - k - bs;
+ // partition the matrix:
+ // A00 | A01 | A02
+ // lu = A10 | A11 | A12
+ // A20 | A21 | A22
+ BlockType A_0(lu,0,0,size,k);
+ BlockType A_2(lu,0,k+bs,size,rs);
+ BlockType A11(lu,k,k,bs,bs);
+ BlockType A12(lu,k,k+bs,bs,rs);
+ BlockType A21(lu,k+bs,k,rs,bs);
+ BlockType A22(lu,k+bs,k+bs,rs,rs);
+
+ int nb_transpositions_in_panel;
+ // recursively calls the blocked LU algorithm with a very small
+ // blocking size:
+ blocked_lu(ps, bs, &lu.coeffRef(k,k), luStride,
+ row_transpositions+k, nb_transpositions_in_panel, 16);
+ nb_transpositions_in_panel += nb_transpositions_in_panel;
+
+ // update permutations and apply them to A10
+ for(int i=k;i<k+bs; ++i)
+ {
+ int piv = (row_transpositions[i] += k);
+ A_0.row(i).swap(A_0.row(piv));
+ }
+
+ if(rs)
+ {
+ // apply permutations to A_2
+ for(int i=k;i<k+bs; ++i)
+ A_2.row(i).swap(A_2.row(row_transpositions[i]));
+
+ // A12 = A11^-1 A12
+ A11.template triangularView<UnitLowerTriangular>().solveInPlace(A12);
+
+ A22 -= A21 * A12;
+ }
}
}
+};
+
+/** \internal performs the LU decomposition with partial pivoting in-place.
+ */
+template<typename MatrixType, typename IntVector>
+void ei_partial_lu_inplace(MatrixType& lu, IntVector& row_transpositions, int& nb_transpositions)
+{
+ ei_assert(lu.cols() == row_transpositions.size());
+ ei_assert((&row_transpositions.coeffRef(1)-&row_transpositions.coeffRef(0)) == 1);
+
+ ei_partial_lu_impl
+ <typename MatrixType::Scalar, MatrixType::Flags&RowMajorBit?RowMajor:ColMajor>
+ ::blocked_lu(lu.rows(), lu.cols(), &lu.coeffRef(0,0), lu.stride(), &row_transpositions.coeffRef(0), nb_transpositions);
}
template<typename MatrixType>
@@ -307,7 +362,7 @@ void PartialLU<MatrixType>::compute(const MatrixType& matrix)
IntColVectorType rows_transpositions(size);
int nb_transpositions;
- ei_lu_blocked(m_lu, rows_transpositions, nb_transpositions);
+ ei_partial_lu_inplace(m_lu, rows_transpositions, nb_transpositions);
m_det_p = (nb_transpositions%2) ? -1 : 1;
for(int k = 0; k < size; ++k) m_p.coeffRef(k) = k;
diff --git a/test/inverse.cpp b/test/inverse.cpp
index b4eef73b6..65dfbc73e 100644
--- a/test/inverse.cpp
+++ b/test/inverse.cpp
@@ -81,13 +81,16 @@ template<typename MatrixType> void inverse(const MatrixType& m)
void test_inverse()
{
+ int s;
for(int i = 0; i < g_repeat; i++) {
CALL_SUBTEST( inverse(Matrix<double,1,1>()) );
CALL_SUBTEST( inverse(Matrix2d()) );
CALL_SUBTEST( inverse(Matrix3f()) );
CALL_SUBTEST( inverse(Matrix4f()) );
- CALL_SUBTEST( inverse(MatrixXf(72,72)) );
- CALL_SUBTEST( inverse(MatrixXcd(56,56)) );
+ s = ei_random<int>(50,320);
+ CALL_SUBTEST( inverse(MatrixXf(s,s)) );
+ s = ei_random<int>(25,100);
+ CALL_SUBTEST( inverse(MatrixXcd(s,s)) );
}
// test some tricky cases for 4x4 matrices