From fea4220f3728e60bde06147ff9b89e10df203466 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 30 Oct 2012 15:09:48 +0100 Subject: SparseLU: add a specialized gemm kernel, and add padding to the supernodes such that supernodes columns are all properly aligned --- Eigen/src/SparseLU/SparseLU.h | 17 ++- Eigen/src/SparseLU/SparseLU_Matrix.h | 8 +- Eigen/src/SparseLU/SparseLU_column_bmod.h | 21 ++- Eigen/src/SparseLU/SparseLU_gemm_kernel.h | 240 ++++++++++++++++++++++++++++++ Eigen/src/SparseLU/SparseLU_kernel_bmod.h | 30 ++-- Eigen/src/SparseLU/SparseLU_panel_bmod.h | 44 +++--- Eigen/src/SparseLU/SparseLU_pivotL.h | 5 +- Eigen/src/SparseLU/SparseLU_snode_bmod.h | 16 +- 8 files changed, 329 insertions(+), 52 deletions(-) create mode 100644 Eigen/src/SparseLU/SparseLU_gemm_kernel.h (limited to 'Eigen/src') diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h index 9ea121ce5..24d120a21 100644 --- a/Eigen/src/SparseLU/SparseLU.h +++ b/Eigen/src/SparseLU/SparseLU.h @@ -206,8 +206,7 @@ class SparseLU for (int k = m_Lstore.nsuper(); k >= 0; k--) { Index fsupc = m_Lstore.supToCol()[k]; - Index istart = m_Lstore.rowIndexPtr()[fsupc]; - Index nsupr = m_Lstore.rowIndexPtr()[fsupc+1] - istart; + Index lda = m_Lstore.colIndexPtr()[fsupc+1] - m_Lstore.colIndexPtr()[fsupc]; // leading dimension Index nsupc = m_Lstore.supToCol()[k+1] - fsupc; Index luptr = m_Lstore.colIndexPtr()[fsupc]; @@ -220,7 +219,7 @@ class SparseLU } else { - Map, 0, OuterStride<> > A( &(m_Lstore.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(nsupr) ); + Map, 0, OuterStride<> > A( &(m_Lstore.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) ); Map< Matrix, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); U = A.template triangularView().solve(U); } @@ -252,9 +251,9 @@ class SparseLU { m_perfv.panel_size = 12; m_perfv.relax = 1; - m_perfv.maxsuper = 100; - m_perfv.rowblk = 200; - m_perfv.colblk = 60; + m_perfv.maxsuper = 128; + m_perfv.rowblk = 16; + m_perfv.colblk = 8; m_perfv.fillfactor = 20; } @@ -423,7 +422,7 @@ void SparseLU::factorize(const MatrixType& matrix) ScalarVector dense; dense.setZero(maxpanel); ScalarVector tempv; - tempv.setZero(LU_NUM_TEMPV(m, m_perfv.panel_size, m_perfv.maxsuper, m_perfv.rowblk) ); + tempv.setZero(LU_NUM_TEMPV(m, m_perfv.panel_size, m_perfv.maxsuper, /*m_perfv.rowblk*/m) ); // Compute the inverse of perm_c PermutationType iperm_c(m_perm_c.inverse()); @@ -474,7 +473,9 @@ void SparseLU::factorize(const MatrixType& matrix) nextlu = m_glu.xlusup(jcol); //Starting location of column jcol in lusup (rectangular supernodes) jsupno = m_glu.supno(jcol); // Supernode number which column jcol belongs to fsupc = m_glu.xsup(jsupno); //First column number of the current supernode - new_next = nextlu + (m_glu.xlsub(fsupc+1)-m_glu.xlsub(fsupc)) * (kcol - jcol + 1); + int lda = m_glu.xusub(fsupc+1) - m_glu.xusub(fsupc); + lda = m_glu.xlsub(fsupc+1)-m_glu.xlsub(fsupc); + new_next = nextlu + lda * (kcol - jcol + 1); int mem; while (new_next > m_glu.nzlumax ) { diff --git a/Eigen/src/SparseLU/SparseLU_Matrix.h b/Eigen/src/SparseLU/SparseLU_Matrix.h index 31aeee64d..3b0f4b77a 100644 --- a/Eigen/src/SparseLU/SparseLU_Matrix.h +++ b/Eigen/src/SparseLU/SparseLU_Matrix.h @@ -229,8 +229,7 @@ class SuperNodalMatrix::InnerIterator inline operator bool() const { - return ( (m_idval < m_endval) && (m_idval > m_startval) && - (m_idrow < m_endidrow) && (m_idrow > m_startidrow) ); + return ( (m_idrow < m_endidrow) && (m_idrow > m_startidrow) ); } protected: @@ -283,14 +282,15 @@ void SuperNodalMatrix::solveInPlace( MatrixBase&X) const { // The supernode has more than one column Index luptr = colIndexPtr()[fsupc]; + Index lda = colIndexPtr()[fsupc+1] - luptr; // Triangular solve - Map, 0, OuterStride<> > A( &(Lval[luptr]), nsupc, nsupc, OuterStride<>(nsupr) ); + Map, 0, OuterStride<> > A( &(Lval[luptr]), nsupc, nsupc, OuterStride<>(lda) ); Map< Matrix, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); U = A.template triangularView().solve(U); // Matrix-vector product - new (&A) Map, 0, OuterStride<> > ( &(Lval[luptr+nsupc]), nrow, nsupc, OuterStride<>(nsupr) ); + new (&A) Map, 0, OuterStride<> > ( &(Lval[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) ); work.block(0, 0, nrow, nrhs) = A * U; //Begin Scatter diff --git a/Eigen/src/SparseLU/SparseLU_column_bmod.h b/Eigen/src/SparseLU/SparseLU_column_bmod.h index abe3d9956..d3f7f82e9 100644 --- a/Eigen/src/SparseLU/SparseLU_column_bmod.h +++ b/Eigen/src/SparseLU/SparseLU_column_bmod.h @@ -91,15 +91,17 @@ int SparseLUBase::LU_column_bmod(const int jcol, const int nseg, B segsize = krep - kfnz + 1; nsupc = krep - fst_col + 1; nsupr = glu.xlsub(fsupc+1) - glu.xlsub(fsupc); - nrow = nsupr - d_fsupc - nsupc; + nrow = nsupr - d_fsupc - nsupc; + int lda = glu.xlusup(fst_col+1) - glu.xlusup(fst_col); + // Perform a triangular solver and block update, // then scatter the result of sup-col update to dense no_zeros = kfnz - fst_col; if(segsize==1) - LU_kernel_bmod<1>::run(segsize, dense, tempv, glu.lusup, luptr, nsupr, nrow, glu.lsub, lptr, no_zeros); + LU_kernel_bmod<1>::run(segsize, dense, tempv, glu.lusup, luptr, lda, nrow, glu.lsub, lptr, no_zeros); else - LU_kernel_bmod::run(segsize, dense, tempv, glu.lusup, luptr, nsupr, nrow, glu.lsub, lptr, no_zeros); + LU_kernel_bmod::run(segsize, dense, tempv, glu.lusup, luptr, lda, nrow, glu.lsub, lptr, no_zeros); } // end if jsupno } // end for each segment @@ -110,6 +112,9 @@ int SparseLUBase::LU_column_bmod(const int jcol, const int nseg, B // copy the SPA dense into L\U[*,j] int mem; new_next = nextlu + glu.xlsub(fsupc + 1) - glu.xlsub(fsupc); + int offset = internal::first_multiple(new_next, internal::packet_traits::size) - new_next; + if(offset) + new_next += offset; while (new_next > glu.nzlumax ) { mem = LUMemXpand(glu.lusup, glu.nzlumax, nextlu, LUSUP, glu.num_expansions); @@ -124,6 +129,11 @@ int SparseLUBase::LU_column_bmod(const int jcol, const int nseg, B ++nextlu; } + if(offset) + { + glu.lusup.segment(nextlu,offset).setZero(); + nextlu += offset; + } glu.xlusup(jcol + 1) = nextlu; // close L\U(*,jcol); /* For more updates within the panel (also within the current supernode), @@ -148,11 +158,12 @@ int SparseLUBase::LU_column_bmod(const int jcol, const int nseg, B // points to the beginning of jcol in snode L\U(jsupno) ufirst = glu.xlusup(jcol) + d_fsupc; - Map, 0, OuterStride<> > A( &(glu.lusup.data()[luptr]), nsupc, nsupc, OuterStride<>(nsupr) ); + int lda = glu.xlusup(jcol+1) - glu.xlusup(jcol); + Map, 0, OuterStride<> > A( &(glu.lusup.data()[luptr]), nsupc, nsupc, OuterStride<>(lda) ); VectorBlock u(glu.lusup, ufirst, nsupc); u = A.template triangularView().solve(u); - new (&A) Map, 0, OuterStride<> > ( &(glu.lusup.data()[luptr+nsupc]), nrow, nsupc, OuterStride<>(nsupr) ); + new (&A) Map, 0, OuterStride<> > ( &(glu.lusup.data()[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) ); VectorBlock l(glu.lusup, ufirst+nsupc, nrow); l.noalias() -= A * u; diff --git a/Eigen/src/SparseLU/SparseLU_gemm_kernel.h b/Eigen/src/SparseLU/SparseLU_gemm_kernel.h new file mode 100644 index 000000000..b358b21d3 --- /dev/null +++ b/Eigen/src/SparseLU/SparseLU_gemm_kernel.h @@ -0,0 +1,240 @@ + +#ifndef EIGEN_SPARSELU_GEMM_KERNEL_H +#define EIGEN_SPARSELU_GEMM_KERNEL_H + +namespace Eigen { + +namespace internal { + + +/** \internal + * A general matrix-matrix product kernel optimized for the SparseLU factorization. + * - A, B, and C must be column major + * - lda and ldc must be multiples of the respective packet size + * - C must have the same alignment as A + */ +template +EIGEN_DONT_INLINE +void sparselu_gemm(int m, int n, int d, const Scalar* A, int lda, const Scalar* B, int ldb, Scalar* C, int ldc) +{ + using namespace Eigen::internal; + + typedef typename packet_traits::type Packet; + enum { + PacketSize = packet_traits::size, + PM = 8, // peeling in M + RN = 2, // register blocking + RK = 4, // register blocking + BM = 4096/sizeof(Scalar), // number of rows of A-C per chunk + SM = PM*PacketSize // step along M + }; + int d_end = (d/RK)*RK; // number of columns of A (rows of B) suitable for full register blocking + int n_end = (n/RN)*RN; // number of columns of B-C suitable for processing RN columns at once + int i0 = internal::first_aligned(A,m); + + eigen_internal_assert(((lda%PacketSize)==0) && ((ldc%PacketSize)==0) && (i0==internal::first_aligned(C,m))); + + // handle the non aligned rows of A and C without any optimization: + for(int i=0; i(BM, m-ib); // actual number of rows + int actual_b_end1 = (actual_b/SM)*SM; // actual number of rows suitable for peeling + int actual_b_end2 = (actual_b/PacketSize)*PacketSize; // actual number of rows suitable for vectorization + + // Let's process two columns of B-C at once + for(int j=0; j(Bc0[0]); + b10 = pset1(Bc0[1]); + b20 = pset1(Bc0[2]); + b30 = pset1(Bc0[3]); + b01 = pset1(Bc1[0]); + b11 = pset1(Bc1[1]); + b21 = pset1(Bc1[2]); + b31 = pset1(Bc1[3]); + + Packet a0, a1, a2, a3, c0, c1, t0, t1; + + const Scalar* A0 = A+ib+(k+0)*lda; + const Scalar* A1 = A+ib+(k+1)*lda; + const Scalar* A2 = A+ib+(k+2)*lda; + const Scalar* A3 = A+ib+(k+3)*lda; + + Scalar* C0 = C+ib+(j+0)*ldc; + Scalar* C1 = C+ib+(j+1)*ldc; + + a0 = pload(A0); + a1 = pload(A1); + a2 = pload(A2); + a3 = pload(A3); + +#define KMADD(c, a, b, tmp) tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp); +#define WORK(I) \ + c0 = pload(C0+i+(I)*PacketSize); \ + c1 = pload(C1+i+(I)*PacketSize); \ + KMADD(c0, a0, b00, t0); \ + KMADD(c1, a0, b01, t1); \ + a0 = pload(A0+i+(I+1)*PacketSize); \ + KMADD(c0, a1, b10, t0); \ + KMADD(c1, a1, b11, t1); \ + a1 = pload(A1+i+(I+1)*PacketSize); \ + KMADD(c0, a2, b20, t0); \ + KMADD(c1, a2, b21, t1); \ + a2 = pload(A2+i+(I+1)*PacketSize); \ + KMADD(c0, a3, b30, t0); \ + KMADD(c1, a3, b31, t1); \ + a3 = pload(A3+i+(I+1)*PacketSize); \ + pstore(C0+i+(I)*PacketSize, c0); \ + pstore(C1+i+(I)*PacketSize, c1) + + // process rows of A' - C' with aggressive vectorization and peeling + for(int i=0; i0) + { + const Scalar* Bc0 = B+(n-1)*ldb; + + for(int k=0; k(Bc0[0]); + b10 = pset1(Bc0[1]); + b20 = pset1(Bc0[2]); + b30 = pset1(Bc0[3]); + + Packet a0, a1, a2, a3, c0, t0/*, t1*/; + + const Scalar* A0 = A+ib+(k+0)*lda; + const Scalar* A1 = A+ib+(k+1)*lda; + const Scalar* A2 = A+ib+(k+2)*lda; + const Scalar* A3 = A+ib+(k+3)*lda; + + Scalar* C0 = C+ib+(n_end)*ldc; + + a0 = pload(A0); + a1 = pload(A1); + a2 = pload(A2); + a3 = pload(A3); + +#define WORK(I) \ + c0 = pload(C0+i+(I)*PacketSize); \ + KMADD(c0, a0, b00, t0); \ + a0 = pload(A0+i+(I+1)*PacketSize); \ + KMADD(c0, a1, b10, t0); \ + a1 = pload(A1+i+(I+1)*PacketSize); \ + KMADD(c0, a2, b20, t0); \ + a2 = pload(A2+i+(I+1)*PacketSize); \ + KMADD(c0, a3, b30, t0); \ + a3 = pload(A3+i+(I+1)*PacketSize); \ + pstore(C0+i+(I)*PacketSize, c0); + + // agressive vectorization and peeling + for(int i=0; i0) + { + for(int j=0; j, Aligned > MapVector; + typedef Map, Aligned > ConstMapVector; + if(rd==1) MapVector (C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b); + + else if(rd==2) MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b) + + B[1+d_end+j*ldb] * ConstMapVector(A+(d_end+1)*lda+ib, actual_b); + + else MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b) + + B[1+d_end+j*ldb] * ConstMapVector(A+(d_end+1)*lda+ib, actual_b) + + B[2+d_end+j*ldb] * ConstMapVector(A+(d_end+2)*lda+ib, actual_b); + } + } + + } // blocking on the rows of A and C +} +#undef KMADD + +} // namespace internal + +} // namespace Eigen + +#endif // EIGEN_SPARSELU_GEMM_KERNEL_H diff --git a/Eigen/src/SparseLU/SparseLU_kernel_bmod.h b/Eigen/src/SparseLU/SparseLU_kernel_bmod.h index b15ff9c50..ca53fb6d0 100644 --- a/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +++ b/Eigen/src/SparseLU/SparseLU_kernel_bmod.h @@ -18,7 +18,7 @@ * \param [in,out]dense Packed values of the original matrix * \param tempv temporary vector to use for updates * \param lusup array containing the supernodes - * \param nsupr Number of rows in the supernode + * \param lda Leading dimension in the supernode * \param nrow Number of rows in the rectangular part of the supernode * \param lsub compressed row subscripts of supernodes * \param lptr pointer to the first column of the current supernode in lsub @@ -28,7 +28,7 @@ template struct LU_kernel_bmod { template - EIGEN_DONT_INLINE static void run(const int segsize, BlockScalarVector& dense, ScalarVector& tempv, ScalarVector& lusup, int& luptr, const int nsupr, const int nrow, IndexVector& lsub, const int lptr, const int no_zeros) + EIGEN_DONT_INLINE static void run(const int segsize, BlockScalarVector& dense, ScalarVector& tempv, ScalarVector& lusup, int& luptr, const int lda, const int nrow, IndexVector& lsub, const int lptr, const int no_zeros) { typedef typename ScalarVector::Scalar Scalar; // First, copy U[*,j] segment from dense(*) to tempv(*) @@ -43,23 +43,24 @@ template struct LU_kernel_bmod ++isub; } // Dense triangular solve -- start effective triangle - luptr += nsupr * no_zeros + no_zeros; + luptr += lda * no_zeros + no_zeros; // Form Eigen matrix and vector - Map, 0, OuterStride<> > A( &(lusup.data()[luptr]), segsize, segsize, OuterStride<>(nsupr) ); + Map, 0, OuterStride<> > A( &(lusup.data()[luptr]), segsize, segsize, OuterStride<>(lda) ); Map > u(tempv.data(), segsize); u = A.template triangularView().solve(u); // Dense matrix-vector product y <-- B*x luptr += segsize; - Map, 0, OuterStride<> > B( &(lusup.data()[luptr]), nrow, segsize, OuterStride<>(nsupr) ); - Map > l(tempv.data()+segsize, nrow); - if(SegSizeAtCompileTime==2) - l = u(0) * B.col(0) + u(1) * B.col(1); - else if(SegSizeAtCompileTime==3) - l = u(0) * B.col(0) + u(1) * B.col(1) + u(2) * B.col(2); - else - l.noalias() = B * u; + const int PacketSize = internal::packet_traits::size; + int ldl = internal::first_multiple(nrow, PacketSize); + Map, 0, OuterStride<> > B( &(lusup.data()[luptr]), nrow, segsize, OuterStride<>(lda) ); + int aligned_offset = internal::first_aligned(tempv.data()+segsize, PacketSize); + int aligned_with_B_offset = (PacketSize-internal::first_aligned(B.data(), PacketSize))%PacketSize; + Map, 0, OuterStride<> > l(tempv.data()+segsize+aligned_offset+aligned_with_B_offset, nrow, OuterStride<>(ldl) ); + + l.setZero(); + internal::sparselu_gemm(l.rows(), l.cols(), B.cols(), B.data(), B.outerStride(), u.data(), u.outerStride(), l.data(), l.outerStride()); // Scatter tempv[] into SPA dense[] as a temporary storage isub = lptr + no_zeros; @@ -81,11 +82,12 @@ template struct LU_kernel_bmod template <> struct LU_kernel_bmod<1> { template - EIGEN_DONT_INLINE static void run(const int /*segsize*/, BlockScalarVector& dense, ScalarVector& /*tempv*/, ScalarVector& lusup, int& luptr, const int nsupr, const int nrow, IndexVector& lsub, const int lptr, const int no_zeros) + EIGEN_DONT_INLINE static void run(const int /*segsize*/, BlockScalarVector& dense, ScalarVector& /*tempv*/, ScalarVector& lusup, int& luptr, const int lda, const int nrow, + IndexVector& lsub, const int lptr, const int no_zeros) { typedef typename ScalarVector::Scalar Scalar; Scalar f = dense(lsub(lptr + no_zeros)); - luptr += nsupr * no_zeros + no_zeros + 1; + luptr += lda * no_zeros + no_zeros + 1; const Scalar* a(lusup.data() + luptr); const typename IndexVector::Scalar* irow(lsub.data()+lptr + no_zeros + 1); int i = 0; diff --git a/Eigen/src/SparseLU/SparseLU_panel_bmod.h b/Eigen/src/SparseLU/SparseLU_panel_bmod.h index 6688b4e3e..bc2a04f44 100644 --- a/Eigen/src/SparseLU/SparseLU_panel_bmod.h +++ b/Eigen/src/SparseLU/SparseLU_panel_bmod.h @@ -30,6 +30,7 @@ */ #ifndef SPARSELU_PANEL_BMOD_H #define SPARSELU_PANEL_BMOD_H + /** * \brief Performs numeric block updates (sup-panel) in topological order. * @@ -49,7 +50,8 @@ * */ template -void SparseLUBase::LU_panel_bmod(const int m, const int w, const int jcol, const int nseg, ScalarVector& dense, ScalarVector& tempv, IndexVector& segrep, IndexVector& repfnz, LU_perfvalues& perfv, GlobalLU_t& glu) +void SparseLUBase::LU_panel_bmod(const int m, const int w, const int jcol, const int nseg, ScalarVector& dense, ScalarVector& tempv, + IndexVector& segrep, IndexVector& repfnz, LU_perfvalues& perfv, GlobalLU_t& glu) { int ksub,jj,nextl_col; @@ -60,9 +62,10 @@ void SparseLUBase::LU_panel_bmod(const int m, const int w, const i int segsize,no_zeros ; // For each nonz supernode segment of U[*,j] in topological order int k = nseg - 1; + const Index PacketSize = internal::packet_traits::size; + for (ksub = 0; ksub < nseg; ksub++) { // For each updating supernode - /* krep = representative of current k-th supernode * fsupc = first supernodal column * nsupc = number of columns in a supernode @@ -92,11 +95,10 @@ void SparseLUBase::LU_panel_bmod(const int m, const int w, const i u_rows = (std::max)(segsize,u_rows); } - // if the blocks are large enough, use level 3 - // TODO find better heuristics! - if( nsupc >= perfv.colblk && nrow > perfv.rowblk && u_cols>perfv.relax) + if(nsupc >= 2) { - Map > U(tempv.data(), u_rows, u_cols); + int ldu = internal::first_multiple(u_rows, PacketSize); + Map, Aligned, OuterStride<> > U(tempv.data(), u_rows, u_cols, OuterStride<>(ldu)); // gather U int u_col = 0; @@ -127,17 +129,23 @@ void SparseLUBase::LU_panel_bmod(const int m, const int w, const i } // solve U = A^-1 U luptr = glu.xlusup(fsupc); + int lda = glu.xlusup(fsupc+1) - glu.xlusup(fsupc); no_zeros = (krep - u_rows + 1) - fsupc; - luptr += nsupr * no_zeros + no_zeros; - Map, 0, OuterStride<> > A(glu.lusup.data()+luptr, u_rows, u_rows, OuterStride<>(nsupr) ); + luptr += lda * no_zeros + no_zeros; + Map, 0, OuterStride<> > A(glu.lusup.data()+luptr, u_rows, u_rows, OuterStride<>(lda) ); U = A.template triangularView().solve(U); // update luptr += u_rows; - Map, 0, OuterStride<> > B(glu.lusup.data()+luptr, nrow, u_rows, OuterStride<>(nsupr) ); - assert(tempv.size()>w*u_rows + nrow*w); - Map > L(tempv.data()+w*u_rows, nrow, u_cols); - L.noalias() = B * U; + Map, 0, OuterStride<> > B(glu.lusup.data()+luptr, nrow, u_rows, OuterStride<>(lda) ); + eigen_assert(tempv.size()>w*ldu + nrow*w + 1); + + int ldl = internal::first_multiple(nrow, PacketSize); + int offset = (PacketSize-internal::first_aligned(B.data(), PacketSize)) % PacketSize; + Map, 0, OuterStride<> > L(tempv.data()+w*ldu+offset, nrow, u_cols, OuterStride<>(ldl)); + + L.setZero(); + internal::sparselu_gemm(L.rows(), L.cols(), B.cols(), B.data(), B.outerStride(), U.data(), U.outerStride(), L.data(), L.outerStride()); // scatter U and L u_col = 0; @@ -187,15 +195,17 @@ void SparseLUBase::LU_panel_bmod(const int m, const int w, const i continue; // skip any zero segment segsize = krep - kfnz + 1; - luptr = glu.xlusup(fsupc); + luptr = glu.xlusup(fsupc); + + int lda = glu.xlusup(fsupc+1)-glu.xlusup(fsupc);// nsupr // Perform a trianglar solve and block update, // then scatter the result of sup-col update to dense[] no_zeros = kfnz - fsupc; - if(segsize==1) LU_kernel_bmod<1>::run(segsize, dense_col, tempv, glu.lusup, luptr, nsupr, nrow, glu.lsub, lptr, no_zeros); - else if(segsize==2) LU_kernel_bmod<2>::run(segsize, dense_col, tempv, glu.lusup, luptr, nsupr, nrow, glu.lsub, lptr, no_zeros); - else if(segsize==3) LU_kernel_bmod<3>::run(segsize, dense_col, tempv, glu.lusup, luptr, nsupr, nrow, glu.lsub, lptr, no_zeros); - else LU_kernel_bmod::run(segsize, dense_col, tempv, glu.lusup, luptr, nsupr, nrow, glu.lsub, lptr, no_zeros); + if(segsize==1) LU_kernel_bmod<1>::run(segsize, dense_col, tempv, glu.lusup, luptr, lda, nrow, glu.lsub, lptr, no_zeros); + else if(segsize==2) LU_kernel_bmod<2>::run(segsize, dense_col, tempv, glu.lusup, luptr, lda, nrow, glu.lsub, lptr, no_zeros); + else if(segsize==3) LU_kernel_bmod<3>::run(segsize, dense_col, tempv, glu.lusup, luptr, lda, nrow, glu.lsub, lptr, no_zeros); + else LU_kernel_bmod::run(segsize, dense_col, tempv, glu.lusup, luptr, lda, nrow, glu.lsub, lptr, no_zeros); } // End for each column in the panel } diff --git a/Eigen/src/SparseLU/SparseLU_pivotL.h b/Eigen/src/SparseLU/SparseLU_pivotL.h index c4a9f1c74..9f56903c8 100644 --- a/Eigen/src/SparseLU/SparseLU_pivotL.h +++ b/Eigen/src/SparseLU/SparseLU_pivotL.h @@ -60,6 +60,7 @@ int SparseLUBase::LU_pivotL(const int jcol, const RealScalar diagp Index nsupc = jcol - fsupc; // Number of columns in the supernode portion, excluding jcol; nsupc >=0 Index lptr = glu.xlsub(fsupc); // pointer to the starting location of the row subscripts for this supernode portion Index nsupr = glu.xlsub(fsupc+1) - lptr; // Number of rows in the supernode + Index lda = glu.xlusup(fsupc+1) - glu.xlusup(fsupc); // leading dimension Scalar* lu_sup_ptr = &(glu.lusup.data()[glu.xlusup(fsupc)]); // Start of the current supernode Scalar* lu_col_ptr = &(glu.lusup.data()[glu.xlusup(jcol)]); // Start of jcol in the supernode Index* lsub_ptr = &(glu.lsub.data()[lptr]); // Start of row indices of the supernode @@ -112,8 +113,8 @@ int SparseLUBase::LU_pivotL(const int jcol, const RealScalar diagp // such that L is indexed the same way as A for (icol = 0; icol <= nsupc; icol++) { - itemp = pivptr + icol * nsupr; - std::swap(lu_sup_ptr[itemp], lu_sup_ptr[nsupc + icol * nsupr]); + itemp = pivptr + icol * lda; + std::swap(lu_sup_ptr[itemp], lu_sup_ptr[nsupc + icol * lda]); } } // cdiv operations diff --git a/Eigen/src/SparseLU/SparseLU_snode_bmod.h b/Eigen/src/SparseLU/SparseLU_snode_bmod.h index beea71e31..6774dc7bc 100644 --- a/Eigen/src/SparseLU/SparseLU_snode_bmod.h +++ b/Eigen/src/SparseLU/SparseLU_snode_bmod.h @@ -47,6 +47,17 @@ int SparseLUBase::LU_snode_bmod (const int jcol, const int fsupc, dense(irow) = 0; ++nextlu; } + // Make sure the leading dimension is a multiple of the underlying packet size + // so that fast fully aligned kernels can be enabled: + { + Index lda = nextlu-glu.xlusup(jcol); + int offset = internal::first_multiple(lda, internal::packet_traits::size) - lda; + if(offset) + { + glu.lusup.segment(nextlu,offset).setZero(); + nextlu += offset; + } + } glu.xlusup(jcol + 1) = nextlu; // Initialize xlusup for next column ( jcol+1 ) if (fsupc < jcol ){ @@ -54,16 +65,17 @@ int SparseLUBase::LU_snode_bmod (const int jcol, const int fsupc, int nsupr = glu.xlsub(fsupc + 1) -glu.xlsub(fsupc); //Number of rows in the supernode int nsupc = jcol - fsupc; // Number of columns in the supernodal portion of L\U[*,jcol] int ufirst = glu.xlusup(jcol); // points to the beginning of column jcol in supernode L\U(jsupno) + int lda = glu.xlusup(jcol+1) - ufirst; int nrow = nsupr - nsupc; // Number of rows in the off-diagonal blocks // Solve the triangular system for U(fsupc:jcol, jcol) with L(fspuc:jcol, fsupc:jcol) - Map,0,OuterStride<> > A( &(glu.lusup.data()[luptr]), nsupc, nsupc, OuterStride<>(nsupr) ); + Map,0,OuterStride<> > A( &(glu.lusup.data()[luptr]), nsupc, nsupc, OuterStride<>(lda) ); VectorBlock u(glu.lusup, ufirst, nsupc); u = A.template triangularView().solve(u); // Call the Eigen dense triangular solve interface // Update the trailing part of the column jcol U(jcol:jcol+nrow, jcol) using L(jcol:jcol+nrow, fsupc:jcol) and U(fsupc:jcol) - new (&A) Map,0,OuterStride<> > ( &(glu.lusup.data()[luptr+nsupc]), nrow, nsupc, OuterStride<>(nsupr) ); + new (&A) Map,0,OuterStride<> > ( &(glu.lusup.data()[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) ); VectorBlock l(glu.lusup, ufirst+nsupc, nrow); l.noalias() -= A * u; } -- cgit v1.2.3