diff options
author | Gael Guennebaud <g.gael@free.fr> | 2012-10-30 15:09:48 +0100 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2012-10-30 15:09:48 +0100 |
commit | fea4220f3728e60bde06147ff9b89e10df203466 (patch) | |
tree | c26978864d04c3da65f09fe2590feb6b4dce52f0 /Eigen/src/SparseLU/SparseLU_kernel_bmod.h | |
parent | f7e203fb0c3033df26b3f75cc3812989c19041a2 (diff) |
SparseLU: add a specialized gemm kernel, and add padding to the supernodes such that supernodes columns are all properly aligned
Diffstat (limited to 'Eigen/src/SparseLU/SparseLU_kernel_bmod.h')
-rw-r--r-- | Eigen/src/SparseLU/SparseLU_kernel_bmod.h | 30 |
1 files changed, 16 insertions, 14 deletions
diff --git a/Eigen/src/SparseLU/SparseLU_kernel_bmod.h b/Eigen/src/SparseLU/SparseLU_kernel_bmod.h index b15ff9c50..ca53fb6d0 100644 --- a/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +++ b/Eigen/src/SparseLU/SparseLU_kernel_bmod.h @@ -18,7 +18,7 @@ * \param [in,out]dense Packed values of the original matrix * \param tempv temporary vector to use for updates * \param lusup array containing the supernodes - * \param nsupr Number of rows in the supernode + * \param lda Leading dimension in the supernode * \param nrow Number of rows in the rectangular part of the supernode * \param lsub compressed row subscripts of supernodes * \param lptr pointer to the first column of the current supernode in lsub @@ -28,7 +28,7 @@ template <int SegSizeAtCompileTime> struct LU_kernel_bmod { template <typename BlockScalarVector, typename ScalarVector, typename IndexVector> - EIGEN_DONT_INLINE static void run(const int segsize, BlockScalarVector& dense, ScalarVector& tempv, ScalarVector& lusup, int& luptr, const int nsupr, const int nrow, IndexVector& lsub, const int lptr, const int no_zeros) + EIGEN_DONT_INLINE static void run(const int segsize, BlockScalarVector& dense, ScalarVector& tempv, ScalarVector& lusup, int& luptr, const int lda, const int nrow, IndexVector& lsub, const int lptr, const int no_zeros) { typedef typename ScalarVector::Scalar Scalar; // First, copy U[*,j] segment from dense(*) to tempv(*) @@ -43,23 +43,24 @@ template <int SegSizeAtCompileTime> struct LU_kernel_bmod ++isub; } // Dense triangular solve -- start effective triangle - luptr += nsupr * no_zeros + no_zeros; + luptr += lda * no_zeros + no_zeros; // Form Eigen matrix and vector - Map<Matrix<Scalar,SegSizeAtCompileTime,SegSizeAtCompileTime>, 0, OuterStride<> > A( &(lusup.data()[luptr]), segsize, segsize, OuterStride<>(nsupr) ); + Map<Matrix<Scalar,SegSizeAtCompileTime,SegSizeAtCompileTime>, 0, OuterStride<> > A( &(lusup.data()[luptr]), segsize, segsize, OuterStride<>(lda) ); Map<Matrix<Scalar,SegSizeAtCompileTime,1> > u(tempv.data(), segsize); u = A.template triangularView<UnitLower>().solve(u); // Dense matrix-vector product y <-- B*x luptr += segsize; - Map<Matrix<Scalar,Dynamic,SegSizeAtCompileTime>, 0, OuterStride<> > B( &(lusup.data()[luptr]), nrow, segsize, OuterStride<>(nsupr) ); - Map<Matrix<Scalar,Dynamic,1> > l(tempv.data()+segsize, nrow); - if(SegSizeAtCompileTime==2) - l = u(0) * B.col(0) + u(1) * B.col(1); - else if(SegSizeAtCompileTime==3) - l = u(0) * B.col(0) + u(1) * B.col(1) + u(2) * B.col(2); - else - l.noalias() = B * u; + const int PacketSize = internal::packet_traits<Scalar>::size; + int ldl = internal::first_multiple(nrow, PacketSize); + Map<Matrix<Scalar,Dynamic,SegSizeAtCompileTime>, 0, OuterStride<> > B( &(lusup.data()[luptr]), nrow, segsize, OuterStride<>(lda) ); + int aligned_offset = internal::first_aligned(tempv.data()+segsize, PacketSize); + int aligned_with_B_offset = (PacketSize-internal::first_aligned(B.data(), PacketSize))%PacketSize; + Map<Matrix<Scalar,Dynamic,1>, 0, OuterStride<> > l(tempv.data()+segsize+aligned_offset+aligned_with_B_offset, nrow, OuterStride<>(ldl) ); + + l.setZero(); + internal::sparselu_gemm<Scalar>(l.rows(), l.cols(), B.cols(), B.data(), B.outerStride(), u.data(), u.outerStride(), l.data(), l.outerStride()); // Scatter tempv[] into SPA dense[] as a temporary storage isub = lptr + no_zeros; @@ -81,11 +82,12 @@ template <int SegSizeAtCompileTime> struct LU_kernel_bmod template <> struct LU_kernel_bmod<1> { template <typename BlockScalarVector, typename ScalarVector, typename IndexVector> - EIGEN_DONT_INLINE static void run(const int /*segsize*/, BlockScalarVector& dense, ScalarVector& /*tempv*/, ScalarVector& lusup, int& luptr, const int nsupr, const int nrow, IndexVector& lsub, const int lptr, const int no_zeros) + EIGEN_DONT_INLINE static void run(const int /*segsize*/, BlockScalarVector& dense, ScalarVector& /*tempv*/, ScalarVector& lusup, int& luptr, const int lda, const int nrow, + IndexVector& lsub, const int lptr, const int no_zeros) { typedef typename ScalarVector::Scalar Scalar; Scalar f = dense(lsub(lptr + no_zeros)); - luptr += nsupr * no_zeros + no_zeros + 1; + luptr += lda * no_zeros + no_zeros + 1; const Scalar* a(lusup.data() + luptr); const typename IndexVector::Scalar* irow(lsub.data()+lptr + no_zeros + 1); int i = 0; |