From f4112dcff3ebdded503d21d7051cc6301899dc24 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 25 Jul 2009 21:41:01 +0200 Subject: The new trsm is working very very well (read very fast) for lower triangular matrix and row or col major lhs. TODO: handle upper triangular and row major rhs cases --- Eigen/src/Core/products/TriangularSolverMatrix.h | 173 +++++++++++------------ 1 file changed, 84 insertions(+), 89 deletions(-) (limited to 'Eigen/src/Core') diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h index 0a5b4749f..798cd6c09 100644 --- a/Eigen/src/Core/products/TriangularSolverMatrix.h +++ b/Eigen/src/Core/products/TriangularSolverMatrix.h @@ -25,6 +25,48 @@ #ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_H #define EIGEN_TRIANGULAR_SOLVER_MATRIX_H +template +struct ei_gemm_pack_rhs_panel +{ + enum { PacketSize = ei_packet_traits::size }; + void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols, int stride, int offset) + { + int packet_cols = (cols/nr) * nr; + int count = 0; + for(int j2=0; j2 IsLowerTriangular = (Mode&LowerTriangular) == LowerTriangular }; - int kc = 8;//std::min(Blocking::Max_kc,size); // cache block size along the K direction - int mc = 8;//std::min(Blocking::Max_mc,size); // cache block size along the M direction + int kc = std::min(Blocking::Max_kc/4,size); // cache block size along the K direction + int mc = std::min(Blocking::Max_mc,size); // cache block size along the M direction Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); +// Scalar* blockB = new Scalar[10*kc*cols*Blocking::PacketSize]; Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize); ei_gebp_kernel > gebp_kernel; @@ -66,26 +109,16 @@ struct ei_triangular_solve_matrix// // and R2 the remaining part of rhs. The corresponding vertical panel of lhs is split into // A11 (the triangular part) and A21 the remaining rectangular part. // Then the high level algorithm is: - // - B = R1 => general block copy + // - B = R1 => general block copy (done during the next step) // - R1 = L1^-1 B => tricky part - // - update B from the new R1 => actually this has to performed continuously during the above step + // - update B from the new R1 => actually this has to be performed continuously during the above step // - R2 = L2 * B => GEPP - // B = R1 - ei_gemm_pack_rhs() - (blockB, &rhs(k2,0), rhsStride, -1, actual_kc, cols); - - Map(blockB,Blocking::PacketSize*Blocking::nr*actual_kc, cols/Blocking::nr+(cols%Blocking::nr)).setZero(); - // The tricky part: R1 = L1^-1 B while updating B from R1 // The idea is to split L1 into multiple small vertical panels. // Each panel can be split into a small triangular part A1 which is processed without optimization, // and the remaining small part A2 which is processed using gebp with appropriate block strides { - // pack L1 -// ei_gemm_pack_lhs() -// (blockA, &lhs(k2, k2), lhsStride, actual_kc, actual_kc); - // for each small vertical panels of lhs for (int k1=0; k1 for (int k=0; k0) - { - rhs.block(i+1,0,rs,cols) -= - lhs.col(i).segment(IsLowerTriangular ? i+1 : i-rs, rs) * rhs.row(i); - } - } - // update the respective row of B from rhs - { - const Scalar* lr = _rhs+k2+k1; - int packet_cols = (cols/Blocking::nr) * Blocking::nr; - int count = 0; - for(int j2=0; j2() +// .solveInPlace(rhs.col(j).segment(k2+k1,actualPanelWidth)); +// lhs.block(k2+k1,k2+k1,actualPanelWidth,actualPanelWidth).template triangularView() +// .solveInPlace(rhs.block(k2+k1,0,actualPanelWidth,cols)); -// std::cerr << Map(blockB,Blocking::PacketSize*Blocking::nr*actual_kc, cols/Blocking::nr+(cols%Blocking::nr)) << "\n\n"; + // update the respective rows of B from rhs + ei_gemm_pack_rhs_panel() + (blockB, _rhs+k2+k1, rhsStride, -1, actualPanelWidth, cols, actual_kc, k1); -// MatrixXf aux(Blocking::PacketSize*Blocking::nr*actual_kc, cols/Blocking::nr+(cols%Blocking::nr)); -// aux.setZero(); - -// ei_gemm_pack_rhs() -// (aux.data(), &rhs(k2,0), rhsStride, -1, actual_kc, cols); - -// std::cerr << Map(blockB,Blocking::PacketSize*Blocking::nr*actual_kc, cols/Blocking::nr+(cols%Blocking::nr)) - aux << "\n\n"; - - - // gebp + // GEBP int i = k1+actualPanelWidth; int rs = actual_kc-i; -// ei_gemm_pack_rhs() -// (blockB, &rhs(k1,0), rhsStride, -1, actualPanelWidth, cols); - - ei_gemm_pack_lhs() - (blockA, &lhs(k2+i, k2+k1), lhsStride, actualPanelWidth, rs); - if (rs>0) - rhs.block(i,0,actual_kc-i,cols) -= lhs.block(i,k1,rs,actualPanelWidth) * rhs.block(k1,0,actualPanelWidth,cols); - -// gebp_kernel(_rhs+i+k2, rhsStride, -// blockA/*+actual_kc*i+k1*rs*/, blockB/*+k1*Blocking::PacketSize*Blocking::nr*/, rs, actualPanelWidth, cols, actualPanelWidth/*actual_kc*/, actual_kc, 0, k1*Blocking::PacketSize); - -// gebp_kernel(_rhs+i, rhsStride, -// blockA+actual_kc*i+k1*rs, blockB+k1*Blocking::PacketSize*Blocking::nr, rs, actualPanelWidth, cols, actual_kc, actual_kc); + { + ei_gemm_pack_lhs() + (blockA, &lhs(k2+i, k2+k1), lhsStride, actualPanelWidth, rs); -// gebp_kernel(_rhs+k2+i, rhsStride, -// blockA+actual_kc*i+k1, blockB+k1*Blocking::PacketSize, actual_kc-i, actualPanelWidth, cols, actual_kc, actual_kc); + gebp_kernel(_rhs+i+k2, rhsStride, + blockA, blockB, rs, actualPanelWidth, cols, actualPanelWidth, actual_kc, 0, k1*Blocking::PacketSize); + } } } @@ -186,17 +183,15 @@ struct ei_triangular_solve_matrix// { const int actual_mc = std::min(i2+mc,size)-i2; ei_gemm_pack_lhs() - (blockA, &lhs(k2, i2), lhsStride, actual_kc, actual_mc); - - std::cerr << i2 << " sur " << actual_mc << " -= " << i2 << "x" << k2 << "+" << actual_mc<<"," <