diff options
author | Gael Guennebaud <g.gael@free.fr> | 2011-03-24 23:19:53 +0100 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2011-03-24 23:19:53 +0100 |
commit | 931814d7c08f83e12b39f6185a437cb10698c0ec (patch) | |
tree | 97a1f629891c669a3ac338d4d5543ce813caecdb /Eigen | |
parent | c6ad2deead4e8f7db743a103200b51cfaf1d8e7f (diff) |
improve performance of trsm
Diffstat (limited to 'Eigen')
-rw-r--r-- | Eigen/src/Core/products/TriangularSolverMatrix.h | 32 |
1 files changed, 18 insertions, 14 deletions
diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h index a6ad9c322..4f5f85cbc 100644 --- a/Eigen/src/Core/products/TriangularSolverMatrix.h +++ b/Eigen/src/Core/products/TriangularSolverMatrix.h @@ -75,6 +75,7 @@ struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageO ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); Scalar* blockB = allocatedBlockB + sizeW; + Scalar* blockW = allocatedBlockB; conj_if<Conjugate> conj; gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel; @@ -92,16 +93,19 @@ struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageO // A11 (the triangular part) and A21 the remaining rectangular part. // Then the high level algorithm is: // - B = R1 => general block copy (done during the next step) - // - R1 = L1^-1 B => tricky part + // - R1 = A11^-1 B => tricky part // - update B from the new R1 => actually this has to be performed continuously during the above step - // - R2 = L2 * B => GEPP - - // The tricky part: compute R1 = L1^-1 B while updating B from R1 - // The idea is to split L1 into multiple small vertical panels. - // Each panel can be split into a small triangular part A1 which is processed without optimization, - // and the remaining small part A2 which is processed using gebp with appropriate block strides + // - R2 -= A21 * B => GEPP + + // The tricky part: compute R1 = A11^-1 B while updating B from R1 + // The idea is to split A11 into multiple small vertical panels. + // Each panel can be split into a small triangular part T1k which is processed without optimization, + // and the remaining small part T2k which is processed using gebp with appropriate block strides + Index subcols = (kc/Traits::nr)*Traits::nr; // TODO kc might not be an ideal choice here + for(Index j2=0; j2<cols; j2+=subcols) { - // for each small vertical panels of lhs + Index actual_cols = std::min(cols-j2,subcols); + // for each small vertical panels [T1k^T, T2k^T]^T of lhs for (Index k1=0; k1<actual_kc; k1+=SmallPanelWidth) { Index actualPanelWidth = std::min<Index>(actual_kc-k1, SmallPanelWidth); @@ -114,7 +118,7 @@ struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageO Index rs = actualPanelWidth - k - 1; // remaining size Scalar a = (Mode & UnitDiag) ? Scalar(1) : Scalar(1)/conj(tri(i,i)); - for (Index j=0; j<cols; ++j) + for (Index j=j2; j<j2+actual_cols; ++j) { if (TriStorageOrder==RowMajor) { @@ -143,7 +147,7 @@ struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageO Index blockBOffset = IsLower ? k1 : lengthTarget; // update the respective rows of B from other - pack_rhs(blockB, _other+startBlock, otherStride, actualPanelWidth, cols, actual_kc, blockBOffset); + pack_rhs(blockB+actual_kc*j2, &other(startBlock,j2), otherStride, actualPanelWidth, actual_cols, actual_kc, blockBOffset); // GEBP if (lengthTarget>0) @@ -152,13 +156,13 @@ struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageO pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget); - gebp_kernel(_other+startTarget, otherStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, Scalar(-1), - actualPanelWidth, actual_kc, 0, blockBOffset); + gebp_kernel(&other(startTarget,j2), otherStride, blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1), + actualPanelWidth, actual_kc, 0, blockBOffset, blockW); } } } - - // R2 = A2 * B => GEPP + + // R2 -= A21 * B => GEPP { Index start = IsLower ? k2+kc : 0; Index end = IsLower ? size : k2-kc; |