aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2011-03-24 23:19:53 +0100
committerGravatar Gael Guennebaud <g.gael@free.fr>2011-03-24 23:19:53 +0100
commit931814d7c08f83e12b39f6185a437cb10698c0ec (patch)
tree97a1f629891c669a3ac338d4d5543ce813caecdb /Eigen
parentc6ad2deead4e8f7db743a103200b51cfaf1d8e7f (diff)
improve performance of trsm
Diffstat (limited to 'Eigen')
-rw-r--r--Eigen/src/Core/products/TriangularSolverMatrix.h32
1 files changed, 18 insertions, 14 deletions
diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h
index a6ad9c322..4f5f85cbc 100644
--- a/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -75,6 +75,7 @@ struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageO
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
Scalar* blockB = allocatedBlockB + sizeW;
+ Scalar* blockW = allocatedBlockB;
conj_if<Conjugate> conj;
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
@@ -92,16 +93,19 @@ struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageO
// A11 (the triangular part) and A21 the remaining rectangular part.
// Then the high level algorithm is:
// - B = R1 => general block copy (done during the next step)
- // - R1 = L1^-1 B => tricky part
+ // - R1 = A11^-1 B => tricky part
// - update B from the new R1 => actually this has to be performed continuously during the above step
- // - R2 = L2 * B => GEPP
-
- // The tricky part: compute R1 = L1^-1 B while updating B from R1
- // The idea is to split L1 into multiple small vertical panels.
- // Each panel can be split into a small triangular part A1 which is processed without optimization,
- // and the remaining small part A2 which is processed using gebp with appropriate block strides
+ // - R2 -= A21 * B => GEPP
+
+ // The tricky part: compute R1 = A11^-1 B while updating B from R1
+ // The idea is to split A11 into multiple small vertical panels.
+ // Each panel can be split into a small triangular part T1k which is processed without optimization,
+ // and the remaining small part T2k which is processed using gebp with appropriate block strides
+ Index subcols = (kc/Traits::nr)*Traits::nr; // TODO kc might not be an ideal choice here
+ for(Index j2=0; j2<cols; j2+=subcols)
{
- // for each small vertical panels of lhs
+ Index actual_cols = std::min(cols-j2,subcols);
+ // for each small vertical panels [T1k^T, T2k^T]^T of lhs
for (Index k1=0; k1<actual_kc; k1+=SmallPanelWidth)
{
Index actualPanelWidth = std::min<Index>(actual_kc-k1, SmallPanelWidth);
@@ -114,7 +118,7 @@ struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageO
Index rs = actualPanelWidth - k - 1; // remaining size
Scalar a = (Mode & UnitDiag) ? Scalar(1) : Scalar(1)/conj(tri(i,i));
- for (Index j=0; j<cols; ++j)
+ for (Index j=j2; j<j2+actual_cols; ++j)
{
if (TriStorageOrder==RowMajor)
{
@@ -143,7 +147,7 @@ struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageO
Index blockBOffset = IsLower ? k1 : lengthTarget;
// update the respective rows of B from other
- pack_rhs(blockB, _other+startBlock, otherStride, actualPanelWidth, cols, actual_kc, blockBOffset);
+ pack_rhs(blockB+actual_kc*j2, &other(startBlock,j2), otherStride, actualPanelWidth, actual_cols, actual_kc, blockBOffset);
// GEBP
if (lengthTarget>0)
@@ -152,13 +156,13 @@ struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageO
pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget);
- gebp_kernel(_other+startTarget, otherStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, Scalar(-1),
- actualPanelWidth, actual_kc, 0, blockBOffset);
+ gebp_kernel(&other(startTarget,j2), otherStride, blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
+ actualPanelWidth, actual_kc, 0, blockBOffset, blockW);
}
}
}
-
- // R2 = A2 * B => GEPP
+
+ // R2 -= A21 * B => GEPP
{
Index start = IsLower ? k2+kc : 0;
Index end = IsLower ? size : k2-kc;