aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2009-07-25 21:41:01 +0200
committerGravatar Gael Guennebaud <g.gael@free.fr>2009-07-25 21:41:01 +0200
commitf4112dcff3ebdded503d21d7051cc6301899dc24 (patch)
tree0633734ae40e5b1c49d25b64451a5eb2b5a1bbe4 /Eigen
parent35927e78c21b11113889f93325b980fc13349af9 (diff)
The new trsm is working very very well (read very fast) for
lower triangular matrix and row or col major lhs. TODO: handle upper triangular and row major rhs cases
Diffstat (limited to 'Eigen')
-rw-r--r--Eigen/Core1
-rw-r--r--Eigen/src/Core/products/TriangularSolverMatrix.h173
2 files changed, 85 insertions, 89 deletions
diff --git a/Eigen/Core b/Eigen/Core
index fadecfa89..5ebbd9496 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -188,6 +188,7 @@ namespace Eigen {
#include "src/Core/products/SelfadjointProduct.h"
#include "src/Core/products/SelfadjointRank2Update.h"
#include "src/Core/products/TriangularMatrixVector.h"
+#include "src/Core/products/TriangularSolverMatrix.h"
#include "src/Core/BandMatrix.h"
} // namespace Eigen
diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h
index 0a5b4749f..798cd6c09 100644
--- a/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -25,6 +25,48 @@
#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_H
#define EIGEN_TRIANGULAR_SOLVER_MATRIX_H
+template<typename Scalar, int nr>
+struct ei_gemm_pack_rhs_panel
+{
+ enum { PacketSize = ei_packet_traits<Scalar>::size };
+ void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols, int stride, int offset)
+ {
+ int packet_cols = (cols/nr) * nr;
+ int count = 0;
+ for(int j2=0; j2<packet_cols; j2+=nr)
+ {
+ // skip what we have before
+ count += PacketSize * nr * offset;
+ const Scalar* b0 = &rhs[(j2+0)*rhsStride];
+ const Scalar* b1 = &rhs[(j2+1)*rhsStride];
+ const Scalar* b2 = &rhs[(j2+2)*rhsStride];
+ const Scalar* b3 = &rhs[(j2+3)*rhsStride];
+ for(int k=0; k<depth; k++)
+ {
+ ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[k]));
+ ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b1[k]));
+ if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b2[k]));
+ if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b3[k]));
+ count += nr*PacketSize;
+ }
+ // skip what we have after
+ count += PacketSize * nr * (stride-offset-depth);
+ }
+ // copy the remaining columns one at a time (nr==1)
+ for(int j2=packet_cols; j2<cols; ++j2)
+ {
+ count += PacketSize * offset;
+ const Scalar* b0 = &rhs[(j2+0)*rhsStride];
+ for(int k=0; k<depth; k++)
+ {
+ ei_pstore(&blockB[count], ei_pset1(alpha*b0[k]));
+ count += PacketSize;
+ }
+ count += PacketSize * (stride-offset-depth);
+ }
+ }
+};
+
/* Optimized triangular solver with multiple right hand side (_TRSM)
*/
template <typename Scalar,
@@ -50,10 +92,11 @@ struct ei_triangular_solve_matrix//<Scalar,LhsStorageOrder,RhsStorageOrder>
IsLowerTriangular = (Mode&LowerTriangular) == LowerTriangular
};
- int kc = 8;//std::min<int>(Blocking::Max_kc,size); // cache block size along the K direction
- int mc = 8;//std::min<int>(Blocking::Max_mc,size); // cache block size along the M direction
+ int kc = std::min<int>(Blocking::Max_kc/4,size); // cache block size along the K direction
+ int mc = std::min<int>(Blocking::Max_mc,size); // cache block size along the M direction
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
+// Scalar* blockB = new Scalar[10*kc*cols*Blocking::PacketSize];
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize);
ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<false,false> > gebp_kernel;
@@ -66,26 +109,16 @@ struct ei_triangular_solve_matrix//<Scalar,LhsStorageOrder,RhsStorageOrder>
// and R2 the remaining part of rhs. The corresponding vertical panel of lhs is split into
// A11 (the triangular part) and A21 the remaining rectangular part.
// Then the high level algorithm is:
- // - B = R1 => general block copy
+ // - B = R1 => general block copy (done during the next step)
// - R1 = L1^-1 B => tricky part
- // - update B from the new R1 => actually this has to performed continuously during the above step
+ // - update B from the new R1 => actually this has to be performed continuously during the above step
// - R2 = L2 * B => GEPP
- // B = R1
- ei_gemm_pack_rhs<Scalar,Blocking::nr,RhsStorageOrder>()
- (blockB, &rhs(k2,0), rhsStride, -1, actual_kc, cols);
-
- Map<MatrixXf>(blockB,Blocking::PacketSize*Blocking::nr*actual_kc, cols/Blocking::nr+(cols%Blocking::nr)).setZero();
-
// The tricky part: R1 = L1^-1 B while updating B from R1
// The idea is to split L1 into multiple small vertical panels.
// Each panel can be split into a small triangular part A1 which is processed without optimization,
// and the remaining small part A2 which is processed using gebp with appropriate block strides
{
- // pack L1
-// ei_gemm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder>()
-// (blockA, &lhs(k2, k2), lhsStride, actual_kc, actual_kc);
-
// for each small vertical panels of lhs
for (int k1=0; k1<actual_kc; k1+=SmallPanelWidth)
{
@@ -94,90 +127,54 @@ struct ei_triangular_solve_matrix//<Scalar,LhsStorageOrder,RhsStorageOrder>
for (int k=0; k<actualPanelWidth; ++k)
{
int i = k2+k1+k;
- if(!(Mode & UnitDiagBit))
- rhs.row(i) /= lhs(i,i);
-
int rs = actualPanelWidth - k - 1; // remaining size
- //std::cerr << i << " ; " << k << " " << rs << "\n";
- if (rs>0)
- {
- rhs.block(i+1,0,rs,cols) -=
- lhs.col(i).segment(IsLowerTriangular ? i+1 : i-rs, rs) * rhs.row(i);
- }
- }
- // update the respective row of B from rhs
- {
- const Scalar* lr = _rhs+k2+k1;
- int packet_cols = (cols/Blocking::nr) * Blocking::nr;
- int count = 0;
- for(int j2=0; j2<packet_cols; j2+=Blocking::nr)
+
+ Scalar a = (Mode & UnitDiagBit) ? Scalar(1) : Scalar(1)/lhs(i,i);
+ for (int j=0; j<cols; ++j)
{
- // skip what we have before
- count += Blocking::PacketSize * Blocking::nr * (k1-k2);
- const Scalar* b0 = &lr[(j2+0)*rhsStride];
- const Scalar* b1 = &lr[(j2+1)*rhsStride];
- const Scalar* b2 = &lr[(j2+2)*rhsStride];
- const Scalar* b3 = &lr[(j2+3)*rhsStride];
- for(int k=0; k<actualPanelWidth; k++)
+
+ if (LhsStorageOrder==RowMajor)
{
- ei_pstore(&blockB[count+0*Blocking::PacketSize], ei_pset1(-b0[k]));
- ei_pstore(&blockB[count+1*Blocking::PacketSize], ei_pset1(-b1[k]));
- if (Blocking::nr==4)
- {
- ei_pstore(&blockB[count+2*Blocking::PacketSize], ei_pset1(-b2[k]));
- ei_pstore(&blockB[count+3*Blocking::PacketSize], ei_pset1(-b3[k]));
- }
- count += Blocking::nr*Blocking::PacketSize;
+ Scalar b = 0;
+ Scalar* r = &rhs.coeffRef(k2+k1,j);
+ const Scalar* l = &lhs.coeff(i,k2+k1);
+ for (int i3=0; i3<k; ++i3)
+ b += l[i3] * r[i3];
+
+ rhs.coeffRef(i,j) = (rhs.coeffRef(i,j) - b)*a;
}
- // skip what we have after
- count += Blocking::PacketSize * Blocking::nr * (actual_kc-k1-actualPanelWidth);
- }
- // copy the remaining columns one at a time (nr==1)
- for(int j2=packet_cols; j2<cols; ++j2)
- {
- count += Blocking::PacketSize * (k1-k2);
- const Scalar* b0 = &lr[(j2+0)*rhsStride];
- for(int k=0; k<actualPanelWidth; k++)
+ else
{
- ei_pstore(&blockB[count], ei_pset1(-b0[k]));
- count += Blocking::PacketSize;
+ Scalar b = (rhs.coeffRef(i,j) *= a);
+ Scalar* r = &rhs.coeffRef(i+1,j);
+ const Scalar* l = &lhs.coeff(i+1,i);
+ for (int i3=0;i3<rs;++i3)
+ r[i3] -= b * l[i3];
}
- count += Blocking::PacketSize * (actual_kc-k1-actualPanelWidth);
}
}
+// for (int j=0; j<cols; ++j)
+// lhs.block(k2+k1,k2+k1,actualPanelWidth,actualPanelWidth).template triangularView<LowerTriangular>()
+// .solveInPlace(rhs.col(j).segment(k2+k1,actualPanelWidth));
+// lhs.block(k2+k1,k2+k1,actualPanelWidth,actualPanelWidth).template triangularView<LowerTriangular>()
+// .solveInPlace(rhs.block(k2+k1,0,actualPanelWidth,cols));
-// std::cerr << Map<MatrixXf>(blockB,Blocking::PacketSize*Blocking::nr*actual_kc, cols/Blocking::nr+(cols%Blocking::nr)) << "\n\n";
+ // update the respective rows of B from rhs
+ ei_gemm_pack_rhs_panel<Scalar, Blocking::nr>()
+ (blockB, _rhs+k2+k1, rhsStride, -1, actualPanelWidth, cols, actual_kc, k1);
-// MatrixXf aux(Blocking::PacketSize*Blocking::nr*actual_kc, cols/Blocking::nr+(cols%Blocking::nr));
-// aux.setZero();
-
-// ei_gemm_pack_rhs<Scalar,Blocking::nr,RhsStorageOrder>()
-// (aux.data(), &rhs(k2,0), rhsStride, -1, actual_kc, cols);
-
-// std::cerr << Map<MatrixXf>(blockB,Blocking::PacketSize*Blocking::nr*actual_kc, cols/Blocking::nr+(cols%Blocking::nr)) - aux << "\n\n";
-
-
- // gebp
+ // GEBP
int i = k1+actualPanelWidth;
int rs = actual_kc-i;
-// ei_gemm_pack_rhs<Scalar,Blocking::nr,RhsStorageOrder>()
-// (blockB, &rhs(k1,0), rhsStride, -1, actualPanelWidth, cols);
-
- ei_gemm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder>()
- (blockA, &lhs(k2+i, k2+k1), lhsStride, actualPanelWidth, rs);
-
if (rs>0)
- rhs.block(i,0,actual_kc-i,cols) -= lhs.block(i,k1,rs,actualPanelWidth) * rhs.block(k1,0,actualPanelWidth,cols);
-
-// gebp_kernel(_rhs+i+k2, rhsStride,
-// blockA/*+actual_kc*i+k1*rs*/, blockB/*+k1*Blocking::PacketSize*Blocking::nr*/, rs, actualPanelWidth, cols, actualPanelWidth/*actual_kc*/, actual_kc, 0, k1*Blocking::PacketSize);
-
-// gebp_kernel(_rhs+i, rhsStride,
-// blockA+actual_kc*i+k1*rs, blockB+k1*Blocking::PacketSize*Blocking::nr, rs, actualPanelWidth, cols, actual_kc, actual_kc);
+ {
+ ei_gemm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder>()
+ (blockA, &lhs(k2+i, k2+k1), lhsStride, actualPanelWidth, rs);
-// gebp_kernel(_rhs+k2+i, rhsStride,
-// blockA+actual_kc*i+k1, blockB+k1*Blocking::PacketSize, actual_kc-i, actualPanelWidth, cols, actual_kc, actual_kc);
+ gebp_kernel(_rhs+i+k2, rhsStride,
+ blockA, blockB, rs, actualPanelWidth, cols, actualPanelWidth, actual_kc, 0, k1*Blocking::PacketSize);
+ }
}
}
@@ -186,17 +183,15 @@ struct ei_triangular_solve_matrix//<Scalar,LhsStorageOrder,RhsStorageOrder>
{
const int actual_mc = std::min(i2+mc,size)-i2;
ei_gemm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder>()
- (blockA, &lhs(k2, i2), lhsStride, actual_kc, actual_mc);
-
- std::cerr << i2 << " sur " << actual_mc << " -= " << i2 << "x" << k2 << "+" << actual_mc<<"," <<actual_kc << " * " << k2 << " sur " << actual_kc << "\n";
- rhs.block(i2,0,actual_mc,cols) -= lhs.block(i2,k2,actual_mc,actual_kc) * rhs.block(k2,0,actual_kc,cols);
-
-// gebp_kernel(_rhs+i2, rhsStride, blockA, blockB, actual_mc, actual_kc, cols);
+ (blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
+
+ gebp_kernel(_rhs+i2, rhsStride, blockA, blockB, actual_mc, actual_kc, cols);
}
}
ei_aligned_stack_delete(Scalar, blockA, kc*mc);
ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize);
+// delete[] blockB;
}
};