aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2009-07-24 10:53:31 +0200
committerGravatar Gael Guennebaud <g.gael@free.fr>2009-07-24 10:53:31 +0200
commitc6d06c22ac6f4b1c2923f050a3b8087f7b717ab0 (patch)
treeacad724e00cc5f96f62e8ab6b06bcf33725febde /Eigen/src/Core
parent6076173f0b941fbb4cd22ff6e0d1cb9f0b56452f (diff)
some cleaning
Diffstat (limited to 'Eigen/src/Core')
-rw-r--r--Eigen/src/Core/products/GeneralMatrixMatrix.h122
-rw-r--r--Eigen/src/Core/products/SelfadjointMatrixMatrix.h39
-rw-r--r--Eigen/src/Core/products/SelfadjointProduct.h327
3 files changed, 108 insertions, 380 deletions
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index 9166921fe..776b7c5d6 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -78,9 +78,6 @@ static void run(int rows, int cols, int depth,
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize);
- // number of columns which can be processed by packet of nr columns
- int packet_cols = (cols/Blocking::nr) * Blocking::nr;
-
// => GEMM_VAR1
for(int k2=0; k2<depth; k2+=kc)
{
@@ -89,7 +86,7 @@ static void run(int rows, int cols, int depth,
// we have selected one row panel of rhs and one column panel of lhs
// pack rhs's panel into a sequential chunk of memory
// and expand each coeff to a constant packet for further reuse
- ei_gemm_pack_rhs<Scalar, Blocking::nr, RhsStorageOrder>()(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, packet_cols, cols);
+ ei_gemm_pack_rhs<Scalar, Blocking::nr, RhsStorageOrder>()(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols);
// => GEPP_VAR1
for(int i2=0; i2<rows; i2+=mc)
@@ -99,7 +96,7 @@ static void run(int rows, int cols, int depth,
ei_gemm_pack_lhs<Scalar, Blocking::mr, LhsStorageOrder>()(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc);
ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >()
- (res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols);
+ (res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
}
}
@@ -113,19 +110,20 @@ static void run(int rows, int cols, int depth,
template<typename Scalar, int mr, int nr, typename Conj>
struct ei_gebp_kernel
{
- void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int actual_mc, int actual_kc, int packet_cols, int i2, int cols)
+ void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int rows, int depth, int cols)
{
typedef typename ei_packet_traits<Scalar>::type PacketType;
enum { PacketSize = ei_packet_traits<Scalar>::size };
Conj cj;
- const int peeled_mc = (actual_mc/mr)*mr;
+ int packet_cols = (cols/nr) * nr;
+ const int peeled_mc = (rows/mr)*mr;
// loops on each cache friendly block of the result/rhs
for(int j2=0; j2<packet_cols; j2+=nr)
{
// loops on each register blocking of lhs/res
for(int i=0; i<peeled_mc; i+=mr)
{
- const Scalar* blA = &blockA[i*actual_kc];
+ const Scalar* blA = &blockA[i*depth];
#ifdef EIGEN_VECTORIZE_SSE
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
#endif
@@ -134,20 +132,20 @@ struct ei_gebp_kernel
// gets res block as register
PacketType C0, C1, C2, C3, C4, C5, C6, C7;
- C0 = ei_ploadu(&res[(j2+0)*resStride + i2 + i]);
- C1 = ei_ploadu(&res[(j2+1)*resStride + i2 + i]);
- if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i2 + i]);
- if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i2 + i]);
- C4 = ei_ploadu(&res[(j2+0)*resStride + i2 + i + PacketSize]);
- C5 = ei_ploadu(&res[(j2+1)*resStride + i2 + i + PacketSize]);
- if(nr==4) C6 = ei_ploadu(&res[(j2+2)*resStride + i2 + i + PacketSize]);
- if(nr==4) C7 = ei_ploadu(&res[(j2+3)*resStride + i2 + i + PacketSize]);
+ C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
+ C1 = ei_ploadu(&res[(j2+1)*resStride + i]);
+ if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i]);
+ if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i]);
+ C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]);
+ C5 = ei_ploadu(&res[(j2+1)*resStride + i + PacketSize]);
+ if(nr==4) C6 = ei_ploadu(&res[(j2+2)*resStride + i + PacketSize]);
+ if(nr==4) C7 = ei_ploadu(&res[(j2+3)*resStride + i + PacketSize]);
// performs "inner" product
// TODO let's check wether the flowing peeled loop could not be
// optimized via optimal prefetching from one loop to the other
- const Scalar* blB = &blockB[j2*actual_kc*PacketSize];
- const int peeled_kc = (actual_kc/4)*4;
+ const Scalar* blB = &blockB[j2*depth*PacketSize];
+ const int peeled_kc = (depth/4)*4;
for(int k=0; k<peeled_kc; k+=4)
{
PacketType B0, B1, B2, B3, A0, A1;
@@ -214,7 +212,7 @@ struct ei_gebp_kernel
blA += 4*mr;
}
// process remaining peeled loop
- for(int k=peeled_kc; k<actual_kc; k++)
+ for(int k=peeled_kc; k<depth; k++)
{
PacketType B0, B1, B2, B3, A0, A1;
@@ -237,26 +235,26 @@ struct ei_gebp_kernel
blA += mr;
}
- ei_pstoreu(&res[(j2+0)*resStride + i2 + i], C0);
- ei_pstoreu(&res[(j2+1)*resStride + i2 + i], C1);
- if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i2 + i], C2);
- if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i2 + i], C3);
- ei_pstoreu(&res[(j2+0)*resStride + i2 + i + PacketSize], C4);
- ei_pstoreu(&res[(j2+1)*resStride + i2 + i + PacketSize], C5);
- if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i2 + i + PacketSize], C6);
- if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i2 + i + PacketSize], C7);
+ ei_pstoreu(&res[(j2+0)*resStride + i], C0);
+ ei_pstoreu(&res[(j2+1)*resStride + i], C1);
+ if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i], C2);
+ if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i], C3);
+ ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4);
+ ei_pstoreu(&res[(j2+1)*resStride + i + PacketSize], C5);
+ if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i + PacketSize], C6);
+ if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i + PacketSize], C7);
}
- for(int i=peeled_mc; i<actual_mc; i++)
+ for(int i=peeled_mc; i<rows; i++)
{
- const Scalar* blA = &blockA[i*actual_kc];
+ const Scalar* blA = &blockA[i*depth];
#ifdef EIGEN_VECTORIZE_SSE
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
#endif
// gets a 1 x nr res block as registers
Scalar C0(0), C1(0), C2(0), C3(0);
- const Scalar* blB = &blockB[j2*actual_kc*PacketSize];
- for(int k=0; k<actual_kc; k++)
+ const Scalar* blB = &blockB[j2*depth*PacketSize];
+ for(int k=0; k<depth; k++)
{
Scalar B0, B1, B2, B3, A0;
@@ -272,10 +270,10 @@ struct ei_gebp_kernel
blB += nr*PacketSize;
}
- res[(j2+0)*resStride + i2 + i] += C0;
- res[(j2+1)*resStride + i2 + i] += C1;
- if(nr==4) res[(j2+2)*resStride + i2 + i] += C2;
- if(nr==4) res[(j2+3)*resStride + i2 + i] += C3;
+ res[(j2+0)*resStride + i] += C0;
+ res[(j2+1)*resStride + i] += C1;
+ if(nr==4) res[(j2+2)*resStride + i] += C2;
+ if(nr==4) res[(j2+3)*resStride + i] += C3;
}
}
@@ -285,7 +283,7 @@ struct ei_gebp_kernel
{
for(int i=0; i<peeled_mc; i+=mr)
{
- const Scalar* blA = &blockA[i*actual_kc];
+ const Scalar* blA = &blockA[i*depth];
#ifdef EIGEN_VECTORIZE_SSE
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
#endif
@@ -294,11 +292,11 @@ struct ei_gebp_kernel
// gets res block as register
PacketType C0, C4;
- C0 = ei_ploadu(&res[(j2+0)*resStride + i2 + i]);
- C4 = ei_ploadu(&res[(j2+0)*resStride + i2 + i + PacketSize]);
+ C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
+ C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]);
- const Scalar* blB = &blockB[j2*actual_kc*PacketSize];
- for(int k=0; k<actual_kc; k++)
+ const Scalar* blB = &blockB[j2*depth*PacketSize];
+ for(int k=0; k<depth; k++)
{
PacketType B0, A0, A1;
@@ -312,22 +310,22 @@ struct ei_gebp_kernel
blA += mr;
}
- ei_pstoreu(&res[(j2+0)*resStride + i2 + i], C0);
- ei_pstoreu(&res[(j2+0)*resStride + i2 + i + PacketSize], C4);
+ ei_pstoreu(&res[(j2+0)*resStride + i], C0);
+ ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4);
}
- for(int i=peeled_mc; i<actual_mc; i++)
+ for(int i=peeled_mc; i<rows; i++)
{
- const Scalar* blA = &blockA[i*actual_kc];
+ const Scalar* blA = &blockA[i*depth];
#ifdef EIGEN_VECTORIZE_SSE
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
#endif
// gets a 1 x 1 res block as registers
Scalar C0(0);
- const Scalar* blB = &blockB[j2*actual_kc*PacketSize];
- for(int k=0; k<actual_kc; k++)
+ const Scalar* blB = &blockB[j2*depth*PacketSize];
+ for(int k=0; k<depth; k++)
C0 = cj.pmadd(blA[k], blB[k*PacketSize], C0);
- res[(j2+0)*resStride + i2 + i] += C0;
+ res[(j2+0)*resStride + i] += C0;
}
}
}
@@ -350,19 +348,19 @@ struct ei_gebp_kernel
template<typename Scalar, int mr, int StorageOrder, bool Conjugate>
struct ei_gemm_pack_lhs
{
- void operator()(Scalar* blockA, const EIGEN_RESTRICT Scalar* _lhs, int lhsStride, int actual_kc, int actual_mc)
+ void operator()(Scalar* blockA, const EIGEN_RESTRICT Scalar* _lhs, int lhsStride, int depth, int rows)
{
ei_conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
ei_const_blas_data_mapper<Scalar, StorageOrder> lhs(_lhs,lhsStride);
int count = 0;
- const int peeled_mc = (actual_mc/mr)*mr;
+ const int peeled_mc = (rows/mr)*mr;
for(int i=0; i<peeled_mc; i+=mr)
- for(int k=0; k<actual_kc; k++)
+ for(int k=0; k<depth; k++)
for(int w=0; w<mr; w++)
blockA[count++] = cj(lhs(i+w, k));
- for(int i=peeled_mc; i<actual_mc; i++)
+ for(int i=peeled_mc; i<rows; i++)
{
- for(int k=0; k<actual_kc; k++)
+ for(int k=0; k<depth; k++)
blockA[count++] = cj(lhs(i, k));
}
}
@@ -379,9 +377,10 @@ template<typename Scalar, int nr>
struct ei_gemm_pack_rhs<Scalar, nr, ColMajor>
{
enum { PacketSize = ei_packet_traits<Scalar>::size };
- void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int actual_kc, int packet_cols, int cols)
+ void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols)
{
bool hasAlpha = alpha != Scalar(1);
+ int packet_cols = (cols/nr) * nr;
int count = 0;
for(int j2=0; j2<packet_cols; j2+=nr)
{
@@ -391,7 +390,7 @@ struct ei_gemm_pack_rhs<Scalar, nr, ColMajor>
const Scalar* b3 = &rhs[(j2+3)*rhsStride];
if (hasAlpha)
{
- for(int k=0; k<actual_kc; k++)
+ for(int k=0; k<depth; k++)
{
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[k]));
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b1[k]));
@@ -405,7 +404,7 @@ struct ei_gemm_pack_rhs<Scalar, nr, ColMajor>
}
else
{
- for(int k=0; k<actual_kc; k++)
+ for(int k=0; k<depth; k++)
{
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[k]));
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b1[k]));
@@ -424,7 +423,7 @@ struct ei_gemm_pack_rhs<Scalar, nr, ColMajor>
const Scalar* b0 = &rhs[(j2+0)*rhsStride];
if (hasAlpha)
{
- for(int k=0; k<actual_kc; k++)
+ for(int k=0; k<depth; k++)
{
ei_pstore(&blockB[count], ei_pset1(alpha*b0[k]));
count += PacketSize;
@@ -432,7 +431,7 @@ struct ei_gemm_pack_rhs<Scalar, nr, ColMajor>
}
else
{
- for(int k=0; k<actual_kc; k++)
+ for(int k=0; k<depth; k++)
{
ei_pstore(&blockB[count], ei_pset1(b0[k]));
count += PacketSize;
@@ -447,15 +446,16 @@ template<typename Scalar, int nr>
struct ei_gemm_pack_rhs<Scalar, nr, RowMajor>
{
enum { PacketSize = ei_packet_traits<Scalar>::size };
- void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int actual_kc, int packet_cols, int cols)
+ void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols)
{
bool hasAlpha = alpha != Scalar(1);
+ int packet_cols = (cols/nr) * nr;
int count = 0;
for(int j2=0; j2<packet_cols; j2+=nr)
{
if (hasAlpha)
{
- for(int k=0; k<actual_kc; k++)
+ for(int k=0; k<depth; k++)
{
const Scalar* b0 = &rhs[k*rhsStride + j2];
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[0]));
@@ -470,7 +470,7 @@ struct ei_gemm_pack_rhs<Scalar, nr, RowMajor>
}
else
{
- for(int k=0; k<actual_kc; k++)
+ for(int k=0; k<depth; k++)
{
const Scalar* b0 = &rhs[k*rhsStride + j2];
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[0]));
@@ -488,7 +488,7 @@ struct ei_gemm_pack_rhs<Scalar, nr, RowMajor>
for(int j2=packet_cols; j2<cols; ++j2)
{
const Scalar* b0 = &rhs[j2];
- for(int k=0; k<actual_kc; k++)
+ for(int k=0; k<depth; k++)
{
ei_pstore(&blockB[count], ei_pset1(alpha*b0[k*rhsStride]));
count += PacketSize;
diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
index 073a5ecf7..acd239d28 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -29,11 +29,11 @@
template<typename Scalar, int mr, int StorageOrder>
struct ei_symm_pack_lhs
{
- void operator()(Scalar* blockA, const Scalar* _lhs, int lhsStride, int actual_kc, int actual_mc)
+ void operator()(Scalar* blockA, const Scalar* _lhs, int lhsStride, int cols, int rows)
{
ei_const_blas_data_mapper<Scalar,StorageOrder> lhs(_lhs,lhsStride);
int count = 0;
- const int peeled_mc = (actual_mc/mr)*mr;
+ const int peeled_mc = (rows/mr)*mr;
for(int i=0; i<peeled_mc; i+=mr)
{
// normal copy
@@ -51,17 +51,17 @@ struct ei_symm_pack_lhs
++h;
}
// transposed copy
- for(int k=i+mr; k<actual_kc; k++)
+ for(int k=i+mr; k<cols; k++)
for(int w=0; w<mr; w++)
blockA[count++] = ei_conj(lhs(k, i+w)); // transposed
}
// do the same with mr==1
- for(int i=peeled_mc; i<actual_mc; i++)
+ for(int i=peeled_mc; i<rows; i++)
{
for(int k=0; k<=i; k++)
blockA[count++] = lhs(i, k); // normal
- for(int k=i+1; k<actual_kc; k++)
+ for(int k=i+1; k<cols; k++)
blockA[count++] = ei_conj(lhs(k, i)); // transposed
}
}
@@ -71,11 +71,12 @@ template<typename Scalar, int nr, int StorageOrder>
struct ei_symm_pack_rhs
{
enum { PacketSize = ei_packet_traits<Scalar>::size };
- void operator()(Scalar* blockB, const Scalar* _rhs, int rhsStride, Scalar alpha, int actual_kc, int packet_cols, int cols, int k2)
+ void operator()(Scalar* blockB, const Scalar* _rhs, int rhsStride, Scalar alpha, int rows, int cols, int k2)
{
- int end_k = k2 + actual_kc;
+ int end_k = k2 + rows;
int count = 0;
ei_const_blas_data_mapper<Scalar,StorageOrder> rhs(_rhs,rhsStride);
+ int packet_cols = (cols/nr)*nr;
// first part: normal case
for(int j2=0; j2<k2; j2+=nr)
@@ -94,7 +95,7 @@ struct ei_symm_pack_rhs
}
// second part: diagonal block
- for(int j2=k2; j2<std::min(k2+actual_kc,packet_cols); j2+=nr)
+ for(int j2=k2; j2<std::min(k2+rows,packet_cols); j2+=nr)
{
// again we can split vertically in three different parts (transpose, symmetric, normal)
// transpose
@@ -137,7 +138,7 @@ struct ei_symm_pack_rhs
}
// third part: transposed
- for(int j2=k2+actual_kc; j2<packet_cols; j2+=nr)
+ for(int j2=k2+rows; j2<packet_cols; j2+=nr)
{
for(int k=k2; k<end_k; k++)
{
@@ -163,7 +164,7 @@ struct ei_symm_pack_rhs
count += PacketSize;
}
// normal
- for(int k=half; k<k2+actual_kc; k++)
+ for(int k=half; k<k2+rows; k++)
{
ei_pstore(&blockB[count], ei_pset1(alpha*rhs(k,j2)));
count += PacketSize;
@@ -233,9 +234,6 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,true,ConjugateLhs, R
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize);
- // number of columns which can be processed by packet of nr columns
- int packet_cols = (cols/Blocking::nr)*Blocking::nr;
-
ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp_kernel;
for(int k2=0; k2<size; k2+=kc)
@@ -246,7 +244,7 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,true,ConjugateLhs, R
// pack rhs's panel into a sequential chunk of memory
// and expand each coeff to a constant packet for further reuse
ei_gemm_pack_rhs<Scalar,Blocking::nr,RhsStorageOrder>()
- (blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, packet_cols, cols);
+ (blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols);
// the select lhs's panel has to be split in three different parts:
// 1 - the transposed panel above the diagonal block => transposed packed copy
@@ -259,7 +257,7 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,true,ConjugateLhs, R
ei_gemm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder==RowMajor?ColMajor:RowMajor, true>()
(blockA, &lhs(k2, i2), lhsStride, actual_kc, actual_mc);
- gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols);
+ gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
}
// the block diagonal
{
@@ -268,7 +266,7 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,true,ConjugateLhs, R
ei_symm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder>()
(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc);
- gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, k2, cols);
+ gebp_kernel(res+k2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
}
for(int i2=k2+kc; i2<size; i2+=mc)
@@ -277,7 +275,7 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,true,ConjugateLhs, R
ei_gemm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder,false>()
(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
- gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols);
+ gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
}
}
@@ -316,9 +314,6 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,false,ConjugateLhs,
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize);
- // number of columns which can be processed by packet of nr columns
- int packet_cols = (cols/Blocking::nr)*Blocking::nr;
-
ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp_kernel;
for(int k2=0; k2<size; k2+=kc)
@@ -326,7 +321,7 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,false,ConjugateLhs,
const int actual_kc = std::min(k2+kc,size)-k2;
ei_symm_pack_rhs<Scalar,Blocking::nr,RhsStorageOrder>()
- (blockB, _rhs, rhsStride, alpha, actual_kc, packet_cols, cols, k2);
+ (blockB, _rhs, rhsStride, alpha, actual_kc, cols, k2);
// => GEPP
for(int i2=0; i2<rows; i2+=mc)
@@ -335,7 +330,7 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,false,ConjugateLhs,
ei_gemm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder>()
(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
- gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols);
+ gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
}
}
diff --git a/Eigen/src/Core/products/SelfadjointProduct.h b/Eigen/src/Core/products/SelfadjointProduct.h
index facde7252..d971720d6 100644
--- a/Eigen/src/Core/products/SelfadjointProduct.h
+++ b/Eigen/src/Core/products/SelfadjointProduct.h
@@ -70,7 +70,7 @@ struct ei_selfadjoint_product<Scalar,MatStorageOrder, ColMajor, AAT, UpLo>
typedef ei_product_blocking_traits<Scalar> Blocking;
- int kc = std::min<int>(Blocking::Max_kc,depth); // cache block size along the K direction
+ int kc = std::min<int>(Blocking::Max_kc,depth); // cache block size along the K direction
int mc = std::min<int>(Blocking::Max_mc,size); // cache block size along the M direction
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
@@ -90,7 +90,7 @@ struct ei_selfadjoint_product<Scalar,MatStorageOrder, ColMajor, AAT, UpLo>
// note that the actual rhs is the transpose/adjoint of mat
ei_gemm_pack_rhs<Scalar,Blocking::nr,MatStorageOrder==RowMajor ? ColMajor : RowMajor>()
- (blockB, &mat(0,k2), matStride, alpha, actual_kc, packet_cols, size);
+ (blockB, &mat(0,k2), matStride, alpha, actual_kc, size);
for(int i2=0; i2<size; i2+=mc)
{
@@ -104,16 +104,15 @@ struct ei_selfadjoint_product<Scalar,MatStorageOrder, ColMajor, AAT, UpLo>
// 2 - the actual_mc x actual_mc symmetric block => processed with a special kernel
// 3 - after the diagonal => processed with gebp or skipped
if (UpLo==LowerTriangular)
- gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, std::min(packet_cols,i2), i2, std::min(size,i2));
+ gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, std::min(size,i2));
ei_sybb_kernel<Scalar, Blocking::mr, Blocking::nr, Conj, UpLo>()
- (res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*Blocking::PacketSize*i2, actual_mc, actual_kc, std::min(actual_mc,std::max(packet_cols-i2,0)));
+ (res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*Blocking::PacketSize*i2, actual_mc, actual_kc);
if (UpLo==UpperTriangular)
{
int j2 = i2+actual_mc;
- gebp_kernel(res+resStride*j2, resStride, blockA, blockB+actual_kc*Blocking::PacketSize*j2, actual_mc, actual_kc,
- std::max(0,packet_cols-j2), i2, std::max(0,size-j2));
+ gebp_kernel(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*Blocking::PacketSize*j2, actual_mc, actual_kc, std::max(0,size-j2));
}
}
}
@@ -149,323 +148,57 @@ SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
}
-// optimized SYmmetric packed Block * packed Block product kernel
+// Optimized SYmmetric packed Block * packed Block product kernel.
+// This kernel is built on top of the gebp kernel:
+// - the current selfadjoint block (res) is processed per panel of actual_mc x BlockSize
+// where BlockSize is set to the minimal value allowing gebp to be as fast as possible
+// - then, as usual, each panel is split into three parts along the diagonal,
+// the sub blocks above and below the diagonal are processed as usual,
+// while the selfadjoint block overlapping the diagonal is evaluated into a
+// small temporary buffer which is then accumulated into the result using a
+// triangular traversal.
template<typename Scalar, int mr, int nr, typename Conj, int UpLo>
struct ei_sybb_kernel
{
enum {
PacketSize = ei_packet_traits<Scalar>::size,
- BlockSize = EIGEN_ENUM_MAX(mr,nr)
+ BlockSize = EIGEN_ENUM_MAX(mr,nr)
};
- void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int actual_mc, int actual_kc, int packet_cols)
+ void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int size, int depth)
{
ei_gebp_kernel<Scalar, mr, nr, Conj> gebp_kernel;
Matrix<Scalar,BlockSize,BlockSize,ColMajor> buffer;
-
+
// let's process the block per panel of actual_mc x BlockSize,
// again, each is split into three parts, etc.
- for (int j=0; j<actual_mc; j+=BlockSize)
+ for (int j=0; j<size; j+=BlockSize)
{
- int actualBlockSize = std::min(BlockSize,actual_mc - j);
- int actualPacketCols = std::min(actualBlockSize,std::max(packet_cols-j,0));
- const Scalar* actual_b = blockB+j*actual_kc*PacketSize;
-
+ int actualBlockSize = std::min<int>(BlockSize,size - j);
+ const Scalar* actual_b = blockB+j*depth*PacketSize;
+
if(UpLo==UpperTriangular)
- gebp_kernel(res+j*resStride, resStride, blockA, actual_b,
- j, actual_kc, actualPacketCols, 0, actualBlockSize);
+ gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize);
// selfadjoint micro block
- // => use the gebp kernel on a temporary buffer,
- // and then perform a triangular copy
{
int i = j;
buffer.setZero();
- gebp_kernel(buffer.data(), BlockSize, blockA+actual_kc*i, actual_b,
- actualBlockSize, actual_kc, actualPacketCols, 0, actualBlockSize);
+ // 1 - apply the kernel on the temporary buffer
+ gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize);
+ // 2 - triangular accumulation
for(int j1=0; j1<actualBlockSize; ++j1)
{
- Scalar* r = res + (j+j1)*resStride;
+ Scalar* r = res + (j+j1)*resStride + i;
for(int i1=UpLo==LowerTriangular ? j1 : 0;
- UpLo==LowerTriangular ? i1<actualBlockSize : i1<=j1; ++j1)
- r[i1+j1*resStride] = buffer(i1,j1);
+ UpLo==LowerTriangular ? i1<actualBlockSize : i1<=j1; ++i1)
+ r[i1] += buffer(i1,j1);
}
}
-
- if(UpLo==LowerTriangular)
- {
- int i = j+actualBlockSize;
- if(i<actual_mc)
- gebp_kernel(res+j*resStride+i, resStride, blockA+actual_kc*i, actual_b,
- actual_mc-i, actual_kc, actualPacketCols, 0, actualBlockSize);
- }
- }
- }
-};
-// optimized SYmmetric packed Block * packed Block product kernel
-// this kernel is very similar to the gebp kernel: the only differences are
-// the piece of code to avoid the writes off the diagonal
-// => TODO find a way to factorize the two kernels in a single one
-template<typename Scalar, int mr, int nr, typename Conj, int UpLo>
-struct ei_sybb_kernel_
-{
- void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int actual_mc, int actual_kc, int packet_cols)
- {
- typedef typename ei_packet_traits<Scalar>::type PacketType;
- enum { PacketSize = ei_packet_traits<Scalar>::size };
- Conj cj;
- const int peeled_mc = (actual_mc/mr)*mr;
- // loops on each cache friendly block of the result/rhs
- for(int j2=0; j2<packet_cols; j2+=nr)
- {
- // here we selected a vertical mc x nr panel of the result that we'll
- // process normally until the end of the diagonal (or from the start if upper)
- //
- int start_i = UpLo==LowerTriangular ? (j2/mr)*mr : 0;
- int end_i = UpLo==LowerTriangular ? actual_mc : std::min(actual_mc,((j2+std::max(mr,nr))/mr)*mr);
- for(int i=start_i; i<std::min(peeled_mc,end_i); i+=mr)
- {
- const Scalar* blA = &blockA[i*actual_kc];
- #ifdef EIGEN_VECTORIZE_SSE
- _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
- #endif
-
- // TODO move the res loads to the stores
-
- // gets res block as register
- PacketType C0, C1, C2, C3, C4, C5, C6, C7;
- C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
- C1 = ei_ploadu(&res[(j2+1)*resStride + i]);
- if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i]);
- if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i]);
- C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]);
- C5 = ei_ploadu(&res[(j2+1)*resStride + i + PacketSize]);
- if(nr==4) C6 = ei_ploadu(&res[(j2+2)*resStride + i + PacketSize]);
- if(nr==4) C7 = ei_ploadu(&res[(j2+3)*resStride + i + PacketSize]);
-
- // performs "inner" product
- // TODO let's check wether the flowing peeled loop could not be
- // optimized via optimal prefetching from one loop to the other
- const Scalar* blB = &blockB[j2*actual_kc*PacketSize];
- const int peeled_kc = (actual_kc/4)*4;
- for(int k=0; k<peeled_kc; k+=4)
- {
- PacketType B0, B1, B2, B3, A0, A1;
-
- A0 = ei_pload(&blA[0*PacketSize]);
- A1 = ei_pload(&blA[1*PacketSize]);
- B0 = ei_pload(&blB[0*PacketSize]);
- B1 = ei_pload(&blB[1*PacketSize]);
- C0 = cj.pmadd(A0, B0, C0);
- if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
- C4 = cj.pmadd(A1, B0, C4);
- if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
- B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
- C1 = cj.pmadd(A0, B1, C1);
- C5 = cj.pmadd(A1, B1, C5);
- B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
- if(nr==4) C2 = cj.pmadd(A0, B2, C2);
- if(nr==4) C6 = cj.pmadd(A1, B2, C6);
- if(nr==4) B2 = ei_pload(&blB[6*PacketSize]);
- if(nr==4) C3 = cj.pmadd(A0, B3, C3);
- A0 = ei_pload(&blA[2*PacketSize]);
- if(nr==4) C7 = cj.pmadd(A1, B3, C7);
- A1 = ei_pload(&blA[3*PacketSize]);
- if(nr==4) B3 = ei_pload(&blB[7*PacketSize]);
- C0 = cj.pmadd(A0, B0, C0);
- C4 = cj.pmadd(A1, B0, C4);
- B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
- C1 = cj.pmadd(A0, B1, C1);
- C5 = cj.pmadd(A1, B1, C5);
- B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
- if(nr==4) C2 = cj.pmadd(A0, B2, C2);
- if(nr==4) C6 = cj.pmadd(A1, B2, C6);
- if(nr==4) B2 = ei_pload(&blB[10*PacketSize]);
- if(nr==4) C3 = cj.pmadd(A0, B3, C3);
- A0 = ei_pload(&blA[4*PacketSize]);
- if(nr==4) C7 = cj.pmadd(A1, B3, C7);
- A1 = ei_pload(&blA[5*PacketSize]);
- if(nr==4) B3 = ei_pload(&blB[11*PacketSize]);
-
- C0 = cj.pmadd(A0, B0, C0);
- C4 = cj.pmadd(A1, B0, C4);
- B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
- C1 = cj.pmadd(A0, B1, C1);
- C5 = cj.pmadd(A1, B1, C5);
- B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
- if(nr==4) C2 = cj.pmadd(A0, B2, C2);
- if(nr==4) C6 = cj.pmadd(A1, B2, C6);
- if(nr==4) B2 = ei_pload(&blB[14*PacketSize]);
- if(nr==4) C3 = cj.pmadd(A0, B3, C3);
- A0 = ei_pload(&blA[6*PacketSize]);
- if(nr==4) C7 = cj.pmadd(A1, B3, C7);
- A1 = ei_pload(&blA[7*PacketSize]);
- if(nr==4) B3 = ei_pload(&blB[15*PacketSize]);
- C0 = cj.pmadd(A0, B0, C0);
- C4 = cj.pmadd(A1, B0, C4);
- C1 = cj.pmadd(A0, B1, C1);
- C5 = cj.pmadd(A1, B1, C5);
- if(nr==4) C2 = cj.pmadd(A0, B2, C2);
- if(nr==4) C6 = cj.pmadd(A1, B2, C6);
- if(nr==4) C3 = cj.pmadd(A0, B3, C3);
- if(nr==4) C7 = cj.pmadd(A1, B3, C7);
-
- blB += 4*nr*PacketSize;
- blA += 4*mr;
- }
- // process remaining peeled loop
- for(int k=peeled_kc; k<actual_kc; k++)
- {
- PacketType B0, B1, B2, B3, A0, A1;
-
- A0 = ei_pload(&blA[0*PacketSize]);
- A1 = ei_pload(&blA[1*PacketSize]);
- B0 = ei_pload(&blB[0*PacketSize]);
- B1 = ei_pload(&blB[1*PacketSize]);
- C0 = cj.pmadd(A0, B0, C0);
- if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
- C4 = cj.pmadd(A1, B0, C4);
- if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
- C1 = cj.pmadd(A0, B1, C1);
- C5 = cj.pmadd(A1, B1, C5);
- if(nr==4) C2 = cj.pmadd(A0, B2, C2);
- if(nr==4) C6 = cj.pmadd(A1, B2, C6);
- if(nr==4) C3 = cj.pmadd(A0, B3, C3);
- if(nr==4) C7 = cj.pmadd(A1, B3, C7);
-
- blB += nr*PacketSize;
- blA += mr;
- }
-
- // let's check whether the mr x nr block overlap the diagonal,
- // is so then we have to carefully discard writes off the diagonal
- if(UpLo==LowerTriangular ? i>=j2+nr : i+mr<=j2)
- {
- ei_pstoreu(&res[(j2+0)*resStride + i], C0);
- ei_pstoreu(&res[(j2+1)*resStride + i], C1);
- if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i], C2);
- if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i], C3);
- ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4);
- ei_pstoreu(&res[(j2+1)*resStride + i + PacketSize], C5);
- if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i + PacketSize], C6);
- if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i + PacketSize], C7);
- }
- else
- {
- Scalar buf[mr*nr];
- // overlap => copy to a temporary mr x nr buffer and then triangular copy
- ei_pstore(&buf[0*mr], C0);
- ei_pstore(&buf[1*mr], C1);
- if(nr==4) ei_pstore(&buf[2*mr], C2);
- if(nr==4) ei_pstore(&buf[3*mr], C3);
- ei_pstore(&buf[0*mr + PacketSize], C4);
- ei_pstore(&buf[1*mr + PacketSize], C5);
- if(nr==4) ei_pstore(&buf[2*mr + PacketSize], C6);
- if(nr==4) ei_pstore(&buf[3*mr + PacketSize], C7);
-
- for(int j1=0; j1<nr; ++j1)
- for(int i1=0; i1<mr; ++i1)
- {
- if(UpLo==LowerTriangular ? i+i1 >= j2+j1 : i+i1 <= j2+j1)
- res[(j2+j1)*resStride + i+i1] = buf[i1 + j1 * mr];
- }
- }
- }
- for(int i=std::max(start_i,peeled_mc); i<std::min(end_i,actual_mc); i++)
- {
- const Scalar* blA = &blockA[i*actual_kc];
- #ifdef EIGEN_VECTORIZE_SSE
- _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
- #endif
-
- // gets a 1 x nr res block as registers
- Scalar C0(0), C1(0), C2(0), C3(0);
- const Scalar* blB = &blockB[j2*actual_kc*PacketSize];
- for(int k=0; k<actual_kc; k++)
- {
- Scalar B0, B1, B2, B3, A0;
-
- A0 = blA[k];
- B0 = blB[0*PacketSize];
- B1 = blB[1*PacketSize];
- C0 = cj.pmadd(A0, B0, C0);
- if(nr==4) B2 = blB[2*PacketSize];
- if(nr==4) B3 = blB[3*PacketSize];
- C1 = cj.pmadd(A0, B1, C1);
- if(nr==4) C2 = cj.pmadd(A0, B2, C2);
- if(nr==4) C3 = cj.pmadd(A0, B3, C3);
-
- blB += nr*PacketSize;
- }
- if(UpLo==LowerTriangular ? i>=j2+nr : i+mr<=j2) {
- res[(j2+0)*resStride + i] += C0;
- res[(j2+1)*resStride + i] += C1;
- if(nr==4) res[(j2+2)*resStride + i] += C2;
- if(nr==4) res[(j2+3)*resStride + i] += C3;
- }
- else
- {
- if(UpLo==LowerTriangular ? i>=j2+0 : i<=j2+0) res[(j2+0)*resStride + i] += C0;
- if(UpLo==LowerTriangular ? i>=j2+1 : i<=j2+1) res[(j2+1)*resStride + i] += C1;
- if(nr==4) if(UpLo==LowerTriangular ? i>=j2+2 : i<=j2+2) res[(j2+2)*resStride + i] += C2;
- if(nr==4) if(UpLo==LowerTriangular ? i>=j2+3 : i<=j2+3) res[(j2+3)*resStride + i] += C3;
- }
- }
- }
-
- // process remaining rhs/res columns one at a time
- // => do the same but with nr==1
- for(int j2=packet_cols; j2<actual_mc; j2++)
- {
- int start_i = UpLo==LowerTriangular ? (j2/mr)*mr : 0;
- int end_i = UpLo==LowerTriangular ? actual_mc : std::min(actual_mc,j2+1);
- for(int i=start_i; i<std::min(end_i,peeled_mc); i+=mr)
- {
- const Scalar* blA = &blockA[i*actual_kc];
- #ifdef EIGEN_VECTORIZE_SSE
- _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
- #endif
-
- // TODO move the res loads to the stores
-
- // gets res block as register
- PacketType C0, C4;
- C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
- C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]);
-
- const Scalar* blB = &blockB[j2*actual_kc*PacketSize];
- for(int k=0; k<actual_kc; k++)
- {
- PacketType B0, A0, A1;
-
- A0 = ei_pload(&blA[0*PacketSize]);
- A1 = ei_pload(&blA[1*PacketSize]);
- B0 = ei_pload(&blB[0*PacketSize]);
- C0 = cj.pmadd(A0, B0, C0);
- C4 = cj.pmadd(A1, B0, C4);
-
- blB += PacketSize;
- blA += mr;
- }
-
- if(UpLo==LowerTriangular ? i>=j2 : i<=j2) ei_pstoreu(&res[(j2+0)*resStride + i], C0);
- if(UpLo==LowerTriangular ? i+PacketSize>=j2 : i+PacketSize<=j2) ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4);
- }
if(UpLo==LowerTriangular)
- start_i = j2;
- for(int i=std::max(start_i,peeled_mc); i<std::min(end_i,actual_mc); i++)
{
- const Scalar* blA = &blockA[i*actual_kc];
- #ifdef EIGEN_VECTORIZE_SSE
- _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
- #endif
-
- // gets a 1 x 1 res block as registers
- Scalar C0(0);
- const Scalar* blB = &blockB[j2*actual_kc*PacketSize];
- for(int k=0; k<actual_kc; k++)
- C0 = cj.pmadd(blA[k], blB[k*PacketSize], C0);
- res[(j2+0)*resStride + i] += C0;
+ int i = j+actualBlockSize;
+ gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize);
}
}
}