aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--Eigen/Core1
-rw-r--r--Eigen/src/Core/products/GeneralMatrixMatrix.h667
-rw-r--r--Eigen/src/Core/products/SelfadjointMatrixMatrix.h184
-rw-r--r--test/product_selfadjoint.cpp26
4 files changed, 518 insertions, 360 deletions
diff --git a/Eigen/Core b/Eigen/Core
index ee9bfe325..ea871dca3 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -180,6 +180,7 @@ namespace Eigen {
#include "src/Core/products/GeneralMatrixMatrix.h"
#include "src/Core/products/GeneralMatrixVector.h"
#include "src/Core/products/SelfadjointMatrixVector.h"
+#include "src/Core/products/SelfadjointMatrixMatrix.h"
#include "src/Core/TriangularMatrix.h"
#include "src/Core/SelfAdjointView.h"
#include "src/Core/SolveTriangular.h"
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index fe3e877e1..30fa4aa66 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -25,6 +25,15 @@
#ifndef EIGEN_GENERAL_MATRIX_MATRIX_H
#define EIGEN_GENERAL_MATRIX_MATRIX_H
+template<typename Scalar, typename Packet, int PacketSize, int mr, int nr, typename Conj>
+struct ei_gebp_kernel;
+
+template<typename Scalar, int PacketSize, int nr>
+struct ei_gemm_pack_rhs;
+
+template<typename Scalar, int mr>
+struct ei_gemm_pack_lhs;
+
template <int L2MemorySize,typename Scalar>
struct ei_L2_block_traits {
enum {width = 8 * ei_meta_sqrt<L2MemorySize/(64*sizeof(Scalar))>::ret };
@@ -124,10 +133,6 @@ static void ei_cache_friendly_product(
typedef typename ei_packet_traits<Scalar>::type PacketType;
-
-
-#ifndef EIGEN_USE_ALT_PRODUCT
-
enum {
PacketSize = sizeof(PacketType)/sizeof(Scalar),
#if (defined __i386__)
@@ -166,398 +171,346 @@ static void ei_cache_friendly_product(
// we have selected one row panel of rhs and one column panel of lhs
// pack rhs's panel into a sequential chunk of memory
// and expand each coeff to a constant packet for further reuse
- {
- int count = 0;
- for(int j2=0; j2<packet_cols; j2+=nr)
- {
- const Scalar* b0 = &rhs[(j2+0)*rhsStride + k2];
- const Scalar* b1 = &rhs[(j2+1)*rhsStride + k2];
- const Scalar* b2 = &rhs[(j2+2)*rhsStride + k2];
- const Scalar* b3 = &rhs[(j2+3)*rhsStride + k2];
- if (hasAlpha)
- {
- for(int k=0; k<actual_kc; k++)
- {
- ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[k]));
- ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b1[k]));
- if (nr==4)
- {
- ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b2[k]));
- ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b3[k]));
- }
- count += nr*PacketSize;
- }
- }
- else
- {
- for(int k=0; k<actual_kc; k++)
- {
- ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[k]));
- ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b1[k]));
- if (nr==4)
- {
- ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b2[k]));
- ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b3[k]));
- }
- count += nr*PacketSize;
- }
- }
- }
- }
+ ei_gemm_pack_rhs<Scalar,PacketSize,nr>()(blockB, rhs, rhsStride, hasAlpha, alpha, actual_kc, packet_cols, k2, cols);
// => GEPP_VAR1
for(int i2=0; i2<rows; i2+=mc)
{
const int actual_mc = std::min(i2+mc,rows)-i2;
- // We have selected a mc x kc block of lhs
- // Let's pack it in a clever order for further purely sequential access
- int count = 0;
- const int peeled_mc = (actual_mc/mr)*mr;
- if (lhsRowMajor)
- {
- for(int i=0; i<peeled_mc; i+=mr)
- for(int k=0; k<actual_kc; k++)
- for(int w=0; w<mr; w++)
- blockA[count++] = lhs[(k2+k) + (i2+i+w)*lhsStride];
- for(int i=peeled_mc; i<actual_mc; i++)
- {
- const Scalar* llhs = &lhs[(k2) + (i2+i)*lhsStride];
- for(int k=0; k<actual_kc; k++)
- blockA[count++] = llhs[k];
- }
- }
- else
- {
- for(int i=0; i<peeled_mc; i+=mr)
- for(int k=0; k<actual_kc; k++)
- for(int w=0; w<mr; w++)
- blockA[count++] = lhs[(k2+k)*lhsStride + i2+i+w];
- for(int i=peeled_mc; i<actual_mc; i++)
- for(int k=0; k<actual_kc; k++)
- blockA[count++] = lhs[(k2+k)*lhsStride + i2+i];
- }
+ ei_gemm_pack_lhs<Scalar,mr>()(blockA, lhs, lhsStride, lhsRowMajor, actual_kc, actual_mc, k2, i2);
+
+ ei_gebp_kernel<Scalar, PacketType, PacketSize, mr, nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >()
+ (res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols);
+ }
+ }
+
+ ei_aligned_stack_delete(Scalar, blockA, kc*mc);
+ ei_aligned_stack_delete(Scalar, blockB, kc*cols*PacketSize);
+}
- // GEBP
- // loops on each cache friendly block of the result/rhs
- for(int j2=0; j2<packet_cols; j2+=nr)
+// optimized GEneral packed Block * packed Panel product kernel
+template<typename Scalar, typename PacketType, int PacketSize, int mr, int nr, typename Conj>
+struct ei_gebp_kernel
+{
+ void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int actual_mc, int actual_kc, int packet_cols, int i2, int cols)
+ {
+ Conj cj;
+ const int peeled_mc = (actual_mc/mr)*mr;
+ // loops on each cache friendly block of the result/rhs
+ for(int j2=0; j2<packet_cols; j2+=nr)
+ {
+ // loops on each register blocking of lhs/res
+ for(int i=0; i<peeled_mc; i+=mr)
{
- // loops on each register blocking of lhs/res
- const int peeled_mc = (actual_mc/mr)*mr;
- for(int i=0; i<peeled_mc; i+=mr)
+ const Scalar* blA = &blockA[i*actual_kc];
+ #ifdef EIGEN_VECTORIZE_SSE
+ _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
+ #endif
+
+ // TODO move the res loads to the stores
+
+ // gets res block as register
+ PacketType C0, C1, C2, C3, C4, C5, C6, C7;
+ C0 = ei_ploadu(&res[(j2+0)*resStride + i2 + i]);
+ C1 = ei_ploadu(&res[(j2+1)*resStride + i2 + i]);
+ if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i2 + i]);
+ if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i2 + i]);
+ C4 = ei_ploadu(&res[(j2+0)*resStride + i2 + i + PacketSize]);
+ C5 = ei_ploadu(&res[(j2+1)*resStride + i2 + i + PacketSize]);
+ if(nr==4) C6 = ei_ploadu(&res[(j2+2)*resStride + i2 + i + PacketSize]);
+ if(nr==4) C7 = ei_ploadu(&res[(j2+3)*resStride + i2 + i + PacketSize]);
+
+ // performs "inner" product
+ // TODO let's check wether the flowing peeled loop could not be
+ // optimized via optimal prefetching from one loop to the other
+ const Scalar* blB = &blockB[j2*actual_kc*PacketSize];
+ const int peeled_kc = (actual_kc/4)*4;
+ for(int k=0; k<peeled_kc; k+=4)
{
- const Scalar* blA = &blockA[i*actual_kc];
- #ifdef EIGEN_VECTORIZE_SSE
- _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
- #endif
-
- // TODO move the res loads to the stores
-
- // gets res block as register
- PacketType C0, C1, C2, C3, C4, C5, C6, C7;
- C0 = ei_ploadu(&res[(j2+0)*resStride + i2 + i]);
- C1 = ei_ploadu(&res[(j2+1)*resStride + i2 + i]);
- if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i2 + i]);
- if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i2 + i]);
- C4 = ei_ploadu(&res[(j2+0)*resStride + i2 + i + PacketSize]);
- C5 = ei_ploadu(&res[(j2+1)*resStride + i2 + i + PacketSize]);
- if(nr==4) C6 = ei_ploadu(&res[(j2+2)*resStride + i2 + i + PacketSize]);
- if(nr==4) C7 = ei_ploadu(&res[(j2+3)*resStride + i2 + i + PacketSize]);
-
- // performs "inner" product
- // TODO let's check wether the flowing peeled loop could not be
- // optimized via optimal prefetching from one loop to the other
- const Scalar* blB = &blockB[j2*actual_kc*PacketSize];
- const int peeled_kc = (actual_kc/4)*4;
- for(int k=0; k<peeled_kc; k+=4)
- {
- PacketType B0, B1, B2, B3, A0, A1;
-
- A0 = ei_pload(&blA[0*PacketSize]);
- A1 = ei_pload(&blA[1*PacketSize]);
- B0 = ei_pload(&blB[0*PacketSize]);
- B1 = ei_pload(&blB[1*PacketSize]);
- C0 = cj.pmadd(A0, B0, C0);
- if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
- C4 = cj.pmadd(A1, B0, C4);
- if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
- B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
- C1 = cj.pmadd(A0, B1, C1);
- C5 = cj.pmadd(A1, B1, C5);
- B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
- if(nr==4) C2 = cj.pmadd(A0, B2, C2);
- if(nr==4) C6 = cj.pmadd(A1, B2, C6);
- if(nr==4) B2 = ei_pload(&blB[6*PacketSize]);
- if(nr==4) C3 = cj.pmadd(A0, B3, C3);
- A0 = ei_pload(&blA[2*PacketSize]);
- if(nr==4) C7 = cj.pmadd(A1, B3, C7);
- A1 = ei_pload(&blA[3*PacketSize]);
- if(nr==4) B3 = ei_pload(&blB[7*PacketSize]);
- C0 = cj.pmadd(A0, B0, C0);
- C4 = cj.pmadd(A1, B0, C4);
- B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
- C1 = cj.pmadd(A0, B1, C1);
- C5 = cj.pmadd(A1, B1, C5);
- B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
- if(nr==4) C2 = cj.pmadd(A0, B2, C2);
- if(nr==4) C6 = cj.pmadd(A1, B2, C6);
- if(nr==4) B2 = ei_pload(&blB[10*PacketSize]);
- if(nr==4) C3 = cj.pmadd(A0, B3, C3);
- A0 = ei_pload(&blA[4*PacketSize]);
- if(nr==4) C7 = cj.pmadd(A1, B3, C7);
- A1 = ei_pload(&blA[5*PacketSize]);
- if(nr==4) B3 = ei_pload(&blB[11*PacketSize]);
-
- C0 = cj.pmadd(A0, B0, C0);
- C4 = cj.pmadd(A1, B0, C4);
- B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
- C1 = cj.pmadd(A0, B1, C1);
- C5 = cj.pmadd(A1, B1, C5);
- B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
- if(nr==4) C2 = cj.pmadd(A0, B2, C2);
- if(nr==4) C6 = cj.pmadd(A1, B2, C6);
- if(nr==4) B2 = ei_pload(&blB[14*PacketSize]);
- if(nr==4) C3 = cj.pmadd(A0, B3, C3);
- A0 = ei_pload(&blA[6*PacketSize]);
- if(nr==4) C7 = cj.pmadd(A1, B3, C7);
- A1 = ei_pload(&blA[7*PacketSize]);
- if(nr==4) B3 = ei_pload(&blB[15*PacketSize]);
- C0 = cj.pmadd(A0, B0, C0);
- C4 = cj.pmadd(A1, B0, C4);
- C1 = cj.pmadd(A0, B1, C1);
- C5 = cj.pmadd(A1, B1, C5);
- if(nr==4) C2 = cj.pmadd(A0, B2, C2);
- if(nr==4) C6 = cj.pmadd(A1, B2, C6);
- if(nr==4) C3 = cj.pmadd(A0, B3, C3);
- if(nr==4) C7 = cj.pmadd(A1, B3, C7);
-
- blB += 4*nr*PacketSize;
- blA += 4*mr;
- }
- // process remaining peeled loop
- for(int k=peeled_kc; k<actual_kc; k++)
- {
- PacketType B0, B1, B2, B3, A0, A1;
-
- A0 = ei_pload(&blA[0*PacketSize]);
- A1 = ei_pload(&blA[1*PacketSize]);
- B0 = ei_pload(&blB[0*PacketSize]);
- B1 = ei_pload(&blB[1*PacketSize]);
- C0 = cj.pmadd(A0, B0, C0);
- if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
- C4 = cj.pmadd(A1, B0, C4);
- if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
- C1 = cj.pmadd(A0, B1, C1);
- C5 = cj.pmadd(A1, B1, C5);
- if(nr==4) C2 = cj.pmadd(A0, B2, C2);
- if(nr==4) C6 = cj.pmadd(A1, B2, C6);
- if(nr==4) C3 = cj.pmadd(A0, B3, C3);
- if(nr==4) C7 = cj.pmadd(A1, B3, C7);
-
- blB += nr*PacketSize;
- blA += mr;
- }
-
- ei_pstoreu(&res[(j2+0)*resStride + i2 + i], C0);
- ei_pstoreu(&res[(j2+1)*resStride + i2 + i], C1);
- if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i2 + i], C2);
- if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i2 + i], C3);
- ei_pstoreu(&res[(j2+0)*resStride + i2 + i + PacketSize], C4);
- ei_pstoreu(&res[(j2+1)*resStride + i2 + i + PacketSize], C5);
- if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i2 + i + PacketSize], C6);
- if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i2 + i + PacketSize], C7);
+ PacketType B0, B1, B2, B3, A0, A1;
+
+ A0 = ei_pload(&blA[0*PacketSize]);
+ A1 = ei_pload(&blA[1*PacketSize]);
+ B0 = ei_pload(&blB[0*PacketSize]);
+ B1 = ei_pload(&blB[1*PacketSize]);
+ C0 = cj.pmadd(A0, B0, C0);
+ if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
+ C4 = cj.pmadd(A1, B0, C4);
+ if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
+ B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
+ C1 = cj.pmadd(A0, B1, C1);
+ C5 = cj.pmadd(A1, B1, C5);
+ B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
+ if(nr==4) C2 = cj.pmadd(A0, B2, C2);
+ if(nr==4) C6 = cj.pmadd(A1, B2, C6);
+ if(nr==4) B2 = ei_pload(&blB[6*PacketSize]);
+ if(nr==4) C3 = cj.pmadd(A0, B3, C3);
+ A0 = ei_pload(&blA[2*PacketSize]);
+ if(nr==4) C7 = cj.pmadd(A1, B3, C7);
+ A1 = ei_pload(&blA[3*PacketSize]);
+ if(nr==4) B3 = ei_pload(&blB[7*PacketSize]);
+ C0 = cj.pmadd(A0, B0, C0);
+ C4 = cj.pmadd(A1, B0, C4);
+ B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
+ C1 = cj.pmadd(A0, B1, C1);
+ C5 = cj.pmadd(A1, B1, C5);
+ B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
+ if(nr==4) C2 = cj.pmadd(A0, B2, C2);
+ if(nr==4) C6 = cj.pmadd(A1, B2, C6);
+ if(nr==4) B2 = ei_pload(&blB[10*PacketSize]);
+ if(nr==4) C3 = cj.pmadd(A0, B3, C3);
+ A0 = ei_pload(&blA[4*PacketSize]);
+ if(nr==4) C7 = cj.pmadd(A1, B3, C7);
+ A1 = ei_pload(&blA[5*PacketSize]);
+ if(nr==4) B3 = ei_pload(&blB[11*PacketSize]);
+
+ C0 = cj.pmadd(A0, B0, C0);
+ C4 = cj.pmadd(A1, B0, C4);
+ B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
+ C1 = cj.pmadd(A0, B1, C1);
+ C5 = cj.pmadd(A1, B1, C5);
+ B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
+ if(nr==4) C2 = cj.pmadd(A0, B2, C2);
+ if(nr==4) C6 = cj.pmadd(A1, B2, C6);
+ if(nr==4) B2 = ei_pload(&blB[14*PacketSize]);
+ if(nr==4) C3 = cj.pmadd(A0, B3, C3);
+ A0 = ei_pload(&blA[6*PacketSize]);
+ if(nr==4) C7 = cj.pmadd(A1, B3, C7);
+ A1 = ei_pload(&blA[7*PacketSize]);
+ if(nr==4) B3 = ei_pload(&blB[15*PacketSize]);
+ C0 = cj.pmadd(A0, B0, C0);
+ C4 = cj.pmadd(A1, B0, C4);
+ C1 = cj.pmadd(A0, B1, C1);
+ C5 = cj.pmadd(A1, B1, C5);
+ if(nr==4) C2 = cj.pmadd(A0, B2, C2);
+ if(nr==4) C6 = cj.pmadd(A1, B2, C6);
+ if(nr==4) C3 = cj.pmadd(A0, B3, C3);
+ if(nr==4) C7 = cj.pmadd(A1, B3, C7);
+
+ blB += 4*nr*PacketSize;
+ blA += 4*mr;
}
- for(int i=peeled_mc; i<actual_mc; i++)
+ // process remaining peeled loop
+ for(int k=peeled_kc; k<actual_kc; k++)
{
- const Scalar* blA = &blockA[i*actual_kc];
- #ifdef EIGEN_VECTORIZE_SSE
- _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
- #endif
-
- // gets a 1 x nr res block as registers
- Scalar C0(0), C1(0), C2(0), C3(0);
- const Scalar* blB = &blockB[j2*actual_kc*PacketSize];
- for(int k=0; k<actual_kc; k++)
- {
- Scalar B0, B1, B2, B3, A0;
-
- A0 = blA[k];
- B0 = blB[0*PacketSize];
- B1 = blB[1*PacketSize];
- C0 = cj.pmadd(A0, B0, C0);
- if(nr==4) B2 = blB[2*PacketSize];
- if(nr==4) B3 = blB[3*PacketSize];
- C1 = cj.pmadd(A0, B1, C1);
- if(nr==4) C2 = cj.pmadd(A0, B2, C2);
- if(nr==4) C3 = cj.pmadd(A0, B3, C3);
-
- blB += nr*PacketSize;
- }
- res[(j2+0)*resStride + i2 + i] += C0;
- res[(j2+1)*resStride + i2 + i] += C1;
- if(nr==4) res[(j2+2)*resStride + i2 + i] += C2;
- if(nr==4) res[(j2+3)*resStride + i2 + i] += C3;
+ PacketType B0, B1, B2, B3, A0, A1;
+
+ A0 = ei_pload(&blA[0*PacketSize]);
+ A1 = ei_pload(&blA[1*PacketSize]);
+ B0 = ei_pload(&blB[0*PacketSize]);
+ B1 = ei_pload(&blB[1*PacketSize]);
+ C0 = cj.pmadd(A0, B0, C0);
+ if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
+ C4 = cj.pmadd(A1, B0, C4);
+ if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
+ C1 = cj.pmadd(A0, B1, C1);
+ C5 = cj.pmadd(A1, B1, C5);
+ if(nr==4) C2 = cj.pmadd(A0, B2, C2);
+ if(nr==4) C6 = cj.pmadd(A1, B2, C6);
+ if(nr==4) C3 = cj.pmadd(A0, B3, C3);
+ if(nr==4) C7 = cj.pmadd(A1, B3, C7);
+
+ blB += nr*PacketSize;
+ blA += mr;
}
+
+ ei_pstoreu(&res[(j2+0)*resStride + i2 + i], C0);
+ ei_pstoreu(&res[(j2+1)*resStride + i2 + i], C1);
+ if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i2 + i], C2);
+ if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i2 + i], C3);
+ ei_pstoreu(&res[(j2+0)*resStride + i2 + i + PacketSize], C4);
+ ei_pstoreu(&res[(j2+1)*resStride + i2 + i + PacketSize], C5);
+ if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i2 + i + PacketSize], C6);
+ if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i2 + i + PacketSize], C7);
}
- // remaining rhs/res columns (<nr)
- for(int j2=packet_cols; j2<cols; j2++)
+ for(int i=peeled_mc; i<actual_mc; i++)
{
- for(int i=0; i<actual_mc; i++)
+ const Scalar* blA = &blockA[i*actual_kc];
+ #ifdef EIGEN_VECTORIZE_SSE
+ _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
+ #endif
+
+ // gets a 1 x nr res block as registers
+ Scalar C0(0), C1(0), C2(0), C3(0);
+ const Scalar* blB = &blockB[j2*actual_kc*PacketSize];
+ for(int k=0; k<actual_kc; k++)
{
- Scalar c0 = Scalar(0);
- if (lhsRowMajor)
- for(int k=0; k<actual_kc; k++)
- c0 += cj.pmul(lhs[(k2+k)+(i2+i)*lhsStride], rhs[j2*rhsStride + k2 + k]);
- else
- for(int k=0; k<actual_kc; k++)
- c0 += cj.pmul(lhs[(k2+k)*lhsStride + i2+i], rhs[j2*rhsStride + k2 + k]);
- res[(j2)*resStride + i2+i] += (ConjugateRhs ? ei_conj(alpha) : alpha) * c0;
+ Scalar B0, B1, B2, B3, A0;
+
+ A0 = blA[k];
+ B0 = blB[0*PacketSize];
+ B1 = blB[1*PacketSize];
+ C0 = cj.pmadd(A0, B0, C0);
+ if(nr==4) B2 = blB[2*PacketSize];
+ if(nr==4) B3 = blB[3*PacketSize];
+ C1 = cj.pmadd(A0, B1, C1);
+ if(nr==4) C2 = cj.pmadd(A0, B2, C2);
+ if(nr==4) C3 = cj.pmadd(A0, B3, C3);
+
+ blB += nr*PacketSize;
}
+ res[(j2+0)*resStride + i2 + i] += C0;
+ res[(j2+1)*resStride + i2 + i] += C1;
+ if(nr==4) res[(j2+2)*resStride + i2 + i] += C2;
+ if(nr==4) res[(j2+3)*resStride + i2 + i] += C3;
}
}
- }
+ // remaining rhs/res columns (<nr)
- ei_aligned_stack_delete(Scalar, blockA, kc*mc);
- ei_aligned_stack_delete(Scalar, blockB, kc*cols*PacketSize);
+ // process remaining rhs/res columns one at a time
+ // => do the same but with nr==1
+ for(int j2=packet_cols; j2<cols; j2++)
+ {
+ for(int i=0; i<peeled_mc; i+=mr)
+ {
+ const Scalar* blA = &blockA[i*actual_kc];
+ #ifdef EIGEN_VECTORIZE_SSE
+ _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
+ #endif
+ // TODO move the res loads to the stores
-
-#else // alternate product from cylmor
+ // gets res block as register
+ PacketType C0, C4;
+ C0 = ei_ploadu(&res[(j2+0)*resStride + i2 + i]);
+ C4 = ei_ploadu(&res[(j2+0)*resStride + i2 + i + PacketSize]);
- enum {
- PacketSize = sizeof(PacketType)/sizeof(Scalar),
- #if (defined __i386__)
- // i386 architecture provides only 8 xmm registers,
- // so let's reduce the max number of rows processed at once.
- MaxBlockRows = 4,
- MaxBlockRows_ClampingMask = 0xFFFFFC,
- #else
- MaxBlockRows = 8,
- MaxBlockRows_ClampingMask = 0xFFFFF8,
- #endif
- // maximal size of the blocks fitted in L2 cache
- MaxL2BlockSize = ei_L2_block_traits<EIGEN_TUNE_FOR_CPU_CACHE_SIZE,Scalar>::width
- };
+ const Scalar* blB = &blockB[j2*actual_kc*PacketSize];
+ for(int k=0; k<actual_kc; k++)
+ {
+ PacketType B0, A0, A1;
- const bool resIsAligned = (PacketSize==1) || (((resStride%PacketSize) == 0) && (size_t(res)%16==0));
+ A0 = ei_pload(&blA[0*PacketSize]);
+ A1 = ei_pload(&blA[1*PacketSize]);
+ B0 = ei_pload(&blB[0*PacketSize]);
+ C0 = cj.pmadd(A0, B0, C0);
+ C4 = cj.pmadd(A1, B0, C4);
- const int remainingSize = depth % PacketSize;
- const int size = depth - remainingSize; // third dimension of the product clamped to packet boundaries
+ blB += PacketSize;
+ blA += mr;
+ }
- const int l2BlockRows = MaxL2BlockSize > rows ? rows : 512;
- const int l2BlockCols = MaxL2BlockSize > cols ? cols : 128;
- const int l2BlockSize = MaxL2BlockSize > size ? size : 256;
- const int l2BlockSizeAligned = (1 + std::max(l2BlockSize,l2BlockCols)/PacketSize)*PacketSize;
- const bool needRhsCopy = (PacketSize>1) && ((rhsStride%PacketSize!=0) || (size_t(rhs)%16!=0));
+ ei_pstoreu(&res[(j2+0)*resStride + i2 + i], C0);
+ ei_pstoreu(&res[(j2+0)*resStride + i2 + i + PacketSize], C4);
+ }
+ for(int i=peeled_mc; i<actual_mc; i++)
+ {
+ const Scalar* blA = &blockA[i*actual_kc];
+ #ifdef EIGEN_VECTORIZE_SSE
+ _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
+ #endif
+
+ // gets a 1 x 1 res block as registers
+ Scalar C0(0);
+ const Scalar* blB = &blockB[j2*actual_kc*PacketSize];
+ for(int k=0; k<actual_kc; k++)
+ C0 = cj.pmadd(blA[k], blB[k*PacketSize], C0);
+ res[(j2+0)*resStride + i2 + i] += C0;
+ }
+ }
+ }
+};
- Scalar* EIGEN_RESTRICT block = new Scalar[l2BlockRows*size];
-// for(int i=0; i<l2BlockRows*l2BlockSize; ++i)
-// block[i] = 0;
- // loops on each L2 cache friendly blocks of lhs
- for(int l2k=0; l2k<depth; l2k+=l2BlockSize)
+// pack a block of the lhs
+template<typename Scalar, int mr>
+struct ei_gemm_pack_lhs
+{
+ void operator()(Scalar* blockA, const Scalar* lhs, int lhsStride, bool lhsRowMajor, int actual_kc, int actual_mc, int k2, int i2)
{
- for(int l2i=0; l2i<rows; l2i+=l2BlockRows)
+ int count = 0;
+ const int peeled_mc = (actual_mc/mr)*mr;
+ if (lhsRowMajor)
{
- // We have selected a block of lhs
- // Packs this block into 'block'
- int count = 0;
- for(int k=0; k<l2BlockSize; k+=MaxBlockRows)
+ for(int i=0; i<peeled_mc; i+=mr)
+ for(int k=0; k<actual_kc; k++)
+ for(int w=0; w<mr; w++)
+ blockA[count++] = lhs[(k2+k) + (i2+i+w)*lhsStride];
+ for(int i=peeled_mc; i<actual_mc; i++)
{
- for(int i=0; i<l2BlockRows; i+=2*PacketSize)
- for (int w=0; w<MaxBlockRows; ++w)
- for (int y=0; y<2*PacketSize; ++y)
- block[count++] = lhs[(k+l2k+w)*lhsStride + l2i+i+ y];
+ const Scalar* llhs = &lhs[(k2) + (i2+i)*lhsStride];
+ for(int k=0; k<actual_kc; k++)
+ blockA[count++] = llhs[k];
}
+ }
+ else
+ {
+ for(int i=0; i<peeled_mc; i+=mr)
+ for(int k=0; k<actual_kc; k++)
+ for(int w=0; w<mr; w++)
+ blockA[count++] = lhs[(k2+k)*lhsStride + i2+i+w];
+ for(int i=peeled_mc; i<actual_mc; i++)
+ for(int k=0; k<actual_kc; k++)
+ blockA[count++] = lhs[(k2+k)*lhsStride + i2+i];
+ }
+ }
+};
- // loops on each L2 cache firendly block of the result/rhs
- for(int l2j=0; l2j<cols; l2j+=l2BlockCols)
+// copy a complete panel of the rhs while expending each coefficient into a packet form
+template<typename Scalar, int PacketSize, int nr>
+struct ei_gemm_pack_rhs
+{
+ void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, bool hasAlpha, Scalar alpha, int actual_kc, int packet_cols, int k2, int cols)
+ {
+ int count = 0;
+ for(int j2=0; j2<packet_cols; j2+=nr)
+ {
+ const Scalar* b0 = &rhs[(j2+0)*rhsStride + k2];
+ const Scalar* b1 = &rhs[(j2+1)*rhsStride + k2];
+ const Scalar* b2 = &rhs[(j2+2)*rhsStride + k2];
+ const Scalar* b3 = &rhs[(j2+3)*rhsStride + k2];
+ if (hasAlpha)
+ {
+ for(int k=0; k<actual_kc; k++)
+ {
+ ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[k]));
+ ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b1[k]));
+ if (nr==4)
+ {
+ ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b2[k]));
+ ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b3[k]));
+ }
+ count += nr*PacketSize;
+ }
+ }
+ else
{
- for(int k=0; k<l2BlockSize; k+=MaxBlockRows)
+ for(int k=0; k<actual_kc; k++)
{
- for(int j=0; j<l2BlockCols; ++j)
+ ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[k]));
+ ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b1[k]));
+ if (nr==4)
{
- PacketType A0, A1, A2, A3, A4, A5, A6, A7;
-
- // Load the packets from rhs and reorder them
-
- // Here we need some vector reordering
- // Right now its hardcoded to packets of 4 elements
- const Scalar* lrhs = &rhs[(j+l2j)*rhsStride+(k+l2k)];
- A0 = ei_pset1(lrhs[0]);
- A1 = ei_pset1(lrhs[1]);
- A2 = ei_pset1(lrhs[2]);
- A3 = ei_pset1(lrhs[3]);
- if (MaxBlockRows==8)
- {
- A4 = ei_pset1(lrhs[4]);
- A5 = ei_pset1(lrhs[5]);
- A6 = ei_pset1(lrhs[6]);
- A7 = ei_pset1(lrhs[7]);
- }
-
- Scalar * lb = &block[l2BlockRows * k];
- for(int i=0; i<l2BlockRows; i+=2*PacketSize)
- {
- PacketType R0, R1, L0, L1, T0, T1;
-
- // We perform "cross products" of vectors to avoid
- // reductions (horizontal ops) afterwards
- T0 = ei_pload(&res[(j+l2j)*resStride+l2i+i]);
- T1 = ei_pload(&res[(j+l2j)*resStride+l2i+i+PacketSize]);
-
- R0 = ei_pload(&lb[0*PacketSize]);
- L0 = ei_pload(&lb[1*PacketSize]);
- R1 = ei_pload(&lb[2*PacketSize]);
- L1 = ei_pload(&lb[3*PacketSize]);
- T0 = cj.pmadd(A0, R0, T0);
- T1 = cj.pmadd(A0, L0, T1);
- R0 = ei_pload(&lb[4*PacketSize]);
- L0 = ei_pload(&lb[5*PacketSize]);
- T0 = cj.pmadd(A1, R1, T0);
- T1 = cj.pmadd(A1, L1, T1);
- R1 = ei_pload(&lb[6*PacketSize]);
- L1 = ei_pload(&lb[7*PacketSize]);
- T0 = cj.pmadd(A2, R0, T0);
- T1 = cj.pmadd(A2, L0, T1);
- if(MaxBlockRows==8)
- {
- R0 = ei_pload(&lb[8*PacketSize]);
- L0 = ei_pload(&lb[9*PacketSize]);
- }
- T0 = cj.pmadd(A3, R1, T0);
- T1 = cj.pmadd(A3, L1, T1);
- if(MaxBlockRows==8)
- {
- R1 = ei_pload(&lb[10*PacketSize]);
- L1 = ei_pload(&lb[11*PacketSize]);
- T0 = cj.pmadd(A4, R0, T0);
- T1 = cj.pmadd(A4, L0, T1);
- R0 = ei_pload(&lb[12*PacketSize]);
- L0 = ei_pload(&lb[13*PacketSize]);
- T0 = cj.pmadd(A5, R1, T0);
- T1 = cj.pmadd(A5, L1, T1);
- R1 = ei_pload(&lb[14*PacketSize]);
- L1 = ei_pload(&lb[15*PacketSize]);
- T0 = cj.pmadd(A6, R0, T0);
- T1 = cj.pmadd(A6, L0, T1);
- T0 = cj.pmadd(A7, R1, T0);
- T1 = cj.pmadd(A7, L1, T1);
- }
- lb += MaxBlockRows*2*PacketSize;
-
- ei_pstore(&res[(j+l2j)*resStride+l2i+i], T0);
- ei_pstore(&res[(j+l2j)*resStride+l2i+i+PacketSize], T1);
- }
+ ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b2[k]));
+ ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b3[k]));
}
+ count += nr*PacketSize;
+ }
+ }
+ }
+ // copy the remaining columns one at a time (nr==1)
+ for(int j2=packet_cols; j2<cols; ++j2)
+ {
+ const Scalar* b0 = &rhs[(j2+0)*rhsStride + k2];
+ if (hasAlpha)
+ {
+ for(int k=0; k<actual_kc; k++)
+ {
+ ei_pstore(&blockB[count], ei_pset1(alpha*b0[k]));
+ count += PacketSize;
+ }
+ }
+ else
+ {
+ for(int k=0; k<actual_kc; k++)
+ {
+ ei_pstore(&blockB[count], ei_pset1(b0[k]));
+ count += PacketSize;
}
}
}
}
- delete[] block;
-#endif
-
-
-}
+};
#endif // EIGEN_EXTERN_INSTANTIATIONS
diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
new file mode 100644
index 000000000..6ab7b7211
--- /dev/null
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -0,0 +1,184 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <g.gael@free.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_H
+#define EIGEN_SELFADJOINT_MATRIX_MATRIX_H
+
+template<typename Scalar, int mr>
+struct ei_symm_pack_lhs
+{
+ void operator()(Scalar* blockA, const Scalar* lhs, int lhsStride, bool lhsRowMajor, int actual_kc, int actual_mc, int k2, int i2)
+ {
+ int count = 0;
+ const int peeled_mc = (actual_mc/mr)*mr;
+ if (lhsRowMajor)
+ {
+ for(int i=0; i<peeled_mc; i+=mr)
+ for(int k=0; k<actual_kc; k++)
+ for(int w=0; w<mr; w++)
+ blockA[count++] = lhs[(k2+k) + (i2+i+w)*lhsStride];
+ for(int i=peeled_mc; i<actual_mc; i++)
+ {
+ const Scalar* llhs = &lhs[(k2) + (i2+i)*lhsStride];
+ for(int k=0; k<actual_kc; k++)
+ blockA[count++] = llhs[k];
+ }
+ }
+ else
+ {
+ for(int i=0; i<peeled_mc; i+=mr)
+ {
+ for(int k=0; k<i; k++)
+ for(int w=0; w<mr; w++)
+ blockA[count++] = lhs[(k2+k)*lhsStride + i2+i+w];
+
+ // symmetric copy
+ int h = 0;
+ for(int k=i; k<i+mr; k++)
+ {
+ // transposed copy
+ for(int w=0; w<h; w++)
+ blockA[count++] = lhs[(k2+k) + (i2+i+w)*lhsStride];
+ for(int w=h; w<mr; w++)
+ blockA[count++] = lhs[(k2+k)*lhsStride + i2+i+w];
+ ++h;
+ }
+
+ // transposed copy
+ for(int k=i+mr; k<actual_kc; k++)
+ for(int w=0; w<mr; w++)
+ blockA[count++] = lhs[(k2+k) + (i2+i+w)*lhsStride];
+ }
+ // do the same with mr==1
+ for(int i=peeled_mc; i<actual_mc; i++)
+ {
+ for(int k=0; k<=i; k++)
+ blockA[count++] = lhs[(k2+k)*lhsStride + i2+i];
+
+ // transposed copy
+ for(int k=i+1; k<actual_kc; k++)
+ blockA[count++] = lhs[(k2+k) + (i2+i)*lhsStride];
+ }
+ }
+ }
+};
+
+/* Optimized selfadjoint matrix * matrix (_SYMM) product built on top of
+ * the general matrix matrix product.
+ */
+template<typename Scalar, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs>
+static EIGEN_DONT_INLINE void ei_product_selfadjoint_matrix(
+ int size,
+ const Scalar* lhs, int lhsStride,
+ const Scalar* rhs, int rhsStride, bool rhsRowMajor, int cols,
+ Scalar* res, int resStride,
+ Scalar alpha)
+{
+ typedef typename ei_packet_traits<Scalar>::type Packet;
+
+ ei_conj_helper<ConjugateLhs,ConjugateRhs> cj;
+ if (ConjugateRhs)
+ alpha = ei_conj(alpha);
+ bool hasAlpha = alpha != Scalar(1);
+
+ typedef typename ei_packet_traits<Scalar>::type PacketType;
+
+ const bool lhsRowMajor = (StorageOrder==RowMajor);
+
+ enum {
+ PacketSize = sizeof(PacketType)/sizeof(Scalar),
+ #if (defined __i386__)
+ HalfRegisterCount = 4,
+ #else
+ HalfRegisterCount = 8,
+ #endif
+
+ // register block size along the N direction
+ nr = HalfRegisterCount/2,
+
+ // register block size along the M direction
+ mr = 2 * PacketSize,
+
+ // max cache block size along the K direction
+ Max_kc = ei_L2_block_traits<EIGEN_TUNE_FOR_CPU_CACHE_SIZE,Scalar>::width,
+
+ // max cache block size along the M direction
+ Max_mc = 2*Max_kc
+ };
+
+ int kc = std::min<int>(Max_kc,size); // cache block size along the K direction
+ int mc = std::min<int>(Max_mc,size); // cache block size along the M direction
+
+ Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
+ Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*PacketSize);
+
+ // number of columns which can be processed by packet of nr columns
+ int packet_cols = (cols/nr)*nr;
+
+ for(int k2=0; k2<size; k2+=kc)
+ {
+ const int actual_kc = std::min(k2+kc,size)-k2;
+
+ // we have selected one row panel of rhs and one column panel of lhs
+ // pack rhs's panel into a sequential chunk of memory
+ // and expand each coeff to a constant packet for further reuse
+ ei_gemm_pack_rhs<Scalar,PacketSize,nr>()(blockB, rhs, rhsStride, hasAlpha, alpha, actual_kc, packet_cols, k2, cols);
+
+ // the select lhs's panel has to be split in three different parts:
+ // 1 - the transposed panel above the diagonal block => transposed packed copy
+ // 2 - the diagonal block => special packed copy
+ // 3 - the panel below the diagonal block => generic packed copy
+ for(int i2=0; i2<k2; i2+=mc)
+ {
+ const int actual_mc = std::min(i2+mc,k2)-i2;
+ // transposed packed copy
+ ei_gemm_pack_lhs<Scalar,mr>()(blockA, lhs, lhsStride, !lhsRowMajor, actual_kc, actual_mc, k2, i2);
+
+ ei_gebp_kernel<Scalar, PacketType, PacketSize, mr, nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >()
+ (res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols);
+ }
+ // the block diagonal
+ {
+ const int actual_mc = std::min(k2+kc,size)-k2;
+ // symmetric packed copy
+ ei_symm_pack_lhs<Scalar,mr>()(blockA, lhs, lhsStride, lhsRowMajor, actual_kc, actual_mc, k2, k2);
+ ei_gebp_kernel<Scalar, PacketType, PacketSize, mr, nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >()
+ (res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, k2, cols);
+ }
+
+ for(int i2=k2+kc; i2<size; i2+=mc)
+ {
+ const int actual_mc = std::min(i2+mc,size)-i2;
+ ei_gemm_pack_lhs<Scalar,mr>()(blockA, lhs, lhsStride, lhsRowMajor, actual_kc, actual_mc, k2, i2);
+ ei_gebp_kernel<Scalar, PacketType, PacketSize, mr, nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >()
+ (res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols);
+ }
+ }
+
+ ei_aligned_stack_delete(Scalar, blockA, kc*mc);
+ ei_aligned_stack_delete(Scalar, blockB, kc*cols*PacketSize);
+}
+
+
+#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_H
diff --git a/test/product_selfadjoint.cpp b/test/product_selfadjoint.cpp
index 29fbf11bf..814672696 100644
--- a/test/product_selfadjoint.cpp
+++ b/test/product_selfadjoint.cpp
@@ -46,9 +46,9 @@ template<typename MatrixType> void product_selfadjoint(const MatrixType& m)
Scalar s1 = ei_random<Scalar>(),
s2 = ei_random<Scalar>(),
s3 = ei_random<Scalar>();
-
+
m1 = m1.adjoint()*m1;
-
+
// lower
m2.setZero();
m2.template triangularView<LowerTriangular>() = m1;
@@ -68,7 +68,7 @@ template<typename MatrixType> void product_selfadjoint(const MatrixType& m)
m2 = m1.template triangularView<LowerTriangular>();
m2.template selfadjointView<LowerTriangular>().rank2update(v1,v2);
VERIFY_IS_APPROX(m2, (m1 + v1 * v2.adjoint()+ v2 * v1.adjoint()).template triangularView<LowerTriangular>().toDense());
-
+
m2 = m1.template triangularView<UpperTriangular>();
m2.template selfadjointView<UpperTriangular>().rank2update(-v1,s2*v2,s3);
VERIFY_IS_APPROX(m2, (m1 + (-s2*s3) * (v1 * v2.adjoint()+ v2 * v1.adjoint())).template triangularView<UpperTriangular>().toDense());
@@ -99,4 +99,24 @@ void test_product_selfadjoint()
CALL_SUBTEST( product_selfadjoint(Matrix<float,Dynamic,Dynamic,RowMajor>(17,17)) );
CALL_SUBTEST( product_selfadjoint(Matrix<std::complex<double>,Dynamic,Dynamic,RowMajor>(19, 19)) );
}
+
+ for(int i = 0; i < g_repeat ; i++)
+ {
+ int size = ei_random<int>(10,1024);
+ int cols = ei_random<int>(10,320);
+ MatrixXf A = MatrixXf::Random(size,size);
+ MatrixXf B = MatrixXf::Random(size,cols);
+ MatrixXf C = MatrixXf::Random(size,cols);
+ MatrixXf R = MatrixXf::Random(size,cols);
+ A = (A+A.transpose()).eval();
+
+ R = C + (A * B).eval();
+
+ A.corner(TopRight,size-1,size-1).triangularView<UpperTriangular>().setZero();
+
+ ei_product_selfadjoint_matrix<float,ColMajor,LowerTriangular,false,false>
+ (size, A.data(), A.stride(), B.data(), B.stride(), false, B.cols(), C.data(), C.stride(), 1);
+// std::cerr << A << "\n\n" << C << "\n\n" << R << "\n\n";
+ VERIFY_IS_APPROX(C,R);
+ }
}