aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2010-02-22 21:32:29 +0100
committerGravatar Gael Guennebaud <g.gael@free.fr>2010-02-22 21:32:29 +0100
commit3beedba244f2fc05bb5e301a688ee6bc01f73698 (patch)
treeec783020d3a80e448a2c90a341f35677378fbf5a
parent437f40acc1cbd9ce2f2a2a3f413cae3a5b35f8fb (diff)
parentd3b314569b8040c813b914381cc7d0a2e4a1a529 (diff)
merge
-rw-r--r--Eigen/src/Core/products/GeneralBlockPanelKernel.h418
-rw-r--r--bench/BenchTimer.h94
-rwxr-xr-xbench/bench_unrolling3
-rwxr-xr-xbench/benchmark_suite1
-rw-r--r--unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h10
-rw-r--r--unsupported/test/matrix_function.cpp12
6 files changed, 365 insertions, 173 deletions
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index fe1987bdd..18e913b0e 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -27,6 +27,12 @@
#ifndef EIGEN_EXTERN_INSTANTIATIONS
+#ifdef EIGEN_HAS_FUSE_CJMADD
+#define CJMADD(A,B,C,T) C = cj.pmadd(A,B,C);
+#else
+#define CJMADD(A,B,C,T) T = A; T = cj.pmul(T,B); C = ei_padd(C,T);
+#endif
+
// optimized GEneral packed Block * packed Panel product kernel
template<typename Scalar, int mr, int nr, typename Conj>
struct ei_gebp_kernel
@@ -74,65 +80,111 @@ struct ei_gebp_kernel
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr];
for(int k=0; k<peeled_kc; k+=4)
{
- PacketType B0, B1, B2, B3, A0, A1;
-
- A0 = ei_pload(&blA[0*PacketSize]);
- A1 = ei_pload(&blA[1*PacketSize]);
- B0 = ei_pload(&blB[0*PacketSize]);
- B1 = ei_pload(&blB[1*PacketSize]);
- C0 = cj.pmadd(A0, B0, C0);
- if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
- C4 = cj.pmadd(A1, B0, C4);
- if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
- B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
- C1 = cj.pmadd(A0, B1, C1);
- C5 = cj.pmadd(A1, B1, C5);
- B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
- if(nr==4) C2 = cj.pmadd(A0, B2, C2);
- if(nr==4) C6 = cj.pmadd(A1, B2, C6);
- if(nr==4) B2 = ei_pload(&blB[6*PacketSize]);
- if(nr==4) C3 = cj.pmadd(A0, B3, C3);
- A0 = ei_pload(&blA[2*PacketSize]);
- if(nr==4) C7 = cj.pmadd(A1, B3, C7);
- A1 = ei_pload(&blA[3*PacketSize]);
- if(nr==4) B3 = ei_pload(&blB[7*PacketSize]);
- C0 = cj.pmadd(A0, B0, C0);
- C4 = cj.pmadd(A1, B0, C4);
- B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
- C1 = cj.pmadd(A0, B1, C1);
- C5 = cj.pmadd(A1, B1, C5);
- B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
- if(nr==4) C2 = cj.pmadd(A0, B2, C2);
- if(nr==4) C6 = cj.pmadd(A1, B2, C6);
- if(nr==4) B2 = ei_pload(&blB[10*PacketSize]);
- if(nr==4) C3 = cj.pmadd(A0, B3, C3);
- A0 = ei_pload(&blA[4*PacketSize]);
- if(nr==4) C7 = cj.pmadd(A1, B3, C7);
- A1 = ei_pload(&blA[5*PacketSize]);
- if(nr==4) B3 = ei_pload(&blB[11*PacketSize]);
-
- C0 = cj.pmadd(A0, B0, C0);
- C4 = cj.pmadd(A1, B0, C4);
- B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
- C1 = cj.pmadd(A0, B1, C1);
- C5 = cj.pmadd(A1, B1, C5);
- B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
- if(nr==4) C2 = cj.pmadd(A0, B2, C2);
- if(nr==4) C6 = cj.pmadd(A1, B2, C6);
- if(nr==4) B2 = ei_pload(&blB[14*PacketSize]);
- if(nr==4) C3 = cj.pmadd(A0, B3, C3);
- A0 = ei_pload(&blA[6*PacketSize]);
- if(nr==4) C7 = cj.pmadd(A1, B3, C7);
- A1 = ei_pload(&blA[7*PacketSize]);
- if(nr==4) B3 = ei_pload(&blB[15*PacketSize]);
- C0 = cj.pmadd(A0, B0, C0);
- C4 = cj.pmadd(A1, B0, C4);
- C1 = cj.pmadd(A0, B1, C1);
- C5 = cj.pmadd(A1, B1, C5);
- if(nr==4) C2 = cj.pmadd(A0, B2, C2);
- if(nr==4) C6 = cj.pmadd(A1, B2, C6);
- if(nr==4) C3 = cj.pmadd(A0, B3, C3);
- if(nr==4) C7 = cj.pmadd(A1, B3, C7);
+ if(nr==2)
+ {
+ PacketType B0, T0, A0, A1;
+
+ A0 = ei_pload(&blA[0*PacketSize]);
+ A1 = ei_pload(&blA[1*PacketSize]);
+ B0 = ei_pload(&blB[0*PacketSize]);
+ CJMADD(A0,B0,C0,T0);
+ CJMADD(A1,B0,C4,T0);
+ B0 = ei_pload(&blB[1*PacketSize]);
+ CJMADD(A0,B0,C1,T0);
+ CJMADD(A1,B0,C5,T0);
+
+ A0 = ei_pload(&blA[2*PacketSize]);
+ A1 = ei_pload(&blA[3*PacketSize]);
+ B0 = ei_pload(&blB[2*PacketSize]);
+ CJMADD(A0,B0,C0,T0);
+ CJMADD(A1,B0,C4,T0);
+ B0 = ei_pload(&blB[3*PacketSize]);
+ CJMADD(A0,B0,C1,T0);
+ CJMADD(A1,B0,C5,T0);
+
+ A0 = ei_pload(&blA[4*PacketSize]);
+ A1 = ei_pload(&blA[5*PacketSize]);
+ B0 = ei_pload(&blB[4*PacketSize]);
+ CJMADD(A0,B0,C0,T0);
+ CJMADD(A1,B0,C4,T0);
+ B0 = ei_pload(&blB[5*PacketSize]);
+ CJMADD(A0,B0,C1,T0);
+ CJMADD(A1,B0,C5,T0);
+
+ A0 = ei_pload(&blA[6*PacketSize]);
+ A1 = ei_pload(&blA[7*PacketSize]);
+ B0 = ei_pload(&blB[6*PacketSize]);
+ CJMADD(A0,B0,C0,T0);
+ CJMADD(A1,B0,C4,T0);
+ B0 = ei_pload(&blB[7*PacketSize]);
+ CJMADD(A0,B0,C1,T0);
+ CJMADD(A1,B0,C5,T0);
+ }
+ else
+ {
+
+ PacketType B0, B1, B2, B3, A0, A1;
+ PacketType T0, T1;
+
+ A0 = ei_pload(&blA[0*PacketSize]);
+ A1 = ei_pload(&blA[1*PacketSize]);
+ B0 = ei_pload(&blB[0*PacketSize]);
+ B1 = ei_pload(&blB[1*PacketSize]);
+
+ CJMADD(A0,B0,C0,T0);
+ if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
+ CJMADD(A1,B0,C4,T1);
+ if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
+ B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
+ CJMADD(A0,B1,C1,T0);
+ CJMADD(A1,B1,C5,T1);
+ B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
+ if(nr==4) { CJMADD(A0,B2,C2,T0); }
+ if(nr==4) { CJMADD(A1,B2,C6,T1); }
+ if(nr==4) B2 = ei_pload(&blB[6*PacketSize]);
+ if(nr==4) { CJMADD(A0,B3,C3,T0); }
+ A0 = ei_pload(&blA[2*PacketSize]);
+ if(nr==4) { CJMADD(A1,B3,C7,T1); }
+ A1 = ei_pload(&blA[3*PacketSize]);
+ if(nr==4) B3 = ei_pload(&blB[7*PacketSize]);
+ CJMADD(A0,B0,C0,T0);
+ CJMADD(A1,B0,C4,T1);
+ B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
+ CJMADD(A0,B1,C1,T0);
+ CJMADD(A1,B1,C5,T1);
+ B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
+ if(nr==4) { CJMADD(A0,B2,C2,T0); }
+ if(nr==4) { CJMADD(A1,B2,C6,T1); }
+ if(nr==4) B2 = ei_pload(&blB[10*PacketSize]);
+ if(nr==4) { CJMADD(A0,B3,C3,T0); }
+ A0 = ei_pload(&blA[4*PacketSize]);
+ if(nr==4) { CJMADD(A1,B3,C7,T1); }
+ A1 = ei_pload(&blA[5*PacketSize]);
+ if(nr==4) B3 = ei_pload(&blB[11*PacketSize]);
+
+ CJMADD(A0,B0,C0,T0);
+ CJMADD(A1,B0,C4,T1);
+ B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
+ CJMADD(A0,B1,C1,T0);
+ CJMADD(A1,B1,C5,T1);
+ B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
+ if(nr==4) { CJMADD(A0,B2,C2,T0); }
+ if(nr==4) { CJMADD(A1,B2,C6,T1); }
+ if(nr==4) B2 = ei_pload(&blB[14*PacketSize]);
+ if(nr==4) { CJMADD(A0,B3,C3,T0); }
+ A0 = ei_pload(&blA[6*PacketSize]);
+ if(nr==4) { CJMADD(A1,B3,C7,T1); }
+ A1 = ei_pload(&blA[7*PacketSize]);
+ if(nr==4) B3 = ei_pload(&blB[15*PacketSize]);
+ CJMADD(A0,B0,C0,T0);
+ CJMADD(A1,B0,C4,T1);
+ CJMADD(A0,B1,C1,T0);
+ CJMADD(A1,B1,C5,T1);
+ if(nr==4) { CJMADD(A0,B2,C2,T0); }
+ if(nr==4) { CJMADD(A1,B2,C6,T1); }
+ if(nr==4) { CJMADD(A0,B3,C3,T0); }
+ if(nr==4) { CJMADD(A1,B3,C7,T1); }
+ }
blB += 4*nr*PacketSize;
blA += 4*mr;
@@ -140,22 +192,40 @@ struct ei_gebp_kernel
// process remaining peeled loop
for(int k=peeled_kc; k<depth; k++)
{
- PacketType B0, B1, B2, B3, A0, A1;
-
- A0 = ei_pload(&blA[0*PacketSize]);
- A1 = ei_pload(&blA[1*PacketSize]);
- B0 = ei_pload(&blB[0*PacketSize]);
- B1 = ei_pload(&blB[1*PacketSize]);
- C0 = cj.pmadd(A0, B0, C0);
- if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
- C4 = cj.pmadd(A1, B0, C4);
- if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
- C1 = cj.pmadd(A0, B1, C1);
- C5 = cj.pmadd(A1, B1, C5);
- if(nr==4) C2 = cj.pmadd(A0, B2, C2);
- if(nr==4) C6 = cj.pmadd(A1, B2, C6);
- if(nr==4) C3 = cj.pmadd(A0, B3, C3);
- if(nr==4) C7 = cj.pmadd(A1, B3, C7);
+ if(nr==2)
+ {
+ PacketType B0, T0, A0, A1;
+
+ A0 = ei_pload(&blA[0*PacketSize]);
+ A1 = ei_pload(&blA[1*PacketSize]);
+ B0 = ei_pload(&blB[0*PacketSize]);
+ CJMADD(A0,B0,C0,T0);
+ CJMADD(A1,B0,C4,T0);
+ B0 = ei_pload(&blB[1*PacketSize]);
+ CJMADD(A0,B0,C1,T0);
+ CJMADD(A1,B0,C5,T0);
+ }
+ else
+ {
+ PacketType B0, B1, B2, B3, A0, A1, T0, T1;
+
+ A0 = ei_pload(&blA[0*PacketSize]);
+ A1 = ei_pload(&blA[1*PacketSize]);
+ B0 = ei_pload(&blB[0*PacketSize]);
+ B1 = ei_pload(&blB[1*PacketSize]);
+
+ CJMADD(A0,B0,C0,T0);
+ if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
+ CJMADD(A1,B0,C4,T1);
+ if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
+ B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
+ CJMADD(A0,B1,C1,T0);
+ CJMADD(A1,B1,C5,T1);
+ if(nr==4) { CJMADD(A0,B2,C2,T0); }
+ if(nr==4) { CJMADD(A1,B2,C6,T1); }
+ if(nr==4) { CJMADD(A0,B3,C3,T0); }
+ if(nr==4) { CJMADD(A1,B3,C7,T1); }
+ }
blB += nr*PacketSize;
blA += mr;
@@ -189,45 +259,79 @@ struct ei_gebp_kernel
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr];
for(int k=0; k<peeled_kc; k+=4)
{
- PacketType B0, B1, B2, B3, A0;
-
- A0 = ei_pload(&blA[0*PacketSize]);
- B0 = ei_pload(&blB[0*PacketSize]);
- B1 = ei_pload(&blB[1*PacketSize]);
- C0 = cj.pmadd(A0, B0, C0);
- if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
- if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
- B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
- C1 = cj.pmadd(A0, B1, C1);
- B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
- if(nr==4) C2 = cj.pmadd(A0, B2, C2);
- if(nr==4) B2 = ei_pload(&blB[6*PacketSize]);
- if(nr==4) C3 = cj.pmadd(A0, B3, C3);
- A0 = ei_pload(&blA[1*PacketSize]);
- if(nr==4) B3 = ei_pload(&blB[7*PacketSize]);
- C0 = cj.pmadd(A0, B0, C0);
- B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
- C1 = cj.pmadd(A0, B1, C1);
- B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
- if(nr==4) C2 = cj.pmadd(A0, B2, C2);
- if(nr==4) B2 = ei_pload(&blB[10*PacketSize]);
- if(nr==4) C3 = cj.pmadd(A0, B3, C3);
- A0 = ei_pload(&blA[2*PacketSize]);
- if(nr==4) B3 = ei_pload(&blB[11*PacketSize]);
-
- C0 = cj.pmadd(A0, B0, C0);
- B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
- C1 = cj.pmadd(A0, B1, C1);
- B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
- if(nr==4) C2 = cj.pmadd(A0, B2, C2);
- if(nr==4) B2 = ei_pload(&blB[14*PacketSize]);
- if(nr==4) C3 = cj.pmadd(A0, B3, C3);
- A0 = ei_pload(&blA[3*PacketSize]);
- if(nr==4) B3 = ei_pload(&blB[15*PacketSize]);
- C0 = cj.pmadd(A0, B0, C0);
- C1 = cj.pmadd(A0, B1, C1);
- if(nr==4) C2 = cj.pmadd(A0, B2, C2);
- if(nr==4) C3 = cj.pmadd(A0, B3, C3);
+ if(nr==2)
+ {
+ PacketType B0, T0, A0;
+
+ A0 = ei_pload(&blA[0*PacketSize]);
+ B0 = ei_pload(&blB[0*PacketSize]);
+ CJMADD(A0,B0,C0,T0);
+ B0 = ei_pload(&blB[1*PacketSize]);
+ CJMADD(A0,B0,C1,T0);
+
+ A0 = ei_pload(&blA[1*PacketSize]);
+ B0 = ei_pload(&blB[2*PacketSize]);
+ CJMADD(A0,B0,C0,T0);
+ B0 = ei_pload(&blB[3*PacketSize]);
+ CJMADD(A0,B0,C1,T0);
+
+ A0 = ei_pload(&blA[2*PacketSize]);
+ B0 = ei_pload(&blB[4*PacketSize]);
+ CJMADD(A0,B0,C0,T0);
+ B0 = ei_pload(&blB[5*PacketSize]);
+ CJMADD(A0,B0,C1,T0);
+
+ A0 = ei_pload(&blA[3*PacketSize]);
+ B0 = ei_pload(&blB[6*PacketSize]);
+ CJMADD(A0,B0,C0,T0);
+ B0 = ei_pload(&blB[7*PacketSize]);
+ CJMADD(A0,B0,C1,T0);
+ }
+ else
+ {
+
+ PacketType B0, B1, B2, B3, A0;
+ PacketType T0, T1;
+
+ A0 = ei_pload(&blA[0*PacketSize]);
+ B0 = ei_pload(&blB[0*PacketSize]);
+ B1 = ei_pload(&blB[1*PacketSize]);
+
+ CJMADD(A0,B0,C0,T0);
+ if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
+ if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
+ B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
+ CJMADD(A0,B1,C1,T1);
+ B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
+ if(nr==4) { CJMADD(A0,B2,C2,T0); }
+ if(nr==4) B2 = ei_pload(&blB[6*PacketSize]);
+ if(nr==4) { CJMADD(A0,B3,C3,T1); }
+ A0 = ei_pload(&blA[1*PacketSize]);
+ if(nr==4) B3 = ei_pload(&blB[7*PacketSize]);
+ CJMADD(A0,B0,C0,T0);
+ B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
+ CJMADD(A0,B1,C1,T1);
+ B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
+ if(nr==4) { CJMADD(A0,B2,C2,T0); }
+ if(nr==4) B2 = ei_pload(&blB[10*PacketSize]);
+ if(nr==4) { CJMADD(A0,B3,C3,T1); }
+ A0 = ei_pload(&blA[2*PacketSize]);
+ if(nr==4) B3 = ei_pload(&blB[11*PacketSize]);
+
+ CJMADD(A0,B0,C0,T0);
+ B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
+ CJMADD(A0,B1,C1,T1);
+ B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
+ if(nr==4) { CJMADD(A0,B2,C2,T0); }
+ if(nr==4) B2 = ei_pload(&blB[14*PacketSize]);
+ if(nr==4) { CJMADD(A0,B3,C3,T1); }
+ A0 = ei_pload(&blA[3*PacketSize]);
+ if(nr==4) B3 = ei_pload(&blB[15*PacketSize]);
+ CJMADD(A0,B0,C0,T0);
+ CJMADD(A0,B1,C1,T1);
+ if(nr==4) { CJMADD(A0,B2,C2,T0); }
+ if(nr==4) { CJMADD(A0,B3,C3,T1); }
+ }
blB += 4*nr*PacketSize;
blA += 4*PacketSize;
@@ -235,17 +339,32 @@ struct ei_gebp_kernel
// process remaining peeled loop
for(int k=peeled_kc; k<depth; k++)
{
- PacketType B0, B1, B2, B3, A0;
-
- A0 = ei_pload(&blA[0*PacketSize]);
- B0 = ei_pload(&blB[0*PacketSize]);
- B1 = ei_pload(&blB[1*PacketSize]);
- C0 = cj.pmadd(A0, B0, C0);
- if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
- if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
- C1 = cj.pmadd(A0, B1, C1);
- if(nr==4) C2 = cj.pmadd(A0, B2, C2);
- if(nr==4) C3 = cj.pmadd(A0, B3, C3);
+ if(nr==2)
+ {
+ PacketType B0, T0, A0;
+
+ A0 = ei_pload(&blA[0*PacketSize]);
+ B0 = ei_pload(&blB[0*PacketSize]);
+ CJMADD(A0,B0,C0,T0);
+ B0 = ei_pload(&blB[1*PacketSize]);
+ CJMADD(A0,B0,C1,T0);
+ }
+ else
+ {
+ PacketType B0, B1, B2, B3, A0;
+ PacketType T0, T1;
+
+ A0 = ei_pload(&blA[0*PacketSize]);
+ B0 = ei_pload(&blB[0*PacketSize]);
+ B1 = ei_pload(&blB[1*PacketSize]);
+ if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
+ if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
+
+ CJMADD(A0,B0,C0,T0);
+ CJMADD(A0,B1,C1,T1);
+ if(nr==4) { CJMADD(A0,B2,C2,T0); }
+ if(nr==4) { CJMADD(A0,B3,C3,T1); }
+ }
blB += nr*PacketSize;
blA += PacketSize;
@@ -268,17 +387,32 @@ struct ei_gebp_kernel
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr];
for(int k=0; k<depth; k++)
{
- Scalar B0, B1, B2, B3, A0;
-
- A0 = blA[k];
- B0 = blB[0*PacketSize];
- B1 = blB[1*PacketSize];
- C0 = cj.pmadd(A0, B0, C0);
- if(nr==4) B2 = blB[2*PacketSize];
- if(nr==4) B3 = blB[3*PacketSize];
- C1 = cj.pmadd(A0, B1, C1);
- if(nr==4) C2 = cj.pmadd(A0, B2, C2);
- if(nr==4) C3 = cj.pmadd(A0, B3, C3);
+ if(nr==2)
+ {
+ Scalar B0, T0, A0;
+
+ A0 = blA[0*PacketSize];
+ B0 = blB[0*PacketSize];
+ CJMADD(A0,B0,C0,T0);
+ B0 = blB[1*PacketSize];
+ CJMADD(A0,B0,C1,T0);
+ }
+ else
+ {
+ Scalar B0, B1, B2, B3, A0;
+ Scalar T0, T1;
+
+ A0 = blA[k];
+ B0 = blB[0*PacketSize];
+ B1 = blB[1*PacketSize];
+ if(nr==4) B2 = blB[2*PacketSize];
+ if(nr==4) B3 = blB[3*PacketSize];
+
+ CJMADD(A0,B0,C0,T0);
+ CJMADD(A0,B1,C1,T1);
+ if(nr==4) { CJMADD(A0,B2,C2,T0); }
+ if(nr==4) { CJMADD(A0,B3,C3,T1); }
+ }
blB += nr*PacketSize;
}
@@ -310,13 +444,13 @@ struct ei_gebp_kernel
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB];
for(int k=0; k<depth; k++)
{
- PacketType B0, A0, A1;
+ PacketType B0, A0, A1, T0, T1;
A0 = ei_pload(&blA[0*PacketSize]);
A1 = ei_pload(&blA[1*PacketSize]);
B0 = ei_pload(&blB[0*PacketSize]);
- C0 = cj.pmadd(A0, B0, C0);
- C4 = cj.pmadd(A1, B0, C4);
+ CJMADD(A0,B0,C0,T0);
+ CJMADD(A1,B0,C4,T1);
blB += PacketSize;
blA += mr;
@@ -334,7 +468,7 @@ struct ei_gebp_kernel
#endif
PacketType C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
-
+
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB];
for(int k=0; k<depth; k++)
{
@@ -363,6 +497,8 @@ struct ei_gebp_kernel
}
};
+#undef CJMADD
+
// pack a block of the lhs
// The travesal is as follow (mr==4):
// 0 4 8 12 ...
@@ -474,7 +610,7 @@ struct ei_gemm_pack_rhs<Scalar, nr, ColMajor, PanelMode>
// skip what we have after
if(PanelMode) count += PacketSize * nr * (stride-offset-depth);
}
-
+
// copy the remaining columns one at a time (nr==1)
for(int j2=packet_cols; j2<cols; ++j2)
{
diff --git a/bench/BenchTimer.h b/bench/BenchTimer.h
index b2d0fc5f6..e49afa07f 100644
--- a/bench/BenchTimer.h
+++ b/bench/BenchTimer.h
@@ -23,24 +23,31 @@
// License and a copy of the GNU General Public License along with
// Eigen. If not, see <http://www.gnu.org/licenses/>.
-#ifndef EIGEN_BENCH_TIMER_H
-#define EIGEN_BENCH_TIMER_H
+#ifndef EIGEN_BENCH_TIMERR_H
+#define EIGEN_BENCH_TIMERR_H
#if defined(_WIN32) || defined(__CYGWIN__)
#define NOMINMAX
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#else
+#include <sys/time.h>
#include <time.h>
#include <unistd.h>
#endif
+#include <cmath>
#include <cstdlib>
#include <numeric>
namespace Eigen
{
+enum {
+ CPU_TIMER = 0,
+ REAL_TIMER = 1
+};
+
/** Elapsed time timer keeping the best try.
*
* On POSIX platforms we use clock_gettime with CLOCK_PROCESS_CPUTIME_ID.
@@ -52,37 +59,58 @@ class BenchTimer
{
public:
- BenchTimer()
- {
+ BenchTimer()
+ {
#if defined(_WIN32) || defined(__CYGWIN__)
LARGE_INTEGER freq;
QueryPerformanceFrequency(&freq);
m_frequency = (double)freq.QuadPart;
#endif
- reset();
+ reset();
}
~BenchTimer() {}
- inline void reset(void) {m_best = 1e6;}
- inline void start(void) {m_start = getTime();}
- inline void stop(void)
+ inline void reset()
+ {
+ m_bests.fill(1e9);
+ m_totals.setZero();
+ }
+ inline void start()
+ {
+ m_starts[CPU_TIMER] = getCpuTime();
+ m_starts[REAL_TIMER] = getRealTime();
+ }
+ inline void stop()
{
- m_best = std::min(m_best, getTime() - m_start);
+ m_times[CPU_TIMER] = getCpuTime() - m_starts[CPU_TIMER];
+ m_times[REAL_TIMER] = getRealTime() - m_starts[REAL_TIMER];
+ m_bests = m_bests.cwiseMin(m_times);
+ m_totals += m_times;
}
- /** Return the best elapsed time in seconds.
+ /** Return the elapsed time in seconds between the last start/stop pair
*/
- inline double value(void)
+ inline double value(int TIMER = CPU_TIMER)
{
- return m_best;
+ return m_times[TIMER];
}
-#if defined(_WIN32) || defined(__CYGWIN__)
- inline double getTime(void)
-#else
- static inline double getTime(void)
-#endif
+ /** Return the best elapsed time in seconds
+ */
+ inline double best(int TIMER = CPU_TIMER)
+ {
+ return m_bests[TIMER];
+ }
+
+ /** Return the total elapsed time in seconds.
+ */
+ inline double total(int TIMER = CPU_TIMER)
+ {
+ return m_totals[TIMER];
+ }
+
+ inline double getCpuTime()
{
#ifdef WIN32
LARGE_INTEGER query_ticks;
@@ -95,14 +123,42 @@ public:
#endif
}
+ inline double getRealTime()
+ {
+#ifdef WIN32
+ SYSTEMTIME st;
+ GetSystemTime(&st);
+ return (double)st.wSecond + 1.e-3 * (double)st.wMilliseconds;
+#else
+ struct timeval tv;
+ struct timezone tz;
+ gettimeofday(&tv, &tz);
+ return (double)tv.tv_sec + 1.e-6 * (double)tv.tv_usec;
+#endif
+ }
+
protected:
#if defined(_WIN32) || defined(__CYGWIN__)
double m_frequency;
#endif
- double m_best, m_start;
+ Vector2d m_starts;
+ Vector2d m_times;
+ Vector2d m_bests;
+ Vector2d m_totals;
};
+#define BENCH(TIMER,TRIES,REP,CODE) { \
+ TIMER.reset(); \
+ for(int uglyvarname1=0; uglyvarname1<TRIES; ++uglyvarname1){ \
+ TIMER.start(); \
+ for(int uglyvarname2=0; uglyvarname2<REP; ++uglyvarname2){ \
+ CODE; \
+ } \
+ TIMER.stop(); \
+ } \
+ }
+
}
-#endif // EIGEN_BENCH_TIMER_H
+#endif // EIGEN_BENCH_TIMERR_H
diff --git a/bench/bench_unrolling b/bench/bench_unrolling
index bf01cce7d..826443845 100755
--- a/bench/bench_unrolling
+++ b/bench/bench_unrolling
@@ -2,10 +2,11 @@
# gcc : CXX="g++ -finline-limit=10000 -ftemplate-depth-2000 --param max-inline-recursive-depth=2000"
# icc : CXX="icpc -fast -no-inline-max-size -fno-exceptions"
+CXX=${CXX-g++ -finline-limit=10000 -ftemplate-depth-2000 --param max-inline-recursive-depth=2000} # default value
for ((i=1; i<16; ++i)); do
echo "Matrix size: $i x $i :"
- $CXX -O3 -I.. -DNDEBUG benchmark.cpp -DMATSIZE=$i -DEIGEN_UNROLLING_LIMIT=1024 -DEIGEN_UNROLLING_LIMIT=400 -o benchmark && time ./benchmark >/dev/null
+ $CXX -O3 -I.. -DNDEBUG benchmark.cpp -DMATSIZE=$i -DEIGEN_UNROLLING_LIMIT=400 -o benchmark && time ./benchmark >/dev/null
$CXX -O3 -I.. -DNDEBUG -finline-limit=10000 benchmark.cpp -DMATSIZE=$i -DEIGEN_DONT_USE_UNROLLED_LOOPS=1 -o benchmark && time ./benchmark >/dev/null
echo " "
done
diff --git a/bench/benchmark_suite b/bench/benchmark_suite
index a8fc6dced..3f21d3661 100755
--- a/bench/benchmark_suite
+++ b/bench/benchmark_suite
@@ -1,4 +1,5 @@
#!/bin/bash
+CXX=${CXX-g++} # default value unless caller has defined CXX
echo "Fixed size 3x3, column-major, -DNDEBUG"
$CXX -O3 -I .. -DNDEBUG benchmark.cpp -o benchmark && time ./benchmark >/dev/null
echo "Fixed size 3x3, column-major, with asserts"
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
index 12322a256..e8069154c 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
@@ -178,9 +178,9 @@ class MatrixFunction<MatrixType, 1>
*
* This is morally a \c static \c const \c Scalar, but only
* integers can be static constant class members in C++. The
- * separation constant is set to 0.01, a value taken from the
+ * separation constant is set to 0.1, a value taken from the
* paper by Davies and Higham. */
- static const RealScalar separation() { return static_cast<RealScalar>(0.01); }
+ static const RealScalar separation() { return static_cast<RealScalar>(0.1); }
};
/** \brief Constructor.
@@ -492,14 +492,12 @@ typename MatrixFunction<MatrixType,1>::DynMatrixType MatrixFunction<MatrixType,1
template<typename Derived> class MatrixFunctionReturnValue
: public ReturnByValue<MatrixFunctionReturnValue<Derived> >
{
- private:
+ public:
typedef typename ei_traits<Derived>::Scalar Scalar;
typedef typename ei_stem_function<Scalar>::type StemFunction;
- public:
-
- /** \brief Constructor.
+ /** \brief Constructor.
*
* \param[in] A %Matrix (expression) forming the argument of the
* matrix function.
diff --git a/unsupported/test/matrix_function.cpp b/unsupported/test/matrix_function.cpp
index 4ff6d7f1e..7a1501da2 100644
--- a/unsupported/test/matrix_function.cpp
+++ b/unsupported/test/matrix_function.cpp
@@ -109,11 +109,10 @@ template<typename MatrixType>
void testHyperbolicFunctions(const MatrixType& A)
{
for (int i = 0; i < g_repeat; i++) {
- MatrixType sinhA = ei_matrix_sinh(A);
- MatrixType coshA = ei_matrix_cosh(A);
MatrixType expA = ei_matrix_exponential(A);
- VERIFY_IS_APPROX(sinhA, (expA - expA.inverse())/2);
- VERIFY_IS_APPROX(coshA, (expA + expA.inverse())/2);
+ MatrixType expmA = ei_matrix_exponential(-A);
+ VERIFY_IS_APPROX(ei_matrix_sinh(A), (expA - expmA) / 2);
+ VERIFY_IS_APPROX(ei_matrix_cosh(A), (expA + expmA) / 2);
}
}
@@ -134,14 +133,15 @@ void testGonioFunctions(const MatrixType& A)
ComplexMatrix Ac = A.template cast<ComplexScalar>();
ComplexMatrix exp_iA = ei_matrix_exponential(imagUnit * Ac);
+ ComplexMatrix exp_miA = ei_matrix_exponential(-imagUnit * Ac);
MatrixType sinA = ei_matrix_sin(A);
ComplexMatrix sinAc = sinA.template cast<ComplexScalar>();
- VERIFY_IS_APPROX(sinAc, (exp_iA - exp_iA.inverse()) / (two*imagUnit));
+ VERIFY_IS_APPROX(sinAc, (exp_iA - exp_miA) / (two*imagUnit));
MatrixType cosA = ei_matrix_cos(A);
ComplexMatrix cosAc = cosA.template cast<ComplexScalar>();
- VERIFY_IS_APPROX(cosAc, (exp_iA + exp_iA.inverse()) / 2);
+ VERIFY_IS_APPROX(cosAc, (exp_iA + exp_miA) / 2);
}
}