diff options
author | 2010-02-22 21:32:29 +0100 | |
---|---|---|
committer | 2010-02-22 21:32:29 +0100 | |
commit | 3beedba244f2fc05bb5e301a688ee6bc01f73698 (patch) | |
tree | ec783020d3a80e448a2c90a341f35677378fbf5a | |
parent | 437f40acc1cbd9ce2f2a2a3f413cae3a5b35f8fb (diff) | |
parent | d3b314569b8040c813b914381cc7d0a2e4a1a529 (diff) |
merge
-rw-r--r-- | Eigen/src/Core/products/GeneralBlockPanelKernel.h | 418 | ||||
-rw-r--r-- | bench/BenchTimer.h | 94 | ||||
-rwxr-xr-x | bench/bench_unrolling | 3 | ||||
-rwxr-xr-x | bench/benchmark_suite | 1 | ||||
-rw-r--r-- | unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h | 10 | ||||
-rw-r--r-- | unsupported/test/matrix_function.cpp | 12 |
6 files changed, 365 insertions, 173 deletions
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index fe1987bdd..18e913b0e 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -27,6 +27,12 @@ #ifndef EIGEN_EXTERN_INSTANTIATIONS +#ifdef EIGEN_HAS_FUSE_CJMADD +#define CJMADD(A,B,C,T) C = cj.pmadd(A,B,C); +#else +#define CJMADD(A,B,C,T) T = A; T = cj.pmul(T,B); C = ei_padd(C,T); +#endif + // optimized GEneral packed Block * packed Panel product kernel template<typename Scalar, int mr, int nr, typename Conj> struct ei_gebp_kernel @@ -74,65 +80,111 @@ struct ei_gebp_kernel const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr]; for(int k=0; k<peeled_kc; k+=4) { - PacketType B0, B1, B2, B3, A0, A1; - - A0 = ei_pload(&blA[0*PacketSize]); - A1 = ei_pload(&blA[1*PacketSize]); - B0 = ei_pload(&blB[0*PacketSize]); - B1 = ei_pload(&blB[1*PacketSize]); - C0 = cj.pmadd(A0, B0, C0); - if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); - C4 = cj.pmadd(A1, B0, C4); - if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); - B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]); - C1 = cj.pmadd(A0, B1, C1); - C5 = cj.pmadd(A1, B1, C5); - B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]); - if(nr==4) C2 = cj.pmadd(A0, B2, C2); - if(nr==4) C6 = cj.pmadd(A1, B2, C6); - if(nr==4) B2 = ei_pload(&blB[6*PacketSize]); - if(nr==4) C3 = cj.pmadd(A0, B3, C3); - A0 = ei_pload(&blA[2*PacketSize]); - if(nr==4) C7 = cj.pmadd(A1, B3, C7); - A1 = ei_pload(&blA[3*PacketSize]); - if(nr==4) B3 = ei_pload(&blB[7*PacketSize]); - C0 = cj.pmadd(A0, B0, C0); - C4 = cj.pmadd(A1, B0, C4); - B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]); - C1 = cj.pmadd(A0, B1, C1); - C5 = cj.pmadd(A1, B1, C5); - B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]); - if(nr==4) C2 = cj.pmadd(A0, B2, C2); - if(nr==4) C6 = cj.pmadd(A1, B2, C6); - if(nr==4) B2 = ei_pload(&blB[10*PacketSize]); - if(nr==4) C3 = cj.pmadd(A0, B3, C3); - A0 = ei_pload(&blA[4*PacketSize]); - if(nr==4) C7 = cj.pmadd(A1, B3, C7); - A1 = ei_pload(&blA[5*PacketSize]); - if(nr==4) B3 = ei_pload(&blB[11*PacketSize]); - - C0 = cj.pmadd(A0, B0, C0); - C4 = cj.pmadd(A1, B0, C4); - B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]); - C1 = cj.pmadd(A0, B1, C1); - C5 = cj.pmadd(A1, B1, C5); - B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]); - if(nr==4) C2 = cj.pmadd(A0, B2, C2); - if(nr==4) C6 = cj.pmadd(A1, B2, C6); - if(nr==4) B2 = ei_pload(&blB[14*PacketSize]); - if(nr==4) C3 = cj.pmadd(A0, B3, C3); - A0 = ei_pload(&blA[6*PacketSize]); - if(nr==4) C7 = cj.pmadd(A1, B3, C7); - A1 = ei_pload(&blA[7*PacketSize]); - if(nr==4) B3 = ei_pload(&blB[15*PacketSize]); - C0 = cj.pmadd(A0, B0, C0); - C4 = cj.pmadd(A1, B0, C4); - C1 = cj.pmadd(A0, B1, C1); - C5 = cj.pmadd(A1, B1, C5); - if(nr==4) C2 = cj.pmadd(A0, B2, C2); - if(nr==4) C6 = cj.pmadd(A1, B2, C6); - if(nr==4) C3 = cj.pmadd(A0, B3, C3); - if(nr==4) C7 = cj.pmadd(A1, B3, C7); + if(nr==2) + { + PacketType B0, T0, A0, A1; + + A0 = ei_pload(&blA[0*PacketSize]); + A1 = ei_pload(&blA[1*PacketSize]); + B0 = ei_pload(&blB[0*PacketSize]); + CJMADD(A0,B0,C0,T0); + CJMADD(A1,B0,C4,T0); + B0 = ei_pload(&blB[1*PacketSize]); + CJMADD(A0,B0,C1,T0); + CJMADD(A1,B0,C5,T0); + + A0 = ei_pload(&blA[2*PacketSize]); + A1 = ei_pload(&blA[3*PacketSize]); + B0 = ei_pload(&blB[2*PacketSize]); + CJMADD(A0,B0,C0,T0); + CJMADD(A1,B0,C4,T0); + B0 = ei_pload(&blB[3*PacketSize]); + CJMADD(A0,B0,C1,T0); + CJMADD(A1,B0,C5,T0); + + A0 = ei_pload(&blA[4*PacketSize]); + A1 = ei_pload(&blA[5*PacketSize]); + B0 = ei_pload(&blB[4*PacketSize]); + CJMADD(A0,B0,C0,T0); + CJMADD(A1,B0,C4,T0); + B0 = ei_pload(&blB[5*PacketSize]); + CJMADD(A0,B0,C1,T0); + CJMADD(A1,B0,C5,T0); + + A0 = ei_pload(&blA[6*PacketSize]); + A1 = ei_pload(&blA[7*PacketSize]); + B0 = ei_pload(&blB[6*PacketSize]); + CJMADD(A0,B0,C0,T0); + CJMADD(A1,B0,C4,T0); + B0 = ei_pload(&blB[7*PacketSize]); + CJMADD(A0,B0,C1,T0); + CJMADD(A1,B0,C5,T0); + } + else + { + + PacketType B0, B1, B2, B3, A0, A1; + PacketType T0, T1; + + A0 = ei_pload(&blA[0*PacketSize]); + A1 = ei_pload(&blA[1*PacketSize]); + B0 = ei_pload(&blB[0*PacketSize]); + B1 = ei_pload(&blB[1*PacketSize]); + + CJMADD(A0,B0,C0,T0); + if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); + CJMADD(A1,B0,C4,T1); + if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); + B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]); + CJMADD(A0,B1,C1,T0); + CJMADD(A1,B1,C5,T1); + B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]); + if(nr==4) { CJMADD(A0,B2,C2,T0); } + if(nr==4) { CJMADD(A1,B2,C6,T1); } + if(nr==4) B2 = ei_pload(&blB[6*PacketSize]); + if(nr==4) { CJMADD(A0,B3,C3,T0); } + A0 = ei_pload(&blA[2*PacketSize]); + if(nr==4) { CJMADD(A1,B3,C7,T1); } + A1 = ei_pload(&blA[3*PacketSize]); + if(nr==4) B3 = ei_pload(&blB[7*PacketSize]); + CJMADD(A0,B0,C0,T0); + CJMADD(A1,B0,C4,T1); + B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]); + CJMADD(A0,B1,C1,T0); + CJMADD(A1,B1,C5,T1); + B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]); + if(nr==4) { CJMADD(A0,B2,C2,T0); } + if(nr==4) { CJMADD(A1,B2,C6,T1); } + if(nr==4) B2 = ei_pload(&blB[10*PacketSize]); + if(nr==4) { CJMADD(A0,B3,C3,T0); } + A0 = ei_pload(&blA[4*PacketSize]); + if(nr==4) { CJMADD(A1,B3,C7,T1); } + A1 = ei_pload(&blA[5*PacketSize]); + if(nr==4) B3 = ei_pload(&blB[11*PacketSize]); + + CJMADD(A0,B0,C0,T0); + CJMADD(A1,B0,C4,T1); + B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]); + CJMADD(A0,B1,C1,T0); + CJMADD(A1,B1,C5,T1); + B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]); + if(nr==4) { CJMADD(A0,B2,C2,T0); } + if(nr==4) { CJMADD(A1,B2,C6,T1); } + if(nr==4) B2 = ei_pload(&blB[14*PacketSize]); + if(nr==4) { CJMADD(A0,B3,C3,T0); } + A0 = ei_pload(&blA[6*PacketSize]); + if(nr==4) { CJMADD(A1,B3,C7,T1); } + A1 = ei_pload(&blA[7*PacketSize]); + if(nr==4) B3 = ei_pload(&blB[15*PacketSize]); + CJMADD(A0,B0,C0,T0); + CJMADD(A1,B0,C4,T1); + CJMADD(A0,B1,C1,T0); + CJMADD(A1,B1,C5,T1); + if(nr==4) { CJMADD(A0,B2,C2,T0); } + if(nr==4) { CJMADD(A1,B2,C6,T1); } + if(nr==4) { CJMADD(A0,B3,C3,T0); } + if(nr==4) { CJMADD(A1,B3,C7,T1); } + } blB += 4*nr*PacketSize; blA += 4*mr; @@ -140,22 +192,40 @@ struct ei_gebp_kernel // process remaining peeled loop for(int k=peeled_kc; k<depth; k++) { - PacketType B0, B1, B2, B3, A0, A1; - - A0 = ei_pload(&blA[0*PacketSize]); - A1 = ei_pload(&blA[1*PacketSize]); - B0 = ei_pload(&blB[0*PacketSize]); - B1 = ei_pload(&blB[1*PacketSize]); - C0 = cj.pmadd(A0, B0, C0); - if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); - C4 = cj.pmadd(A1, B0, C4); - if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); - C1 = cj.pmadd(A0, B1, C1); - C5 = cj.pmadd(A1, B1, C5); - if(nr==4) C2 = cj.pmadd(A0, B2, C2); - if(nr==4) C6 = cj.pmadd(A1, B2, C6); - if(nr==4) C3 = cj.pmadd(A0, B3, C3); - if(nr==4) C7 = cj.pmadd(A1, B3, C7); + if(nr==2) + { + PacketType B0, T0, A0, A1; + + A0 = ei_pload(&blA[0*PacketSize]); + A1 = ei_pload(&blA[1*PacketSize]); + B0 = ei_pload(&blB[0*PacketSize]); + CJMADD(A0,B0,C0,T0); + CJMADD(A1,B0,C4,T0); + B0 = ei_pload(&blB[1*PacketSize]); + CJMADD(A0,B0,C1,T0); + CJMADD(A1,B0,C5,T0); + } + else + { + PacketType B0, B1, B2, B3, A0, A1, T0, T1; + + A0 = ei_pload(&blA[0*PacketSize]); + A1 = ei_pload(&blA[1*PacketSize]); + B0 = ei_pload(&blB[0*PacketSize]); + B1 = ei_pload(&blB[1*PacketSize]); + + CJMADD(A0,B0,C0,T0); + if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); + CJMADD(A1,B0,C4,T1); + if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); + B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]); + CJMADD(A0,B1,C1,T0); + CJMADD(A1,B1,C5,T1); + if(nr==4) { CJMADD(A0,B2,C2,T0); } + if(nr==4) { CJMADD(A1,B2,C6,T1); } + if(nr==4) { CJMADD(A0,B3,C3,T0); } + if(nr==4) { CJMADD(A1,B3,C7,T1); } + } blB += nr*PacketSize; blA += mr; @@ -189,45 +259,79 @@ struct ei_gebp_kernel const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr]; for(int k=0; k<peeled_kc; k+=4) { - PacketType B0, B1, B2, B3, A0; - - A0 = ei_pload(&blA[0*PacketSize]); - B0 = ei_pload(&blB[0*PacketSize]); - B1 = ei_pload(&blB[1*PacketSize]); - C0 = cj.pmadd(A0, B0, C0); - if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); - if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); - B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]); - C1 = cj.pmadd(A0, B1, C1); - B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]); - if(nr==4) C2 = cj.pmadd(A0, B2, C2); - if(nr==4) B2 = ei_pload(&blB[6*PacketSize]); - if(nr==4) C3 = cj.pmadd(A0, B3, C3); - A0 = ei_pload(&blA[1*PacketSize]); - if(nr==4) B3 = ei_pload(&blB[7*PacketSize]); - C0 = cj.pmadd(A0, B0, C0); - B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]); - C1 = cj.pmadd(A0, B1, C1); - B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]); - if(nr==4) C2 = cj.pmadd(A0, B2, C2); - if(nr==4) B2 = ei_pload(&blB[10*PacketSize]); - if(nr==4) C3 = cj.pmadd(A0, B3, C3); - A0 = ei_pload(&blA[2*PacketSize]); - if(nr==4) B3 = ei_pload(&blB[11*PacketSize]); - - C0 = cj.pmadd(A0, B0, C0); - B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]); - C1 = cj.pmadd(A0, B1, C1); - B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]); - if(nr==4) C2 = cj.pmadd(A0, B2, C2); - if(nr==4) B2 = ei_pload(&blB[14*PacketSize]); - if(nr==4) C3 = cj.pmadd(A0, B3, C3); - A0 = ei_pload(&blA[3*PacketSize]); - if(nr==4) B3 = ei_pload(&blB[15*PacketSize]); - C0 = cj.pmadd(A0, B0, C0); - C1 = cj.pmadd(A0, B1, C1); - if(nr==4) C2 = cj.pmadd(A0, B2, C2); - if(nr==4) C3 = cj.pmadd(A0, B3, C3); + if(nr==2) + { + PacketType B0, T0, A0; + + A0 = ei_pload(&blA[0*PacketSize]); + B0 = ei_pload(&blB[0*PacketSize]); + CJMADD(A0,B0,C0,T0); + B0 = ei_pload(&blB[1*PacketSize]); + CJMADD(A0,B0,C1,T0); + + A0 = ei_pload(&blA[1*PacketSize]); + B0 = ei_pload(&blB[2*PacketSize]); + CJMADD(A0,B0,C0,T0); + B0 = ei_pload(&blB[3*PacketSize]); + CJMADD(A0,B0,C1,T0); + + A0 = ei_pload(&blA[2*PacketSize]); + B0 = ei_pload(&blB[4*PacketSize]); + CJMADD(A0,B0,C0,T0); + B0 = ei_pload(&blB[5*PacketSize]); + CJMADD(A0,B0,C1,T0); + + A0 = ei_pload(&blA[3*PacketSize]); + B0 = ei_pload(&blB[6*PacketSize]); + CJMADD(A0,B0,C0,T0); + B0 = ei_pload(&blB[7*PacketSize]); + CJMADD(A0,B0,C1,T0); + } + else + { + + PacketType B0, B1, B2, B3, A0; + PacketType T0, T1; + + A0 = ei_pload(&blA[0*PacketSize]); + B0 = ei_pload(&blB[0*PacketSize]); + B1 = ei_pload(&blB[1*PacketSize]); + + CJMADD(A0,B0,C0,T0); + if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); + if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); + B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]); + CJMADD(A0,B1,C1,T1); + B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]); + if(nr==4) { CJMADD(A0,B2,C2,T0); } + if(nr==4) B2 = ei_pload(&blB[6*PacketSize]); + if(nr==4) { CJMADD(A0,B3,C3,T1); } + A0 = ei_pload(&blA[1*PacketSize]); + if(nr==4) B3 = ei_pload(&blB[7*PacketSize]); + CJMADD(A0,B0,C0,T0); + B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]); + CJMADD(A0,B1,C1,T1); + B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]); + if(nr==4) { CJMADD(A0,B2,C2,T0); } + if(nr==4) B2 = ei_pload(&blB[10*PacketSize]); + if(nr==4) { CJMADD(A0,B3,C3,T1); } + A0 = ei_pload(&blA[2*PacketSize]); + if(nr==4) B3 = ei_pload(&blB[11*PacketSize]); + + CJMADD(A0,B0,C0,T0); + B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]); + CJMADD(A0,B1,C1,T1); + B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]); + if(nr==4) { CJMADD(A0,B2,C2,T0); } + if(nr==4) B2 = ei_pload(&blB[14*PacketSize]); + if(nr==4) { CJMADD(A0,B3,C3,T1); } + A0 = ei_pload(&blA[3*PacketSize]); + if(nr==4) B3 = ei_pload(&blB[15*PacketSize]); + CJMADD(A0,B0,C0,T0); + CJMADD(A0,B1,C1,T1); + if(nr==4) { CJMADD(A0,B2,C2,T0); } + if(nr==4) { CJMADD(A0,B3,C3,T1); } + } blB += 4*nr*PacketSize; blA += 4*PacketSize; @@ -235,17 +339,32 @@ struct ei_gebp_kernel // process remaining peeled loop for(int k=peeled_kc; k<depth; k++) { - PacketType B0, B1, B2, B3, A0; - - A0 = ei_pload(&blA[0*PacketSize]); - B0 = ei_pload(&blB[0*PacketSize]); - B1 = ei_pload(&blB[1*PacketSize]); - C0 = cj.pmadd(A0, B0, C0); - if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); - if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); - C1 = cj.pmadd(A0, B1, C1); - if(nr==4) C2 = cj.pmadd(A0, B2, C2); - if(nr==4) C3 = cj.pmadd(A0, B3, C3); + if(nr==2) + { + PacketType B0, T0, A0; + + A0 = ei_pload(&blA[0*PacketSize]); + B0 = ei_pload(&blB[0*PacketSize]); + CJMADD(A0,B0,C0,T0); + B0 = ei_pload(&blB[1*PacketSize]); + CJMADD(A0,B0,C1,T0); + } + else + { + PacketType B0, B1, B2, B3, A0; + PacketType T0, T1; + + A0 = ei_pload(&blA[0*PacketSize]); + B0 = ei_pload(&blB[0*PacketSize]); + B1 = ei_pload(&blB[1*PacketSize]); + if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); + if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); + + CJMADD(A0,B0,C0,T0); + CJMADD(A0,B1,C1,T1); + if(nr==4) { CJMADD(A0,B2,C2,T0); } + if(nr==4) { CJMADD(A0,B3,C3,T1); } + } blB += nr*PacketSize; blA += PacketSize; @@ -268,17 +387,32 @@ struct ei_gebp_kernel const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr]; for(int k=0; k<depth; k++) { - Scalar B0, B1, B2, B3, A0; - - A0 = blA[k]; - B0 = blB[0*PacketSize]; - B1 = blB[1*PacketSize]; - C0 = cj.pmadd(A0, B0, C0); - if(nr==4) B2 = blB[2*PacketSize]; - if(nr==4) B3 = blB[3*PacketSize]; - C1 = cj.pmadd(A0, B1, C1); - if(nr==4) C2 = cj.pmadd(A0, B2, C2); - if(nr==4) C3 = cj.pmadd(A0, B3, C3); + if(nr==2) + { + Scalar B0, T0, A0; + + A0 = blA[0*PacketSize]; + B0 = blB[0*PacketSize]; + CJMADD(A0,B0,C0,T0); + B0 = blB[1*PacketSize]; + CJMADD(A0,B0,C1,T0); + } + else + { + Scalar B0, B1, B2, B3, A0; + Scalar T0, T1; + + A0 = blA[k]; + B0 = blB[0*PacketSize]; + B1 = blB[1*PacketSize]; + if(nr==4) B2 = blB[2*PacketSize]; + if(nr==4) B3 = blB[3*PacketSize]; + + CJMADD(A0,B0,C0,T0); + CJMADD(A0,B1,C1,T1); + if(nr==4) { CJMADD(A0,B2,C2,T0); } + if(nr==4) { CJMADD(A0,B3,C3,T1); } + } blB += nr*PacketSize; } @@ -310,13 +444,13 @@ struct ei_gebp_kernel const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB]; for(int k=0; k<depth; k++) { - PacketType B0, A0, A1; + PacketType B0, A0, A1, T0, T1; A0 = ei_pload(&blA[0*PacketSize]); A1 = ei_pload(&blA[1*PacketSize]); B0 = ei_pload(&blB[0*PacketSize]); - C0 = cj.pmadd(A0, B0, C0); - C4 = cj.pmadd(A1, B0, C4); + CJMADD(A0,B0,C0,T0); + CJMADD(A1,B0,C4,T1); blB += PacketSize; blA += mr; @@ -334,7 +468,7 @@ struct ei_gebp_kernel #endif PacketType C0 = ei_ploadu(&res[(j2+0)*resStride + i]); - + const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB]; for(int k=0; k<depth; k++) { @@ -363,6 +497,8 @@ struct ei_gebp_kernel } }; +#undef CJMADD + // pack a block of the lhs // The travesal is as follow (mr==4): // 0 4 8 12 ... @@ -474,7 +610,7 @@ struct ei_gemm_pack_rhs<Scalar, nr, ColMajor, PanelMode> // skip what we have after if(PanelMode) count += PacketSize * nr * (stride-offset-depth); } - + // copy the remaining columns one at a time (nr==1) for(int j2=packet_cols; j2<cols; ++j2) { diff --git a/bench/BenchTimer.h b/bench/BenchTimer.h index b2d0fc5f6..e49afa07f 100644 --- a/bench/BenchTimer.h +++ b/bench/BenchTimer.h @@ -23,24 +23,31 @@ // License and a copy of the GNU General Public License along with // Eigen. If not, see <http://www.gnu.org/licenses/>. -#ifndef EIGEN_BENCH_TIMER_H -#define EIGEN_BENCH_TIMER_H +#ifndef EIGEN_BENCH_TIMERR_H +#define EIGEN_BENCH_TIMERR_H #if defined(_WIN32) || defined(__CYGWIN__) #define NOMINMAX #define WIN32_LEAN_AND_MEAN #include <windows.h> #else +#include <sys/time.h> #include <time.h> #include <unistd.h> #endif +#include <cmath> #include <cstdlib> #include <numeric> namespace Eigen { +enum { + CPU_TIMER = 0, + REAL_TIMER = 1 +}; + /** Elapsed time timer keeping the best try. * * On POSIX platforms we use clock_gettime with CLOCK_PROCESS_CPUTIME_ID. @@ -52,37 +59,58 @@ class BenchTimer { public: - BenchTimer() - { + BenchTimer() + { #if defined(_WIN32) || defined(__CYGWIN__) LARGE_INTEGER freq; QueryPerformanceFrequency(&freq); m_frequency = (double)freq.QuadPart; #endif - reset(); + reset(); } ~BenchTimer() {} - inline void reset(void) {m_best = 1e6;} - inline void start(void) {m_start = getTime();} - inline void stop(void) + inline void reset() + { + m_bests.fill(1e9); + m_totals.setZero(); + } + inline void start() + { + m_starts[CPU_TIMER] = getCpuTime(); + m_starts[REAL_TIMER] = getRealTime(); + } + inline void stop() { - m_best = std::min(m_best, getTime() - m_start); + m_times[CPU_TIMER] = getCpuTime() - m_starts[CPU_TIMER]; + m_times[REAL_TIMER] = getRealTime() - m_starts[REAL_TIMER]; + m_bests = m_bests.cwiseMin(m_times); + m_totals += m_times; } - /** Return the best elapsed time in seconds. + /** Return the elapsed time in seconds between the last start/stop pair */ - inline double value(void) + inline double value(int TIMER = CPU_TIMER) { - return m_best; + return m_times[TIMER]; } -#if defined(_WIN32) || defined(__CYGWIN__) - inline double getTime(void) -#else - static inline double getTime(void) -#endif + /** Return the best elapsed time in seconds + */ + inline double best(int TIMER = CPU_TIMER) + { + return m_bests[TIMER]; + } + + /** Return the total elapsed time in seconds. + */ + inline double total(int TIMER = CPU_TIMER) + { + return m_totals[TIMER]; + } + + inline double getCpuTime() { #ifdef WIN32 LARGE_INTEGER query_ticks; @@ -95,14 +123,42 @@ public: #endif } + inline double getRealTime() + { +#ifdef WIN32 + SYSTEMTIME st; + GetSystemTime(&st); + return (double)st.wSecond + 1.e-3 * (double)st.wMilliseconds; +#else + struct timeval tv; + struct timezone tz; + gettimeofday(&tv, &tz); + return (double)tv.tv_sec + 1.e-6 * (double)tv.tv_usec; +#endif + } + protected: #if defined(_WIN32) || defined(__CYGWIN__) double m_frequency; #endif - double m_best, m_start; + Vector2d m_starts; + Vector2d m_times; + Vector2d m_bests; + Vector2d m_totals; }; +#define BENCH(TIMER,TRIES,REP,CODE) { \ + TIMER.reset(); \ + for(int uglyvarname1=0; uglyvarname1<TRIES; ++uglyvarname1){ \ + TIMER.start(); \ + for(int uglyvarname2=0; uglyvarname2<REP; ++uglyvarname2){ \ + CODE; \ + } \ + TIMER.stop(); \ + } \ + } + } -#endif // EIGEN_BENCH_TIMER_H +#endif // EIGEN_BENCH_TIMERR_H diff --git a/bench/bench_unrolling b/bench/bench_unrolling index bf01cce7d..826443845 100755 --- a/bench/bench_unrolling +++ b/bench/bench_unrolling @@ -2,10 +2,11 @@ # gcc : CXX="g++ -finline-limit=10000 -ftemplate-depth-2000 --param max-inline-recursive-depth=2000" # icc : CXX="icpc -fast -no-inline-max-size -fno-exceptions" +CXX=${CXX-g++ -finline-limit=10000 -ftemplate-depth-2000 --param max-inline-recursive-depth=2000} # default value for ((i=1; i<16; ++i)); do echo "Matrix size: $i x $i :" - $CXX -O3 -I.. -DNDEBUG benchmark.cpp -DMATSIZE=$i -DEIGEN_UNROLLING_LIMIT=1024 -DEIGEN_UNROLLING_LIMIT=400 -o benchmark && time ./benchmark >/dev/null + $CXX -O3 -I.. -DNDEBUG benchmark.cpp -DMATSIZE=$i -DEIGEN_UNROLLING_LIMIT=400 -o benchmark && time ./benchmark >/dev/null $CXX -O3 -I.. -DNDEBUG -finline-limit=10000 benchmark.cpp -DMATSIZE=$i -DEIGEN_DONT_USE_UNROLLED_LOOPS=1 -o benchmark && time ./benchmark >/dev/null echo " " done diff --git a/bench/benchmark_suite b/bench/benchmark_suite index a8fc6dced..3f21d3661 100755 --- a/bench/benchmark_suite +++ b/bench/benchmark_suite @@ -1,4 +1,5 @@ #!/bin/bash +CXX=${CXX-g++} # default value unless caller has defined CXX echo "Fixed size 3x3, column-major, -DNDEBUG" $CXX -O3 -I .. -DNDEBUG benchmark.cpp -o benchmark && time ./benchmark >/dev/null echo "Fixed size 3x3, column-major, with asserts" diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h index 12322a256..e8069154c 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h @@ -178,9 +178,9 @@ class MatrixFunction<MatrixType, 1> * * This is morally a \c static \c const \c Scalar, but only * integers can be static constant class members in C++. The - * separation constant is set to 0.01, a value taken from the + * separation constant is set to 0.1, a value taken from the * paper by Davies and Higham. */ - static const RealScalar separation() { return static_cast<RealScalar>(0.01); } + static const RealScalar separation() { return static_cast<RealScalar>(0.1); } }; /** \brief Constructor. @@ -492,14 +492,12 @@ typename MatrixFunction<MatrixType,1>::DynMatrixType MatrixFunction<MatrixType,1 template<typename Derived> class MatrixFunctionReturnValue : public ReturnByValue<MatrixFunctionReturnValue<Derived> > { - private: + public: typedef typename ei_traits<Derived>::Scalar Scalar; typedef typename ei_stem_function<Scalar>::type StemFunction; - public: - - /** \brief Constructor. + /** \brief Constructor. * * \param[in] A %Matrix (expression) forming the argument of the * matrix function. diff --git a/unsupported/test/matrix_function.cpp b/unsupported/test/matrix_function.cpp index 4ff6d7f1e..7a1501da2 100644 --- a/unsupported/test/matrix_function.cpp +++ b/unsupported/test/matrix_function.cpp @@ -109,11 +109,10 @@ template<typename MatrixType> void testHyperbolicFunctions(const MatrixType& A) { for (int i = 0; i < g_repeat; i++) { - MatrixType sinhA = ei_matrix_sinh(A); - MatrixType coshA = ei_matrix_cosh(A); MatrixType expA = ei_matrix_exponential(A); - VERIFY_IS_APPROX(sinhA, (expA - expA.inverse())/2); - VERIFY_IS_APPROX(coshA, (expA + expA.inverse())/2); + MatrixType expmA = ei_matrix_exponential(-A); + VERIFY_IS_APPROX(ei_matrix_sinh(A), (expA - expmA) / 2); + VERIFY_IS_APPROX(ei_matrix_cosh(A), (expA + expmA) / 2); } } @@ -134,14 +133,15 @@ void testGonioFunctions(const MatrixType& A) ComplexMatrix Ac = A.template cast<ComplexScalar>(); ComplexMatrix exp_iA = ei_matrix_exponential(imagUnit * Ac); + ComplexMatrix exp_miA = ei_matrix_exponential(-imagUnit * Ac); MatrixType sinA = ei_matrix_sin(A); ComplexMatrix sinAc = sinA.template cast<ComplexScalar>(); - VERIFY_IS_APPROX(sinAc, (exp_iA - exp_iA.inverse()) / (two*imagUnit)); + VERIFY_IS_APPROX(sinAc, (exp_iA - exp_miA) / (two*imagUnit)); MatrixType cosA = ei_matrix_cos(A); ComplexMatrix cosAc = cosA.template cast<ComplexScalar>(); - VERIFY_IS_APPROX(cosAc, (exp_iA + exp_iA.inverse()) / 2); + VERIFY_IS_APPROX(cosAc, (exp_iA + exp_miA) / 2); } } |