aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/products
diff options
context:
space:
mode:
Diffstat (limited to 'Eigen/src/Core/products')
-rw-r--r--Eigen/src/Core/products/GeneralBlockPanelKernel.h86
1 files changed, 68 insertions, 18 deletions
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index c8a1dcced..6a16aa661 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -771,7 +771,19 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
const Index peeled_kc = depth & ~(pk-1);
const Index prefetch_res_offset = 32/sizeof(ResScalar);
// const Index depth2 = depth & ~1;
-
+
+#if EIGEN_ARCH_ARM
+ const bool PreferRotatingKernel = true;
+#else
+ const bool PreferRotatingKernel = false;
+#endif
+
+ const bool UseRotatingKernel =
+ PreferRotatingKernel &&
+ Traits::LhsPacketSize == 4 &&
+ Traits::RhsPacketSize == 4 &&
+ Traits::ResPacketSize == 4;
+
//---------- Process 3 * LhsProgress rows at once ----------
// This corresponds to 3*LhsProgress x nr register blocks.
// Usually, make sense only with FMA
@@ -818,7 +830,21 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
RhsPacket B_0, T0;
LhsPacket A2;
-#define EIGEN_GEBGP_ONESTEP(K) \
+#define EIGEN_GEBP_ONESTEP_LOADRHS(K,N) \
+ do { \
+ if (UseRotatingKernel) { \
+ if (N == 0) { \
+ B_0 = pload<RhsPacket>(&blB[(0+4*K)*RhsProgress]); \
+ } else { \
+ EIGEN_ASM_COMMENT("Do not reorder code, we're very tight on registers"); \
+ B_0 = protate<1>(B_0); \
+ } \
+ } else { \
+ traits.loadRhs(&blB[(N+4*K)*RhsProgress], B_0); \
+ } \
+ } while (false)
+
+#define EIGEN_GEBP_ONESTEP(K) \
do { \
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
@@ -827,34 +853,34 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
- traits.loadRhs(&blB[(0+4*K)*RhsProgress], B_0); \
+ EIGEN_GEBP_ONESTEP_LOADRHS(K, 0); \
traits.madd(A0, B_0, C0, T0); \
traits.madd(A1, B_0, C4, T0); \
traits.madd(A2, B_0, C8, B_0); \
- traits.loadRhs(&blB[1+4*K*RhsProgress], B_0); \
+ EIGEN_GEBP_ONESTEP_LOADRHS(K, 1); \
traits.madd(A0, B_0, C1, T0); \
traits.madd(A1, B_0, C5, T0); \
traits.madd(A2, B_0, C9, B_0); \
- traits.loadRhs(&blB[2+4*K*RhsProgress], B_0); \
+ EIGEN_GEBP_ONESTEP_LOADRHS(K, 2); \
traits.madd(A0, B_0, C2, T0); \
traits.madd(A1, B_0, C6, T0); \
traits.madd(A2, B_0, C10, B_0); \
- traits.loadRhs(&blB[3+4*K*RhsProgress], B_0); \
+ EIGEN_GEBP_ONESTEP_LOADRHS(K, 3); \
traits.madd(A0, B_0, C3 , T0); \
traits.madd(A1, B_0, C7, T0); \
traits.madd(A2, B_0, C11, B_0); \
EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
} while(false)
-
+
internal::prefetch(blB + 4 * pk * sizeof(RhsScalar)); /* Bug 953 */
- EIGEN_GEBGP_ONESTEP(0);
- EIGEN_GEBGP_ONESTEP(1);
- EIGEN_GEBGP_ONESTEP(2);
- EIGEN_GEBGP_ONESTEP(3);
- EIGEN_GEBGP_ONESTEP(4);
- EIGEN_GEBGP_ONESTEP(5);
- EIGEN_GEBGP_ONESTEP(6);
- EIGEN_GEBGP_ONESTEP(7);
+ EIGEN_GEBP_ONESTEP(0);
+ EIGEN_GEBP_ONESTEP(1);
+ EIGEN_GEBP_ONESTEP(2);
+ EIGEN_GEBP_ONESTEP(3);
+ EIGEN_GEBP_ONESTEP(4);
+ EIGEN_GEBP_ONESTEP(5);
+ EIGEN_GEBP_ONESTEP(6);
+ EIGEN_GEBP_ONESTEP(7);
blB += pk*4*RhsProgress;
blA += pk*3*Traits::LhsProgress;
@@ -866,12 +892,36 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
{
RhsPacket B_0, T0;
LhsPacket A2;
- EIGEN_GEBGP_ONESTEP(0);
+ EIGEN_GEBP_ONESTEP(0);
blB += 4*RhsProgress;
blA += 3*Traits::LhsProgress;
}
- #undef EIGEN_GEBGP_ONESTEP
-
+#undef EIGEN_GEBP_ONESTEP
+
+ if (UseRotatingKernel) {
+ #define EIGEN_GEBP_UNROTATE_RESULT(res0, res1, res2, res3) \
+ do { \
+ PacketBlock<ResPacket> resblock; \
+ resblock.packet[0] = res0; \
+ resblock.packet[1] = res1; \
+ resblock.packet[2] = res2; \
+ resblock.packet[3] = res3; \
+ ptranspose(resblock); \
+ resblock.packet[3] = protate<1>(resblock.packet[3]); \
+ resblock.packet[2] = protate<2>(resblock.packet[2]); \
+ resblock.packet[1] = protate<3>(resblock.packet[1]); \
+ ptranspose(resblock); \
+ res0 = resblock.packet[0]; \
+ res1 = resblock.packet[1]; \
+ res2 = resblock.packet[2]; \
+ res3 = resblock.packet[3]; \
+ } while (false)
+
+ EIGEN_GEBP_UNROTATE_RESULT(C0, C1, C2, C3);
+ EIGEN_GEBP_UNROTATE_RESULT(C4, C5, C6, C7);
+ EIGEN_GEBP_UNROTATE_RESULT(C8, C9, C10, C11);
+ }
+
ResPacket R0, R1, R2;
ResPacket alphav = pset1<ResPacket>(alpha);