aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen
diff options
context:
space:
mode:
Diffstat (limited to 'Eigen')
-rw-r--r--Eigen/src/Core/GenericPacketMath.h15
-rw-r--r--Eigen/src/Core/arch/NEON/PacketMath.h25
-rwxr-xr-xEigen/src/Core/arch/SSE/PacketMath.h23
-rw-r--r--Eigen/src/Core/products/GeneralBlockPanelKernel.h86
-rw-r--r--Eigen/src/Core/util/StaticAssert.h3
5 files changed, 133 insertions, 19 deletions
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 74e1174ae..967a07df5 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -287,6 +287,21 @@ template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Pack
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a)
{ return a; }
+template<size_t offset, typename Packet>
+struct protate_impl
+{
+ static Packet run(const Packet& a) { return a; }
+};
+
+/** \internal \returns a packet with the coefficients rotated to the right in little-endian convention,
+ * by the given offset, e.g. for offset == 1:
+ * (packet[3], packet[2], packet[1], packet[0]) becomes (packet[0], packet[3], packet[2], packet[1])
+ */
+template<size_t offset, typename Packet> EIGEN_DEVICE_FUNC inline Packet protate(const Packet& a)
+{
+ EIGEN_STATIC_ASSERT(offset < unpacket_traits<Packet>::size, ROTATION_BY_ILLEGAL_OFFSET);
+ return offset ? protate_impl<offset, Packet>::run(a) : a;
+}
/** \internal \returns \a a with real and imaginary part flipped (for complex type only) */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a)
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 8149aed7f..e9af45f22 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -309,6 +309,23 @@ template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
a_hi = vget_high_s32(a_r64);
return vcombine_s32(a_hi, a_lo);
}
+
+template<size_t offset>
+struct protate_impl<offset, Packet4f>
+{
+ static Packet4f run(const Packet4f& a) {
+ return vextq_f32(a, a, offset);
+ }
+};
+
+template<size_t offset>
+struct protate_impl<offset, Packet4i>
+{
+ static Packet4i run(const Packet4i& a) {
+ return vextq_s32(a, a, offset);
+ }
+};
+
template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); }
template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); }
@@ -625,6 +642,14 @@ template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { retu
template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); }
+template<size_t offset>
+struct protate_impl<offset, Packet2d>
+{
+ static Packet2d run(const Packet2d& a) {
+ return vextq_f64(a, a, offset);
+ }
+};
+
template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); }
#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index b5a0ba2bc..3653783fd 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -462,6 +462,29 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
{ return _mm_shuffle_epi32(a,0x1B); }
+template<size_t offset>
+struct protate_impl<offset, Packet4f>
+{
+ static Packet4f run(const Packet4f& a) {
+ return vec4f_swizzle1(a, offset, (offset + 1) % 4, (offset + 2) % 4, (offset + 3) % 4);
+ }
+};
+
+template<size_t offset>
+struct protate_impl<offset, Packet4i>
+{
+ static Packet4i run(const Packet4i& a) {
+ return vec4i_swizzle1(a, offset, (offset + 1) % 4, (offset + 2) % 4, (offset + 3) % 4);
+ }
+};
+
+template<size_t offset>
+struct protate_impl<offset, Packet2d>
+{
+ static Packet2d run(const Packet2d& a) {
+ return vec2d_swizzle1(a, offset, (offset + 1) % 2);
+ }
+};
template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)
{
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index c8a1dcced..6a16aa661 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -771,7 +771,19 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
const Index peeled_kc = depth & ~(pk-1);
const Index prefetch_res_offset = 32/sizeof(ResScalar);
// const Index depth2 = depth & ~1;
-
+
+#if EIGEN_ARCH_ARM
+ const bool PreferRotatingKernel = true;
+#else
+ const bool PreferRotatingKernel = false;
+#endif
+
+ const bool UseRotatingKernel =
+ PreferRotatingKernel &&
+ Traits::LhsPacketSize == 4 &&
+ Traits::RhsPacketSize == 4 &&
+ Traits::ResPacketSize == 4;
+
//---------- Process 3 * LhsProgress rows at once ----------
// This corresponds to 3*LhsProgress x nr register blocks.
// Usually, make sense only with FMA
@@ -818,7 +830,21 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
RhsPacket B_0, T0;
LhsPacket A2;
-#define EIGEN_GEBGP_ONESTEP(K) \
+#define EIGEN_GEBP_ONESTEP_LOADRHS(K,N) \
+ do { \
+ if (UseRotatingKernel) { \
+ if (N == 0) { \
+ B_0 = pload<RhsPacket>(&blB[(0+4*K)*RhsProgress]); \
+ } else { \
+ EIGEN_ASM_COMMENT("Do not reorder code, we're very tight on registers"); \
+ B_0 = protate<1>(B_0); \
+ } \
+ } else { \
+ traits.loadRhs(&blB[(N+4*K)*RhsProgress], B_0); \
+ } \
+ } while (false)
+
+#define EIGEN_GEBP_ONESTEP(K) \
do { \
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
@@ -827,34 +853,34 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
- traits.loadRhs(&blB[(0+4*K)*RhsProgress], B_0); \
+ EIGEN_GEBP_ONESTEP_LOADRHS(K, 0); \
traits.madd(A0, B_0, C0, T0); \
traits.madd(A1, B_0, C4, T0); \
traits.madd(A2, B_0, C8, B_0); \
- traits.loadRhs(&blB[1+4*K*RhsProgress], B_0); \
+ EIGEN_GEBP_ONESTEP_LOADRHS(K, 1); \
traits.madd(A0, B_0, C1, T0); \
traits.madd(A1, B_0, C5, T0); \
traits.madd(A2, B_0, C9, B_0); \
- traits.loadRhs(&blB[2+4*K*RhsProgress], B_0); \
+ EIGEN_GEBP_ONESTEP_LOADRHS(K, 2); \
traits.madd(A0, B_0, C2, T0); \
traits.madd(A1, B_0, C6, T0); \
traits.madd(A2, B_0, C10, B_0); \
- traits.loadRhs(&blB[3+4*K*RhsProgress], B_0); \
+ EIGEN_GEBP_ONESTEP_LOADRHS(K, 3); \
traits.madd(A0, B_0, C3 , T0); \
traits.madd(A1, B_0, C7, T0); \
traits.madd(A2, B_0, C11, B_0); \
EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
} while(false)
-
+
internal::prefetch(blB + 4 * pk * sizeof(RhsScalar)); /* Bug 953 */
- EIGEN_GEBGP_ONESTEP(0);
- EIGEN_GEBGP_ONESTEP(1);
- EIGEN_GEBGP_ONESTEP(2);
- EIGEN_GEBGP_ONESTEP(3);
- EIGEN_GEBGP_ONESTEP(4);
- EIGEN_GEBGP_ONESTEP(5);
- EIGEN_GEBGP_ONESTEP(6);
- EIGEN_GEBGP_ONESTEP(7);
+ EIGEN_GEBP_ONESTEP(0);
+ EIGEN_GEBP_ONESTEP(1);
+ EIGEN_GEBP_ONESTEP(2);
+ EIGEN_GEBP_ONESTEP(3);
+ EIGEN_GEBP_ONESTEP(4);
+ EIGEN_GEBP_ONESTEP(5);
+ EIGEN_GEBP_ONESTEP(6);
+ EIGEN_GEBP_ONESTEP(7);
blB += pk*4*RhsProgress;
blA += pk*3*Traits::LhsProgress;
@@ -866,12 +892,36 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
{
RhsPacket B_0, T0;
LhsPacket A2;
- EIGEN_GEBGP_ONESTEP(0);
+ EIGEN_GEBP_ONESTEP(0);
blB += 4*RhsProgress;
blA += 3*Traits::LhsProgress;
}
- #undef EIGEN_GEBGP_ONESTEP
-
+#undef EIGEN_GEBP_ONESTEP
+
+ if (UseRotatingKernel) {
+ #define EIGEN_GEBP_UNROTATE_RESULT(res0, res1, res2, res3) \
+ do { \
+ PacketBlock<ResPacket> resblock; \
+ resblock.packet[0] = res0; \
+ resblock.packet[1] = res1; \
+ resblock.packet[2] = res2; \
+ resblock.packet[3] = res3; \
+ ptranspose(resblock); \
+ resblock.packet[3] = protate<1>(resblock.packet[3]); \
+ resblock.packet[2] = protate<2>(resblock.packet[2]); \
+ resblock.packet[1] = protate<3>(resblock.packet[1]); \
+ ptranspose(resblock); \
+ res0 = resblock.packet[0]; \
+ res1 = resblock.packet[1]; \
+ res2 = resblock.packet[2]; \
+ res3 = resblock.packet[3]; \
+ } while (false)
+
+ EIGEN_GEBP_UNROTATE_RESULT(C0, C1, C2, C3);
+ EIGEN_GEBP_UNROTATE_RESULT(C4, C5, C6, C7);
+ EIGEN_GEBP_UNROTATE_RESULT(C8, C9, C10, C11);
+ }
+
ResPacket R0, R1, R2;
ResPacket alphav = pset1<ResPacket>(alpha);
diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h
index 7538a0633..5e16b775b 100644
--- a/Eigen/src/Core/util/StaticAssert.h
+++ b/Eigen/src/Core/util/StaticAssert.h
@@ -93,7 +93,8 @@
THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH,
OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG,
IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY,
- STORAGE_LAYOUT_DOES_NOT_MATCH
+ STORAGE_LAYOUT_DOES_NOT_MATCH,
+ ROTATION_BY_ILLEGAL_OFFSET
};
};