diff options
Diffstat (limited to 'Eigen')
-rw-r--r-- | Eigen/src/Core/GenericPacketMath.h | 15 | ||||
-rw-r--r-- | Eigen/src/Core/arch/NEON/PacketMath.h | 25 | ||||
-rwxr-xr-x | Eigen/src/Core/arch/SSE/PacketMath.h | 23 | ||||
-rw-r--r-- | Eigen/src/Core/products/GeneralBlockPanelKernel.h | 86 | ||||
-rw-r--r-- | Eigen/src/Core/util/StaticAssert.h | 3 |
5 files changed, 133 insertions, 19 deletions
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 74e1174ae..967a07df5 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -287,6 +287,21 @@ template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Pack template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a) { return a; } +template<size_t offset, typename Packet> +struct protate_impl +{ + static Packet run(const Packet& a) { return a; } +}; + +/** \internal \returns a packet with the coefficients rotated to the right in little-endian convention, + * by the given offset, e.g. for offset == 1: + * (packet[3], packet[2], packet[1], packet[0]) becomes (packet[0], packet[3], packet[2], packet[1]) + */ +template<size_t offset, typename Packet> EIGEN_DEVICE_FUNC inline Packet protate(const Packet& a) +{ + EIGEN_STATIC_ASSERT(offset < unpacket_traits<Packet>::size, ROTATION_BY_ILLEGAL_OFFSET); + return offset ? protate_impl<offset, Packet>::run(a) : a; +} /** \internal \returns \a a with real and imaginary part flipped (for complex type only) */ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 8149aed7f..e9af45f22 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -309,6 +309,23 @@ template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { a_hi = vget_high_s32(a_r64); return vcombine_s32(a_hi, a_lo); } + +template<size_t offset> +struct protate_impl<offset, Packet4f> +{ + static Packet4f run(const Packet4f& a) { + return vextq_f32(a, a, offset); + } +}; + +template<size_t offset> +struct protate_impl<offset, Packet4i> +{ + static Packet4i run(const Packet4i& a) { + return vextq_s32(a, a, offset); + } +}; + template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); } template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); } @@ -625,6 +642,14 @@ template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { retu template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); } +template<size_t offset> +struct protate_impl<offset, Packet2d> +{ + static Packet2d run(const Packet2d& a) { + return vextq_f64(a, a, offset); + } +}; + template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); } #if EIGEN_COMP_CLANG && defined(__apple_build_version__) diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index b5a0ba2bc..3653783fd 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -462,6 +462,29 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return _mm_shuffle_epi32(a,0x1B); } +template<size_t offset> +struct protate_impl<offset, Packet4f> +{ + static Packet4f run(const Packet4f& a) { + return vec4f_swizzle1(a, offset, (offset + 1) % 4, (offset + 2) % 4, (offset + 3) % 4); + } +}; + +template<size_t offset> +struct protate_impl<offset, Packet4i> +{ + static Packet4i run(const Packet4i& a) { + return vec4i_swizzle1(a, offset, (offset + 1) % 4, (offset + 2) % 4, (offset + 3) % 4); + } +}; + +template<size_t offset> +struct protate_impl<offset, Packet2d> +{ + static Packet2d run(const Packet2d& a) { + return vec2d_swizzle1(a, offset, (offset + 1) % 2); + } +}; template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index c8a1dcced..6a16aa661 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -771,7 +771,19 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga const Index peeled_kc = depth & ~(pk-1); const Index prefetch_res_offset = 32/sizeof(ResScalar); // const Index depth2 = depth & ~1; - + +#if EIGEN_ARCH_ARM + const bool PreferRotatingKernel = true; +#else + const bool PreferRotatingKernel = false; +#endif + + const bool UseRotatingKernel = + PreferRotatingKernel && + Traits::LhsPacketSize == 4 && + Traits::RhsPacketSize == 4 && + Traits::ResPacketSize == 4; + //---------- Process 3 * LhsProgress rows at once ---------- // This corresponds to 3*LhsProgress x nr register blocks. // Usually, make sense only with FMA @@ -818,7 +830,21 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga RhsPacket B_0, T0; LhsPacket A2; -#define EIGEN_GEBGP_ONESTEP(K) \ +#define EIGEN_GEBP_ONESTEP_LOADRHS(K,N) \ + do { \ + if (UseRotatingKernel) { \ + if (N == 0) { \ + B_0 = pload<RhsPacket>(&blB[(0+4*K)*RhsProgress]); \ + } else { \ + EIGEN_ASM_COMMENT("Do not reorder code, we're very tight on registers"); \ + B_0 = protate<1>(B_0); \ + } \ + } else { \ + traits.loadRhs(&blB[(N+4*K)*RhsProgress], B_0); \ + } \ + } while (false) + +#define EIGEN_GEBP_ONESTEP(K) \ do { \ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ @@ -827,34 +853,34 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ - traits.loadRhs(&blB[(0+4*K)*RhsProgress], B_0); \ + EIGEN_GEBP_ONESTEP_LOADRHS(K, 0); \ traits.madd(A0, B_0, C0, T0); \ traits.madd(A1, B_0, C4, T0); \ traits.madd(A2, B_0, C8, B_0); \ - traits.loadRhs(&blB[1+4*K*RhsProgress], B_0); \ + EIGEN_GEBP_ONESTEP_LOADRHS(K, 1); \ traits.madd(A0, B_0, C1, T0); \ traits.madd(A1, B_0, C5, T0); \ traits.madd(A2, B_0, C9, B_0); \ - traits.loadRhs(&blB[2+4*K*RhsProgress], B_0); \ + EIGEN_GEBP_ONESTEP_LOADRHS(K, 2); \ traits.madd(A0, B_0, C2, T0); \ traits.madd(A1, B_0, C6, T0); \ traits.madd(A2, B_0, C10, B_0); \ - traits.loadRhs(&blB[3+4*K*RhsProgress], B_0); \ + EIGEN_GEBP_ONESTEP_LOADRHS(K, 3); \ traits.madd(A0, B_0, C3 , T0); \ traits.madd(A1, B_0, C7, T0); \ traits.madd(A2, B_0, C11, B_0); \ EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \ } while(false) - + internal::prefetch(blB + 4 * pk * sizeof(RhsScalar)); /* Bug 953 */ - EIGEN_GEBGP_ONESTEP(0); - EIGEN_GEBGP_ONESTEP(1); - EIGEN_GEBGP_ONESTEP(2); - EIGEN_GEBGP_ONESTEP(3); - EIGEN_GEBGP_ONESTEP(4); - EIGEN_GEBGP_ONESTEP(5); - EIGEN_GEBGP_ONESTEP(6); - EIGEN_GEBGP_ONESTEP(7); + EIGEN_GEBP_ONESTEP(0); + EIGEN_GEBP_ONESTEP(1); + EIGEN_GEBP_ONESTEP(2); + EIGEN_GEBP_ONESTEP(3); + EIGEN_GEBP_ONESTEP(4); + EIGEN_GEBP_ONESTEP(5); + EIGEN_GEBP_ONESTEP(6); + EIGEN_GEBP_ONESTEP(7); blB += pk*4*RhsProgress; blA += pk*3*Traits::LhsProgress; @@ -866,12 +892,36 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga { RhsPacket B_0, T0; LhsPacket A2; - EIGEN_GEBGP_ONESTEP(0); + EIGEN_GEBP_ONESTEP(0); blB += 4*RhsProgress; blA += 3*Traits::LhsProgress; } - #undef EIGEN_GEBGP_ONESTEP - +#undef EIGEN_GEBP_ONESTEP + + if (UseRotatingKernel) { + #define EIGEN_GEBP_UNROTATE_RESULT(res0, res1, res2, res3) \ + do { \ + PacketBlock<ResPacket> resblock; \ + resblock.packet[0] = res0; \ + resblock.packet[1] = res1; \ + resblock.packet[2] = res2; \ + resblock.packet[3] = res3; \ + ptranspose(resblock); \ + resblock.packet[3] = protate<1>(resblock.packet[3]); \ + resblock.packet[2] = protate<2>(resblock.packet[2]); \ + resblock.packet[1] = protate<3>(resblock.packet[1]); \ + ptranspose(resblock); \ + res0 = resblock.packet[0]; \ + res1 = resblock.packet[1]; \ + res2 = resblock.packet[2]; \ + res3 = resblock.packet[3]; \ + } while (false) + + EIGEN_GEBP_UNROTATE_RESULT(C0, C1, C2, C3); + EIGEN_GEBP_UNROTATE_RESULT(C4, C5, C6, C7); + EIGEN_GEBP_UNROTATE_RESULT(C8, C9, C10, C11); + } + ResPacket R0, R1, R2; ResPacket alphav = pset1<ResPacket>(alpha); diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h index 7538a0633..5e16b775b 100644 --- a/Eigen/src/Core/util/StaticAssert.h +++ b/Eigen/src/Core/util/StaticAssert.h @@ -93,7 +93,8 @@ THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH, OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG, IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY, - STORAGE_LAYOUT_DOES_NOT_MATCH + STORAGE_LAYOUT_DOES_NOT_MATCH, + ROTATION_BY_ILLEGAL_OFFSET }; }; |