diff options
Diffstat (limited to 'Eigen/src/Core/products')
-rw-r--r-- | Eigen/src/Core/products/GeneralBlockPanelKernel.h | 510 |
1 files changed, 338 insertions, 172 deletions
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index afbd83eda..030c7740a 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -353,6 +353,24 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_ // #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T); #endif +template <typename RhsPacket, typename RhsPacketx4, int registers_taken> +struct RhsPanelHelper { + private: + static const int remaining_registers = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS - registers_taken; + public: + typedef typename conditional<remaining_registers>=4, RhsPacketx4, RhsPacket>::type type; +}; + +template <typename Packet> +struct QuadPacket +{ + Packet B_0, B1, B2, B3; + const Packet& get(const FixedInt<0>&) const { return B_0; } + const Packet& get(const FixedInt<1>&) const { return B1; } + const Packet& get(const FixedInt<2>&) const { return B2; } + const Packet& get(const FixedInt<3>&) const { return B3; } +}; + template <int N, typename T1, typename T2, typename T3> struct packet_conditional { typedef T3 type; }; @@ -448,29 +466,35 @@ public: typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; typedef LhsPacket LhsPacket4Packing; + typedef QuadPacket<RhsPacket> RhsPacketx4; typedef ResPacket AccPacket; EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1<ResPacket>(ResScalar(0)); } - - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) - { - pbroadcast4(b, b0, b1, b2, b3); - } - -// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) -// { -// pbroadcast2(b, b0, b1); -// } - + template<typename RhsPacketType> EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { dest = pset1<RhsPacketType>(*b); } - + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + template<typename RhsPacketType> + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const + { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + { + } + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad<RhsPacket>(b); @@ -488,8 +512,8 @@ public: dest = ploadu<LhsPacketType>(a); } - template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType> - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const + template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const { conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj; // It would be a lot cleaner to call pmadd all the time. Unfortunately if we @@ -504,6 +528,12 @@ public: #endif } + template<typename LhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const + { + madd(a, b.get(lane), c, tmp, lane); + } + EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const { r = pmadd(c,alpha,r); @@ -555,6 +585,8 @@ public: typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; typedef LhsPacket LhsPacket4Packing; + typedef QuadPacket<RhsPacket> RhsPacketx4; + typedef ResPacket AccPacket; EIGEN_STRONG_INLINE void initAcc(AccPacket& p) @@ -567,6 +599,20 @@ public: { dest = pset1<RhsPacketType>(*b); } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + template<typename RhsPacketType> + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const + { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + {} EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { @@ -598,18 +644,8 @@ public: dest = ploadu<LhsPacketType>(a); } - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) - { - pbroadcast4(b, b0, b1, b2, b3); - } - -// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) -// { -// pbroadcast2(b, b0, b1); -// } - - template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType> - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp) const + template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const { madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type()); } @@ -630,10 +666,16 @@ public: c += a * b; } + template<typename LhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const + { + madd(a, b.get(lane), c, tmp, lane); + } + template <typename ResPacketType, typename AccPacketType> EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const { - const conj_helper<ResPacketType,ResPacketType,ConjLhs,false> cj; + conj_helper<ResPacketType,ResPacketType,ConjLhs,false> cj; r = cj.pmadd(c,alpha,r); } @@ -756,6 +798,9 @@ public: typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket; typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket; typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket; + + // this actualy holds 8 packets! + typedef QuadPacket<RhsPacket> RhsPacketx4; EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } @@ -778,39 +823,37 @@ public: dest.first = pset1<RealPacketType>(real(*b)); dest.second = pset1<RealPacketType>(imag(*b)); } - - EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { - loadRhs(b,dest); + loadRhs(b, dest.B_0); + loadRhs(b + 1, dest.B1); + loadRhs(b + 2, dest.B2); + loadRhs(b + 3, dest.B3); } - EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const + + // Scalar path + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const { - loadQuadToDoublePacket(b,dest); + loadRhs(b, dest); } - - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + + // Vectorized path + template<typename RealPacketType> + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const { - // FIXME not sure that's the best way to implement it! - loadRhs(b+0, b0); - loadRhs(b+1, b1); - loadRhs(b+2, b2); - loadRhs(b+3, b3); + loadRhs(b, dest); } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} - // Vectorized path - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1) + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const { - // FIXME not sure that's the best way to implement it! - loadRhs(b+0, b0); - loadRhs(b+1, b1); + loadRhs(b,dest); } - - // Scalar path - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1) + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const { - // FIXME not sure that's the best way to implement it! - loadRhs(b+0, b0); - loadRhs(b+1, b1); + loadQuadToDoublePacket(b,dest); } // nothing special here @@ -825,17 +868,26 @@ public: dest = ploadu<LhsPacketType>((const typename unpacket_traits<LhsPacketType>::type*)(a)); } - template<typename LhsPacketType, typename RhsPacketType, typename ResPacketType, typename TmpType> - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket<ResPacketType>& c, TmpType& /*tmp*/) const + template<typename LhsPacketType, typename RhsPacketType, typename ResPacketType, typename TmpType, typename LaneIdType> + EIGEN_STRONG_INLINE + typename enable_if<!is_same<RhsPacketType,RhsPacketx4>::value>::type + madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket<ResPacketType>& c, TmpType& /*tmp*/, const LaneIdType&) const { c.first = padd(pmul(a,b.first), c.first); c.second = padd(pmul(a,b.second),c.second); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const + template<typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const LaneIdType&) const { c = cj.pmadd(a,b,c); } + + template<typename LhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const + { + madd(a, b.get(lane), c, tmp, lane); + } EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; } @@ -914,7 +966,7 @@ public: typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; typedef LhsPacket LhsPacket4Packing; - + typedef QuadPacket<RhsPacket> RhsPacketx4; typedef ResPacket AccPacket; EIGEN_STRONG_INLINE void initAcc(AccPacket& p) @@ -927,18 +979,20 @@ public: { dest = pset1<RhsPacketType>(*b); } - - void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { - pbroadcast4(b, b0, b1, b2, b3); + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); } - -// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) -// { -// // FIXME not sure that's the best way to implement it! -// b0 = pload1<RhsPacket>(b+0); -// b1 = pload1<RhsPacket>(b+1); -// } + + template<typename RhsPacketType> + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const + { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + {} EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { @@ -956,8 +1010,8 @@ public: dest = ploaddup<LhsPacketType>(a); } - template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType> - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp) const + template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const { madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type()); } @@ -979,48 +1033,135 @@ public: c += a * b; } + template<typename LhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const + { + madd(a, b.get(lane), c, tmp, lane); + } + template <typename ResPacketType, typename AccPacketType> EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const { - const conj_helper<ResPacketType,ResPacketType,false,ConjRhs> cj; + conj_helper<ResPacketType,ResPacketType,false,ConjRhs> cj; r = cj.pmadd(alpha,c,r); } protected: - + }; #if EIGEN_ARCH_ARM64 && defined EIGEN_VECTORIZE_NEON -template<> -struct gebp_traits <float, float, false, false,Architecture::NEON> - : gebp_traits<float,float,false,false,Architecture::Generic> +template<int _PacketSize> +struct gebp_traits <float, float, false, false,Architecture::NEON,_PacketSize> + : gebp_traits<float,float,false,false,Architecture::Generic,_PacketSize> { typedef float RhsPacket; - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + typedef float32x4_t RhsPacketx4; + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { - loadRhs(b+0, b0); - loadRhs(b+1, b1); - loadRhs(b+2, b2); - loadRhs(b+3, b3); + dest = *b; } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { - dest = *b; + dest = vld1q_f32(b); } + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = *b; + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketx4& dest) const + {} + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b,dest); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/) const + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const { c = vfmaq_n_f32(c, a, b); } + + template<int LaneID> + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<LaneID>&) const + { + #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) + // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 + // vfmaq_laneq_f32 is implemented through a costly dup + if(LaneID==0) asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) : ); + else if(LaneID==1) asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) : ); + else if(LaneID==2) asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) : ); + else if(LaneID==3) asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) : ); + #else + c = vfmaq_laneq_f32(c, a, b, LaneID); + #endif + } +}; + + +template<> +struct gebp_traits <double, double, false, false,Architecture::NEON> + : gebp_traits<double,double,false,false,Architecture::Generic> +{ + typedef double RhsPacket; + + struct RhsPacketx4 { + float64x2_t B_0, B_1; + }; + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = *b; + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + dest.B_0 = vld1q_f64(b); + dest.B_1 = vld1q_f64(b+2); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const + { + loadRhs(b,dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketx4& dest) const + {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const + { + loadRhs(b,dest); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + { + c = vfmaq_n_f64(c, a, b); + } + + template<int LaneID> + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<LaneID>&) const + { + #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) + // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 + // vfmaq_laneq_f64 is implemented through a costly dup + if(LaneID==0) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); + else if(LaneID==1) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); + else if(LaneID==2) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : ); + else if(LaneID==3) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : ); + #else + if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0); + else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1); + else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0); + else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1); + #endif + } }; #endif @@ -1044,6 +1185,9 @@ struct gebp_kernel typedef typename Traits::RhsPacket RhsPacket; typedef typename Traits::ResPacket ResPacket; typedef typename Traits::AccPacket AccPacket; + typedef typename Traits::RhsPacketx4 RhsPacketx4; + + typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 15>::type RhsPanel15; typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits; @@ -1148,7 +1292,7 @@ struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, SRhsPacketQuarter b0; straits.loadLhsUnaligned(blB, a0); straits.loadRhs(blA, b0); - straits.madd(a0,b0,c0,b0); + straits.madd(a0,b0,c0,b0, fix<0>); blB += SwappedTraits::LhsProgress/4; blA += 1; } @@ -1165,21 +1309,25 @@ struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper> struct lhs_process_one_packet { + typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4; - EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3) + EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3) { EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4"); EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); - traits.loadLhs(&blA[(0+1*K)*(LhsProgress)], *A0); - traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3); - traits.madd(*A0, *B_0, *C0, *B_0); - traits.madd(*A0, *B1, *C1, *B1); - traits.madd(*A0, *B2, *C2, *B2); - traits.madd(*A0, *B3, *C3, *B3); + traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0); + traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel); + traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>); + traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>); + traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>); + traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>); EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4"); } - EIGEN_STRONG_INLINE void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha, Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB, Index prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4) + EIGEN_STRONG_INLINE void operator()( + const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha, + Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB, + int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4) { GEBPTraits traits; @@ -1221,18 +1369,19 @@ struct lhs_process_one_packet for(Index k=0; k<peeled_kc; k+=pk) { EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX4"); - RhsPacket B_0, B1, B2, B3; + RhsPacketx4 rhs_panel; + RhsPacket T0; internal::prefetch(blB+(48+0)); - peeled_kc_onestep(0, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3); - peeled_kc_onestep(1, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3); - peeled_kc_onestep(2, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3); - peeled_kc_onestep(3, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3); + peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + peeled_kc_onestep(1, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + peeled_kc_onestep(3, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); internal::prefetch(blB+(48+16)); - peeled_kc_onestep(4, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3); - peeled_kc_onestep(5, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3); - peeled_kc_onestep(6, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3); - peeled_kc_onestep(7, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3); + peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + peeled_kc_onestep(5, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + peeled_kc_onestep(7, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); blB += pk*4*RhsProgress; blA += pk*LhsProgress; @@ -1243,8 +1392,9 @@ struct lhs_process_one_packet // process remaining peeled loop for(Index k=peeled_kc; k<depth; k++) { - RhsPacket B_0, B1, B2, B3; - peeled_kc_onestep(0, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3); + RhsPacketx4 rhs_panel; + RhsPacket T0; + peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); blB += 4*RhsProgress; blA += LhsProgress; } @@ -1293,9 +1443,10 @@ struct lhs_process_one_packet do { \ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ + /* FIXME: why unaligned???? */ \ traits.loadLhsUnaligned(&blA[(0+1*K)*LhsProgress], A0); \ traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ - traits.madd(A0, B_0, C0, B_0); \ + traits.madd(A0, B_0, C0, B_0, fix<0>); \ EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \ } while(false); @@ -1372,7 +1523,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0; enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell) const Index peeled_kc = depth & ~(pk-1); - const Index prefetch_res_offset = 32/sizeof(ResScalar); + const int prefetch_res_offset = 32/sizeof(ResScalar); // const Index depth2 = depth & ~1; //---------- Process 3 * LhsProgress rows at once ---------- @@ -1430,36 +1581,48 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga for(Index k=0; k<peeled_kc; k+=pk) { EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4"); - RhsPacket B_0, T0; + // 15 registers are taken (12 for acc, 2 for lhs). + RhsPanel15 rhs_panel; + RhsPacket T0; LhsPacket A2; - -#define EIGEN_GEBP_ONESTEP(K) \ - do { \ - EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \ + #if EIGEN_COMP_GNUC_STRICT && EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && !(EIGEN_GNUC_AT_LEAST(9,0)) + // see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633 + // without this workaround A0, A1, and A2 are loaded in the same register, + // which is not good for pipelining + #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__ ("" : "+w,m" (A0), "+w,m" (A1), "+w,m" (A2)); + #else + #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND + #endif +#define EIGEN_GEBP_ONESTEP(K) \ + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ - internal::prefetch(blA+(3*K+16)*LhsProgress); \ - if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { internal::prefetch(blB+(4*K+16)*RhsProgress); } /* Bug 953 */ \ - traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ - traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ - traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ - traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \ - traits.madd(A0, B_0, C0, T0); \ - traits.madd(A1, B_0, C4, T0); \ - traits.madd(A2, B_0, C8, B_0); \ - traits.loadRhs(blB + (1+4*K)*Traits::RhsProgress, B_0); \ - traits.madd(A0, B_0, C1, T0); \ - traits.madd(A1, B_0, C5, T0); \ - traits.madd(A2, B_0, C9, B_0); \ - traits.loadRhs(blB + (2+4*K)*Traits::RhsProgress, B_0); \ - traits.madd(A0, B_0, C2, T0); \ - traits.madd(A1, B_0, C6, T0); \ - traits.madd(A2, B_0, C10, B_0); \ - traits.loadRhs(blB + (3+4*K)*Traits::RhsProgress, B_0); \ - traits.madd(A0, B_0, C3 , T0); \ - traits.madd(A1, B_0, C7, T0); \ - traits.madd(A2, B_0, C11, B_0); \ - EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \ - } while(false) + internal::prefetch(blA + (3 * K + 16) * LhsProgress); \ + if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { \ + internal::prefetch(blB + (4 * K + 16) * RhsProgress); \ + } /* Bug 953 */ \ + traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \ + traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \ + traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \ + EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \ + traits.loadRhs(blB + (0+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C0, T0, fix<0>); \ + traits.madd(A1, rhs_panel, C4, T0, fix<0>); \ + traits.madd(A2, rhs_panel, C8, T0, fix<0>); \ + traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C1, T0, fix<1>); \ + traits.madd(A1, rhs_panel, C5, T0, fix<1>); \ + traits.madd(A2, rhs_panel, C9, T0, fix<1>); \ + traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C2, T0, fix<2>); \ + traits.madd(A1, rhs_panel, C6, T0, fix<2>); \ + traits.madd(A2, rhs_panel, C10, T0, fix<2>); \ + traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C3, T0, fix<3>); \ + traits.madd(A1, rhs_panel, C7, T0, fix<3>); \ + traits.madd(A2, rhs_panel, C11, T0, fix<3>); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \ + } while (false) internal::prefetch(blB); EIGEN_GEBP_ONESTEP(0); @@ -1479,7 +1642,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga // process remaining peeled loop for(Index k=peeled_kc; k<depth; k++) { - RhsPacket B_0, T0; + RhsPanel15 rhs_panel; + RhsPacket T0; LhsPacket A2; EIGEN_GEBP_ONESTEP(0); blB += 4*RhsProgress; @@ -1559,20 +1723,20 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga { EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1"); RhsPacket B_0; -#define EIGEN_GEBGP_ONESTEP(K) \ - do { \ - EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \ +#define EIGEN_GEBGP_ONESTEP(K) \ + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ - traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ - traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ - traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ - traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ - traits.madd(A0, B_0, C0, B_0); \ - traits.madd(A1, B_0, C4, B_0); \ - traits.madd(A2, B_0, C8, B_0); \ - EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \ - } while(false) - + traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \ + traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \ + traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \ + traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \ + traits.madd(A0, B_0, C0, B_0, fix<0>); \ + traits.madd(A1, B_0, C4, B_0, fix<0>); \ + traits.madd(A2, B_0, C8, B_0, fix<0>); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \ + } while (false) + EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); EIGEN_GEBGP_ONESTEP(2); @@ -1661,7 +1825,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga for(Index k=0; k<peeled_kc; k+=pk) { EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4"); - RhsPacket B_0, B1, B2, B3, T0; + RhsPacketx4 rhs_panel; + RhsPacket T0; // NOTE: the begin/end asm comments below work around bug 935! // but they are not enough for gcc>=6 without FMA (bug 1637) @@ -1670,24 +1835,24 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga #else #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND #endif - #define EIGEN_GEBGP_ONESTEP(K) \ - do { \ - EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \ - traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ - traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ - traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ - traits.madd(A0, B_0, C0, T0); \ - traits.madd(A1, B_0, C4, B_0); \ - traits.madd(A0, B1, C1, T0); \ - traits.madd(A1, B1, C5, B1); \ - traits.madd(A0, B2, C2, T0); \ - traits.madd(A1, B2, C6, B2); \ - traits.madd(A0, B3, C3, T0); \ - traits.madd(A1, B3, C7, B3); \ - EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \ - EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \ - } while(false) - +#define EIGEN_GEBGP_ONESTEP(K) \ + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \ + traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \ + traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \ + traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \ + traits.madd(A0, rhs_panel, C0, T0, fix<0>); \ + traits.madd(A1, rhs_panel, C4, T0, fix<0>); \ + traits.madd(A0, rhs_panel, C1, T0, fix<1>); \ + traits.madd(A1, rhs_panel, C5, T0, fix<1>); \ + traits.madd(A0, rhs_panel, C2, T0, fix<2>); \ + traits.madd(A1, rhs_panel, C6, T0, fix<2>); \ + traits.madd(A0, rhs_panel, C3, T0, fix<3>); \ + traits.madd(A1, rhs_panel, C7, T0, fix<3>); \ + EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \ + } while (false) + internal::prefetch(blB+(48+0)); EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); @@ -1707,7 +1872,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga // process remaining peeled loop for(Index k=peeled_kc; k<depth; k++) { - RhsPacket B_0, B1, B2, B3, T0; + RhsPacketx4 rhs_panel; + RhsPacket T0; EIGEN_GEBGP_ONESTEP(0); blB += 4*RhsProgress; blA += 2*Traits::LhsProgress; @@ -1778,8 +1944,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ - traits.madd(A0, B_0, C0, B1); \ - traits.madd(A1, B_0, C4, B_0); \ + traits.madd(A0, B_0, C0, B1, fix<0>); \ + traits.madd(A1, B_0, C4, B_0, fix<0>); \ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \ } while(false) @@ -1882,15 +2048,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga straits.loadRhsQuad(blA+0*spk, B_0); straits.loadRhsQuad(blA+1*spk, B_1); - straits.madd(A0,B_0,C0,B_0); - straits.madd(A1,B_1,C1,B_1); + straits.madd(A0,B_0,C0,B_0, fix<0>); + straits.madd(A1,B_1,C1,B_1, fix<0>); straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0); straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1); straits.loadRhsQuad(blA+2*spk, B_0); straits.loadRhsQuad(blA+3*spk, B_1); - straits.madd(A0,B_0,C2,B_0); - straits.madd(A1,B_1,C3,B_1); + straits.madd(A0,B_0,C2,B_0, fix<0>); + straits.madd(A1,B_1,C3,B_1, fix<0>); blB += 4*SwappedTraits::LhsProgress; blA += 4*spk; @@ -1903,7 +2069,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga straits.loadLhsUnaligned(blB, A0); straits.loadRhsQuad(blA, B_0); - straits.madd(A0,B_0,C0,B_0); + straits.madd(A0,B_0,C0,B_0, fix<0>); blB += SwappedTraits::LhsProgress; blA += spk; @@ -1927,7 +2093,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga straits.loadLhsUnaligned(blB, a0); straits.loadRhs(blA, b0); SAccPacketHalf c0 = predux_half_dowto4(C0); - straits.madd(a0,b0,c0,b0); + straits.madd(a0,b0,c0,b0, fix<0>); straits.acc(c0, alphav, R); } else @@ -2273,7 +2439,7 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa } pack -= psize; - int left = rows - i; + Index left = rows - i; if (pack <= 0) { if (!gone_last && (starting_pos == i || left >= psize/2 || left >= psize/4) && |