diff options
author | 2015-02-27 15:30:10 -0500 | |
---|---|---|
committer | 2015-02-27 15:30:10 -0500 | |
commit | 6466fa63be51b32c493401bb7cf6aa4b2e1ef385 (patch) | |
tree | 6aebc98f506ccaf5a6220df703a85817baba5957 | |
parent | bf9877a92a4f049d5cbfda1817bac21167b5a324 (diff) |
Reimplement the selection between rotating and non-rotating kernels
using templates instead of macros and if()'s.
That was needed to fix the build of unit tests on ARM, which I had
broken. My bad for not testing earlier.
-rw-r--r-- | Eigen/src/Core/GenericPacketMath.h | 7 | ||||
-rw-r--r-- | Eigen/src/Core/products/GeneralBlockPanelKernel.h | 157 |
2 files changed, 96 insertions, 68 deletions
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index afdd6f71e..e48c064ce 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -290,11 +290,8 @@ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& template<size_t offset, typename Packet> struct protate_impl { - static Packet run(const Packet& a) - { - eigen_assert(false && "unimplemented"); - return a; - } + // Empty so attempts to use this unimplemented path will fail to compile. + // Only specializations of this template should be used. }; /** \internal \returns a packet with the coefficients rotated to the right in little-endian convention, diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 06c7f362e..9061fd936 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -800,6 +800,80 @@ protected: conj_helper<ResPacket,ResPacket,false,ConjRhs> cj; }; +// helper for the rotating kernel below +template <typename GebpKernel, bool UseRotatingKernel = GebpKernel::UseRotatingKernel> +struct PossiblyRotatingKernelHelper +{ + // default implementation, not rotating + + typedef typename GebpKernel::Traits Traits; + typedef typename Traits::RhsScalar RhsScalar; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::AccPacket AccPacket; + + const Traits& traits; + PossiblyRotatingKernelHelper(const Traits& t) : traits(t) {} + + + template <size_t K, size_t Index> + void loadOrRotateRhs(RhsPacket& to, const RhsScalar* from) const + { + traits.loadRhs(from + (Index+4*K)*Traits::RhsProgress, to); + } + + void unrotateResult(AccPacket&, + AccPacket&, + AccPacket&, + AccPacket&) + { + } +}; + +// rotating implementation +template <typename GebpKernel> +struct PossiblyRotatingKernelHelper<GebpKernel, true> +{ + typedef typename GebpKernel::Traits Traits; + typedef typename Traits::RhsScalar RhsScalar; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::AccPacket AccPacket; + + const Traits& traits; + PossiblyRotatingKernelHelper(const Traits& t) : traits(t) {} + + template <size_t K, size_t Index> + void loadOrRotateRhs(RhsPacket& to, const RhsScalar* from) const + { + if (Index == 0) { + to = pload<RhsPacket>(from + 4*K*Traits::RhsProgress); + } else { + EIGEN_ASM_COMMENT("Do not reorder code, we're very tight on registers"); + to = protate<1>(to); + } + } + + void unrotateResult(AccPacket& res0, + AccPacket& res1, + AccPacket& res2, + AccPacket& res3) + { + PacketBlock<AccPacket> resblock; + resblock.packet[0] = res0; + resblock.packet[1] = res1; + resblock.packet[2] = res2; + resblock.packet[3] = res3; + ptranspose(resblock); + resblock.packet[3] = protate<1>(resblock.packet[3]); + resblock.packet[2] = protate<2>(resblock.packet[2]); + resblock.packet[1] = protate<3>(resblock.packet[1]); + ptranspose(resblock); + res0 = resblock.packet[0]; + res1 = resblock.packet[1]; + res2 = resblock.packet[2]; + res3 = resblock.packet[3]; + } +}; + /* optimized GEneral packed Block * packed Panel product kernel * * Mixing type logic: C += A * B @@ -833,6 +907,16 @@ struct gebp_kernel ResPacketSize = Traits::ResPacketSize }; + + static const bool UseRotatingKernel = + EIGEN_ARCH_ARM && + internal::is_same<LhsScalar, float>::value && + internal::is_same<RhsScalar, float>::value && + internal::is_same<ResScalar, float>::value && + Traits::LhsPacketSize == 4 && + Traits::RhsPacketSize == 4 && + Traits::ResPacketSize == 4; + EIGEN_DONT_INLINE void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, @@ -866,6 +950,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga // Usually, make sense only with FMA if(mr>=3*Traits::LhsProgress) { + PossiblyRotatingKernelHelper<gebp_kernel> possiblyRotatingKernelHelper(traits); + // loops on each largest micro horizontal panel of lhs (3*Traits::LhsProgress x depth) for(Index i=0; i<peeled_mc3; i+=3*Traits::LhsProgress) { @@ -901,43 +987,12 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga prefetch(&blB[0]); LhsPacket A0, A1; -#define EIGEN_ARCH_PREFERS_ROTATING_KERNEL EIGEN_ARCH_ARM - -#if EIGEN_ARCH_PREFERS_ROTATING_KERNEL - static const bool UseRotatingKernel = - Traits::LhsPacketSize == 4 && - Traits::RhsPacketSize == 4 && - Traits::ResPacketSize == 4; -#endif - for(Index k=0; k<peeled_kc; k+=pk) { EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4"); RhsPacket B_0, T0; LhsPacket A2; -#define EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING(K,N) \ - traits.loadRhs(&blB[(N+4*K)*RhsProgress], B_0); - -#if EIGEN_ARCH_PREFERS_ROTATING_KERNEL -#define EIGEN_GEBP_ONESTEP_LOADRHS(K,N) \ - do { \ - if (UseRotatingKernel) { \ - if (N == 0) { \ - B_0 = pload<RhsPacket>(&blB[(0+4*K)*RhsProgress]); \ - } else { \ - EIGEN_ASM_COMMENT("Do not reorder code, we're very tight on registers"); \ - B_0 = protate<1>(B_0); \ - } \ - } else { \ - EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING(K,N); \ - } \ - } while (false) -#else -#define EIGEN_GEBP_ONESTEP_LOADRHS(K,N) \ - EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING(K,N) -#endif - #define EIGEN_GEBP_ONESTEP(K) \ do { \ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \ @@ -947,19 +1002,19 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ - EIGEN_GEBP_ONESTEP_LOADRHS(K, 0); \ + possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 0>(B_0, blB); \ traits.madd(A0, B_0, C0, T0); \ traits.madd(A1, B_0, C4, T0); \ traits.madd(A2, B_0, C8, B_0); \ - EIGEN_GEBP_ONESTEP_LOADRHS(K, 1); \ + possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 1>(B_0, blB); \ traits.madd(A0, B_0, C1, T0); \ traits.madd(A1, B_0, C5, T0); \ traits.madd(A2, B_0, C9, B_0); \ - EIGEN_GEBP_ONESTEP_LOADRHS(K, 2); \ + possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 2>(B_0, blB); \ traits.madd(A0, B_0, C2, T0); \ traits.madd(A1, B_0, C6, T0); \ traits.madd(A2, B_0, C10, B_0); \ - EIGEN_GEBP_ONESTEP_LOADRHS(K, 3); \ + possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 3>(B_0, blB); \ traits.madd(A0, B_0, C3 , T0); \ traits.madd(A1, B_0, C7, T0); \ traits.madd(A2, B_0, C11, B_0); \ @@ -992,34 +1047,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga } #undef EIGEN_GEBP_ONESTEP -#undef EIGEN_GEBP_ONESTEP_LOADRHS -#undef EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING - -#if EIGEN_ARCH_PREFERS_ROTATING_KERNEL - if (UseRotatingKernel) { - #define EIGEN_GEBP_UNROTATE_RESULT(res0, res1, res2, res3) \ - do { \ - PacketBlock<ResPacket> resblock; \ - resblock.packet[0] = res0; \ - resblock.packet[1] = res1; \ - resblock.packet[2] = res2; \ - resblock.packet[3] = res3; \ - ptranspose(resblock); \ - resblock.packet[3] = protate<1>(resblock.packet[3]); \ - resblock.packet[2] = protate<2>(resblock.packet[2]); \ - resblock.packet[1] = protate<3>(resblock.packet[1]); \ - ptranspose(resblock); \ - res0 = resblock.packet[0]; \ - res1 = resblock.packet[1]; \ - res2 = resblock.packet[2]; \ - res3 = resblock.packet[3]; \ - } while (false) - - EIGEN_GEBP_UNROTATE_RESULT(C0, C1, C2, C3); - EIGEN_GEBP_UNROTATE_RESULT(C4, C5, C6, C7); - EIGEN_GEBP_UNROTATE_RESULT(C8, C9, C10, C11); - } -#endif + + possiblyRotatingKernelHelper.unrotateResult(C0, C1, C2, C3); + possiblyRotatingKernelHelper.unrotateResult(C4, C5, C6, C7); + possiblyRotatingKernelHelper.unrotateResult(C8, C9, C10, C11); ResPacket R0, R1, R2; ResPacket alphav = pset1<ResPacket>(alpha); |