diff options
Diffstat (limited to 'Eigen/src')
-rw-r--r-- | Eigen/src/Core/GenericPacketMath.h | 4 | ||||
-rw-r--r-- | Eigen/src/Core/arch/NEON/PacketMath.h | 8 | ||||
-rw-r--r-- | Eigen/src/Core/products/GeneralBlockPanelKernel.h | 269 | ||||
-rw-r--r-- | Eigen/src/Core/products/GeneralMatrixMatrix.h | 5 | ||||
-rw-r--r-- | Eigen/src/Core/util/StaticAssert.h | 3 |
5 files changed, 205 insertions, 84 deletions
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 678938c6b..722881215 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -313,7 +313,8 @@ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& template<size_t offset, typename Packet> struct protate_impl { - static Packet run(const Packet& a) { return a; } + // Empty so attempts to use this unimplemented path will fail to compile. + // Only specializations of this template should be used. }; /** \internal \returns a packet with the coefficients rotated to the right in little-endian convention, @@ -322,7 +323,6 @@ struct protate_impl */ template<size_t offset, typename Packet> EIGEN_DEVICE_FUNC inline Packet protate(const Packet& a) { - EIGEN_STATIC_ASSERT(offset < unpacket_traits<Packet>::size, ROTATION_BY_ILLEGAL_OFFSET); return offset ? protate_impl<offset, Packet>::run(a) : a; } diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index e9af45f22..8dd1e1370 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -76,12 +76,12 @@ typedef uint32x4_t Packet4ui; template<> struct packet_traits<float> : default_packet_traits { typedef Packet4f type; - typedef Packet2f half; + typedef Packet4f half; // Packet2f intrinsics not implemented yet enum { Vectorizable = 1, AlignedOnScalar = 1, size = 4, - HasHalfPacket=1, + HasHalfPacket=0, // Packet2f intrinsics not implemented yet HasDiv = 1, // FIXME check the Has* @@ -95,12 +95,12 @@ template<> struct packet_traits<float> : default_packet_traits template<> struct packet_traits<int> : default_packet_traits { typedef Packet4i type; - typedef Packet2i half; + typedef Packet4i half; // Packet2i intrinsics not implemented yet enum { Vectorizable = 1, AlignedOnScalar = 1, size=4, - HasHalfPacket=1 + HasHalfPacket=0 // Packet2i intrinsics not implemented yet // FIXME check the Has* }; }; diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 0890bc690..9061fd936 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -88,15 +88,15 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES EIGEN_UNUSED_VARIABLE(num_threads); enum { - kr = 16, + kr = 8, mr = Traits::mr, nr = Traits::nr }; k = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K); if (k > kr) k -= k % kr; - m = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M); + m = std::min<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M); if (m > mr) m -= m % mr; - n = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N); + n = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N); if (n > nr) n -= n % nr; return; #endif @@ -153,16 +153,104 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads } else { // In unit tests we do not want to use extra large matrices, - // so we reduce the block size to check the blocking strategy is not flawed -#ifndef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS - k = std::min<Index>(k,sizeof(LhsScalar)<=4 ? 360 : 240); - n = std::min<Index>(n,3840/sizeof(RhsScalar)); - m = std::min<Index>(m,3840/sizeof(RhsScalar)); -#else - k = std::min<Index>(k,24); - n = std::min<Index>(n,384/sizeof(RhsScalar)); - m = std::min<Index>(m,384/sizeof(RhsScalar)); + // so we reduce the cache size to check the blocking strategy is not flawed +#ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS + l1 = 4*1024; + l2 = 32*1024; + l3 = 512*1024; #endif + + // Early return for small problems because the computation below are time consuming for small problems. + // Perhaps it would make more sense to consider k*n*m?? + // Note that for very tiny problem, this function should be bypassed anyway + // because we use the coefficient-based implementation for them. + if(std::max(k,std::max(m,n))<48) + return; + + typedef typename Traits::ResScalar ResScalar; + enum { + k_peeling = 8, + k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)), + k_sub = Traits::mr * Traits::nr * sizeof(ResScalar) + }; + + // ---- 1st level of blocking on L1, yields kc ---- + + // Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel + // of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache. + // We also include a register-level block of the result (mx x nr). + // (In an ideal world only the lhs panel would stay in L1) + // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of: + const Index max_kc = ((l1-k_sub)/k_div) & (~(k_peeling-1)); + const Index old_k = k; + if(k>max_kc) + { + // We are really blocking on the third dimension: + // -> reduce blocking size to make sure the last block is as large as possible + // while keeping the same number of sweeps over the result. + k = (k%max_kc)==0 ? max_kc + : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1))); + + eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same"); + } + + // ---- 2nd level of blocking on max(L2,L3), yields nc ---- + + // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is: + // actual_l2 = max(l2, l3/nb_core_sharing_l3) + // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it) + // For instance, it corresponds to 6MB of L3 shared among 4 cores. + #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS + const Index actual_l2 = l3; + #else + const Index actual_l2 = 1572864; // == 1.5 MB + #endif + + + + // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2. + // The second half is implicitly reserved to access the result and lhs coefficients. + // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful + // to limit this growth: we bound nc to growth by a factor x1.5, leading to: + const Index max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar)); + // WARNING Below, we assume that Traits::nr is a power of two. + Index nc = std::min<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1)); + if(n>nc) + { + // We are really blocking over the columns: + // -> reduce blocking size to make sure the last block is as large as possible + // while keeping the same number of sweeps over the packed lhs. + // Here we allow one more sweep if this gives us a perfect match, thus the commented "-1" + n = (n%nc)==0 ? nc + : (nc - Traits::nr * ((nc/*-1*/-(n%nc))/(Traits::nr*(n/nc+1)))); + } + else if(old_k==k) + { + // So far, no blocking at all, i.e., kc==k, and nc==n. + // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2 + Index problem_size = k*n*sizeof(LhsScalar); + Index actual_lm = actual_l2; + Index max_mc = m; + if(problem_size<=1024) + { + // problem is small enough to keep in L1 + // Let's choose m such that lhs's block fit in 1/3 of L1 + actual_lm = l1; + } + else if(l3!=0 && problem_size<=32768) + { + // we have both L2 and L3, and problem is small enough to be kept in L2 + // Let's choose m such that lhs's block fit in 1/3 of L2 + actual_lm = l2; + max_mc = 576; + } + + Index mc = (std::min<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc); + if (mc > Traits::mr) mc -= mc % Traits::mr; + + m = (m%mc)==0 ? mc + : (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1)))); + } } } @@ -712,6 +800,80 @@ protected: conj_helper<ResPacket,ResPacket,false,ConjRhs> cj; }; +// helper for the rotating kernel below +template <typename GebpKernel, bool UseRotatingKernel = GebpKernel::UseRotatingKernel> +struct PossiblyRotatingKernelHelper +{ + // default implementation, not rotating + + typedef typename GebpKernel::Traits Traits; + typedef typename Traits::RhsScalar RhsScalar; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::AccPacket AccPacket; + + const Traits& traits; + PossiblyRotatingKernelHelper(const Traits& t) : traits(t) {} + + + template <size_t K, size_t Index> + void loadOrRotateRhs(RhsPacket& to, const RhsScalar* from) const + { + traits.loadRhs(from + (Index+4*K)*Traits::RhsProgress, to); + } + + void unrotateResult(AccPacket&, + AccPacket&, + AccPacket&, + AccPacket&) + { + } +}; + +// rotating implementation +template <typename GebpKernel> +struct PossiblyRotatingKernelHelper<GebpKernel, true> +{ + typedef typename GebpKernel::Traits Traits; + typedef typename Traits::RhsScalar RhsScalar; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::AccPacket AccPacket; + + const Traits& traits; + PossiblyRotatingKernelHelper(const Traits& t) : traits(t) {} + + template <size_t K, size_t Index> + void loadOrRotateRhs(RhsPacket& to, const RhsScalar* from) const + { + if (Index == 0) { + to = pload<RhsPacket>(from + 4*K*Traits::RhsProgress); + } else { + EIGEN_ASM_COMMENT("Do not reorder code, we're very tight on registers"); + to = protate<1>(to); + } + } + + void unrotateResult(AccPacket& res0, + AccPacket& res1, + AccPacket& res2, + AccPacket& res3) + { + PacketBlock<AccPacket> resblock; + resblock.packet[0] = res0; + resblock.packet[1] = res1; + resblock.packet[2] = res2; + resblock.packet[3] = res3; + ptranspose(resblock); + resblock.packet[3] = protate<1>(resblock.packet[3]); + resblock.packet[2] = protate<2>(resblock.packet[2]); + resblock.packet[1] = protate<3>(resblock.packet[1]); + ptranspose(resblock); + res0 = resblock.packet[0]; + res1 = resblock.packet[1]; + res2 = resblock.packet[2]; + res3 = resblock.packet[3]; + } +}; + /* optimized GEneral packed Block * packed Panel product kernel * * Mixing type logic: C += A * B @@ -745,6 +907,16 @@ struct gebp_kernel ResPacketSize = Traits::ResPacketSize }; + + static const bool UseRotatingKernel = + EIGEN_ARCH_ARM && + internal::is_same<LhsScalar, float>::value && + internal::is_same<RhsScalar, float>::value && + internal::is_same<ResScalar, float>::value && + Traits::LhsPacketSize == 4 && + Traits::RhsPacketSize == 4 && + Traits::ResPacketSize == 4; + EIGEN_DONT_INLINE void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, @@ -778,6 +950,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga // Usually, make sense only with FMA if(mr>=3*Traits::LhsProgress) { + PossiblyRotatingKernelHelper<gebp_kernel> possiblyRotatingKernelHelper(traits); + // loops on each largest micro horizontal panel of lhs (3*Traits::LhsProgress x depth) for(Index i=0; i<peeled_mc3; i+=3*Traits::LhsProgress) { @@ -813,43 +987,12 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga prefetch(&blB[0]); LhsPacket A0, A1; -#define EIGEN_ARCH_PREFERS_ROTATING_KERNEL EIGEN_ARCH_ARM - -#if EIGEN_ARCH_PREFERS_ROTATING_KERNEL - static const bool UseRotatingKernel = - Traits::LhsPacketSize == 4 && - Traits::RhsPacketSize == 4 && - Traits::ResPacketSize == 4; -#endif - for(Index k=0; k<peeled_kc; k+=pk) { EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4"); RhsPacket B_0, T0; LhsPacket A2; -#define EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING(K,N) \ - traits.loadRhs(&blB[(N+4*K)*RhsProgress], B_0); - -#if EIGEN_ARCH_PREFERS_ROTATING_KERNEL -#define EIGEN_GEBP_ONESTEP_LOADRHS(K,N) \ - do { \ - if (UseRotatingKernel) { \ - if (N == 0) { \ - B_0 = pload<RhsPacket>(&blB[(0+4*K)*RhsProgress]); \ - } else { \ - EIGEN_ASM_COMMENT("Do not reorder code, we're very tight on registers"); \ - B_0 = protate<1>(B_0); \ - } \ - } else { \ - EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING(K,N); \ - } \ - } while (false) -#else -#define EIGEN_GEBP_ONESTEP_LOADRHS(K,N) \ - EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING(K,N) -#endif - #define EIGEN_GEBP_ONESTEP(K) \ do { \ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \ @@ -859,19 +1002,19 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ - EIGEN_GEBP_ONESTEP_LOADRHS(K, 0); \ + possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 0>(B_0, blB); \ traits.madd(A0, B_0, C0, T0); \ traits.madd(A1, B_0, C4, T0); \ traits.madd(A2, B_0, C8, B_0); \ - EIGEN_GEBP_ONESTEP_LOADRHS(K, 1); \ + possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 1>(B_0, blB); \ traits.madd(A0, B_0, C1, T0); \ traits.madd(A1, B_0, C5, T0); \ traits.madd(A2, B_0, C9, B_0); \ - EIGEN_GEBP_ONESTEP_LOADRHS(K, 2); \ + possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 2>(B_0, blB); \ traits.madd(A0, B_0, C2, T0); \ traits.madd(A1, B_0, C6, T0); \ traits.madd(A2, B_0, C10, B_0); \ - EIGEN_GEBP_ONESTEP_LOADRHS(K, 3); \ + possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 3>(B_0, blB); \ traits.madd(A0, B_0, C3 , T0); \ traits.madd(A1, B_0, C7, T0); \ traits.madd(A2, B_0, C11, B_0); \ @@ -904,34 +1047,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga } #undef EIGEN_GEBP_ONESTEP -#undef EIGEN_GEBP_ONESTEP_LOADRHS -#undef EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING - -#if EIGEN_ARCH_PREFERS_ROTATING_KERNEL - if (UseRotatingKernel) { - #define EIGEN_GEBP_UNROTATE_RESULT(res0, res1, res2, res3) \ - do { \ - PacketBlock<ResPacket> resblock; \ - resblock.packet[0] = res0; \ - resblock.packet[1] = res1; \ - resblock.packet[2] = res2; \ - resblock.packet[3] = res3; \ - ptranspose(resblock); \ - resblock.packet[3] = protate<1>(resblock.packet[3]); \ - resblock.packet[2] = protate<2>(resblock.packet[2]); \ - resblock.packet[1] = protate<3>(resblock.packet[1]); \ - ptranspose(resblock); \ - res0 = resblock.packet[0]; \ - res1 = resblock.packet[1]; \ - res2 = resblock.packet[2]; \ - res3 = resblock.packet[3]; \ - } while (false) - - EIGEN_GEBP_UNROTATE_RESULT(C0, C1, C2, C3); - EIGEN_GEBP_UNROTATE_RESULT(C4, C5, C6, C7); - EIGEN_GEBP_UNROTATE_RESULT(C8, C9, C10, C11); - } -#endif + + possiblyRotatingKernelHelper.unrotateResult(C0, C1, C2, C3); + possiblyRotatingKernelHelper.unrotateResult(C4, C5, C6, C7); + possiblyRotatingKernelHelper.unrotateResult(C8, C9, C10, C11); ResPacket R0, R1, R2; ResPacket alphav = pset1<ResPacket>(alpha); diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index c38c12c31..c76f48154 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -164,6 +164,8 @@ static void run(Index rows, Index cols, Index depth, ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB()); + + const bool pack_rhs_once = mc!=rows && kc==depth && nc==cols; // For each horizontal panel of the rhs, and corresponding panel of the lhs... for(Index i2=0; i2<rows; i2+=mc) @@ -188,7 +190,8 @@ static void run(Index rows, Index cols, Index depth, // We pack the rhs's block into a sequential chunk of memory (L2 caching) // Note that this block will be read a very high number of times, which is equal to the number of // micro horizontal panel of the large rhs's panel (e.g., rows/12 times). - pack_rhs(blockB, rhs.getSubMapper(k2,j2), actual_kc, actual_nc); + if((!pack_rhs_once) || i2==0) + pack_rhs(blockB, rhs.getSubMapper(k2,j2), actual_kc, actual_nc); // Everything is packed, we can now call the panel * block kernel: gebp(res.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, alpha); diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h index 5e16b775b..7538a0633 100644 --- a/Eigen/src/Core/util/StaticAssert.h +++ b/Eigen/src/Core/util/StaticAssert.h @@ -93,8 +93,7 @@ THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH, OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG, IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY, - STORAGE_LAYOUT_DOES_NOT_MATCH, - ROTATION_BY_ILLEGAL_OFFSET + STORAGE_LAYOUT_DOES_NOT_MATCH }; }; |