diff options
author | oem <oem@efikamx> | 2010-04-24 00:44:14 +0300 |
---|---|---|
committer | oem <oem@efikamx> | 2010-04-24 00:44:14 +0300 |
commit | 6972c140f737874d88da0e225c7c27b4563a4518 (patch) | |
tree | 3863bc8ca18553ea0d7ac34e8fab3aa5eda682f4 /Eigen | |
parent | e3e34b5920c9d522e53e4800218b53633df71f86 (diff) |
replaced _mm_prefetch in GeneralBlockPanelKernel.h, with ei_prefetch() inline function.
Implemented NEON and AltiVec versions, copied SSE version over from GeneralBlockPanelKernel.h.
Also in GCC case (or rather !_MSC_VER) it's implemented using __builtin_prefetch().
NEON managed to give a small but welcome boost, 0.88GFLOPS -> 0.91GFLOPS.
Diffstat (limited to 'Eigen')
-rw-r--r-- | Eigen/src/Core/GenericPacketMath.h | 8 | ||||
-rw-r--r-- | Eigen/src/Core/arch/AltiVec/PacketMath.h | 9 | ||||
-rw-r--r-- | Eigen/src/Core/arch/NEON/PacketMath.h | 7 | ||||
-rw-r--r-- | Eigen/src/Core/arch/SSE/PacketMath.h | 4 | ||||
-rw-r--r-- | Eigen/src/Core/products/GeneralBlockPanelKernel.h | 34 |
5 files changed, 36 insertions, 26 deletions
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 08981f89d..46fa5c186 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -169,6 +169,14 @@ template<typename Scalar, typename Packet> inline void ei_pstore(Scalar* to, con template<typename Scalar, typename Packet> inline void ei_pstoreu(Scalar* to, const Packet& from) { (*to) = from; } +/** \internal tries to do cache prefetching of \a addr */ +template<typename Scalar> inline void ei_prefetch(const Scalar* addr) +{ +#if !defined(_MSC_VER) +__builtin_prefetch(addr); +#endif +} + /** \internal \returns the first element of a packet */ template<typename Packet> inline typename ei_unpacket_traits<Packet>::type ei_pfirst(const Packet& a) { return a; } diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index c6fc670d8..0a7b07645 100644 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -67,6 +67,8 @@ typedef __vector unsigned char Packet16uc; #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ Packet4i ei_p4i_##NAME = ei_pset1<int>(X) +#define DST_CHAN 1 +#define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride)) // Define global static constants: static Packet4f ei_p4f_COUNTDOWN = { 3.0, 2.0, 1.0, 0.0 }; @@ -291,8 +293,8 @@ template<> EIGEN_STRONG_INLINE void ei_pstoreu<float>(float* to, const Packet4f edgeAlign = vec_lvsl(0, to); // permute map to extract edges edges=vec_perm(LSQ,MSQ,edgeAlign); // extract the edges align = vec_lvsr( 0, to ); // permute map to misalign data - MSQ = vec_perm(edges,(Packet16uc)from,align); // misalign the data (MSQ) - LSQ = vec_perm((Packet16uc)from,edges,align); // misalign the data (LSQ) + MSQ = vec_perm(edges,(Packet16uc)from,align); // misalign the data (MSQ) + LSQ = vec_perm((Packet16uc)from,edges,align); // misalign the data (LSQ) vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part } @@ -315,6 +317,9 @@ template<> EIGEN_STRONG_INLINE void ei_pstoreu<int>(int* to, const Packet4i vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part } +template<> EIGEN_STRONG_INLINE void ei_prefetch<float>(const float* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); } +template<> EIGEN_STRONG_INLINE void ei_prefetch<int>(const int* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); } + template<> EIGEN_STRONG_INLINE float ei_pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; } template<> EIGEN_STRONG_INLINE int ei_pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; } diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 2acb3633a..96c75101c 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -53,6 +53,10 @@ typedef int32x4_t Packet4i; #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ const Packet4i ei_p4i_##NAME = ei_pset1<int>(X) +#ifndef __pld +#define __pld(x) asm volatile ( " pld [%[addr]]\n" :: [addr] "r" (x) : "cc" ); +#endif + template<> struct ei_packet_traits<float> : ei_default_packet_traits { typedef Packet4f type; enum {size=4}; @@ -168,6 +172,9 @@ template<> EIGEN_STRONG_INLINE void ei_pstore<int>(int* to, const Packet4i template<> EIGEN_STRONG_INLINE void ei_pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); } template<> EIGEN_STRONG_INLINE void ei_pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); } +template<> EIGEN_STRONG_INLINE void ei_prefetch<float>(const float* addr) { __pld(addr); } +template<> EIGEN_STRONG_INLINE void ei_prefetch<int>(const int* addr) { __pld(addr); } + // FIXME only store the 2 first elements ? template<> EIGEN_STRONG_INLINE float ei_pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE int ei_pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; } diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 77f15d982..d360081cd 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -233,6 +233,10 @@ template<> EIGEN_STRONG_INLINE void ei_pstoreu<double>(double* to, const Packet2 template<> EIGEN_STRONG_INLINE void ei_pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu((double*)to, _mm_castps_pd(from)); } template<> EIGEN_STRONG_INLINE void ei_pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu((double*)to, _mm_castsi128_pd(from)); } +template<> EIGEN_STRONG_INLINE void ei_prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void ei_prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void ei_prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } + #if defined(_MSC_VER) && (_MSC_VER <= 1500) && defined(_WIN64) // The temporary variable fixes an internal compilation error. // Direct of the struct members fixed bug #62. diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 5e219e077..bc697cef5 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -117,9 +117,7 @@ struct ei_gebp_kernel for(int i=0; i<peeled_mc; i+=mr) { const Scalar* blA = &blockA[i*strideA+offsetA*mr]; - #ifdef EIGEN_VECTORIZE_SSE - _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); - #endif + ei_prefetch(&blA[0]); // TODO move the res loads to the stores @@ -139,12 +137,10 @@ struct ei_gebp_kernel Scalar* r2 = r1 + resStride; Scalar* r3 = r2 + resStride; - #ifdef EIGEN_VECTORIZE_SSE - _mm_prefetch((const char*)(r0+16), _MM_HINT_T0); - _mm_prefetch((const char*)(r1+16), _MM_HINT_T0); - _mm_prefetch((const char*)(r2+16), _MM_HINT_T0); - _mm_prefetch((const char*)(r3+16), _MM_HINT_T0); - #endif + ei_prefetch(r0+16); + ei_prefetch(r1+16); + ei_prefetch(r2+16); + ei_prefetch(r3+16); // performs "inner" product // TODO let's check wether the folowing peeled loop could not be @@ -334,9 +330,7 @@ struct ei_gebp_kernel { int i = peeled_mc; const Scalar* blA = &blockA[i*strideA+offsetA*PacketSize]; - #ifdef EIGEN_VECTORIZE_SSE - _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); - #endif + ei_prefetch(&blA[0]); // gets res block as register PacketType C0, C1, C2, C3; @@ -464,9 +458,7 @@ struct ei_gebp_kernel for(int i=peeled_mc2; i<rows; i++) { const Scalar* blA = &blockA[i*strideA+offsetA]; - #ifdef EIGEN_VECTORIZE_SSE - _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); - #endif + ei_prefetch(&blA[0]); // gets a 1 x nr res block as registers Scalar C0(0), C1(0), C2(0), C3(0); @@ -524,9 +516,7 @@ struct ei_gebp_kernel for(int i=0; i<peeled_mc; i+=mr) { const Scalar* blA = &blockA[i*strideA+offsetA*mr]; - #ifdef EIGEN_VECTORIZE_SSE - _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); - #endif + ei_prefetch(&blA[0]); // TODO move the res loads to the stores @@ -557,9 +547,7 @@ struct ei_gebp_kernel { int i = peeled_mc; const Scalar* blA = &blockA[i*strideA+offsetA*PacketSize]; - #ifdef EIGEN_VECTORIZE_SSE - _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); - #endif + ei_prefetch(&blA[0]); PacketType C0 = ei_ploadu(&res[(j2+0)*resStride + i]); @@ -576,9 +564,7 @@ struct ei_gebp_kernel for(int i=peeled_mc2; i<rows; i++) { const Scalar* blA = &blockA[i*strideA+offsetA]; - #ifdef EIGEN_VECTORIZE_SSE - _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); - #endif + ei_prefetch(&blA[0]); // gets a 1 x 1 res block as registers Scalar C0(0); |