diff options
author | 2014-03-20 16:03:46 +0100 | |
---|---|---|
committer | 2014-03-20 16:03:46 +0100 | |
commit | 01fd880424f0e937af7841202af67e6e4ee6fc07 (patch) | |
tree | 2c5585a87e8c1a28a4ab43cce7b47ad240034ed3 /Eigen | |
parent | c39a3fa7a1808233ad6556e169e0c08d3bc979e1 (diff) |
Revert previous change and introduce a new workaround regarding gcc generating a shufps instruction instead of the more efficient pshufd instruction.
The trick consists in introducing a new pload1 function to be used in low level product kernels for which bug #203 does not apply.
Indeed, it turned out that using inline assembly prevents gcc of doing a good job at instructtion reordering.
Diffstat (limited to 'Eigen')
-rwxr-xr-x[-rw-r--r--] | Eigen/src/Core/GenericPacketMath.h | 4 | ||||
-rwxr-xr-x[-rw-r--r--] | Eigen/src/Core/arch/SSE/PacketMath.h | 26 |
2 files changed, 16 insertions, 14 deletions
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index b0469fa1e..538ab53b2 100644..100755 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -169,6 +169,10 @@ ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; } template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pset1(const typename unpacket_traits<Packet>::type& a) { return a; } +/** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */ +template<typename Packet> EIGEN_DEVICE_FUNC inline Packet +pload1(const typename unpacket_traits<Packet>::type *a) { return pset1<Packet>(*a); } + /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */ template<typename Scalar> inline typename packet_traits<Scalar>::type plset(const Scalar& a) { return a; } diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index ea14111e3..293fb83e4 100644..100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -110,24 +110,22 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { re template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set_pd(from,from); } template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set_epi32(from,from,from,from); } #else - -// GCC generates a shufps instruction for set1_ps instead of the more efficient pshufd instruction. -// However, with AVX, we want it to generate a vbroadcastss. -// Moreover, we cannot use intrinsics here because then gcc generates crappy code in some cases (see bug 203) -#if (defined __GNUC__) && (!defined __INTEL_COMPILER) && (!defined __clang__) && (!defined __AVX__) - template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { - Packet4f res; - asm("pshufd $0, %[a], %[b]" : [b] "=x" (res) : [a] "x" (from)); - return res; - } -#else - template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set_ps1(from); } -#endif - +template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set_ps1(from); } template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); } template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set1_epi32(from); } #endif +// GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction. +// However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203) +// Using inline assembly is also not an option because then gcc fails to reorder properly the instructions. +// Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply. +// Also note that with AVX, we want it to generate a vbroadcastss. +#if (defined __GNUC__) && (!defined __INTEL_COMPILER) && (!defined __clang__) && (!defined __AVX__) +template<> EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float *from) { + return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0); +} +#endif + template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); } template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); } template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); } |