aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2014-03-20 16:03:46 +0100
committerGravatar Gael Guennebaud <g.gael@free.fr>2014-03-20 16:03:46 +0100
commit01fd880424f0e937af7841202af67e6e4ee6fc07 (patch)
tree2c5585a87e8c1a28a4ab43cce7b47ad240034ed3 /Eigen
parentc39a3fa7a1808233ad6556e169e0c08d3bc979e1 (diff)
Revert previous change and introduce a new workaround regarding gcc generating a shufps instruction instead of the more efficient pshufd instruction.
The trick consists in introducing a new pload1 function to be used in low level product kernels for which bug #203 does not apply. Indeed, it turned out that using inline assembly prevents gcc of doing a good job at instructtion reordering.
Diffstat (limited to 'Eigen')
-rwxr-xr-x[-rw-r--r--]Eigen/src/Core/GenericPacketMath.h4
-rwxr-xr-x[-rw-r--r--]Eigen/src/Core/arch/SSE/PacketMath.h26
2 files changed, 16 insertions, 14 deletions
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index b0469fa1e..538ab53b2 100644..100755
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -169,6 +169,10 @@ ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
pset1(const typename unpacket_traits<Packet>::type& a) { return a; }
+/** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pload1(const typename unpacket_traits<Packet>::type *a) { return pset1<Packet>(*a); }
+
/** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */
template<typename Scalar> inline typename packet_traits<Scalar>::type
plset(const Scalar& a) { return a; }
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index ea14111e3..293fb83e4 100644..100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -110,24 +110,22 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { re
template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set_pd(from,from); }
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set_epi32(from,from,from,from); }
#else
-
-// GCC generates a shufps instruction for set1_ps instead of the more efficient pshufd instruction.
-// However, with AVX, we want it to generate a vbroadcastss.
-// Moreover, we cannot use intrinsics here because then gcc generates crappy code in some cases (see bug 203)
-#if (defined __GNUC__) && (!defined __INTEL_COMPILER) && (!defined __clang__) && (!defined __AVX__)
- template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
- Packet4f res;
- asm("pshufd $0, %[a], %[b]" : [b] "=x" (res) : [a] "x" (from));
- return res;
- }
-#else
- template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set_ps1(from); }
-#endif
-
+template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set_ps1(from); }
template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set1_epi32(from); }
#endif
+// GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction.
+// However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203)
+// Using inline assembly is also not an option because then gcc fails to reorder properly the instructions.
+// Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply.
+// Also note that with AVX, we want it to generate a vbroadcastss.
+#if (defined __GNUC__) && (!defined __INTEL_COMPILER) && (!defined __clang__) && (!defined __AVX__)
+template<> EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float *from) {
+ return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0);
+}
+#endif
+
template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }