aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/arch
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2009-08-07 11:09:34 +0200
committerGravatar Gael Guennebaud <g.gael@free.fr>2009-08-07 11:09:34 +0200
commitd1dc088ef045dcee5747b5c722f5f4f6bb58e2d1 (patch)
tree6d6d012f9b9f9247bd743eabe5a65130aff3c7e3 /Eigen/src/Core/arch
parent543a7857562b2058718d39ce444f3c0495373fc8 (diff)
* implement a second level of micro blocking (faster for small sizes)
* workaround GCC bad implementation of _mm_set1_p*
Diffstat (limited to 'Eigen/src/Core/arch')
-rw-r--r--Eigen/src/Core/arch/SSE/PacketMath.h15
1 files changed, 15 insertions, 0 deletions
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 3f1fd8ef5..3fd33afbf 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -74,8 +74,23 @@ template<> struct ei_unpacket_traits<Packet4f> { typedef float type; enum {size
template<> struct ei_unpacket_traits<Packet2d> { typedef double type; enum {size=2}; };
template<> struct ei_unpacket_traits<Packet4i> { typedef int type; enum {size=4}; };
+#ifdef __GNUC__
+// Sometimes GCC implements _mm_set1_p* using multiple moves,
+// that is inefficient :(
+template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float& from) {
+ Packet4f res = _mm_set_ss(from);
+ asm("shufps $0, %[x], %[x]" : [x] "+x" (res) : );
+ return res;
+}
+template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<double>(const double& from) {
+ Packet2d res = _mm_set_sd(from);
+ asm("unpcklpd %[x], %[x]" : [x] "+x" (res) : );
+ return res;
+}
+#else
template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float& from) { return _mm_set1_ps(from); }
template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<double>(const double& from) { return _mm_set1_pd(from); }
+#endif
template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int& from) { return _mm_set1_epi32(from); }
template<> EIGEN_STRONG_INLINE Packet4f ei_padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); }