aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/arch/SSE
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2016-11-22 16:58:31 +0100
committerGravatar Gael Guennebaud <g.gael@free.fr>2016-11-22 16:58:31 +0100
commitf3fb0a1940c93c2eea2342b20506d652050ff48b (patch)
treee545487054c4618209964c97ccbf3a8726bfcd45 /Eigen/src/Core/arch/SSE
parent6a84246a6a5dd980e78e5609b3098cdbce93807c (diff)
Disable usage of SSE3 haddpd that is extremely slow.
Diffstat (limited to 'Eigen/src/Core/arch/SSE')
-rwxr-xr-xEigen/src/Core/arch/SSE/PacketMath.h16
1 files changed, 11 insertions, 5 deletions
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 6f31cf12b..3646abdb1 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -504,6 +504,7 @@ template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
{
return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3]));
}
+
template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
{
return _mm_hadd_pd(vecs[0], vecs[1]);
@@ -515,7 +516,6 @@ template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
return pfirst<Packet4f>(_mm_hadd_ps(tmp0, tmp0));
}
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst<Packet2d>(_mm_hadd_pd(a, a)); }
#else
// SSE2 versions
template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
@@ -523,10 +523,6 @@ template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
}
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
-{
- return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
-}
template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
{
@@ -548,6 +544,16 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
}
#endif // SSE3
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
+{
+ // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
+ // (from Nehalem to Haswell)
+// #ifdef EIGEN_VECTORIZE_SSE3
+// return pfirst<Packet2d>(_mm_hadd_pd(a, a));
+// #else
+ return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
+// #endif
+}
#ifdef EIGEN_VECTORIZE_SSSE3
template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)