aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2016-11-22 21:53:14 +0100
committerGravatar Gael Guennebaud <g.gael@free.fr>2016-11-22 21:53:14 +0100
commit178c084856003f1cfd3020615ab98230d9520a80 (patch)
tree13d998c01cce9e7f4ab0920c63da41adfcd91fed
parent7dd894e40e439a6d1f4aed659d1375d65589cff3 (diff)
Disable usage of SSE3 _mm_hadd_ps that is extremely slow.
-rwxr-xr-xEigen/src/Core/arch/SSE/PacketMath.h26
1 files changed, 13 insertions, 13 deletions
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 3646abdb1..80cf8af09 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -510,20 +510,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
return _mm_hadd_pd(vecs[0], vecs[1]);
}
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
- Packet4f tmp0 = _mm_hadd_ps(a,a);
- return pfirst<Packet4f>(_mm_hadd_ps(tmp0, tmp0));
-}
-
#else
-// SSE2 versions
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
- Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
- return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
-}
-
template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
{
Packet4f tmp0, tmp1, tmp2;
@@ -544,6 +531,19 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
}
#endif // SSE3
+template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
+{
+ // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
+ // (from Nehalem to Haswell)
+// #ifdef EIGEN_VECTORIZE_SSE3
+// Packet4f tmp = _mm_add_ps(a, vec4f_swizzle1(a,2,3,2,3));
+// return pfirst<Packet4f>(_mm_hadd_ps(tmp, tmp));
+// #else
+ Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
+ return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+// #endif
+}
+
template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
{
// Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures