diff options
author | Gael Guennebaud <g.gael@free.fr> | 2016-11-22 21:53:14 +0100 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2016-11-22 21:53:14 +0100 |
commit | 178c084856003f1cfd3020615ab98230d9520a80 (patch) | |
tree | 13d998c01cce9e7f4ab0920c63da41adfcd91fed | |
parent | 7dd894e40e439a6d1f4aed659d1375d65589cff3 (diff) |
Disable usage of SSE3 _mm_hadd_ps that is extremely slow.
-rwxr-xr-x | Eigen/src/Core/arch/SSE/PacketMath.h | 26 |
1 files changed, 13 insertions, 13 deletions
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 3646abdb1..80cf8af09 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -510,20 +510,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) return _mm_hadd_pd(vecs[0], vecs[1]); } -template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) -{ - Packet4f tmp0 = _mm_hadd_ps(a,a); - return pfirst<Packet4f>(_mm_hadd_ps(tmp0, tmp0)); -} - #else -// SSE2 versions -template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) -{ - Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a)); - return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); -} - template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs) { Packet4f tmp0, tmp1, tmp2; @@ -544,6 +531,19 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) } #endif // SSE3 +template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) +{ + // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures + // (from Nehalem to Haswell) +// #ifdef EIGEN_VECTORIZE_SSE3 +// Packet4f tmp = _mm_add_ps(a, vec4f_swizzle1(a,2,3,2,3)); +// return pfirst<Packet4f>(_mm_hadd_ps(tmp, tmp)); +// #else + Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a)); + return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); +// #endif +} + template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures |