diff options
Diffstat (limited to 'Eigen/src/Core/arch/SSE/PacketMath.h')
-rwxr-xr-x | Eigen/src/Core/arch/SSE/PacketMath.h | 16 |
1 files changed, 11 insertions, 5 deletions
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 6f31cf12b..3646abdb1 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -504,6 +504,7 @@ template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs) { return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3])); } + template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) { return _mm_hadd_pd(vecs[0], vecs[1]); @@ -515,7 +516,6 @@ template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) return pfirst<Packet4f>(_mm_hadd_ps(tmp0, tmp0)); } -template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst<Packet2d>(_mm_hadd_pd(a, a)); } #else // SSE2 versions template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) @@ -523,10 +523,6 @@ template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a)); return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); } -template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) -{ - return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a,a))); -} template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs) { @@ -548,6 +544,16 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) } #endif // SSE3 +template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) +{ + // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures + // (from Nehalem to Haswell) +// #ifdef EIGEN_VECTORIZE_SSE3 +// return pfirst<Packet2d>(_mm_hadd_pd(a, a)); +// #else + return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a,a))); +// #endif +} #ifdef EIGEN_VECTORIZE_SSSE3 template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs) |