diff options
author | Gael Guennebaud <g.gael@free.fr> | 2014-01-25 16:54:13 +0100 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2014-01-25 16:54:13 +0100 |
commit | a7621809fe0fc7c65446ab9a83739ebe313004d4 (patch) | |
tree | 2503577f20b50b5351bc4ad0a9177b131886707a | |
parent | 6cf938df53b5f26690a0e8034e10d11640d3a5c2 (diff) |
Remove useless register keyword, and optimize predux_min/max for SSE4
-rw-r--r-- | Eigen/src/Core/arch/SSE/PacketMath.h | 18 | ||||
-rw-r--r-- | Eigen/src/Core/products/SelfadjointMatrixVector.h | 6 |
2 files changed, 17 insertions, 7 deletions
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index f85d2e06e..f5a3dab52 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -504,13 +504,18 @@ template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) } template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) { +#ifdef EIGEN_VECTORIZE_SSE4_1 + Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2))); + return pfirst(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); +#else // after some experiments, it is seems this is the fastest way to implement it // for GCC (eg., it does not like using std::min after the pstore !!) EIGEN_ALIGN16 int aux[4]; pstore(aux, a); - register int aux0 = aux[0]<aux[1] ? aux[0] : aux[1]; - register int aux2 = aux[2]<aux[3] ? aux[2] : aux[3]; + int aux0 = aux[0]<aux[1] ? aux[0] : aux[1]; + int aux2 = aux[2]<aux[3] ? aux[2] : aux[3]; return aux0<aux2 ? aux0 : aux2; +#endif // EIGEN_VECTORIZE_SSE4_1 } // max @@ -525,13 +530,18 @@ template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) } template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) { +#ifdef EIGEN_VECTORIZE_SSE4_1 + Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2))); + return pfirst(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); +#else // after some experiments, it is seems this is the fastest way to implement it // for GCC (eg., it does not like using std::min after the pstore !!) EIGEN_ALIGN16 int aux[4]; pstore(aux, a); - register int aux0 = aux[0]>aux[1] ? aux[0] : aux[1]; - register int aux2 = aux[2]>aux[3] ? aux[2] : aux[3]; + int aux0 = aux[0]>aux[1] ? aux[0] : aux[1]; + int aux2 = aux[2]>aux[3] ? aux[2] : aux[3]; return aux0>aux2 ? aux0 : aux2; +#endif // EIGEN_VECTORIZE_SSE4_1 } #if (defined __GNUC__) diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h index c40e80f53..f698f67f9 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixVector.h +++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h @@ -79,8 +79,8 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd for (Index j=FirstTriangular ? bound : 0; j<(FirstTriangular ? size : bound);j+=2) { - register const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride; - register const Scalar* EIGEN_RESTRICT A1 = lhs + (j+1)*lhsStride; + const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride; + const Scalar* EIGEN_RESTRICT A1 = lhs + (j+1)*lhsStride; Scalar t0 = cjAlpha * rhs[j]; Packet ptmp0 = pset1<Packet>(t0); @@ -147,7 +147,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd } for (Index j=FirstTriangular ? 0 : bound;j<(FirstTriangular ? bound : size);j++) { - register const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride; + const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride; Scalar t1 = cjAlpha * rhs[j]; Scalar t2(0); |