diff options
Diffstat (limited to 'Eigen/src/Core/arch/SSE/PacketMath.h')
-rwxr-xr-x | Eigen/src/Core/arch/SSE/PacketMath.h | 61 |
1 files changed, 29 insertions, 32 deletions
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 78fd99e64..b9821ad80 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -148,12 +148,11 @@ struct packet_traits<float> : default_packet_traits { HasErf = EIGEN_FAST_MATH, HasBlend = 1, HasCeil = 1, - HasFloor = 1 + HasFloor = 1, + HasRint = 1, #ifdef EIGEN_VECTORIZE_SSE4_1 - , - HasRint = 1, - HasRound = 1 + HasRound = 1, #endif }; }; @@ -175,12 +174,10 @@ struct packet_traits<double> : default_packet_traits { HasRsqrt = 1, HasBlend = 1, HasFloor = 1, - HasCeil = 1 - -#ifdef EIGEN_VECTORIZE_SSE4_1 - , + HasCeil = 1, + HasRint = 1, + #ifdef EIGEN_VECTORIZE_SSE4_1 HasRound = 1, - HasRint = 1 #endif }; }; @@ -647,23 +644,17 @@ template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { ret template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return _mm_floor_ps(a); } template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return _mm_floor_pd(a); } #else -template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) -{ - const Packet4f cst_1 = pset1<Packet4f>(1.0f); - Packet4i emm0 = _mm_cvttps_epi32(a); - Packet4f tmp = _mm_cvtepi32_ps(emm0); - // If greater, subtract one. - Packet4f mask = _mm_cmpgt_ps(tmp, a); - mask = pand(mask, cst_1); - tmp = psub(tmp, mask); - // Handle saturation cases. - const Packet4f cst_max = pset1<Packet4f>(static_cast<float>(NumTraits<int32_t>::highest())); - return pselect(pcmp_lt(pabs(a), cst_max), tmp, a); +template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) { + // Adds and subtracts signum(a) * 2^23 to force rounding. + const Packet4f offset = + pselect(pcmp_lt(a, pzero(a)), + pset1<Packet4f>(-static_cast<float>(1<<23)), + pset1<Packet4f>(+static_cast<float>(1<<23))); + return psub(padd(a, offset), offset); } -// Rounds to nearest integer. -EIGEN_STRONG_INLINE Packet2d pround_to_nearest(const Packet2d& a) { - // Adds and subtracts signum(a) * 2^52 to force rounding to within precision. +template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) { + // Adds and subtracts signum(a) * 2^52 to force rounding. const Packet2d offset = pselect(pcmp_lt(a, pzero(a)), pset1<Packet2d>(-static_cast<double>(1ull<<52)), @@ -671,10 +662,20 @@ EIGEN_STRONG_INLINE Packet2d pround_to_nearest(const Packet2d& a) { return psub(padd(a, offset), offset); } +template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) +{ + const Packet4f cst_1 = pset1<Packet4f>(1.0f); + Packet4f tmp = print<Packet4f>(a); + // If greater, subtract one. + Packet4f mask = _mm_cmpgt_ps(tmp, a); + mask = pand(mask, cst_1); + return psub(tmp, mask); +} + template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { const Packet2d cst_1 = pset1<Packet2d>(1.0); - Packet2d tmp = pround_to_nearest(a); + Packet2d tmp = print<Packet2d>(a); // If greater, subtract one. Packet2d mask = _mm_cmpgt_pd(tmp, a); mask = pand(mask, cst_1); @@ -684,21 +685,17 @@ template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { const Packet4f cst_1 = pset1<Packet4f>(1.0f); - Packet4i emm0 = _mm_cvttps_epi32(a); - Packet4f tmp = _mm_cvtepi32_ps(emm0); + Packet4f tmp = print<Packet4f>(a); // If smaller, add one. Packet4f mask = _mm_cmplt_ps(tmp, a); mask = pand(mask, cst_1); - tmp = padd(tmp, mask); - // Handle saturation cases. - const Packet4f cst_max = pset1<Packet4f>(static_cast<float>(NumTraits<int32_t>::highest())); - return pselect(pcmp_lt(pabs(a), cst_max), tmp, a); + return padd(tmp, mask); } template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { const Packet2d cst_1 = pset1<Packet2d>(1.0); - Packet2d tmp = pround_to_nearest(a); + Packet2d tmp = print<Packet2d>(a); // If smaller, add one. Packet2d mask = _mm_cmplt_pd(tmp, a); mask = pand(mask, cst_1); |