aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/arch/SSE
diff options
context:
space:
mode:
authorGravatar Antonio Sanchez <cantonios@google.com>2021-02-25 14:29:49 -0800
committerGravatar Antonio Sanchez <cantonios@google.com>2021-02-25 14:39:26 -0800
commit5529db7524b93208f3d97f5fadc53aff1de70190 (patch)
tree776d264bc8af0004bcd5eb6468ddb5c2bb4ea299 /Eigen/src/Core/arch/SSE
parentecb7b19dfa6c4bbf7a4068e114a1c86aa88908fe (diff)
Fix SSE/NEON pfloor/pceil for saturated values.
The original will saturate if the input does not fit into an integer type. Here we fix this, returning the input if it doesn't have enough precision to have a fractional part. Also added `pceil` for NEON. Fixes #1969.
Diffstat (limited to 'Eigen/src/Core/arch/SSE')
-rwxr-xr-xEigen/src/Core/arch/SSE/PacketMath.h78
1 files changed, 44 insertions, 34 deletions
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 23579a978..78fd99e64 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -602,6 +602,26 @@ template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packe
template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right (const Packet4i& a) { return _mm_srli_epi32(a,N); }
template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left (const Packet4i& a) { return _mm_slli_epi32(a,N); }
+template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)
+{
+ const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
+ return _mm_and_ps(a,mask);
+}
+template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a)
+{
+ const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
+ return _mm_and_pd(a,mask);
+}
+template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)
+{
+ #ifdef EIGEN_VECTORIZE_SSSE3
+ return _mm_abs_epi32(a);
+ #else
+ Packet4i aux = _mm_srai_epi32(a,31);
+ return _mm_sub_epi32(_mm_xor_si128(a,aux),aux);
+ #endif
+}
+
#ifdef EIGEN_VECTORIZE_SSE4_1
template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
{
@@ -632,20 +652,30 @@ template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
const Packet4f cst_1 = pset1<Packet4f>(1.0f);
Packet4i emm0 = _mm_cvttps_epi32(a);
Packet4f tmp = _mm_cvtepi32_ps(emm0);
- /* if greater, substract 1 */
+ // If greater, subtract one.
Packet4f mask = _mm_cmpgt_ps(tmp, a);
mask = pand(mask, cst_1);
- return psub(tmp, mask);
+ tmp = psub(tmp, mask);
+ // Handle saturation cases.
+ const Packet4f cst_max = pset1<Packet4f>(static_cast<float>(NumTraits<int32_t>::highest()));
+ return pselect(pcmp_lt(pabs(a), cst_max), tmp, a);
+}
+
+// Rounds to nearest integer.
+EIGEN_STRONG_INLINE Packet2d pround_to_nearest(const Packet2d& a) {
+ // Adds and subtracts signum(a) * 2^52 to force rounding to within precision.
+ const Packet2d offset =
+ pselect(pcmp_lt(a, pzero(a)),
+ pset1<Packet2d>(-static_cast<double>(1ull<<52)),
+ pset1<Packet2d>(+static_cast<double>(1ull<<52)));
+ return psub(padd(a, offset), offset);
}
-// WARNING: this pfloor implementation makes sense for small inputs only,
-// It is currently only used by pexp and not exposed through HasFloor.
template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
{
const Packet2d cst_1 = pset1<Packet2d>(1.0);
- Packet4i emm0 = _mm_cvttpd_epi32(a);
- Packet2d tmp = _mm_cvtepi32_pd(emm0);
- /* if greater, substract 1 */
+ Packet2d tmp = pround_to_nearest(a);
+ // If greater, subtract one.
Packet2d mask = _mm_cmpgt_pd(tmp, a);
mask = pand(mask, cst_1);
return psub(tmp, mask);
@@ -656,20 +686,20 @@ template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
const Packet4f cst_1 = pset1<Packet4f>(1.0f);
Packet4i emm0 = _mm_cvttps_epi32(a);
Packet4f tmp = _mm_cvtepi32_ps(emm0);
- /* if greater, substract 1 */
+ // If smaller, add one.
Packet4f mask = _mm_cmplt_ps(tmp, a);
mask = pand(mask, cst_1);
- return padd(tmp, mask);
+ tmp = padd(tmp, mask);
+ // Handle saturation cases.
+ const Packet4f cst_max = pset1<Packet4f>(static_cast<float>(NumTraits<int32_t>::highest()));
+ return pselect(pcmp_lt(pabs(a), cst_max), tmp, a);
}
-// WARNING: this pfloor implementation makes sense for small inputs only,
-// It is currently only used by pexp and not exposed through HasFloor.
template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a)
{
const Packet2d cst_1 = pset1<Packet2d>(1.0);
- Packet4i emm0 = _mm_cvttpd_epi32(a);
- Packet2d tmp = _mm_cvtepi32_pd(emm0);
- /* if greater, substract 1 */
+ Packet2d tmp = pround_to_nearest(a);
+ // If smaller, add one.
Packet2d mask = _mm_cmplt_pd(tmp, a);
mask = pand(mask, cst_1);
return padd(tmp, mask);
@@ -866,26 +896,6 @@ template<> EIGEN_STRONG_INLINE Packet16b preverse(const Packet16b& a) {
#endif
}
-template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)
-{
- const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
- return _mm_and_ps(a,mask);
-}
-template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a)
-{
- const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
- return _mm_and_pd(a,mask);
-}
-template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)
-{
- #ifdef EIGEN_VECTORIZE_SSSE3
- return _mm_abs_epi32(a);
- #else
- Packet4i aux = _mm_srai_epi32(a,31);
- return _mm_sub_epi32(_mm_xor_si128(a,aux),aux);
- #endif
-}
-
template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
return pfrexp_generic(a,exponent);
}