3 files changed, 216 insertions, 92 deletions
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index b2170e9f7..8a2a14f4d 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -194,7 +194,8 @@ struct packet_traits<float> : default_packet_traits
     HasBlend     = 0,
 
     HasDiv   = 1,
-    HasFloor = 0,
+    HasFloor = 1,
+    HasCeil = 1,
 
     HasSin  = EIGEN_FAST_MATH,
     HasCos  = EIGEN_FAST_MATH,
@@ -1462,32 +1463,6 @@ template<> EIGEN_STRONG_INLINE Packet2f pcmp_lt_or_nan<Packet2f>(const Packet2f&
 template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan<Packet4f>(const Packet4f& a, const Packet4f& b)
 { return vreinterpretq_f32_u32(vmvnq_u32(vcgeq_f32(a,b))); }
 
-// WARNING: this pfloor implementation makes sense for inputs that fit in
-// signed int32 integers (up to ~2.14e9), hence this is currently only used
-// by pexp and not exposed through HasFloor.
-template<> EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(const Packet2f& a)
-{
-  const Packet2f cst_1 = pset1<Packet2f>(1.0f);
-  /* perform a floorf */
-  Packet2f tmp = vcvt_f32_s32(vcvt_s32_f32(a));
-
-  /* if greater, substract 1 */
-  Packet2ui mask = vcgt_f32(tmp, a);
-  mask = vand_u32(mask, vreinterpret_u32_f32(cst_1));
-  return vsub_f32(tmp, vreinterpret_f32_u32(mask));
-}
-template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
-{
-  const Packet4f cst_1 = pset1<Packet4f>(1.0f);
-  /* perform a floorf */
-  Packet4f tmp = vcvtq_f32_s32(vcvtq_s32_f32(a));
-
-  /* if greater, substract 1 */
-  Packet4ui mask = vcgtq_f32(tmp, a);
-  mask = vandq_u32(mask, vreinterpretq_u32_f32(cst_1));
-  return vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
-}
-
 // Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
 template<> EIGEN_STRONG_INLINE Packet2f pand<Packet2f>(const Packet2f& a, const Packet2f& b)
 { return vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
@@ -3206,6 +3181,63 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b)
 { return vbslq_u64(mask, a, b); }
 
+
+template<> EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(const Packet2f& a)
+{
+  const Packet2f cst_1 = pset1<Packet2f>(1.0f);
+  // Round to nearest.
+  Packet2f tmp = vcvt_f32_s32(vcvt_s32_f32(a));
+  // If greater, subtract one.
+  Packet2ui mask = vcgt_f32(tmp, a);
+  mask = vand_u32(mask, vreinterpret_u32_f32(cst_1));
+  tmp = vsub_f32(tmp, vreinterpret_f32_u32(mask));
+  // Handle saturation cases.
+  const Packet2f cst_max = pset1<Packet2f>(static_cast<float>(NumTraits<int32_t>::highest()));
+  return pselect(pcmp_lt(pabs(a), cst_max), tmp, a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
+{
+  const Packet4f cst_1 = pset1<Packet4f>(1.0f);
+  // Round to nearest.
+  Packet4f tmp = vcvtq_f32_s32(vcvtq_s32_f32(a));
+  // If greater, subtract one.
+  Packet4ui mask = vcgtq_f32(tmp, a);
+  mask = vandq_u32(mask, vreinterpretq_u32_f32(cst_1));
+  tmp = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
+  // Handle saturation cases.
+  const Packet4f cst_max = pset1<Packet4f>(static_cast<float>(NumTraits<int32_t>::highest()));
+  return pselect(pcmp_lt(pabs(a), cst_max), tmp, a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(const Packet2f& a)
+{
+  const Packet2f cst_1 = pset1<Packet2f>(1.0f);
+  // Round to nearest.
+  Packet2f tmp = vcvt_f32_s32(vcvt_s32_f32(a));
+  // If smaller, add one.
+  Packet2ui mask = vclt_f32(tmp, a);
+  mask = vand_u32(mask, vreinterpret_u32_f32(cst_1));
+  tmp = vadd_f32(tmp, vreinterpret_f32_u32(mask));
+  // Handle saturation cases.
+  const Packet2f cst_max = pset1<Packet2f>(static_cast<float>(NumTraits<int32_t>::highest()));
+  return pselect(pcmp_lt(pabs(a), cst_max), tmp, a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
+{
+  const Packet4f cst_1 = pset1<Packet4f>(1.0f);
+  // Round to nearest.
+  Packet4f tmp = vcvtq_f32_s32(vcvtq_s32_f32(a));
+  // If smaller, add one.
+  Packet4ui mask = vcltq_f32(tmp, a);
+  mask = vandq_u32(mask, vreinterpretq_u32_f32(cst_1));
+  tmp = vaddq_f32(tmp, vreinterpretq_f32_u32(mask));
+  // Handle saturation cases.
+  const Packet4f cst_max = pset1<Packet4f>(static_cast<float>(NumTraits<int32_t>::highest()));
+  return pselect(pcmp_lt(pabs(a), cst_max), tmp, a);
+}
+
 /**
  * Computes the integer square root
  * @remarks The calculation is performed using an algorithm which iterates through each binary digit of the result
@@ -3336,6 +3368,7 @@ template<> struct packet_traits<bfloat16> : default_packet_traits
     HasBlend     = 0,
     HasDiv       = 1,
     HasFloor     = 1,
+    HasCeil      = 1,
 
     HasSin  = EIGEN_FAST_MATH,
     HasCos  = EIGEN_FAST_MATH,
@@ -3502,6 +3535,11 @@ template<> EIGEN_STRONG_INLINE Packet4bf pfloor<Packet4bf>(const Packet4bf& a)
   return F32ToBf16(pfloor<Packet4f>(Bf16ToF32(a)));
 }
 
+template<> EIGEN_STRONG_INLINE Packet4bf pceil<Packet4bf>(const Packet4bf& a)
+{
+  return F32ToBf16(pceil<Packet4f>(Bf16ToF32(a)));
+}
+
 template<> EIGEN_STRONG_INLINE Packet4bf pconj(const Packet4bf& a) { return a; }
 
 template<> EIGEN_STRONG_INLINE Packet4bf padd<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
@@ -3676,7 +3714,8 @@ template<> struct packet_traits<double>  : default_packet_traits
     HasBlend     = 0,
 
     HasDiv   = 1,
-    HasFloor = 0,
+    HasFloor = 1,
+    HasCeil = 1,
 
     HasSin  = 0,
     HasCos  = 0,
@@ -3754,21 +3793,6 @@ template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const
 
 template<> EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) { return pmax<Packet2d>(a, b); }
 
-// WARNING: this pfloor implementation makes sense for inputs that fit in
-// signed int64 integers (up to ~9.22e18), hence this is currently only used
-// by pexp and not exposed through HasFloor.
-template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
-{
-  const Packet2d cst_1 = pset1<Packet2d>(1.0);
-  /* perform a floorf */
-  const Packet2d tmp = vcvtq_f64_s64(vcvtq_s64_f64(a));
-
-  /* if greater, substract 1 */
-  uint64x2_t mask = vcgtq_f64(tmp, a);
-  mask = vandq_u64(mask, vreinterpretq_u64_f64(cst_1));
-  return vsubq_f64(tmp, vreinterpretq_f64_u64(mask));
-}
-
 // Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b)
 { return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
@@ -3872,6 +3896,34 @@ ptranspose(PacketBlock<Packet2d, 2>& kernel)
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect( const Packet2d& mask, const Packet2d& a, const Packet2d& b)
 { return vbslq_f64(vreinterpretq_u64_f64(mask), a, b); }
 
+template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
+{
+  const Packet2d cst_1 = pset1<Packet2d>(1.0);
+  // Round to nearest.
+  Packet2d tmp = vcvtq_f64_s64(vcvtq_s64_f64(a));
+  // If greater, substract 1.
+  uint64x2_t mask = vcgtq_f64(tmp, a);
+  mask = vandq_u64(mask, vreinterpretq_u64_f64(cst_1));
+  tmp = vsubq_f64(tmp, vreinterpretq_f64_u64(mask));
+  // Handle saturation cases.
+  const Packet2d cst_max = pset1<Packet2d>(static_cast<double>(NumTraits<int64_t>::highest()));
+  return pselect(pcmp_lt(pabs(a), cst_max), tmp, a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a)
+{
+  const Packet2d cst_1 = pset1<Packet2d>(1.0);
+  // Round to nearest.
+  Packet2d tmp = vcvtq_f64_s64(vcvtq_s64_f64(a));
+  // If smaller, add one.
+  uint64x2_t mask = vcltq_f64(tmp, a);
+  mask = vandq_u64(mask, vreinterpretq_u64_f64(cst_1));
+  tmp = vaddq_f64(tmp, vreinterpretq_f64_u64(mask));
+  // Handle saturation cases.
+  const Packet2d cst_max = pset1<Packet2d>(static_cast<double>(NumTraits<int64_t>::highest()));
+  return pselect(pcmp_lt(pabs(a), cst_max), tmp, a);
+}
+
 template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent)
 { return pldexp_generic(a, exponent); }
 
@@ -3920,6 +3972,7 @@ struct packet_traits<Eigen::half> : default_packet_traits {
     HasReduxp = 1,
     HasDiv = 1,
     HasFloor = 1,
+    HasCeil = 1,
     HasSin = 0,
     HasCos = 0,
     HasLog = 0,
@@ -4132,25 +4185,57 @@ EIGEN_STRONG_INLINE Packet4hf pcmp_lt_or_nan<Packet4hf>(const Packet4hf& a, cons
 template <>
 EIGEN_STRONG_INLINE Packet8hf pfloor<Packet8hf>(const Packet8hf& a) {
   const Packet8hf cst_1 = pset1<Packet8hf>(Eigen::half(1.0f));
-  /* perform a floorf */
+  // Round to nearest.
   Packet8hf tmp = vcvtq_f16_s16(vcvtq_s16_f16(a));
-
-  /* if greater, substract 1 */
+  // If greater, substract one.
   uint16x8_t mask = vcgtq_f16(tmp, a);
   mask = vandq_u16(mask, vreinterpretq_u16_f16(cst_1));
-  return vsubq_f16(tmp, vreinterpretq_f16_u16(mask));
+  tmp = vsubq_f16(tmp, vreinterpretq_f16_u16(mask));
+  // Handle saturation cases.
+  EIGEN_CONSTEXPR Packet8hf cst_max = pset1<Packet8hf>(static_cast<Eigen::half>(NumTraits<int16_t>::highest()));
+  return pselect(pcmp_lt(pabs(a), cst_max), tmp, a);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet4hf pfloor<Packet4hf>(const Packet4hf& a) {
   const Packet4hf cst_1 = pset1<Packet4hf>(Eigen::half(1.0f));
-  /* perform a floorf */
+  // Round to nearest.
   Packet4hf tmp = vcvt_f16_s16(vcvt_s16_f16(a));
-
-  /* if greater, substract 1 */
+  // If greater, substract one.
   uint16x4_t mask = vcgt_f16(tmp, a);
   mask = vand_u16(mask, vreinterpret_u16_f16(cst_1));
-  return vsub_f16(tmp, vreinterpret_f16_u16(mask));
+  tmp = vsub_f16(tmp, vreinterpret_f16_u16(mask));
+  // Handle saturation cases.
+  EIGEN_CONSTEXPR Packet4hf cst_max = pset1<Packet4hf>(static_cast<Eigen::half>(NumTraits<int16_t>::highest()));
+  return pselect(pcmp_lt(pabs(a), cst_max), tmp, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pceil<Packet8hf>(const Packet8hf& a) {
+  const Packet8hf cst_1 = pset1<Packet8hf>(Eigen::half(1.0f));
+  // Round to nearest.
+  Packet8hf tmp = vcvtq_f16_s16(vcvtq_s16_f16(a));
+  // If smaller, add one.
+  uint16x8_t mask = vcltq_f16(tmp, a);
+  mask = vandq_u16(mask, vreinterpretq_u16_f16(cst_1));
+  tmp = vaddq_f16(tmp, vreinterpretq_f16_u16(mask));
+  // Handle saturation cases.
+  EIGEN_CONSTEXPR Packet8hf cst_max = pset1<Packet8hf>(static_cast<Eigen::half>(NumTraits<int16_t>::highest()));
+  return pselect(pcmp_lt(pabs(a), cst_max), tmp, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pceil<Packet4hf>(const Packet4hf& a) {
+  const Packet4hf cst_1 = pset1<Packet4hf>(Eigen::half(1.0f));
+  // Round to nearest.
+  Packet4hf tmp = vcvt_f16_s16(vcvt_s16_f16(a));
+  // If smaller, add one.
+  uint16x4_t mask = vclt_f16(tmp, a);
+  mask = vand_u16(mask, vreinterpret_u16_f16(cst_1));
+  tmp = vadd_f16(tmp, vreinterpret_f16_u16(mask));
+  // Handle saturation cases.
+  EIGEN_CONSTEXPR Packet4hf cst_max = pset1<Packet4hf>(static_cast<Eigen::half>(NumTraits<int16_t>::highest()));
+  return pselect(pcmp_lt(pabs(a), cst_max), tmp, a);
 }
 
 template <>
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 23579a978..78fd99e64 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -602,6 +602,26 @@ template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packe
 template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right   (const Packet4i& a) { return _mm_srli_epi32(a,N); }
 template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left    (const Packet4i& a) { return _mm_slli_epi32(a,N); }
 
+template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)
+{
+  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
+  return _mm_and_ps(a,mask);
+}
+template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a)
+{
+  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
+  return _mm_and_pd(a,mask);
+}
+template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)
+{
+  #ifdef EIGEN_VECTORIZE_SSSE3
+  return _mm_abs_epi32(a);
+  #else
+  Packet4i aux = _mm_srai_epi32(a,31);
+  return _mm_sub_epi32(_mm_xor_si128(a,aux),aux);
+  #endif
+}
+
 #ifdef EIGEN_VECTORIZE_SSE4_1
 template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
 {
@@ -632,20 +652,30 @@ template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
   const Packet4f cst_1 = pset1<Packet4f>(1.0f);
   Packet4i emm0 = _mm_cvttps_epi32(a);
   Packet4f tmp  = _mm_cvtepi32_ps(emm0);
-  /* if greater, substract 1 */
+  // If greater, subtract one.
   Packet4f mask = _mm_cmpgt_ps(tmp, a);
   mask = pand(mask, cst_1);
-  return psub(tmp, mask);
+  tmp = psub(tmp, mask);
+  // Handle saturation cases.
+  const Packet4f cst_max = pset1<Packet4f>(static_cast<float>(NumTraits<int32_t>::highest()));
+  return pselect(pcmp_lt(pabs(a), cst_max), tmp, a);
+}
+
+// Rounds to nearest integer.
+EIGEN_STRONG_INLINE Packet2d pround_to_nearest(const Packet2d& a) {
+  // Adds and subtracts signum(a) * 2^52 to force rounding to within precision.
+  const Packet2d offset = 
+    pselect(pcmp_lt(a, pzero(a)), 
+      pset1<Packet2d>(-static_cast<double>(1ull<<52)),
+      pset1<Packet2d>(+static_cast<double>(1ull<<52)));
+  return psub(padd(a, offset), offset);
 }
 
-// WARNING: this pfloor implementation makes sense for small inputs only,
-// It is currently only used by pexp and not exposed through HasFloor.
 template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
 {
   const Packet2d cst_1 = pset1<Packet2d>(1.0);
-  Packet4i emm0 = _mm_cvttpd_epi32(a);
-  Packet2d tmp  = _mm_cvtepi32_pd(emm0);
-  /* if greater, substract 1 */
+  Packet2d tmp  = pround_to_nearest(a);
+  // If greater, subtract one.
   Packet2d mask = _mm_cmpgt_pd(tmp, a);
   mask = pand(mask, cst_1);
   return psub(tmp, mask);
@@ -656,20 +686,20 @@ template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
   const Packet4f cst_1 = pset1<Packet4f>(1.0f);
   Packet4i emm0 = _mm_cvttps_epi32(a);
   Packet4f tmp  = _mm_cvtepi32_ps(emm0);
-  /* if greater, substract 1 */
+  // If smaller, add one.
   Packet4f mask = _mm_cmplt_ps(tmp, a);
   mask = pand(mask, cst_1);
-  return padd(tmp, mask);
+  tmp = padd(tmp, mask);
+  // Handle saturation cases.
+  const Packet4f cst_max = pset1<Packet4f>(static_cast<float>(NumTraits<int32_t>::highest()));
+  return pselect(pcmp_lt(pabs(a), cst_max), tmp, a);
 }
 
-// WARNING: this pfloor implementation makes sense for small inputs only,
-// It is currently only used by pexp and not exposed through HasFloor.
 template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a)
 {
   const Packet2d cst_1 = pset1<Packet2d>(1.0);
-  Packet4i emm0 = _mm_cvttpd_epi32(a);
-  Packet2d tmp  = _mm_cvtepi32_pd(emm0);
-  /* if greater, substract 1 */
+  Packet2d tmp  = pround_to_nearest(a);
+  // If smaller, add one.
   Packet2d mask = _mm_cmplt_pd(tmp, a);
   mask = pand(mask, cst_1);
   return padd(tmp, mask);
@@ -866,26 +896,6 @@ template<> EIGEN_STRONG_INLINE Packet16b preverse(const Packet16b& a) {
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)
-{
-  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
-  return _mm_and_ps(a,mask);
-}
-template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a)
-{
-  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
-  return _mm_and_pd(a,mask);
-}
-template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)
-{
-  #ifdef EIGEN_VECTORIZE_SSSE3
-  return _mm_abs_epi32(a);
-  #else
-  Packet4i aux = _mm_srai_epi32(a,31);
-  return _mm_sub_epi32(_mm_xor_si128(a,aux),aux);
-  #endif
-}
-
 template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
   return pfrexp_generic(a,exponent);
 }
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index 94a83c008..f355af491 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -546,14 +546,43 @@ void packetmath_real() {
   CHECK_CWISE1_IF(PacketTraits::HasCeil, numext::ceil, internal::pceil);
   CHECK_CWISE1_IF(PacketTraits::HasFloor, numext::floor, internal::pfloor);
   CHECK_CWISE1_IF(PacketTraits::HasRint, numext::rint, internal::print);
-
-  // See bug 1785.
-  for (int i = 0; i < size; ++i) {
-    data1[i] = Scalar(-1.5 + i);
-    data2[i] = Scalar(-1.5 + i);
+  
+  // Rounding edge cases.
+  if (PacketTraits::HasRound || PacketTraits::HasCeil || PacketTraits::HasFloor || PacketTraits::HasRint) {
+    typedef typename internal::make_integer<Scalar>::type IntType;
+    // Start with values that cannot fit inside an integer, work down to less than one.
+    Scalar val = Scalar(2) * static_cast<Scalar>(NumTraits<IntType>::highest());
+    std::vector<Scalar> values;
+    while (val > Scalar(0.25)) {
+      // Cover both even and odd, positive and negative cases.
+      values.push_back(val);
+      values.push_back(val + Scalar(0.3));
+      values.push_back(val + Scalar(0.5));
+      values.push_back(val + Scalar(0.8));
+      values.push_back(val + Scalar(1));
+      values.push_back(val + Scalar(1.3));
+      values.push_back(val + Scalar(1.5));
+      values.push_back(val + Scalar(1.8));
+      values.push_back(-val);
+      values.push_back(-val - Scalar(0.3));
+      values.push_back(-val - Scalar(0.5));
+      values.push_back(-val - Scalar(0.8));
+      values.push_back(-val - Scalar(1));
+      values.push_back(-val - Scalar(1.3));
+      values.push_back(-val - Scalar(1.5));
+      values.push_back(-val - Scalar(1.8));
+      values.push_back(Scalar(-1.5) + val);  // Bug 1785.
+      val = val / Scalar(2);
+    }
+    
+    for (size_t k=0; k<values.size(); ++k) {
+      data1[0] = values[k];
+      CHECK_CWISE1_IF(PacketTraits::HasRound, numext::round, internal::pround);
+      CHECK_CWISE1_IF(PacketTraits::HasCeil, numext::ceil, internal::pceil);
+      CHECK_CWISE1_IF(PacketTraits::HasFloor, numext::floor, internal::pfloor);
+      CHECK_CWISE1_IF(PacketTraits::HasRint, numext::rint, internal::print);
+    }
   }
-  CHECK_CWISE1_IF(PacketTraits::HasRound, numext::round, internal::pround);
-  CHECK_CWISE1_IF(PacketTraits::HasRint, numext::rint, internal::print);
 
   for (int i = 0; i < size; ++i) {
     data1[i] = Scalar(internal::random<double>(-1, 1));