From 1e0c7d4f4933b12a325dbaa2c79ce946bb13f7d6 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Thu, 25 Feb 2021 14:29:49 -0800 Subject: Add print for SSE/NEON, use NEON rounding intrinsics if available. In SSE, by adding/subtracting 2^MantissaBits, we force rounding according to the current rounding mode. For NEON, we use the provided intrinsics for rint/floor/ceil if available (armv8). Related to #1969. --- Eigen/src/Core/arch/NEON/PacketMath.h | 217 ++++++++++++++++------------------ Eigen/src/Core/arch/SSE/PacketMath.h | 61 +++++----- Eigen/src/Core/util/Macros.h | 8 ++ test/packetmath.cpp | 3 + 4 files changed, 140 insertions(+), 149 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index f77a18a4f..ec6ea90c5 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -196,6 +196,7 @@ struct packet_traits : default_packet_traits HasDiv = 1, HasFloor = 1, HasCeil = 1, + HasRint = 1, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, @@ -3182,63 +3183,88 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b) { return vbslq_u64(mask, a, b); } +// Use armv8 rounding intinsics if available. +#if EIGEN_ARCH_ARMV8 +template<> EIGEN_STRONG_INLINE Packet2f print(const Packet2f& a) +{ return vrndn_f32(a); } + +template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) +{ return vrndnq_f32(a); } template<> EIGEN_STRONG_INLINE Packet2f pfloor(const Packet2f& a) -{ - const Packet2f cst_1 = pset1(1.0f); - // Round to nearest. - Packet2f tmp = vcvt_f32_s32(vcvt_s32_f32(a)); - // If greater, subtract one. - Packet2ui mask = vcgt_f32(tmp, a); - mask = vand_u32(mask, vreinterpret_u32_f32(cst_1)); - tmp = vsub_f32(tmp, vreinterpret_f32_u32(mask)); - // Handle saturation cases. - const Packet2f cst_max = pset1(static_cast(NumTraits::highest())); - return pselect(pcmp_lt(pabs(a), cst_max), tmp, a); +{ return vrndm_f32(a); } + +template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) +{ return vrndmq_f32(a); } + +template<> EIGEN_STRONG_INLINE Packet2f pceil(const Packet2f& a) +{ return vrndp_f32(a); } + +template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) +{ return vrndpq_f32(a); } + +#else + +template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) { + // Adds and subtracts signum(a) * 2^23 to force rounding. + const Packet4f offset = + pselect(pcmp_lt(a, pzero(a)), + pset1(-static_cast(1<<23)), + pset1(+static_cast(1<<23))); + return psub(padd(a, offset), offset); +} + +template<> EIGEN_STRONG_INLINE Packet2f print(const Packet2f& a) { + // Adds and subtracts signum(a) * 2^23 to force rounding. + const Packet2f offset = + pselect(pcmp_lt(a, pzero(a)), + pset1(-static_cast(1<<23)), + pset1(+static_cast(1<<23))); + return psub(padd(a, offset), offset); } template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) { const Packet4f cst_1 = pset1(1.0f); - // Round to nearest. - Packet4f tmp = vcvtq_f32_s32(vcvtq_s32_f32(a)); + Packet4f tmp = print(a); // If greater, subtract one. - Packet4ui mask = vcgtq_f32(tmp, a); - mask = vandq_u32(mask, vreinterpretq_u32_f32(cst_1)); - tmp = vsubq_f32(tmp, vreinterpretq_f32_u32(mask)); - // Handle saturation cases. - const Packet4f cst_max = pset1(static_cast(NumTraits::highest())); - return pselect(pcmp_lt(pabs(a), cst_max), tmp, a); + Packet4f mask = pcmp_lt(a, tmp); + mask = pand(mask, cst_1); + return psub(tmp, mask); } -template<> EIGEN_STRONG_INLINE Packet2f pceil(const Packet2f& a) +template<> EIGEN_STRONG_INLINE Packet2f pfloor(const Packet2f& a) { const Packet2f cst_1 = pset1(1.0f); - // Round to nearest. - Packet2f tmp = vcvt_f32_s32(vcvt_s32_f32(a)); - // If smaller, add one. - Packet2ui mask = vclt_f32(tmp, a); - mask = vand_u32(mask, vreinterpret_u32_f32(cst_1)); - tmp = vadd_f32(tmp, vreinterpret_f32_u32(mask)); - // Handle saturation cases. - const Packet2f cst_max = pset1(static_cast(NumTraits::highest())); - return pselect(pcmp_lt(pabs(a), cst_max), tmp, a); + Packet2f tmp = print(a); + // If greater, subtract one. + Packet2f mask = pcmp_lt(a, tmp); + mask = pand(mask, cst_1); + return psub(tmp, mask); } template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) { const Packet4f cst_1 = pset1(1.0f); - // Round to nearest. - Packet4f tmp = vcvtq_f32_s32(vcvtq_s32_f32(a)); + Packet4f tmp = print(a); + // If smaller, add one. + Packet4f mask = pcmp_lt(tmp, a); + mask = pand(mask, cst_1); + return padd(tmp, mask); +} + +template<> EIGEN_STRONG_INLINE Packet2f pceil(const Packet2f& a) +{ + const Packet2f cst_1 = pset1(1.0); + Packet2f tmp = print(a); // If smaller, add one. - Packet4ui mask = vcltq_f32(tmp, a); - mask = vandq_u32(mask, vreinterpretq_u32_f32(cst_1)); - tmp = vaddq_f32(tmp, vreinterpretq_f32_u32(mask)); - // Handle saturation cases. - const Packet4f cst_max = pset1(static_cast(NumTraits::highest())); - return pselect(pcmp_lt(pabs(a), cst_max), tmp, a); + Packet2f mask = pcmp_lt(tmp, a); + mask = pand(mask, cst_1); + return padd(tmp, mask); } +#endif + /** * Computes the integer square root * @remarks The calculation is performed using an algorithm which iterates through each binary digit of the result @@ -3404,6 +3430,7 @@ template<> struct packet_traits : default_packet_traits HasDiv = 1, HasFloor = 1, HasCeil = 1, + HasRint = 1, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, @@ -3565,6 +3592,11 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4bf pselect(const Packet4 return pselect(mask, a, b); } +template<> EIGEN_STRONG_INLINE Packet4bf print(const Packet4bf& a) +{ + return F32ToBf16(print(Bf16ToF32(a))); +} + template<> EIGEN_STRONG_INLINE Packet4bf pfloor(const Packet4bf& a) { return F32ToBf16(pfloor(Bf16ToF32(a))); @@ -3751,6 +3783,7 @@ template<> struct packet_traits : default_packet_traits HasDiv = 1, HasFloor = 1, HasCeil = 1, + HasRint = 1, HasSin = 0, HasCos = 0, @@ -3932,33 +3965,14 @@ ptranspose(PacketBlock& kernel) template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect( const Packet2d& mask, const Packet2d& a, const Packet2d& b) { return vbslq_f64(vreinterpretq_u64_f64(mask), a, b); } +template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) +{ return vrndnq_f64(a); } + template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) -{ - const Packet2d cst_1 = pset1(1.0); - // Round to nearest. - Packet2d tmp = vcvtq_f64_s64(vcvtq_s64_f64(a)); - // If greater, substract 1. - uint64x2_t mask = vcgtq_f64(tmp, a); - mask = vandq_u64(mask, vreinterpretq_u64_f64(cst_1)); - tmp = vsubq_f64(tmp, vreinterpretq_f64_u64(mask)); - // Handle saturation cases. - const Packet2d cst_max = pset1(static_cast(NumTraits::highest())); - return pselect(pcmp_lt(pabs(a), cst_max), tmp, a); -} +{ return vrndmq_f64(a); } template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) -{ - const Packet2d cst_1 = pset1(1.0); - // Round to nearest. - Packet2d tmp = vcvtq_f64_s64(vcvtq_s64_f64(a)); - // If smaller, add one. - uint64x2_t mask = vcltq_f64(tmp, a); - mask = vandq_u64(mask, vreinterpretq_u64_f64(cst_1)); - tmp = vaddq_f64(tmp, vreinterpretq_f64_u64(mask)); - // Handle saturation cases. - const Packet2d cst_max = pset1(static_cast(NumTraits::highest())); - return pselect(pcmp_lt(pabs(a), cst_max), tmp, a); -} +{ return vrndpq_f64(a); } template<> EIGEN_STRONG_INLINE Packet2d pldexp(const Packet2d& a, const Packet2d& exponent) { return pldexp_generic(a, exponent); } @@ -4020,6 +4034,7 @@ struct packet_traits : default_packet_traits { HasDiv = 1, HasFloor = 1, HasCeil = 1, + HasRint = 1, HasSin = 0, HasCos = 0, HasLog = 0, @@ -4230,6 +4245,30 @@ EIGEN_STRONG_INLINE Packet4hf pcmp_lt_or_nan(const Packet4hf& a, cons return vreinterpret_f16_u16(vmvn_u16(vcge_f16(a, b))); } +template <> +EIGEN_STRONG_INLINE Packet8hf print(const Packet8hf& a) +{ return vrndnq_f16(a); } + +template <> +EIGEN_STRONG_INLINE Packet4hf print(const Packet4hf& a) +{ return vrndn_f16(a); } + +template <> +EIGEN_STRONG_INLINE Packet8hf pfloor(const Packet8hf& a) +{ return vrndmq_f16(a); } + +template <> +EIGEN_STRONG_INLINE Packet4hf pfloor(const Packet4hf& a) +{ return vrndm_f16(a); } + +template <> +EIGEN_STRONG_INLINE Packet8hf pceil(const Packet8hf& a) +{ return vrndpq_f16(a); } + +template <> +EIGEN_STRONG_INLINE Packet4hf pceil(const Packet4hf& a) +{ return vrndp_f16(a); } + template <> EIGEN_STRONG_INLINE Packet8hf psqrt(const Packet8hf& a) { return vsqrtq_f16(a); @@ -4464,62 +4503,6 @@ EIGEN_STRONG_INLINE Packet4hf pabs(const Packet4hf& a) { return vabs_f16(a); } -template <> -EIGEN_STRONG_INLINE Packet8hf pfloor(const Packet8hf& a) { - const Packet8hf cst_1 = pset1(Eigen::half(1.0f)); - // Round to nearest. - Packet8hf tmp = vcvtq_f16_s16(vcvtq_s16_f16(a)); - // If greater, substract one. - uint16x8_t mask = vcgtq_f16(tmp, a); - mask = vandq_u16(mask, vreinterpretq_u16_f16(cst_1)); - tmp = vsubq_f16(tmp, vreinterpretq_f16_u16(mask)); - // Handle saturation cases. - const Packet8hf cst_max = pset1(static_cast(NumTraits::highest())); - return pselect(pcmp_lt(pabs(a), cst_max), tmp, a); -} - -template <> -EIGEN_STRONG_INLINE Packet4hf pfloor(const Packet4hf& a) { - const Packet4hf cst_1 = pset1(Eigen::half(1.0f)); - // Round to nearest. - Packet4hf tmp = vcvt_f16_s16(vcvt_s16_f16(a)); - // If greater, substract one. - uint16x4_t mask = vcgt_f16(tmp, a); - mask = vand_u16(mask, vreinterpret_u16_f16(cst_1)); - tmp = vsub_f16(tmp, vreinterpret_f16_u16(mask)); - // Handle saturation cases. - const Packet4hf cst_max = pset1(static_cast(NumTraits::highest())); - return pselect(pcmp_lt(pabs(a), cst_max), tmp, a); -} - -template <> -EIGEN_STRONG_INLINE Packet8hf pceil(const Packet8hf& a) { - const Packet8hf cst_1 = pset1(Eigen::half(1.0f)); - // Round to nearest. - Packet8hf tmp = vcvtq_f16_s16(vcvtq_s16_f16(a)); - // If smaller, add one. - uint16x8_t mask = vcltq_f16(tmp, a); - mask = vandq_u16(mask, vreinterpretq_u16_f16(cst_1)); - tmp = vaddq_f16(tmp, vreinterpretq_f16_u16(mask)); - // Handle saturation cases. - const Packet8hf cst_max = pset1(static_cast(NumTraits::highest())); - return pselect(pcmp_lt(pabs(a), cst_max), tmp, a); -} - -template <> -EIGEN_STRONG_INLINE Packet4hf pceil(const Packet4hf& a) { - const Packet4hf cst_1 = pset1(Eigen::half(1.0f)); - // Round to nearest. - Packet4hf tmp = vcvt_f16_s16(vcvt_s16_f16(a)); - // If smaller, add one. - uint16x4_t mask = vclt_f16(tmp, a); - mask = vand_u16(mask, vreinterpret_u16_f16(cst_1)); - tmp = vadd_f16(tmp, vreinterpret_f16_u16(mask)); - // Handle saturation cases. - const Packet4hf cst_max = pset1(static_cast(NumTraits::highest())); - return pselect(pcmp_lt(pabs(a), cst_max), tmp, a); -} - template <> EIGEN_STRONG_INLINE Eigen::half predux(const Packet8hf& a) { float16x4_t a_lo, a_hi, sum; diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 78fd99e64..b9821ad80 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -148,12 +148,11 @@ struct packet_traits : default_packet_traits { HasErf = EIGEN_FAST_MATH, HasBlend = 1, HasCeil = 1, - HasFloor = 1 + HasFloor = 1, + HasRint = 1, #ifdef EIGEN_VECTORIZE_SSE4_1 - , - HasRint = 1, - HasRound = 1 + HasRound = 1, #endif }; }; @@ -175,12 +174,10 @@ struct packet_traits : default_packet_traits { HasRsqrt = 1, HasBlend = 1, HasFloor = 1, - HasCeil = 1 - -#ifdef EIGEN_VECTORIZE_SSE4_1 - , + HasCeil = 1, + HasRint = 1, + #ifdef EIGEN_VECTORIZE_SSE4_1 HasRound = 1, - HasRint = 1 #endif }; }; @@ -647,23 +644,17 @@ template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { ret template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) { return _mm_floor_ps(a); } template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { return _mm_floor_pd(a); } #else -template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) -{ - const Packet4f cst_1 = pset1(1.0f); - Packet4i emm0 = _mm_cvttps_epi32(a); - Packet4f tmp = _mm_cvtepi32_ps(emm0); - // If greater, subtract one. - Packet4f mask = _mm_cmpgt_ps(tmp, a); - mask = pand(mask, cst_1); - tmp = psub(tmp, mask); - // Handle saturation cases. - const Packet4f cst_max = pset1(static_cast(NumTraits::highest())); - return pselect(pcmp_lt(pabs(a), cst_max), tmp, a); +template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) { + // Adds and subtracts signum(a) * 2^23 to force rounding. + const Packet4f offset = + pselect(pcmp_lt(a, pzero(a)), + pset1(-static_cast(1<<23)), + pset1(+static_cast(1<<23))); + return psub(padd(a, offset), offset); } -// Rounds to nearest integer. -EIGEN_STRONG_INLINE Packet2d pround_to_nearest(const Packet2d& a) { - // Adds and subtracts signum(a) * 2^52 to force rounding to within precision. +template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) { + // Adds and subtracts signum(a) * 2^52 to force rounding. const Packet2d offset = pselect(pcmp_lt(a, pzero(a)), pset1(-static_cast(1ull<<52)), @@ -671,10 +662,20 @@ EIGEN_STRONG_INLINE Packet2d pround_to_nearest(const Packet2d& a) { return psub(padd(a, offset), offset); } +template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) +{ + const Packet4f cst_1 = pset1(1.0f); + Packet4f tmp = print(a); + // If greater, subtract one. + Packet4f mask = _mm_cmpgt_ps(tmp, a); + mask = pand(mask, cst_1); + return psub(tmp, mask); +} + template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { const Packet2d cst_1 = pset1(1.0); - Packet2d tmp = pround_to_nearest(a); + Packet2d tmp = print(a); // If greater, subtract one. Packet2d mask = _mm_cmpgt_pd(tmp, a); mask = pand(mask, cst_1); @@ -684,21 +685,17 @@ template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) { const Packet4f cst_1 = pset1(1.0f); - Packet4i emm0 = _mm_cvttps_epi32(a); - Packet4f tmp = _mm_cvtepi32_ps(emm0); + Packet4f tmp = print(a); // If smaller, add one. Packet4f mask = _mm_cmplt_ps(tmp, a); mask = pand(mask, cst_1); - tmp = padd(tmp, mask); - // Handle saturation cases. - const Packet4f cst_max = pset1(static_cast(NumTraits::highest())); - return pselect(pcmp_lt(pabs(a), cst_max), tmp, a); + return padd(tmp, mask); } template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { const Packet2d cst_1 = pset1(1.0); - Packet2d tmp = pround_to_nearest(a); + Packet2d tmp = print(a); // If smaller, add one. Packet2d mask = _mm_cmplt_pd(tmp, a); mask = pand(mask, cst_1); diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index b39c6ca43..ac514cbb4 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -265,6 +265,14 @@ #define EIGEN_ARCH_ARM_OR_ARM64 0 #endif +/// \internal EIGEN_ARCH_ARMV8 set to 1 if the architecture is armv8 or greater. +#if EIGEN_ARCH_ARM_OR_ARM64 && defined(__ARM_ARCH) && __ARM_ARCH >= 8 +#define EIGEN_ARCH_ARMV8 1 +#else +#define EIGEN_ARCH_ARMV8 0 +#endif + + /// \internal EIGEN_HAS_ARM64_FP16 set to 1 if the architecture provides an IEEE /// compliant Arm fp16 type #if EIGEN_ARCH_ARM64 diff --git a/test/packetmath.cpp b/test/packetmath.cpp index ceafb9002..76ac47554 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -577,6 +577,9 @@ void packetmath_real() { values.push_back(Scalar(-1.5) + val); // Bug 1785. val = val / Scalar(2); } + values.push_back(NumTraits::infinity()); + values.push_back(-NumTraits::infinity()); + values.push_back(NumTraits::quiet_NaN()); for (size_t k=0; k