From cdd8fdc32e730d5a65796a791ff13a92815c59b9 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Mon, 18 Jan 2021 13:25:16 +0000 Subject: Vectorize `pow(x, y)`. This closes https://gitlab.com/libeigen/eigen/-/issues/2085, which also contains a description of the algorithm. I ran some testing (comparing to `std::pow(double(x), double(y)))` for `x` in the set of all (positive) floats in the interval `[std::sqrt(std::numeric_limits::min()), std::sqrt(std::numeric_limits::max())]`, and `y` in `{2, sqrt(2), -sqrt(2)}` I get the following error statistics: ``` max_rel_error = 8.34405e-07 rms_rel_error = 2.76654e-07 ``` If I widen the range to all normal float I see lower accuracy for arguments where the result is subnormal, e.g. for `y = sqrt(2)`: ``` max_rel_error = 0.666667 rms = 6.8727e-05 count = 1335165689 argmax = 2.56049e-32, 2.10195e-45 != 1.4013e-45 ``` which seems reasonable, since these results are subnormals with only couple of significant bits left. --- Eigen/src/Core/arch/SSE/PacketMath.h | 38 +++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) (limited to 'Eigen/src/Core/arch/SSE') diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 4c5b664e6..05d9f8edd 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -26,7 +26,7 @@ namespace internal { #ifdef EIGEN_VECTORIZE_FMA #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif #endif @@ -147,13 +147,13 @@ struct packet_traits : default_packet_traits { HasTanh = EIGEN_FAST_MATH, HasErf = EIGEN_FAST_MATH, HasBlend = 1, + HasCeil = 1, HasFloor = 1 #ifdef EIGEN_VECTORIZE_SSE4_1 , HasRint = 1, - HasRound = 1, - HasCeil = 1 + HasRound = 1 #endif }; }; @@ -173,14 +173,14 @@ struct packet_traits : default_packet_traits { HasExp = 1, HasSqrt = 1, HasRsqrt = 1, - HasBlend = 1 + HasBlend = 1, + HasFloor = 1, + HasCeil = 1 #ifdef EIGEN_VECTORIZE_SSE4_1 , HasRound = 1, - HasRint = 1, - HasFloor = 1, - HasCeil = 1 + HasRint = 1 #endif }; }; @@ -650,6 +650,30 @@ template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) mask = pand(mask, cst_1); return psub(tmp, mask); } + +template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) +{ + const Packet4f cst_1 = pset1(1.0f); + Packet4i emm0 = _mm_cvttps_epi32(a); + Packet4f tmp = _mm_cvtepi32_ps(emm0); + /* if greater, substract 1 */ + Packet4f mask = _mm_cmplt_ps(tmp, a); + mask = pand(mask, cst_1); + return padd(tmp, mask); +} + +// WARNING: this pfloor implementation makes sense for small inputs only, +// It is currently only used by pexp and not exposed through HasFloor. +template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) +{ + const Packet2d cst_1 = pset1(1.0); + Packet4i emm0 = _mm_cvttpd_epi32(a); + Packet2d tmp = _mm_cvtepi32_pd(emm0); + /* if greater, substract 1 */ + Packet2d mask = _mm_cmplt_pd(tmp, a); + mask = pand(mask, cst_1); + return padd(tmp, mask); +} #endif template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); } -- cgit v1.2.3