diff options
author | Gael Guennebaud <g.gael@free.fr> | 2018-11-26 16:36:19 +0100 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2018-11-26 16:36:19 +0100 |
commit | cf8b85d5c5d1896ce1759a8c18beb56e8a71dea2 (patch) | |
tree | 4a441562f5ceb7d93b75518564cf42ac7b22a347 /Eigen/src/Core/arch/AVX | |
parent | c2f35b1b4763348fd0a6df2ce750a7d3d3a56d79 (diff) |
Unify SSE and AVX implementation of pexp
Diffstat (limited to 'Eigen/src/Core/arch/AVX')
-rw-r--r-- | Eigen/src/Core/arch/AVX/MathFunctions.h | 57 | ||||
-rw-r--r-- | Eigen/src/Core/arch/AVX/PacketMath.h | 16 |
2 files changed, 17 insertions, 56 deletions
diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h index 134facccd..b038c7499 100644 --- a/Eigen/src/Core/arch/AVX/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX/MathFunctions.h @@ -111,62 +111,7 @@ plog<Packet8f>(const Packet8f& _x) { template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f pexp<Packet8f>(const Packet8f& _x) { - _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f); - _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet8f(127, 127.0f); - - _EIGEN_DECLARE_CONST_Packet8f(exp_hi, 88.3762626647950f); - _EIGEN_DECLARE_CONST_Packet8f(exp_lo, -88.3762626647949f); - - _EIGEN_DECLARE_CONST_Packet8f(cephes_LOG2EF, 1.44269504088896341f); - - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p0, 1.9875691500E-4f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p1, 1.3981999507E-3f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p2, 8.3334519073E-3f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p3, 4.1665795894E-2f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p4, 1.6666665459E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p5, 5.0000001201E-1f); - - // Clamp x. - Packet8f x = pmax(pmin(_x, p8f_exp_hi), p8f_exp_lo); - - // Express exp(x) as exp(m*ln(2) + r), start by extracting - // m = floor(x/ln(2) + 0.5). - Packet8f m = _mm256_floor_ps(pmadd(x, p8f_cephes_LOG2EF, p8f_half)); - -// Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is -// subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating -// truncation errors. Note that we don't use the "pmadd" function here to -// ensure that a precision-preserving FMA instruction is used. -#ifdef EIGEN_VECTORIZE_FMA - _EIGEN_DECLARE_CONST_Packet8f(nln2, -0.6931471805599453f); - Packet8f r = _mm256_fmadd_ps(m, p8f_nln2, x); -#else - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C1, 0.693359375f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C2, -2.12194440e-4f); - Packet8f r = psub(x, pmul(m, p8f_cephes_exp_C1)); - r = psub(r, pmul(m, p8f_cephes_exp_C2)); -#endif - - Packet8f r2 = pmul(r, r); - - // TODO(gonnet): Split into odd/even polynomials and try to exploit - // instruction-level parallelism. - Packet8f y = p8f_cephes_exp_p0; - y = pmadd(y, r, p8f_cephes_exp_p1); - y = pmadd(y, r, p8f_cephes_exp_p2); - y = pmadd(y, r, p8f_cephes_exp_p3); - y = pmadd(y, r, p8f_cephes_exp_p4); - y = pmadd(y, r, p8f_cephes_exp_p5); - y = pmadd(y, r2, r); - y = padd(y, p8f_1); - - // Build emm0 = 2^m. - Packet8i emm0 = _mm256_cvttps_epi32(padd(m, p8f_127)); - emm0 = pshiftleft(emm0, 23); - - // Return 2^m * exp(r). - return pmax(pmul(y, _mm256_castsi256_ps(emm0)), _x); + return pexp_float(_x); } // Hyperbolic Tangent function. diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 4b5bfebdf..770646b91 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -389,6 +389,22 @@ template<> EIGEN_STRONG_INLINE Packet8f pfrexp<Packet8f>(const Packet8f& a, Pack return pfrexp_float(a,exponent); } +template<> EIGEN_STRONG_INLINE Packet8f pcast_and_shiftleft<Packet8f>(Packet8f v, int n) +{ + Packet8i vi = _mm256_cvttps_epi32(v); +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_castsi256_ps(_mm256_slli_epi32(vi, n)); +#else + __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(vi, 0), n); + __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(vi, 1), n); + return _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet8f pldexp<Packet8f>(const Packet8f& a, const Packet8f& exponent) { + return pldexp_float(a,exponent); +} + // preduxp should be ok // FIXME: why is this ok? why isn't the simply implementation working as expected? template<> EIGEN_STRONG_INLINE Packet8f preduxp<Packet8f>(const Packet8f* vecs) |