aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/arch/AVX
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2018-11-26 16:36:19 +0100
committerGravatar Gael Guennebaud <g.gael@free.fr>2018-11-26 16:36:19 +0100
commitcf8b85d5c5d1896ce1759a8c18beb56e8a71dea2 (patch)
tree4a441562f5ceb7d93b75518564cf42ac7b22a347 /Eigen/src/Core/arch/AVX
parentc2f35b1b4763348fd0a6df2ce750a7d3d3a56d79 (diff)
Unify SSE and AVX implementation of pexp
Diffstat (limited to 'Eigen/src/Core/arch/AVX')
-rw-r--r--Eigen/src/Core/arch/AVX/MathFunctions.h57
-rw-r--r--Eigen/src/Core/arch/AVX/PacketMath.h16
2 files changed, 17 insertions, 56 deletions
diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h
index 134facccd..b038c7499 100644
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -111,62 +111,7 @@ plog<Packet8f>(const Packet8f& _x) {
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
pexp<Packet8f>(const Packet8f& _x) {
- _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f);
- _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f);
- _EIGEN_DECLARE_CONST_Packet8f(127, 127.0f);
-
- _EIGEN_DECLARE_CONST_Packet8f(exp_hi, 88.3762626647950f);
- _EIGEN_DECLARE_CONST_Packet8f(exp_lo, -88.3762626647949f);
-
- _EIGEN_DECLARE_CONST_Packet8f(cephes_LOG2EF, 1.44269504088896341f);
-
- _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p0, 1.9875691500E-4f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p1, 1.3981999507E-3f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p2, 8.3334519073E-3f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p3, 4.1665795894E-2f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p4, 1.6666665459E-1f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p5, 5.0000001201E-1f);
-
- // Clamp x.
- Packet8f x = pmax(pmin(_x, p8f_exp_hi), p8f_exp_lo);
-
- // Express exp(x) as exp(m*ln(2) + r), start by extracting
- // m = floor(x/ln(2) + 0.5).
- Packet8f m = _mm256_floor_ps(pmadd(x, p8f_cephes_LOG2EF, p8f_half));
-
-// Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is
-// subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating
-// truncation errors. Note that we don't use the "pmadd" function here to
-// ensure that a precision-preserving FMA instruction is used.
-#ifdef EIGEN_VECTORIZE_FMA
- _EIGEN_DECLARE_CONST_Packet8f(nln2, -0.6931471805599453f);
- Packet8f r = _mm256_fmadd_ps(m, p8f_nln2, x);
-#else
- _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C1, 0.693359375f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C2, -2.12194440e-4f);
- Packet8f r = psub(x, pmul(m, p8f_cephes_exp_C1));
- r = psub(r, pmul(m, p8f_cephes_exp_C2));
-#endif
-
- Packet8f r2 = pmul(r, r);
-
- // TODO(gonnet): Split into odd/even polynomials and try to exploit
- // instruction-level parallelism.
- Packet8f y = p8f_cephes_exp_p0;
- y = pmadd(y, r, p8f_cephes_exp_p1);
- y = pmadd(y, r, p8f_cephes_exp_p2);
- y = pmadd(y, r, p8f_cephes_exp_p3);
- y = pmadd(y, r, p8f_cephes_exp_p4);
- y = pmadd(y, r, p8f_cephes_exp_p5);
- y = pmadd(y, r2, r);
- y = padd(y, p8f_1);
-
- // Build emm0 = 2^m.
- Packet8i emm0 = _mm256_cvttps_epi32(padd(m, p8f_127));
- emm0 = pshiftleft(emm0, 23);
-
- // Return 2^m * exp(r).
- return pmax(pmul(y, _mm256_castsi256_ps(emm0)), _x);
+ return pexp_float(_x);
}
// Hyperbolic Tangent function.
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 4b5bfebdf..770646b91 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -389,6 +389,22 @@ template<> EIGEN_STRONG_INLINE Packet8f pfrexp<Packet8f>(const Packet8f& a, Pack
return pfrexp_float(a,exponent);
}
+template<> EIGEN_STRONG_INLINE Packet8f pcast_and_shiftleft<Packet8f>(Packet8f v, int n)
+{
+ Packet8i vi = _mm256_cvttps_epi32(v);
+#ifdef EIGEN_VECTORIZE_AVX2
+ return _mm256_castsi256_ps(_mm256_slli_epi32(vi, n));
+#else
+ __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(vi, 0), n);
+ __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(vi, 1), n);
+ return _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pldexp<Packet8f>(const Packet8f& a, const Packet8f& exponent) {
+ return pldexp_float(a,exponent);
+}
+
// preduxp should be ok
// FIXME: why is this ok? why isn't the simply implementation working as expected?
template<> EIGEN_STRONG_INLINE Packet8f preduxp<Packet8f>(const Packet8f* vecs)