aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--Eigen/src/Core/arch/AVX/MathFunctions.h78
-rw-r--r--Eigen/src/Core/arch/AVX/PacketMath.h14
-rw-r--r--Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h69
-rw-r--r--Eigen/src/Core/arch/SSE/MathFunctions.h73
-rwxr-xr-xEigen/src/Core/arch/SSE/PacketMath.h55
5 files changed, 126 insertions, 163 deletions
diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h
index b038c7499..60d6aad40 100644
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -123,82 +123,8 @@ ptanh<Packet8f>(const Packet8f& x) {
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
-pexp<Packet4d>(const Packet4d& _x) {
- Packet4d x = _x;
-
- _EIGEN_DECLARE_CONST_Packet4d(1, 1.0);
- _EIGEN_DECLARE_CONST_Packet4d(2, 2.0);
- _EIGEN_DECLARE_CONST_Packet4d(half, 0.5);
-
- _EIGEN_DECLARE_CONST_Packet4d(exp_hi, 709.437);
- _EIGEN_DECLARE_CONST_Packet4d(exp_lo, -709.436139303);
-
- _EIGEN_DECLARE_CONST_Packet4d(cephes_LOG2EF, 1.4426950408889634073599);
-
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p0, 1.26177193074810590878e-4);
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p1, 3.02994407707441961300e-2);
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p2, 9.99999999999999999910e-1);
-
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q0, 3.00198505138664455042e-6);
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q1, 2.52448340349684104192e-3);
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q2, 2.27265548208155028766e-1);
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q3, 2.00000000000000000009e0);
-
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C1, 0.693145751953125);
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C2, 1.42860682030941723212e-6);
- _EIGEN_DECLARE_CONST_Packet4i(1023, 1023);
-
- Packet4d tmp, fx;
-
- // clamp x
- x = pmax(pmin(x, p4d_exp_hi), p4d_exp_lo);
- // Express exp(x) as exp(g + n*log(2)).
- fx = pmadd(p4d_cephes_LOG2EF, x, p4d_half);
-
- // Get the integer modulus of log(2), i.e. the "n" described above.
- fx = _mm256_floor_pd(fx);
-
- // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
- // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
- // digits right.
- tmp = pmul(fx, p4d_cephes_exp_C1);
- Packet4d z = pmul(fx, p4d_cephes_exp_C2);
- x = psub(x, tmp);
- x = psub(x, z);
-
- Packet4d x2 = pmul(x, x);
-
- // Evaluate the numerator polynomial of the rational interpolant.
- Packet4d px = p4d_cephes_exp_p0;
- px = pmadd(px, x2, p4d_cephes_exp_p1);
- px = pmadd(px, x2, p4d_cephes_exp_p2);
- px = pmul(px, x);
-
- // Evaluate the denominator polynomial of the rational interpolant.
- Packet4d qx = p4d_cephes_exp_q0;
- qx = pmadd(qx, x2, p4d_cephes_exp_q1);
- qx = pmadd(qx, x2, p4d_cephes_exp_q2);
- qx = pmadd(qx, x2, p4d_cephes_exp_q3);
-
- // I don't really get this bit, copied from the SSE2 routines, so...
- // TODO(gonnet): Figure out what is going on here, perhaps find a better
- // rational interpolant?
- x = _mm256_div_pd(px, psub(qx, px));
- x = pmadd(p4d_2, x, p4d_1);
-
- // Build e=2^n by constructing the exponents in a 128-bit vector and
- // shifting them to where they belong in double-precision values.
- __m128i emm0 = _mm256_cvtpd_epi32(fx);
- emm0 = _mm_add_epi32(emm0, p4i_1023);
- emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0));
- __m128i lo = _mm_slli_epi64(emm0, 52);
- __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52);
- __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0);
- e = _mm256_insertf128_si256(e, hi, 1);
-
- // Construct the result 2^n * exp(g) = e * x. The max is used to catch
- // non-finite values in the input.
- return pmax(pmul(x, _mm256_castsi256_pd(e)), _x);
+pexp<Packet4d>(const Packet4d& x) {
+ return pexp_double(x);
}
// Functions for sqrt.
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 770646b91..a415f2f1b 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -405,6 +405,20 @@ template<> EIGEN_STRONG_INLINE Packet8f pldexp<Packet8f>(const Packet8f& a, cons
return pldexp_float(a,exponent);
}
+template<> EIGEN_STRONG_INLINE Packet4d pldexp<Packet4d>(const Packet4d& a, const Packet4d& exponent) {
+ // Build e=2^n by constructing the exponents in a 128-bit vector and
+ // shifting them to where they belong in double-precision values.
+ Packet4i cst_1023 = pset1<Packet4i>(1023);
+ __m128i emm0 = _mm256_cvtpd_epi32(exponent);
+ emm0 = _mm_add_epi32(emm0, cst_1023);
+ emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0));
+ __m128i lo = _mm_slli_epi64(emm0, 52);
+ __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52);
+ __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0);
+ e = _mm256_insertf128_si256(e, hi, 1);
+ return pmul(a,_mm256_castsi256_pd(e));
+}
+
// preduxp should be ok
// FIXME: why is this ok? why isn't the simply implementation working as expected?
template<> EIGEN_STRONG_INLINE Packet8f preduxp<Packet8f>(const Packet8f* vecs)
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 63e21fe42..e05e67703 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -160,5 +160,74 @@ Packet pexp_float(const Packet _x)
return pmax(pldexp(y,m), _x);
}
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet pexp_double(const Packet _x)
+{
+ Packet x = _x;
+
+ const Packet cst_1 = pset1<Packet>(1.0);
+ const Packet cst_2 = pset1<Packet>(2.0);
+ const Packet cst_half = pset1<Packet>(0.5);
+
+ const Packet cst_exp_hi = pset1<Packet>(709.437);
+ const Packet cst_exp_lo = pset1<Packet>(-709.436139303);
+
+ const Packet cst_cephes_LOG2EF = pset1<Packet>(1.4426950408889634073599);
+ const Packet cst_cephes_exp_p0 = pset1<Packet>(1.26177193074810590878e-4);
+ const Packet cst_cephes_exp_p1 = pset1<Packet>(3.02994407707441961300e-2);
+ const Packet cst_cephes_exp_p2 = pset1<Packet>(9.99999999999999999910e-1);
+ const Packet cst_cephes_exp_q0 = pset1<Packet>(3.00198505138664455042e-6);
+ const Packet cst_cephes_exp_q1 = pset1<Packet>(2.52448340349684104192e-3);
+ const Packet cst_cephes_exp_q2 = pset1<Packet>(2.27265548208155028766e-1);
+ const Packet cst_cephes_exp_q3 = pset1<Packet>(2.00000000000000000009e0);
+ const Packet cst_cephes_exp_C1 = pset1<Packet>(0.693145751953125);
+ const Packet cst_cephes_exp_C2 = pset1<Packet>(1.42860682030941723212e-6);
+
+ Packet tmp, fx;
+
+ // clamp x
+ x = pmax(pmin(x, cst_exp_hi), cst_exp_lo);
+ // Express exp(x) as exp(g + n*log(2)).
+ fx = pmadd(cst_cephes_LOG2EF, x, cst_half);
+
+ // Get the integer modulus of log(2), i.e. the "n" described above.
+ fx = pfloor(fx);
+
+ // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
+ // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
+ // digits right.
+ tmp = pmul(fx, cst_cephes_exp_C1);
+ Packet z = pmul(fx, cst_cephes_exp_C2);
+ x = psub(x, tmp);
+ x = psub(x, z);
+
+ Packet x2 = pmul(x, x);
+
+ // Evaluate the numerator polynomial of the rational interpolant.
+ Packet px = cst_cephes_exp_p0;
+ px = pmadd(px, x2, cst_cephes_exp_p1);
+ px = pmadd(px, x2, cst_cephes_exp_p2);
+ px = pmul(px, x);
+
+ // Evaluate the denominator polynomial of the rational interpolant.
+ Packet qx = cst_cephes_exp_q0;
+ qx = pmadd(qx, x2, cst_cephes_exp_q1);
+ qx = pmadd(qx, x2, cst_cephes_exp_q2);
+ qx = pmadd(qx, x2, cst_cephes_exp_q3);
+
+ // I don't really get this bit, copied from the SSE2 routines, so...
+ // TODO(gonnet): Figure out what is going on here, perhaps find a better
+ // rational interpolant?
+ x = pdiv(px, psub(qx, px));
+ x = pmadd(cst_2, x, cst_1);
+
+ // Construct the result 2^n * exp(g) = e * x. The max is used to catch
+ // non-finite values in the input.
+ //return pmax(pmul(x, _mm256_castsi256_pd(e)), _x);
+ return pmax(pldexp(x,fx), _x);
+}
+
} // end namespace internal
} // end namespace Eigen
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index acabc9f1d..c3159de09 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -32,78 +32,11 @@ Packet4f pexp<Packet4f>(const Packet4f& _x)
{
return pexp_float(_x);
}
+
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet2d pexp<Packet2d>(const Packet2d& _x)
+Packet2d pexp<Packet2d>(const Packet2d& x)
{
- Packet2d x = _x;
-
- _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
- _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
- _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
-
- _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437);
- _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
-
- _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
-
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
-
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
-
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
- static const __m128i p4i_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0);
-
- Packet2d tmp, fx;
- Packet4i emm0;
-
- // clamp x
- x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo);
- /* express exp(x) as exp(g + n*log(2)) */
- fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half);
-
-#ifdef EIGEN_VECTORIZE_SSE4_1
- fx = _mm_floor_pd(fx);
-#else
- emm0 = _mm_cvttpd_epi32(fx);
- tmp = _mm_cvtepi32_pd(emm0);
- /* if greater, substract 1 */
- Packet2d mask = _mm_cmpgt_pd(tmp, fx);
- mask = _mm_and_pd(mask, p2d_1);
- fx = psub(tmp, mask);
-#endif
-
- tmp = pmul(fx, p2d_cephes_exp_C1);
- Packet2d z = pmul(fx, p2d_cephes_exp_C2);
- x = psub(x, tmp);
- x = psub(x, z);
-
- Packet2d x2 = pmul(x,x);
-
- Packet2d px = p2d_cephes_exp_p0;
- px = pmadd(px, x2, p2d_cephes_exp_p1);
- px = pmadd(px, x2, p2d_cephes_exp_p2);
- px = pmul (px, x);
-
- Packet2d qx = p2d_cephes_exp_q0;
- qx = pmadd(qx, x2, p2d_cephes_exp_q1);
- qx = pmadd(qx, x2, p2d_cephes_exp_q2);
- qx = pmadd(qx, x2, p2d_cephes_exp_q3);
-
- x = pdiv(px,psub(qx,px));
- x = pmadd(p2d_2,x,p2d_1);
-
- // build 2^n
- emm0 = _mm_cvttpd_epi32(fx);
- emm0 = _mm_add_epi32(emm0, p4i_1023_0);
- emm0 = _mm_slli_epi32(emm0, 20);
- emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3));
- return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x);
+ return pexp_double(x);
}
/* evaluation of 4 sines at once, using SSE2 intrinsics.
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 1258f349f..cb0f9f2c5 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -338,6 +338,21 @@ template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4
template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(b,a); }
+template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); }
+template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b,a); }
#ifdef EIGEN_VECTORIZE_SSE4_1
template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, 0); }
@@ -356,26 +371,23 @@ template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
Packet4f tmp = _mm_cvtepi32_ps(emm0);
/* if greater, substract 1 */
Packet4f mask = _mm_cmpgt_ps(tmp, a);
- mask = _mm_and_ps(mask, cst_1);
+ mask = pand(mask, cst_1);
return psub(tmp, mask);
}
-#endif
-
-template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(b,a); }
-template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); }
-template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b,a); }
+// WARNING: this pfloor implementation makes sense for small inputs only,
+// It is currently only used by pexp and not exposed through HasFloor.
+template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
+{
+ const Packet2d cst_1 = pset1<Packet2d>(1.0);
+ Packet4i emm0 = _mm_cvttpd_epi32(a);
+ Packet2d tmp = _mm_cvtepi32_pd(emm0);
+ /* if greater, substract 1 */
+ Packet2d mask = _mm_cmpgt_pd(tmp, a);
+ mask = pand(mask, cst_1);
+ return psub(tmp, mask);
+}
+#endif
template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
@@ -557,6 +569,15 @@ template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, cons
return pldexp_float(a,exponent);
}
+template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
+ const __m128i cst_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0);
+ Packet4i emm0 = _mm_cvttpd_epi32(exponent);
+ emm0 = padd(emm0, cst_1023_0);
+ emm0 = _mm_slli_epi32(emm0, 20);
+ emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3));
+ return pmul(a, Packet2d(_mm_castsi128_pd(emm0)));
+}
+
// with AVX, the default implementations based on pload1 are faster
#ifndef __AVX__
template<> EIGEN_STRONG_INLINE void