aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/arch/SSE
diff options
context:
space:
mode:
Diffstat (limited to 'Eigen/src/Core/arch/SSE')
-rw-r--r--Eigen/src/Core/arch/SSE/CMakeLists.txt6
-rw-r--r--Eigen/src/Core/arch/SSE/MathFunctions.h46
-rwxr-xr-xEigen/src/Core/arch/SSE/PacketMath.h39
3 files changed, 17 insertions, 74 deletions
diff --git a/Eigen/src/Core/arch/SSE/CMakeLists.txt b/Eigen/src/Core/arch/SSE/CMakeLists.txt
deleted file mode 100644
index 46ea7cc62..000000000
--- a/Eigen/src/Core/arch/SSE/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Core_arch_SSE_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_Core_arch_SSE_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/SSE COMPONENT Devel
-)
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index 28f103eeb..ac2fd8103 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -517,52 +517,10 @@ Packet2d prsqrt<Packet2d>(const Packet2d& x) {
}
// Hyperbolic Tangent function.
-// Doesn't do anything fancy, just a 13/6-degree rational interpolant which
-// is accurate up to a couple of ulp in the range [-9, 9], outside of which the
-// fl(tanh(x)) = +/-1.
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
-ptanh<Packet4f>(const Packet4f& _x) {
- // Clamp the inputs to the range [-9, 9] since anything outside
- // this range is +/-1.0f in single-precision.
- _EIGEN_DECLARE_CONST_Packet4f(plus_9, 9.0f);
- _EIGEN_DECLARE_CONST_Packet4f(minus_9, -9.0f);
- const Packet4f x = pmax(p4f_minus_9, pmin(p4f_plus_9, _x));
-
- // The monomial coefficients of the numerator polynomial (odd).
- _EIGEN_DECLARE_CONST_Packet4f(alpha_1, 4.89352455891786e-03f);
- _EIGEN_DECLARE_CONST_Packet4f(alpha_3, 6.37261928875436e-04f);
- _EIGEN_DECLARE_CONST_Packet4f(alpha_5, 1.48572235717979e-05f);
- _EIGEN_DECLARE_CONST_Packet4f(alpha_7, 5.12229709037114e-08f);
- _EIGEN_DECLARE_CONST_Packet4f(alpha_9, -8.60467152213735e-11f);
- _EIGEN_DECLARE_CONST_Packet4f(alpha_11, 2.00018790482477e-13f);
- _EIGEN_DECLARE_CONST_Packet4f(alpha_13, -2.76076847742355e-16f);
-
- // The monomial coefficients of the denominator polynomial (even).
- _EIGEN_DECLARE_CONST_Packet4f(beta_0, 4.89352518554385e-03f);
- _EIGEN_DECLARE_CONST_Packet4f(beta_2, 2.26843463243900e-03f);
- _EIGEN_DECLARE_CONST_Packet4f(beta_4, 1.18534705686654e-04f);
- _EIGEN_DECLARE_CONST_Packet4f(beta_6, 1.19825839466702e-06f);
-
- // Since the polynomials are odd/even, we need x^2.
- const Packet4f x2 = pmul(x, x);
-
- // Evaluate the numerator polynomial p.
- Packet4f p = pmadd(x2, p4f_alpha_13, p4f_alpha_11);
- p = pmadd(x2, p, p4f_alpha_9);
- p = pmadd(x2, p, p4f_alpha_7);
- p = pmadd(x2, p, p4f_alpha_5);
- p = pmadd(x2, p, p4f_alpha_3);
- p = pmadd(x2, p, p4f_alpha_1);
- p = pmul(x, p);
-
- // Evaluate the denominator polynomial p.
- Packet4f q = pmadd(x2, p4f_beta_6, p4f_beta_4);
- q = pmadd(x2, q, p4f_beta_2);
- q = pmadd(x2, q, p4f_beta_0);
-
- // Divide the numerator by the denominator.
- return pdiv(p, q);
+ptanh<Packet4f>(const Packet4f& x) {
+ return internal::generic_fast_tanh_float(x);
}
} // end namespace internal
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 451034560..baad692e3 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -162,6 +162,11 @@ template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4,
template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
+#ifndef EIGEN_VECTORIZE_AVX
+template<> struct scalar_div_cost<float,true> { enum { value = 7 }; };
+template<> struct scalar_div_cost<double,true> { enum { value = 8 }; };
+#endif
+
#if EIGEN_COMP_MSVC==1500
// Workaround MSVC 9 internal compiler error.
// TODO: It has been detected with win64 builds (amd64), so let's check whether it also happens in 32bits+SSE mode
@@ -434,30 +439,6 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
{ return _mm_shuffle_epi32(a,0x1B); }
-template<size_t offset>
-struct protate_impl<offset, Packet4f>
-{
- static Packet4f run(const Packet4f& a) {
- return vec4f_swizzle1(a, offset, (offset + 1) % 4, (offset + 2) % 4, (offset + 3) % 4);
- }
-};
-
-template<size_t offset>
-struct protate_impl<offset, Packet4i>
-{
- static Packet4i run(const Packet4i& a) {
- return vec4i_swizzle1(a, offset, (offset + 1) % 4, (offset + 2) % 4, (offset + 3) % 4);
- }
-};
-
-template<size_t offset>
-struct protate_impl<offset, Packet2d>
-{
- static Packet2d run(const Packet2d& a) {
- return vec2d_swizzle1(a, offset, (offset + 1) % 2);
- }
-};
-
template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)
{
const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
@@ -837,6 +818,16 @@ template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, cons
#endif
}
+// Scalar path for pmadd with FMA to ensure consistency with vectorized path.
+#ifdef __FMA__
+template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) {
+ return ::fmaf(a,b,c);
+}
+template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) {
+ return ::fma(a,b,c);
+}
+#endif
+
} // end namespace internal
} // end namespace Eigen