aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/arch/SSE
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2018-11-27 22:41:51 +0100
committerGravatar Gael Guennebaud <g.gael@free.fr>2018-11-27 22:41:51 +0100
commitfa7fd61edad765608beb629a2c6f656535188db6 (patch)
tree90e7a48a96e0f8663dc9c0bc1cc518082b664dcb /Eigen/src/Core/arch/SSE
parent08edbc8cfebbd4064ca625072b128408b9bbe812 (diff)
Unify SSE/AVX psin functions.
It is based on the SSE version which is much more accurate, though very slightly slower. This changeset also includes the following required changes: - add packet-float to packet-int type traits - add packet float<->int reinterpret casts - add faster pselect for AVX based on blendv
Diffstat (limited to 'Eigen/src/Core/arch/SSE')
-rw-r--r--Eigen/src/Core/arch/SSE/MathFunctions.h96
-rwxr-xr-xEigen/src/Core/arch/SSE/PacketMath.h24
-rw-r--r--Eigen/src/Core/arch/SSE/TypeCasting.h7
3 files changed, 29 insertions, 98 deletions
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index e2046be47..9e699244e 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -54,101 +54,7 @@ Packet2d pexp<Packet2d>(const Packet2d& x)
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f psin<Packet4f>(const Packet4f& _x)
{
- Packet4f x = _x;
- _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
- _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-
- _EIGEN_DECLARE_CONST_Packet4i(1, 1);
- _EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
- _EIGEN_DECLARE_CONST_Packet4i(2, 2);
- _EIGEN_DECLARE_CONST_Packet4i(4, 4);
-
- _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000u);
-
- _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f);
- _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
- _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
- _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f);
- _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f);
- _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f);
- _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f);
- _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
-
- Packet4f xmm1, xmm2, xmm3, sign_bit, y;
-
- Packet4i emm0, emm2;
- sign_bit = x;
- /* take the absolute value */
- x = pabs(x);
-
- /* take the modulo */
-
- /* extract the sign bit (upper one) */
- sign_bit = _mm_and_ps(sign_bit, p4f_sign_mask);
-
- /* scale by 4/Pi */
- y = pmul(x, p4f_cephes_FOPI);
-
- /* store the integer part of y in mm0 */
- emm2 = _mm_cvttps_epi32(y);
- /* j=(j+1) & (~1) (see the cephes sources) */
- emm2 = _mm_add_epi32(emm2, p4i_1);
- emm2 = _mm_and_si128(emm2, p4i_not1);
- y = _mm_cvtepi32_ps(emm2);
- /* get the swap sign flag */
- emm0 = _mm_and_si128(emm2, p4i_4);
- emm0 = _mm_slli_epi32(emm0, 29);
- /* get the polynom selection mask
- there is one polynom for 0 <= x <= Pi/4
- and another one for Pi/4<x<=Pi/2
-
- Both branches will be computed.
- */
- emm2 = _mm_and_si128(emm2, p4i_2);
- emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
-
- Packet4f swap_sign_bit = _mm_castsi128_ps(emm0);
- Packet4f poly_mask = _mm_castsi128_ps(emm2);
- sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
-
- /* The magic pass: "Extended precision modular arithmetic"
- x = ((x - y * DP1) - y * DP2) - y * DP3; */
- xmm1 = pmul(y, p4f_minus_cephes_DP1);
- xmm2 = pmul(y, p4f_minus_cephes_DP2);
- xmm3 = pmul(y, p4f_minus_cephes_DP3);
- x = padd(x, xmm1);
- x = padd(x, xmm2);
- x = padd(x, xmm3);
-
- /* Evaluate the first polynom (0 <= x <= Pi/4) */
- y = p4f_coscof_p0;
- Packet4f z = _mm_mul_ps(x,x);
-
- y = pmadd(y, z, p4f_coscof_p1);
- y = pmadd(y, z, p4f_coscof_p2);
- y = pmul(y, z);
- y = pmul(y, z);
- Packet4f tmp = pmul(z, p4f_half);
- y = psub(y, tmp);
- y = padd(y, p4f_1);
-
- /* Evaluate the second polynom (Pi/4 <= x <= 0) */
-
- Packet4f y2 = p4f_sincof_p0;
- y2 = pmadd(y2, z, p4f_sincof_p1);
- y2 = pmadd(y2, z, p4f_sincof_p2);
- y2 = pmul(y2, z);
- y2 = pmul(y2, x);
- y2 = padd(y2, x);
-
- /* select the correct result from the two polynoms */
- y2 = _mm_and_ps(poly_mask, y2);
- y = _mm_andnot_ps(poly_mask, y);
- y = _mm_or_ps(y,y2);
- /* update the sign */
- return _mm_xor_ps(y, sign_bit);
+ return psin_float(_x);
}
/* almost the same as psin */
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index cb0f9f2c5..a508ce73e 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -158,9 +158,22 @@ template<> struct packet_traits<int> : default_packet_traits
};
};
-template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
-template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
+template<> struct unpacket_traits<Packet4f> {
+ typedef float type;
+ typedef Packet4f half;
+ typedef Packet4i integer_packet;
+ enum {size=4, alignment=Aligned16};
+};
+template<> struct unpacket_traits<Packet2d> {
+ typedef double type;
+ typedef Packet2d half;
+ enum {size=2, alignment=Aligned16};
+};
+template<> struct unpacket_traits<Packet4i> {
+ typedef int type;
+ typedef Packet4i half;
+ enum {size=4, alignment=Aligned16};
+};
#ifndef EIGEN_VECTORIZE_AVX
template<> struct scalar_div_cost<float,true> { enum { value = 7 }; };
@@ -184,6 +197,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int fro
template<> EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /*a*/) { return _mm_setzero_ps(); }
template<> EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /*a*/) { return _mm_setzero_pd(); }
+template<> EIGEN_STRONG_INLINE Packet4i pzero(const Packet4i& /*a*/) { return _mm_setzero_si128(); }
// GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction.
// However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203)
@@ -338,6 +352,8 @@ template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4
template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); }
+
template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
@@ -354,6 +370,8 @@ template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, con
template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); }
template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b,a); }
+template<> EIGEN_STRONG_INLINE Packet4i pshiftleft<Packet4i>(const Packet4i& a, int n) { return _mm_slli_epi32(a,n); }
+
#ifdef EIGEN_VECTORIZE_SSE4_1
template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, 0); }
template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return _mm_round_pd(a, 0); }
diff --git a/Eigen/src/Core/arch/SSE/TypeCasting.h b/Eigen/src/Core/arch/SSE/TypeCasting.h
index c6ca8c716..f607366f0 100644
--- a/Eigen/src/Core/arch/SSE/TypeCasting.h
+++ b/Eigen/src/Core/arch/SSE/TypeCasting.h
@@ -69,6 +69,13 @@ template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f
return _mm_cvtps_pd(a);
}
+template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a) {
+ return _mm_castps_si128(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Packet4i& a) {
+ return _mm_castsi128_ps(a);
+}
} // end namespace internal