aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/arch
diff options
context:
space:
mode:
authorGravatar Deven Desai <deven.desai.amd@gmail.com>2019-03-19 16:52:38 -0400
committerGravatar Deven Desai <deven.desai.amd@gmail.com>2019-03-19 16:52:38 -0400
commit2dbea5510fe5cb64dbfdef9042c04a3a92b87f76 (patch)
treec187e7ec5e90a191e19466ff6084dd8f053dba7e /Eigen/src/Core/arch
parente7e6809e6b38a5928efc0b5ca9520258e4d1fb3a (diff)
parent5c93b38c5fca514a08084e32feb8a8fb27bf3665 (diff)
Merged eigen/eigen into default
Diffstat (limited to 'Eigen/src/Core/arch')
-rw-r--r--Eigen/src/Core/arch/AVX/Complex.h28
-rw-r--r--Eigen/src/Core/arch/AVX/MathFunctions.h316
-rw-r--r--Eigen/src/Core/arch/AVX/PacketMath.h221
-rw-r--r--Eigen/src/Core/arch/AVX/TypeCasting.h10
-rw-r--r--Eigen/src/Core/arch/AVX512/Complex.h488
-rw-r--r--Eigen/src/Core/arch/AVX512/MathFunctions.h26
-rw-r--r--Eigen/src/Core/arch/AVX512/PacketMath.h309
-rw-r--r--Eigen/src/Core/arch/AltiVec/Complex.h16
-rw-r--r--Eigen/src/Core/arch/AltiVec/MathFunctions.h267
-rwxr-xr-xEigen/src/Core/arch/AltiVec/PacketMath.h167
-rw-r--r--Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h471
-rw-r--r--Eigen/src/Core/arch/Default/Settings.h2
-rw-r--r--Eigen/src/Core/arch/GPU/PacketMath.h124
-rw-r--r--Eigen/src/Core/arch/GPU/PacketMathHalf.h217
-rw-r--r--Eigen/src/Core/arch/MSA/Complex.h4
-rw-r--r--Eigen/src/Core/arch/MSA/MathFunctions.h4
-rw-r--r--Eigen/src/Core/arch/MSA/PacketMath.h6
-rw-r--r--Eigen/src/Core/arch/NEON/Complex.h32
-rw-r--r--Eigen/src/Core/arch/NEON/MathFunctions.h170
-rw-r--r--Eigen/src/Core/arch/NEON/PacketMath.h69
-rw-r--r--Eigen/src/Core/arch/NEON/TypeCasting.h8
-rw-r--r--Eigen/src/Core/arch/SSE/Complex.h25
-rw-r--r--Eigen/src/Core/arch/SSE/MathFunctions.h417
-rwxr-xr-xEigen/src/Core/arch/SSE/PacketMath.h181
-rw-r--r--Eigen/src/Core/arch/SSE/TypeCasting.h7
-rw-r--r--Eigen/src/Core/arch/SYCL/InteropHeaders.h2
-rw-r--r--Eigen/src/Core/arch/ZVector/Complex.h4
-rwxr-xr-xEigen/src/Core/arch/ZVector/PacketMath.h6
28 files changed, 2224 insertions, 1373 deletions
diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h
index 7fa61969d..5b8ff59bd 100644
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h
@@ -22,6 +22,7 @@ struct Packet4cf
__m256 v;
};
+#ifndef EIGEN_VECTORIZE_AVX512
template<> struct packet_traits<std::complex<float> > : default_packet_traits
{
typedef Packet4cf type;
@@ -44,8 +45,9 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
HasSetLinear = 0
};
};
+#endif
-template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4, alignment=Aligned32}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4, alignment=Aligned32, vectorizable=true}; typedef Packet2cf half; };
template<> EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); }
@@ -67,10 +69,18 @@ template<> EIGEN_STRONG_INLINE Packet4cf pmul<Packet4cf>(const Packet4cf& a, con
return Packet4cf(result);
}
+template <>
+EIGEN_STRONG_INLINE Packet4cf pcmp_eq(const Packet4cf& a, const Packet4cf& b) {
+ __m256 eq = _mm256_cmp_ps(a.v, b.v, _CMP_EQ_OQ);
+ return Packet4cf(_mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf ptrue<Packet4cf>(const Packet4cf& a) { return Packet4cf(ptrue(Packet8f(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet4cf pnot<Packet4cf>(const Packet4cf& a) { return Packet4cf(pnot(Packet8f(a.v))); }
template<> EIGEN_STRONG_INLINE Packet4cf pand <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf por <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf pxor <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(b.v,a.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf pload <Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload<Packet8f>(&numext::real_ref(*from))); }
template<> EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu<Packet8f>(&numext::real_ref(*from))); }
@@ -228,6 +238,7 @@ struct Packet2cd
__m256d v;
};
+#ifndef EIGEN_VECTORIZE_AVX512
template<> struct packet_traits<std::complex<double> > : default_packet_traits
{
typedef Packet2cd type;
@@ -250,8 +261,9 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits
HasSetLinear = 0
};
};
+#endif
-template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2, alignment=Aligned32}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2, alignment=Aligned32, vectorizable=true}; typedef Packet1cd half; };
template<> EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); }
@@ -272,10 +284,18 @@ template<> EIGEN_STRONG_INLINE Packet2cd pmul<Packet2cd>(const Packet2cd& a, con
return Packet2cd(_mm256_addsub_pd(even, odd));
}
+template <>
+EIGEN_STRONG_INLINE Packet2cd pcmp_eq(const Packet2cd& a, const Packet2cd& b) {
+ __m256d eq = _mm256_cmp_pd(a.v, b.v, _CMP_EQ_OQ);
+ return Packet2cd(pand(eq, _mm256_permute_pd(eq, 0x5)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd ptrue<Packet2cd>(const Packet2cd& a) { return Packet2cd(ptrue(Packet4d(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet2cd pnot<Packet2cd>(const Packet2cd& a) { return Packet2cd(pnot(Packet4d(a.v))); }
template<> EIGEN_STRONG_INLINE Packet2cd pand <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd por <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd pxor <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(b.v,a.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd pload <Packet2cd>(const std::complex<double>* from)
{ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload<Packet4d>((const double*)from)); }
diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h
index 6af67ce2d..9f375ed98 100644
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -10,7 +10,7 @@
#ifndef EIGEN_MATH_FUNCTIONS_AVX_H
#define EIGEN_MATH_FUNCTIONS_AVX_H
-/* The sin, cos, exp, and log functions of this file are loosely derived from
+/* The sin and cos functions of this file are loosely derived from
* Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
*/
@@ -18,187 +18,22 @@ namespace Eigen {
namespace internal {
-inline Packet8i pshiftleft(Packet8i v, int n)
-{
-#ifdef EIGEN_VECTORIZE_AVX2
- return _mm256_slli_epi32(v, n);
-#else
- __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(v, 0), n);
- __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(v, 1), n);
- return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
-#endif
-}
-
-inline Packet8f pshiftright(Packet8f v, int n)
-{
-#ifdef EIGEN_VECTORIZE_AVX2
- return _mm256_cvtepi32_ps(_mm256_srli_epi32(_mm256_castps_si256(v), n));
-#else
- __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 0), n);
- __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 1), n);
- return _mm256_cvtepi32_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1));
-#endif
-}
-
-// Sine function
-// Computes sin(x) by wrapping x to the interval [-Pi/4,3*Pi/4] and
-// evaluating interpolants in [-Pi/4,Pi/4] or [Pi/4,3*Pi/4]. The interpolants
-// are (anti-)symmetric and thus have only odd/even coefficients
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
psin<Packet8f>(const Packet8f& _x) {
- Packet8f x = _x;
-
- // Some useful values.
- _EIGEN_DECLARE_CONST_Packet8i(one, 1);
- _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f);
- _EIGEN_DECLARE_CONST_Packet8f(two, 2.0f);
- _EIGEN_DECLARE_CONST_Packet8f(one_over_four, 0.25f);
- _EIGEN_DECLARE_CONST_Packet8f(one_over_pi, 3.183098861837907e-01f);
- _EIGEN_DECLARE_CONST_Packet8f(neg_pi_first, -3.140625000000000e+00f);
- _EIGEN_DECLARE_CONST_Packet8f(neg_pi_second, -9.670257568359375e-04f);
- _EIGEN_DECLARE_CONST_Packet8f(neg_pi_third, -6.278329571784980e-07f);
- _EIGEN_DECLARE_CONST_Packet8f(four_over_pi, 1.273239544735163e+00f);
-
- // Map x from [-Pi/4,3*Pi/4] to z in [-1,3] and subtract the shifted period.
- Packet8f z = pmul(x, p8f_one_over_pi);
- Packet8f shift = _mm256_floor_ps(padd(z, p8f_one_over_four));
- x = pmadd(shift, p8f_neg_pi_first, x);
- x = pmadd(shift, p8f_neg_pi_second, x);
- x = pmadd(shift, p8f_neg_pi_third, x);
- z = pmul(x, p8f_four_over_pi);
-
- // Make a mask for the entries that need flipping, i.e. wherever the shift
- // is odd.
- Packet8i shift_ints = _mm256_cvtps_epi32(shift);
- Packet8i shift_isodd = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(shift_ints), _mm256_castsi256_ps(p8i_one)));
- Packet8i sign_flip_mask = pshiftleft(shift_isodd, 31);
-
- // Create a mask for which interpolant to use, i.e. if z > 1, then the mask
- // is set to ones for that entry.
- Packet8f ival_mask = _mm256_cmp_ps(z, p8f_one, _CMP_GT_OQ);
-
- // Evaluate the polynomial for the interval [1,3] in z.
- _EIGEN_DECLARE_CONST_Packet8f(coeff_right_0, 9.999999724233232e-01f);
- _EIGEN_DECLARE_CONST_Packet8f(coeff_right_2, -3.084242535619928e-01f);
- _EIGEN_DECLARE_CONST_Packet8f(coeff_right_4, 1.584991525700324e-02f);
- _EIGEN_DECLARE_CONST_Packet8f(coeff_right_6, -3.188805084631342e-04f);
- Packet8f z_minus_two = psub(z, p8f_two);
- Packet8f z_minus_two2 = pmul(z_minus_two, z_minus_two);
- Packet8f right = pmadd(p8f_coeff_right_6, z_minus_two2, p8f_coeff_right_4);
- right = pmadd(right, z_minus_two2, p8f_coeff_right_2);
- right = pmadd(right, z_minus_two2, p8f_coeff_right_0);
-
- // Evaluate the polynomial for the interval [-1,1] in z.
- _EIGEN_DECLARE_CONST_Packet8f(coeff_left_1, 7.853981525427295e-01f);
- _EIGEN_DECLARE_CONST_Packet8f(coeff_left_3, -8.074536727092352e-02f);
- _EIGEN_DECLARE_CONST_Packet8f(coeff_left_5, 2.489871967827018e-03f);
- _EIGEN_DECLARE_CONST_Packet8f(coeff_left_7, -3.587725841214251e-05f);
- Packet8f z2 = pmul(z, z);
- Packet8f left = pmadd(p8f_coeff_left_7, z2, p8f_coeff_left_5);
- left = pmadd(left, z2, p8f_coeff_left_3);
- left = pmadd(left, z2, p8f_coeff_left_1);
- left = pmul(left, z);
-
- // Assemble the results, i.e. select the left and right polynomials.
- left = _mm256_andnot_ps(ival_mask, left);
- right = _mm256_and_ps(ival_mask, right);
- Packet8f res = _mm256_or_ps(left, right);
+ return psin_float(_x);
+}
- // Flip the sign on the odd intervals and return the result.
- res = _mm256_xor_ps(res, _mm256_castsi256_ps(sign_flip_mask));
- return res;
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
+pcos<Packet8f>(const Packet8f& _x) {
+ return pcos_float(_x);
}
-// Natural logarithm
-// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
-// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
-// be easily approximated by a polynomial centered on m=1 for stability.
-// TODO(gonnet): Further reduce the interval allowing for lower-degree
-// polynomial interpolants -> ... -> profit!
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
plog<Packet8f>(const Packet8f& _x) {
- Packet8f x = _x;
- _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f);
- _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f);
- _EIGEN_DECLARE_CONST_Packet8f(126f, 126.0f);
-
- _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inv_mant_mask, ~0x7f800000);
-
- // The smallest non denormalized float number.
- _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(min_norm_pos, 0x00800000);
- _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(minus_inf, 0xff800000);
-
- // Polynomial coefficients.
- _EIGEN_DECLARE_CONST_Packet8f(cephes_SQRTHF, 0.707106781186547524f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p0, 7.0376836292E-2f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p1, -1.1514610310E-1f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p2, 1.1676998740E-1f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p3, -1.2420140846E-1f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p4, +1.4249322787E-1f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p5, -1.6668057665E-1f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p6, +2.0000714765E-1f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p7, -2.4999993993E-1f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p8, +3.3333331174E-1f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q1, -2.12194440e-4f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q2, 0.693359375f);
-
- Packet8f invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_NGE_UQ); // not greater equal is true if x is NaN
- Packet8f iszero_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ);
-
- // Truncate input values to the minimum positive normal.
- x = pmax(x, p8f_min_norm_pos);
-
- Packet8f emm0 = pshiftright(x,23);
- Packet8f e = _mm256_sub_ps(emm0, p8f_126f);
-
- // Set the exponents to -1, i.e. x are in the range [0.5,1).
- x = _mm256_and_ps(x, p8f_inv_mant_mask);
- x = _mm256_or_ps(x, p8f_half);
-
- // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
- // and shift by -1. The values are then centered around 0, which improves
- // the stability of the polynomial evaluation.
- // if( x < SQRTHF ) {
- // e -= 1;
- // x = x + x - 1.0;
- // } else { x = x - 1.0; }
- Packet8f mask = _mm256_cmp_ps(x, p8f_cephes_SQRTHF, _CMP_LT_OQ);
- Packet8f tmp = _mm256_and_ps(x, mask);
- x = psub(x, p8f_1);
- e = psub(e, _mm256_and_ps(p8f_1, mask));
- x = padd(x, tmp);
-
- Packet8f x2 = pmul(x, x);
- Packet8f x3 = pmul(x2, x);
-
- // Evaluate the polynomial approximant of degree 8 in three parts, probably
- // to improve instruction-level parallelism.
- Packet8f y, y1, y2;
- y = pmadd(p8f_cephes_log_p0, x, p8f_cephes_log_p1);
- y1 = pmadd(p8f_cephes_log_p3, x, p8f_cephes_log_p4);
- y2 = pmadd(p8f_cephes_log_p6, x, p8f_cephes_log_p7);
- y = pmadd(y, x, p8f_cephes_log_p2);
- y1 = pmadd(y1, x, p8f_cephes_log_p5);
- y2 = pmadd(y2, x, p8f_cephes_log_p8);
- y = pmadd(y, x3, y1);
- y = pmadd(y, x3, y2);
- y = pmul(y, x3);
-
- // Add the logarithm of the exponent back to the result of the interpolation.
- y1 = pmul(e, p8f_cephes_log_q1);
- tmp = pmul(x2, p8f_half);
- y = padd(y, y1);
- x = psub(x, tmp);
- y2 = pmul(e, p8f_cephes_log_q2);
- x = padd(x, y);
- x = padd(x, y2);
-
- // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
- return _mm256_or_ps(
- _mm256_andnot_ps(iszero_mask, _mm256_or_ps(x, invalid_mask)),
- _mm256_and_ps(iszero_mask, p8f_minus_inf));
+ return plog_float(_x);
}
// Exponential function. Works by writing "x = m*log(2) + r" where
@@ -207,62 +42,7 @@ plog<Packet8f>(const Packet8f& _x) {
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
pexp<Packet8f>(const Packet8f& _x) {
- _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f);
- _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f);
- _EIGEN_DECLARE_CONST_Packet8f(127, 127.0f);
-
- _EIGEN_DECLARE_CONST_Packet8f(exp_hi, 88.3762626647950f);
- _EIGEN_DECLARE_CONST_Packet8f(exp_lo, -88.3762626647949f);
-
- _EIGEN_DECLARE_CONST_Packet8f(cephes_LOG2EF, 1.44269504088896341f);
-
- _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p0, 1.9875691500E-4f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p1, 1.3981999507E-3f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p2, 8.3334519073E-3f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p3, 4.1665795894E-2f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p4, 1.6666665459E-1f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p5, 5.0000001201E-1f);
-
- // Clamp x.
- Packet8f x = pmax(pmin(_x, p8f_exp_hi), p8f_exp_lo);
-
- // Express exp(x) as exp(m*ln(2) + r), start by extracting
- // m = floor(x/ln(2) + 0.5).
- Packet8f m = _mm256_floor_ps(pmadd(x, p8f_cephes_LOG2EF, p8f_half));
-
-// Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is
-// subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating
-// truncation errors. Note that we don't use the "pmadd" function here to
-// ensure that a precision-preserving FMA instruction is used.
-#ifdef EIGEN_VECTORIZE_FMA
- _EIGEN_DECLARE_CONST_Packet8f(nln2, -0.6931471805599453f);
- Packet8f r = _mm256_fmadd_ps(m, p8f_nln2, x);
-#else
- _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C1, 0.693359375f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C2, -2.12194440e-4f);
- Packet8f r = psub(x, pmul(m, p8f_cephes_exp_C1));
- r = psub(r, pmul(m, p8f_cephes_exp_C2));
-#endif
-
- Packet8f r2 = pmul(r, r);
-
- // TODO(gonnet): Split into odd/even polynomials and try to exploit
- // instruction-level parallelism.
- Packet8f y = p8f_cephes_exp_p0;
- y = pmadd(y, r, p8f_cephes_exp_p1);
- y = pmadd(y, r, p8f_cephes_exp_p2);
- y = pmadd(y, r, p8f_cephes_exp_p3);
- y = pmadd(y, r, p8f_cephes_exp_p4);
- y = pmadd(y, r, p8f_cephes_exp_p5);
- y = pmadd(y, r2, r);
- y = padd(y, p8f_1);
-
- // Build emm0 = 2^m.
- Packet8i emm0 = _mm256_cvttps_epi32(padd(m, p8f_127));
- emm0 = pshiftleft(emm0, 23);
-
- // Return 2^m * exp(r).
- return pmax(pmul(y, _mm256_castsi256_ps(emm0)), _x);
+ return pexp_float(_x);
}
// Hyperbolic Tangent function.
@@ -274,82 +54,8 @@ ptanh<Packet8f>(const Packet8f& x) {
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
-pexp<Packet4d>(const Packet4d& _x) {
- Packet4d x = _x;
-
- _EIGEN_DECLARE_CONST_Packet4d(1, 1.0);
- _EIGEN_DECLARE_CONST_Packet4d(2, 2.0);
- _EIGEN_DECLARE_CONST_Packet4d(half, 0.5);
-
- _EIGEN_DECLARE_CONST_Packet4d(exp_hi, 709.437);
- _EIGEN_DECLARE_CONST_Packet4d(exp_lo, -709.436139303);
-
- _EIGEN_DECLARE_CONST_Packet4d(cephes_LOG2EF, 1.4426950408889634073599);
-
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p0, 1.26177193074810590878e-4);
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p1, 3.02994407707441961300e-2);
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p2, 9.99999999999999999910e-1);
-
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q0, 3.00198505138664455042e-6);
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q1, 2.52448340349684104192e-3);
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q2, 2.27265548208155028766e-1);
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q3, 2.00000000000000000009e0);
-
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C1, 0.693145751953125);
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C2, 1.42860682030941723212e-6);
- _EIGEN_DECLARE_CONST_Packet4i(1023, 1023);
-
- Packet4d tmp, fx;
-
- // clamp x
- x = pmax(pmin(x, p4d_exp_hi), p4d_exp_lo);
- // Express exp(x) as exp(g + n*log(2)).
- fx = pmadd(p4d_cephes_LOG2EF, x, p4d_half);
-
- // Get the integer modulus of log(2), i.e. the "n" described above.
- fx = _mm256_floor_pd(fx);
-
- // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
- // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
- // digits right.
- tmp = pmul(fx, p4d_cephes_exp_C1);
- Packet4d z = pmul(fx, p4d_cephes_exp_C2);
- x = psub(x, tmp);
- x = psub(x, z);
-
- Packet4d x2 = pmul(x, x);
-
- // Evaluate the numerator polynomial of the rational interpolant.
- Packet4d px = p4d_cephes_exp_p0;
- px = pmadd(px, x2, p4d_cephes_exp_p1);
- px = pmadd(px, x2, p4d_cephes_exp_p2);
- px = pmul(px, x);
-
- // Evaluate the denominator polynomial of the rational interpolant.
- Packet4d qx = p4d_cephes_exp_q0;
- qx = pmadd(qx, x2, p4d_cephes_exp_q1);
- qx = pmadd(qx, x2, p4d_cephes_exp_q2);
- qx = pmadd(qx, x2, p4d_cephes_exp_q3);
-
- // I don't really get this bit, copied from the SSE2 routines, so...
- // TODO(gonnet): Figure out what is going on here, perhaps find a better
- // rational interpolant?
- x = _mm256_div_pd(px, psub(qx, px));
- x = pmadd(p4d_2, x, p4d_1);
-
- // Build e=2^n by constructing the exponents in a 128-bit vector and
- // shifting them to where they belong in double-precision values.
- __m128i emm0 = _mm256_cvtpd_epi32(fx);
- emm0 = _mm_add_epi32(emm0, p4i_1023);
- emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0));
- __m128i lo = _mm_slli_epi64(emm0, 52);
- __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52);
- __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0);
- e = _mm256_insertf128_si256(e, hi, 1);
-
- // Construct the result 2^n * exp(g) = e * x. The max is used to catch
- // non-finite values in the input.
- return pmax(pmul(x, _mm256_castsi256_pd(e)), _x);
+pexp<Packet4d>(const Packet4d& x) {
+ return pexp_double(x);
}
// Functions for sqrt.
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 774e64981..f88e36024 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -18,11 +18,11 @@ namespace internal {
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
#endif
-#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
+#if !defined(EIGEN_VECTORIZE_AVX512) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS)
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
#endif
-#ifdef __FMA__
+#ifdef EIGEN_VECTORIZE_FMA
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#endif
@@ -63,7 +63,7 @@ template<> struct packet_traits<float> : default_packet_traits
HasDiv = 1,
HasSin = EIGEN_FAST_MATH,
- HasCos = 0,
+ HasCos = EIGEN_FAST_MATH,
HasLog = 1,
HasExp = 1,
HasSqrt = 1,
@@ -113,14 +113,29 @@ template<> struct packet_traits<int> : default_packet_traits
};
*/
-template<> struct unpacket_traits<Packet8f> { typedef float type; typedef Packet4f half; enum {size=8, alignment=Aligned32}; };
-template<> struct unpacket_traits<Packet4d> { typedef double type; typedef Packet2d half; enum {size=4, alignment=Aligned32}; };
-template<> struct unpacket_traits<Packet8i> { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32}; };
+template<> struct unpacket_traits<Packet8f> {
+ typedef float type;
+ typedef Packet4f half;
+ typedef Packet8i integer_packet;
+ enum {size=8, alignment=Aligned32, vectorizable=true};
+};
+template<> struct unpacket_traits<Packet4d> {
+ typedef double type;
+ typedef Packet2d half;
+ enum {size=4, alignment=Aligned32, vectorizable=true};
+};
+template<> struct unpacket_traits<Packet8i> { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32, vectorizable=false}; };
template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) { return _mm256_set1_ps(from); }
template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); }
template<> EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int& from) { return _mm256_set1_epi32(from); }
+template<> EIGEN_STRONG_INLINE Packet8f pset1frombits<Packet8f>(unsigned int from) { return _mm256_castsi256_ps(pset1<Packet8i>(from)); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pzero(const Packet8f& /*a*/) { return _mm256_setzero_ps(); }
+template<> EIGEN_STRONG_INLINE Packet4d pzero(const Packet4d& /*a*/) { return _mm256_setzero_pd(); }
+template<> EIGEN_STRONG_INLINE Packet8i pzero(const Packet8i& /*a*/) { return _mm256_setzero_si256(); }
+
template<> EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float* from) { return _mm256_broadcast_ss(from); }
template<> EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) { return _mm256_broadcast_sd(from); }
@@ -129,6 +144,15 @@ template<> EIGEN_STRONG_INLINE Packet4d plset<Packet4d>(const double& a) { retur
template<> EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i padd<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+ return _mm256_add_epi32(a,b);
+#else
+ __m128i lo = _mm_add_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
+ __m128i hi = _mm_add_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
template<> EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); }
@@ -157,13 +181,14 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, co
return pset1<Packet8i>(0);
}
-#ifdef __FMA__
+#ifdef EIGEN_VECTORIZE_FMA
template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
-#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) )
- // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
- // and gcc stupidly generates a vfmadd132ps instruction,
- // so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate
- // the result of the product.
+#if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) )
+ // Clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
+ // and even register spilling with clang>=6.0 (bug 1637).
+ // Gcc stupidly generates a vfmadd132ps instruction.
+ // So let's enforce it to generate a vfmadd231ps instruction since the most common use
+ // case is to accumulate the result of the product.
Packet8f res = c;
__asm__("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
return res;
@@ -172,7 +197,7 @@ template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f&
#endif
}
template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
-#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) )
+#if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) )
// see above
Packet4d res = c;
__asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
@@ -184,21 +209,69 @@ template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d&
#endif
template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) {
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+ // There appears to be a bug in GCC, by which the optimizer may flip
+ // the argument order in calls to _mm_min_ps/_mm_max_ps, so we have to
+ // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
+ // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+ Packet8f res;
+ asm("vminps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+ return res;
+#else
// Arguments are swapped to match NaN propagation behavior of std::min.
return _mm256_min_ps(b,a);
+#endif
}
template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) {
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+ // See pmin above
+ Packet4d res;
+ asm("vminpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+ return res;
+#else
// Arguments are swapped to match NaN propagation behavior of std::min.
return _mm256_min_pd(b,a);
+#endif
}
template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) {
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+ // See pmin above
+ Packet8f res;
+ asm("vmaxps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+ return res;
+#else
// Arguments are swapped to match NaN propagation behavior of std::max.
return _mm256_max_ps(b,a);
+#endif
}
template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) {
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+ // See pmin above
+ Packet4d res;
+ asm("vmaxpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+ return res;
+#else
// Arguments are swapped to match NaN propagation behavior of std::max.
return _mm256_max_pd(b,a);
+#endif
}
+
+template<> EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LE_OQ); }
+template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LT_OQ); }
+template<> EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_EQ_OQ); }
+template<> EIGEN_STRONG_INLINE Packet4d pcmp_eq(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_EQ_OQ); }
+template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a, b, _CMP_NGE_UQ); }
+
+template<> EIGEN_STRONG_INLINE Packet8i pcmp_eq(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+ return _mm256_cmpeq_epi32(a,b);
+#else
+ __m128i lo = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
+ __m128i hi = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+
template<> EIGEN_STRONG_INLINE Packet8f pround<Packet8f>(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); }
template<> EIGEN_STRONG_INLINE Packet4d pround<Packet4d>(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); }
@@ -208,17 +281,101 @@ template<> EIGEN_STRONG_INLINE Packet4d pceil<Packet4d>(const Packet4d& a) { ret
template<> EIGEN_STRONG_INLINE Packet8f pfloor<Packet8f>(const Packet8f& a) { return _mm256_floor_ps(a); }
template<> EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) { return _mm256_floor_pd(a); }
+
+template<> EIGEN_STRONG_INLINE Packet8i ptrue<Packet8i>(const Packet8i& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+ // vpcmpeqd has lower latency than the more general vcmpps
+ return _mm256_cmpeq_epi32(a,a);
+#else
+ const __m256 b = _mm256_castsi256_ps(a);
+ return _mm256_castps_si256(_mm256_cmp_ps(b,b,_CMP_TRUE_UQ));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f ptrue<Packet8f>(const Packet8f& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+ // vpcmpeqd has lower latency than the more general vcmpps
+ const __m256i b = _mm256_castps_si256(a);
+ return _mm256_castsi256_ps(_mm256_cmpeq_epi32(b,b));
+#else
+ return _mm256_cmp_ps(a,a,_CMP_TRUE_UQ);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet4d ptrue<Packet4d>(const Packet4d& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+ // vpcmpeqq has lower latency than the more general vcmppd
+ const __m256i b = _mm256_castpd_si256(a);
+ return _mm256_castsi256_pd(_mm256_cmpeq_epi64(b,b));
+#else
+ return _mm256_cmp_pd(a,a,_CMP_TRUE_UQ);
+#endif
+}
+
template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+ return _mm256_and_si256(a,b);
+#else
+ return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
+#endif
+}
template<> EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i por<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+ return _mm256_or_si256(a,b);
+#else
+ return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
+#endif
+}
template<> EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i pxor<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+ return _mm256_xor_si256(a,b);
+#else
+ return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(b,a); }
+template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(b,a); }
+template<> EIGEN_STRONG_INLINE Packet8i pandnot<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+ return _mm256_andnot_si256(b,a);
+#else
+ return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(b),_mm256_castsi256_ps(a)));
+#endif
+}
-template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8f pselect<Packet8f>(const Packet8f& mask, const Packet8f& a, const Packet8f& b)
+{ return _mm256_blendv_ps(b,a,mask); }
+template<> EIGEN_STRONG_INLINE Packet4d pselect<Packet4d>(const Packet4d& mask, const Packet4d& a, const Packet4d& b)
+{ return _mm256_blendv_pd(b,a,mask); }
+
+template<int N> EIGEN_STRONG_INLINE Packet8i pshiftright(Packet8i a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+ return _mm256_srli_epi32(a, N);
+#else
+ __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(a, 0), N);
+ __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(a, 1), N);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet8i pshiftleft(Packet8i a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+ return _mm256_slli_epi32(a, N);
+#else
+ __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(a, 0), N);
+ __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(a, 1), N);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
template<> EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); }
template<> EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); }
@@ -363,6 +520,28 @@ template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a)
return _mm256_and_pd(a,mask);
}
+template<> EIGEN_STRONG_INLINE Packet8f pfrexp<Packet8f>(const Packet8f& a, Packet8f& exponent) {
+ return pfrexp_float(a,exponent);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pldexp<Packet8f>(const Packet8f& a, const Packet8f& exponent) {
+ return pldexp_float(a,exponent);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4d pldexp<Packet4d>(const Packet4d& a, const Packet4d& exponent) {
+ // Build e=2^n by constructing the exponents in a 128-bit vector and
+ // shifting them to where they belong in double-precision values.
+ Packet4i cst_1023 = pset1<Packet4i>(1023);
+ __m128i emm0 = _mm256_cvtpd_epi32(exponent);
+ emm0 = _mm_add_epi32(emm0, cst_1023);
+ emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0));
+ __m128i lo = _mm_slli_epi64(emm0, 52);
+ __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52);
+ __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0);
+ e = _mm256_insertf128_si256(e, hi, 1);
+ return pmul(a,_mm256_castsi256_pd(e));
+}
+
// preduxp should be ok
// FIXME: why is this ok? why isn't the simply implementation working as expected?
template<> EIGEN_STRONG_INLINE Packet8f preduxp<Packet8f>(const Packet8f* vecs)
@@ -459,6 +638,16 @@ template<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a)
return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
}
+// not needed yet
+// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet8f& x)
+// {
+// return _mm256_movemask_ps(x)==0xFF;
+// }
+
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x)
+{
+ return _mm256_movemask_ps(x)!=0;
+}
template<int Offset>
struct palign_impl<Offset,Packet8f>
diff --git a/Eigen/src/Core/arch/AVX/TypeCasting.h b/Eigen/src/Core/arch/AVX/TypeCasting.h
index 83bfdc604..7d2e1e67f 100644
--- a/Eigen/src/Core/arch/AVX/TypeCasting.h
+++ b/Eigen/src/Core/arch/AVX/TypeCasting.h
@@ -37,13 +37,21 @@ struct type_casting_traits<int, float> {
template<> EIGEN_STRONG_INLINE Packet8i pcast<Packet8f, Packet8i>(const Packet8f& a) {
- return _mm256_cvtps_epi32(a);
+ return _mm256_cvttps_epi32(a);
}
template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8i, Packet8f>(const Packet8i& a) {
return _mm256_cvtepi32_ps(a);
}
+template<> EIGEN_STRONG_INLINE Packet8i preinterpret<Packet8i,Packet8f>(const Packet8f& a) {
+ return _mm256_castps_si256(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f preinterpret<Packet8f,Packet8i>(const Packet8i& a) {
+ return _mm256_castsi256_ps(a);
+}
+
} // end namespace internal
} // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h
new file mode 100644
index 000000000..9a89dd01f
--- /dev/null
+++ b/Eigen/src/Core/arch/AVX512/Complex.h
@@ -0,0 +1,488 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_AVX512_H
+#define EIGEN_COMPLEX_AVX512_H
+
+namespace Eigen {
+
+namespace internal {
+
+//---------- float ----------
+struct Packet8cf
+{
+ EIGEN_STRONG_INLINE Packet8cf() {}
+ EIGEN_STRONG_INLINE explicit Packet8cf(const __m512& a) : v(a) {}
+ __m512 v;
+};
+
+template<> struct packet_traits<std::complex<float> > : default_packet_traits
+{
+ typedef Packet8cf type;
+ typedef Packet4cf half;
+ enum {
+ Vectorizable = 1,
+ AlignedOnScalar = 1,
+ size = 8,
+ HasHalfPacket = 1,
+
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 1,
+ HasNegate = 1,
+ HasAbs = 0,
+ HasAbs2 = 0,
+ HasMin = 0,
+ HasMax = 0,
+ HasSetLinear = 0,
+ HasReduxp = 0
+ };
+};
+
+template<> struct unpacket_traits<Packet8cf> {
+ typedef std::complex<float> type;
+ enum {
+ size = 8,
+ alignment=unpacket_traits<Packet16f>::alignment,
+ vectorizable=true
+ };
+ typedef Packet4cf half;
+};
+
+template<> EIGEN_STRONG_INLINE Packet8cf ptrue<Packet8cf>(const Packet8cf& a) { return Packet8cf(ptrue(Packet16f(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet8cf pnot<Packet8cf>(const Packet8cf& a) { return Packet8cf(pnot(Packet16f(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet8cf padd<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_add_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet8cf psub<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_sub_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet8cf pnegate(const Packet8cf& a)
+{
+ return Packet8cf(pnegate(a.v));
+}
+template<> EIGEN_STRONG_INLINE Packet8cf pconj(const Packet8cf& a)
+{
+ const __m512 mask = _mm512_castsi512_ps(_mm512_setr_epi32(
+ 0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,
+ 0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000));
+ return Packet8cf(pxor(a.v,mask));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8cf pmul<Packet8cf>(const Packet8cf& a, const Packet8cf& b)
+{
+ __m512 tmp2 = _mm512_mul_ps(_mm512_movehdup_ps(a.v), _mm512_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1)));
+ return Packet8cf(_mm512_fmaddsub_ps(_mm512_moveldup_ps(a.v), b.v, tmp2));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8cf pand <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pand(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet8cf por <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(por(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet8cf pxor <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pxor(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet8cf pandnot<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pandnot(a.v,b.v)); }
+
+template <>
+EIGEN_STRONG_INLINE Packet8cf pcmp_eq(const Packet8cf& a, const Packet8cf& b) {
+ __m512 eq = pcmp_eq<Packet16f>(a.v, b.v);
+ return Packet8cf(pand(eq, _mm512_permute_ps(eq, 0xB1)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8cf pload <Packet8cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet8cf(pload<Packet16f>(&numext::real_ref(*from))); }
+template<> EIGEN_STRONG_INLINE Packet8cf ploadu<Packet8cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet8cf(ploadu<Packet16f>(&numext::real_ref(*from))); }
+
+
+template<> EIGEN_STRONG_INLINE Packet8cf pset1<Packet8cf>(const std::complex<float>& from)
+{
+ return Packet8cf(_mm512_castpd_ps(pload1<Packet8d>((const double*)(const void*)&from)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8cf ploaddup<Packet8cf>(const std::complex<float>* from)
+{
+ return Packet8cf( _mm512_castpd_ps( ploaddup<Packet8d>((const double*)(const void*)from )) );
+}
+template<> EIGEN_STRONG_INLINE Packet8cf ploadquad<Packet8cf>(const std::complex<float>* from)
+{
+ return Packet8cf( _mm512_castpd_ps( ploadquad<Packet8d>((const double*)(const void*)from )) );
+}
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float>* to, const Packet8cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet8cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet8cf pgather<std::complex<float>, Packet8cf>(const std::complex<float>* from, Index stride)
+{
+ return Packet8cf(_mm512_castpd_ps(pgather<double,Packet8d>((const double*)(const void*)from, stride)));
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet8cf>(std::complex<float>* to, const Packet8cf& from, Index stride)
+{
+ pscatter((double*)(void*)to, _mm512_castps_pd(from.v), stride);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet8cf>(const Packet8cf& a)
+{
+ return pfirst(Packet2cf(_mm512_castps512_ps128(a.v)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8cf preverse(const Packet8cf& a) {
+ return Packet8cf(_mm512_castsi512_ps(
+ _mm512_permutexvar_epi64( _mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7),
+ _mm512_castps_si512(a.v))));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet8cf>(const Packet8cf& a)
+{
+ return predux(padd(Packet4cf(extract256<0>(a.v)),
+ Packet4cf(extract256<1>(a.v))));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet8cf>(const Packet8cf& a)
+{
+ return predux_mul(pmul(Packet4cf(extract256<0>(a.v)),
+ Packet4cf(extract256<1>(a.v))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4cf predux_half_dowto4<Packet8cf>(const Packet8cf& a) {
+ __m256 lane0 = extract256<0>(a.v);
+ __m256 lane1 = extract256<1>(a.v);
+ __m256 res = _mm256_add_ps(lane0, lane1);
+ return Packet4cf(res);
+}
+
+template<int Offset>
+struct palign_impl<Offset,Packet8cf>
+{
+ static EIGEN_STRONG_INLINE void run(Packet8cf& first, const Packet8cf& second)
+ {
+ if (Offset==0) return;
+ palign_impl<Offset*2,Packet16f>::run(first.v, second.v);
+ }
+};
+
+template<> struct conj_helper<Packet8cf, Packet8cf, false,true>
+{
+ EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const
+ { return padd(pmul(x,y),c); }
+
+ EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) const
+ {
+ return internal::pmul(a, pconj(b));
+ }
+};
+
+template<> struct conj_helper<Packet8cf, Packet8cf, true,false>
+{
+ EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const
+ { return padd(pmul(x,y),c); }
+
+ EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) const
+ {
+ return internal::pmul(pconj(a), b);
+ }
+};
+
+template<> struct conj_helper<Packet8cf, Packet8cf, true,true>
+{
+ EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const
+ { return padd(pmul(x,y),c); }
+
+ EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) const
+ {
+ return pconj(internal::pmul(a, b));
+ }
+};
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet8cf,Packet16f)
+
+template<> EIGEN_STRONG_INLINE Packet8cf pdiv<Packet8cf>(const Packet8cf& a, const Packet8cf& b)
+{
+ Packet8cf num = pmul(a, pconj(b));
+ __m512 tmp = _mm512_mul_ps(b.v, b.v);
+ __m512 tmp2 = _mm512_shuffle_ps(tmp,tmp,0xB1);
+ __m512 denom = _mm512_add_ps(tmp, tmp2);
+ return Packet8cf(_mm512_div_ps(num.v, denom));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8cf pcplxflip<Packet8cf>(const Packet8cf& x)
+{
+ return Packet8cf(_mm512_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1)));
+}
+
+//---------- double ----------
+struct Packet4cd
+{
+ EIGEN_STRONG_INLINE Packet4cd() {}
+ EIGEN_STRONG_INLINE explicit Packet4cd(const __m512d& a) : v(a) {}
+ __m512d v;
+};
+
+template<> struct packet_traits<std::complex<double> > : default_packet_traits
+{
+ typedef Packet4cd type;
+ typedef Packet2cd half;
+ enum {
+ Vectorizable = 1,
+ AlignedOnScalar = 0,
+ size = 4,
+ HasHalfPacket = 1,
+
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 1,
+ HasNegate = 1,
+ HasAbs = 0,
+ HasAbs2 = 0,
+ HasMin = 0,
+ HasMax = 0,
+ HasSetLinear = 0,
+ HasReduxp = 0
+ };
+};
+
+template<> struct unpacket_traits<Packet4cd> {
+ typedef std::complex<double> type;
+ enum {
+ size = 4,
+ alignment = unpacket_traits<Packet8d>::alignment,
+ vectorizable=true
+ };
+ typedef Packet2cd half;
+};
+
+template<> EIGEN_STRONG_INLINE Packet4cd padd<Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_add_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cd psub<Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_sub_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cd pnegate(const Packet4cd& a) { return Packet4cd(pnegate(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cd pconj(const Packet4cd& a)
+{
+ const __m512d mask = _mm512_castsi512_pd(
+ _mm512_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0,
+ 0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0));
+ return Packet4cd(pxor(a.v,mask));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cd pmul<Packet4cd>(const Packet4cd& a, const Packet4cd& b)
+{
+ __m512d tmp1 = _mm512_shuffle_pd(a.v,a.v,0x0);
+ __m512d tmp2 = _mm512_shuffle_pd(a.v,a.v,0xFF);
+ __m512d tmp3 = _mm512_shuffle_pd(b.v,b.v,0x55);
+ __m512d odd = _mm512_mul_pd(tmp2, tmp3);
+ return Packet4cd(_mm512_fmaddsub_pd(tmp1, b.v, odd));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cd ptrue<Packet4cd>(const Packet4cd& a) { return Packet4cd(ptrue(Packet8d(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet4cd pnot<Packet4cd>(const Packet4cd& a) { return Packet4cd(pnot(Packet8d(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet4cd pand <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pand(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cd por <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(por(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cd pxor <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pxor(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cd pandnot<Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pandnot(a.v,b.v)); }
+
+template <>
+EIGEN_STRONG_INLINE Packet4cd pcmp_eq(const Packet4cd& a, const Packet4cd& b) {
+ __m512d eq = pcmp_eq<Packet8d>(a.v, b.v);
+ return Packet4cd(pand(eq, _mm512_permute_pd(eq, 0x55)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cd pload <Packet4cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return Packet4cd(pload<Packet8d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet4cd ploadu<Packet4cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cd(ploadu<Packet8d>((const double*)from)); }
+
+template<> EIGEN_STRONG_INLINE Packet4cd pset1<Packet4cd>(const std::complex<double>& from)
+{
+ #ifdef EIGEN_VECTORIZE_AVX512DQ
+ return Packet4cd(_mm512_broadcast_f64x2(pset1<Packet1cd>(from).v));
+ #else
+ return Packet4cd(_mm512_castps_pd(_mm512_broadcast_f32x4( _mm_castpd_ps(pset1<Packet1cd>(from).v))));
+ #endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cd ploaddup<Packet4cd>(const std::complex<double>* from) {
+ return Packet4cd(_mm512_insertf64x4(
+ _mm512_castpd256_pd512(ploaddup<Packet2cd>(from).v), ploaddup<Packet2cd>(from+1).v, 1));
+}
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet4cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet4cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet4cd pgather<std::complex<double>, Packet4cd>(const std::complex<double>* from, Index stride)
+{
+ return Packet4cd(_mm512_insertf64x4(_mm512_castpd256_pd512(
+ _mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu<Packet1cd>(from+0*stride).v), ploadu<Packet1cd>(from+1*stride).v,1)),
+ _mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu<Packet1cd>(from+2*stride).v), ploadu<Packet1cd>(from+3*stride).v,1), 1));
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet4cd>(std::complex<double>* to, const Packet4cd& from, Index stride)
+{
+ __m512i fromi = _mm512_castpd_si512(from.v);
+ double* tod = (double*)(void*)to;
+ _mm_storeu_pd(tod+0*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,0)) );
+ _mm_storeu_pd(tod+2*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,1)) );
+ _mm_storeu_pd(tod+4*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,2)) );
+ _mm_storeu_pd(tod+6*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,3)) );
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet4cd>(const Packet4cd& a)
+{
+ __m128d low = extract128<0>(a.v);
+ EIGEN_ALIGN16 double res[2];
+ _mm_store_pd(res, low);
+ return std::complex<double>(res[0],res[1]);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cd preverse(const Packet4cd& a) {
+ return Packet4cd(_mm512_shuffle_f64x2(a.v, a.v, EIGEN_SSE_SHUFFLE_MASK(3,2,1,0)));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet4cd>(const Packet4cd& a)
+{
+ return predux(padd(Packet2cd(_mm512_extractf64x4_pd(a.v,0)),
+ Packet2cd(_mm512_extractf64x4_pd(a.v,1))));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet4cd>(const Packet4cd& a)
+{
+ return predux_mul(pmul(Packet2cd(_mm512_extractf64x4_pd(a.v,0)),
+ Packet2cd(_mm512_extractf64x4_pd(a.v,1))));
+}
+
+template<int Offset>
+struct palign_impl<Offset,Packet4cd>
+{
+ static EIGEN_STRONG_INLINE void run(Packet4cd& first, const Packet4cd& second)
+ {
+ if (Offset==0) return;
+ palign_impl<Offset*2,Packet8d>::run(first.v, second.v);
+ }
+};
+
+template<> struct conj_helper<Packet4cd, Packet4cd, false,true>
+{
+ EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const
+ { return padd(pmul(x,y),c); }
+
+ EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const
+ {
+ return internal::pmul(a, pconj(b));
+ }
+};
+
+template<> struct conj_helper<Packet4cd, Packet4cd, true,false>
+{
+ EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const
+ { return padd(pmul(x,y),c); }
+
+ EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const
+ {
+ return internal::pmul(pconj(a), b);
+ }
+};
+
+template<> struct conj_helper<Packet4cd, Packet4cd, true,true>
+{
+ EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const
+ { return padd(pmul(x,y),c); }
+
+ EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const
+ {
+ return pconj(internal::pmul(a, b));
+ }
+};
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cd,Packet8d)
+
+template<> EIGEN_STRONG_INLINE Packet4cd pdiv<Packet4cd>(const Packet4cd& a, const Packet4cd& b)
+{
+ Packet4cd num = pmul(a, pconj(b));
+ __m512d tmp = _mm512_mul_pd(b.v, b.v);
+ __m512d denom = padd(_mm512_permute_pd(tmp,0x55), tmp);
+ return Packet4cd(_mm512_div_pd(num.v, denom));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cd pcplxflip<Packet4cd>(const Packet4cd& x)
+{
+ return Packet4cd(_mm512_permute_pd(x.v,0x55));
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8cf,4>& kernel) {
+ PacketBlock<Packet8d,4> pb;
+
+ pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v);
+ pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v);
+ pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v);
+ pb.packet[3] = _mm512_castps_pd(kernel.packet[3].v);
+ ptranspose(pb);
+ kernel.packet[0].v = _mm512_castpd_ps(pb.packet[0]);
+ kernel.packet[1].v = _mm512_castpd_ps(pb.packet[1]);
+ kernel.packet[2].v = _mm512_castpd_ps(pb.packet[2]);
+ kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8cf,8>& kernel) {
+ PacketBlock<Packet8d,8> pb;
+
+ pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v);
+ pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v);
+ pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v);
+ pb.packet[3] = _mm512_castps_pd(kernel.packet[3].v);
+ pb.packet[4] = _mm512_castps_pd(kernel.packet[4].v);
+ pb.packet[5] = _mm512_castps_pd(kernel.packet[5].v);
+ pb.packet[6] = _mm512_castps_pd(kernel.packet[6].v);
+ pb.packet[7] = _mm512_castps_pd(kernel.packet[7].v);
+ ptranspose(pb);
+ kernel.packet[0].v = _mm512_castpd_ps(pb.packet[0]);
+ kernel.packet[1].v = _mm512_castpd_ps(pb.packet[1]);
+ kernel.packet[2].v = _mm512_castpd_ps(pb.packet[2]);
+ kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]);
+ kernel.packet[4].v = _mm512_castpd_ps(pb.packet[4]);
+ kernel.packet[5].v = _mm512_castpd_ps(pb.packet[5]);
+ kernel.packet[6].v = _mm512_castpd_ps(pb.packet[6]);
+ kernel.packet[7].v = _mm512_castpd_ps(pb.packet[7]);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4cd,4>& kernel) {
+ __m512d T0 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, EIGEN_SSE_SHUFFLE_MASK(0,1,0,1)); // [a0 a1 b0 b1]
+ __m512d T1 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, EIGEN_SSE_SHUFFLE_MASK(2,3,2,3)); // [a2 a3 b2 b3]
+ __m512d T2 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, EIGEN_SSE_SHUFFLE_MASK(0,1,0,1)); // [c0 c1 d0 d1]
+ __m512d T3 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, EIGEN_SSE_SHUFFLE_MASK(2,3,2,3)); // [c2 c3 d2 d3]
+
+ kernel.packet[3] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, EIGEN_SSE_SHUFFLE_MASK(1,3,1,3))); // [a3 b3 c3 d3]
+ kernel.packet[2] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, EIGEN_SSE_SHUFFLE_MASK(0,2,0,2))); // [a2 b2 c2 d2]
+ kernel.packet[1] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, EIGEN_SSE_SHUFFLE_MASK(1,3,1,3))); // [a1 b1 c1 d1]
+ kernel.packet[0] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, EIGEN_SSE_SHUFFLE_MASK(0,2,0,2))); // [a0 b0 c0 d0]
+}
+
+template<> EIGEN_STRONG_INLINE Packet8cf pinsertfirst(const Packet8cf& a, std::complex<float> b)
+{
+ Packet2cf tmp = Packet2cf(_mm512_extractf32x4_ps(a.v,0));
+ tmp = pinsertfirst(tmp, b);
+ return Packet8cf( _mm512_insertf32x4(a.v, tmp.v, 0) );
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cd pinsertfirst(const Packet4cd& a, std::complex<double> b)
+{
+ return Packet4cd(_mm512_castsi512_pd( _mm512_inserti32x4(_mm512_castpd_si512(a.v), _mm_castpd_si128(pset1<Packet1cd>(b).v), 0) ));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8cf pinsertlast(const Packet8cf& a, std::complex<float> b)
+{
+ Packet2cf tmp = Packet2cf(_mm512_extractf32x4_ps(a.v,3) );
+ tmp = pinsertlast(tmp, b);
+ return Packet8cf( _mm512_insertf32x4(a.v, tmp.v, 3) );
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cd pinsertlast(const Packet4cd& a, std::complex<double> b)
+{
+ return Packet4cd(_mm512_castsi512_pd( _mm512_inserti32x4(_mm512_castpd_si512(a.v), _mm_castpd_si128(pset1<Packet1cd>(b).v), 3) ));
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_COMPLEX_AVX512_H
diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h
index 93c5ec43f..c2158c538 100644
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -47,6 +47,7 @@ plog<Packet16f>(const Packet16f& _x) {
// The smallest non denormalized float number.
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000);
+ _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(pos_inf, 0x7f800000);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);
// Polynomial coefficients.
@@ -116,10 +117,16 @@ plog<Packet16f>(const Packet16f& _x) {
x = padd(x, y);
x = padd(x, y2);
- // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
+ __mmask16 pos_inf_mask = _mm512_cmp_ps_mask(_x,p16f_pos_inf,_CMP_EQ_OQ);
+ // Filter out invalid inputs, i.e.:
+ // - negative arg will be NAN,
+ // - 0 will be -INF.
+ // - +INF will be +INF
return _mm512_mask_blend_ps(iszero_mask,
- _mm512_mask_blend_ps(invalid_mask, x, p16f_nan),
- p16f_minus_inf);
+ _mm512_mask_blend_ps(invalid_mask,
+ _mm512_mask_blend_ps(pos_inf_mask,x,p16f_pos_inf),
+ p16f_nan),
+ p16f_minus_inf);
}
#endif
@@ -373,6 +380,19 @@ EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
#endif
#endif
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
+psin<Packet16f>(const Packet16f& _x) {
+ return psin_float(_x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
+pcos<Packet16f>(const Packet16f& _x) {
+ return pcos_float(_x);
+}
+
} // end namespace internal
} // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index 86cefba92..60b723b08 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -19,10 +19,10 @@ namespace internal {
#endif
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
#endif
-#ifdef __FMA__
+#ifdef EIGEN_VECTORIZE_FMA
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#endif
@@ -55,7 +55,9 @@ template<> struct packet_traits<float> : default_packet_traits
size = 16,
HasHalfPacket = 1,
HasBlend = 0,
-#if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG
+ HasSin = EIGEN_FAST_MATH,
+ HasCos = EIGEN_FAST_MATH,
+#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
#ifdef EIGEN_VECTORIZE_AVX512DQ
HasLog = 1,
#endif
@@ -75,7 +77,7 @@ template<> struct packet_traits<double> : default_packet_traits
AlignedOnScalar = 1,
size = 8,
HasHalfPacket = 1,
-#if EIGEN_GNUC_AT_LEAST(5, 3)
+#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
HasSqrt = EIGEN_FAST_MATH,
HasRsqrt = EIGEN_FAST_MATH,
#endif
@@ -99,19 +101,20 @@ template <>
struct unpacket_traits<Packet16f> {
typedef float type;
typedef Packet8f half;
- enum { size = 16, alignment=Aligned64 };
+ typedef Packet16i integer_packet;
+ enum { size = 16, alignment=Aligned64, vectorizable=true };
};
template <>
struct unpacket_traits<Packet8d> {
typedef double type;
typedef Packet4d half;
- enum { size = 8, alignment=Aligned64 };
+ enum { size = 8, alignment=Aligned64, vectorizable=true };
};
template <>
struct unpacket_traits<Packet16i> {
typedef int type;
typedef Packet8i half;
- enum { size = 16, alignment=Aligned64 };
+ enum { size = 16, alignment=Aligned64, vectorizable=false };
};
template <>
@@ -128,12 +131,17 @@ EIGEN_STRONG_INLINE Packet16i pset1<Packet16i>(const int& from) {
}
template <>
+EIGEN_STRONG_INLINE Packet16f pset1frombits<Packet16f>(unsigned int from) {
+ return _mm512_castsi512_ps(_mm512_set1_epi32(from));
+}
+
+template <>
EIGEN_STRONG_INLINE Packet16f pload1<Packet16f>(const float* from) {
return _mm512_broadcastss_ps(_mm_load_ps1(from));
}
template <>
EIGEN_STRONG_INLINE Packet8d pload1<Packet8d>(const double* from) {
- return _mm512_broadcastsd_pd(_mm_load_pd1(from));
+ return _mm512_set1_pd(*from);
}
template <>
@@ -159,6 +167,11 @@ EIGEN_STRONG_INLINE Packet8d padd<Packet8d>(const Packet8d& a,
const Packet8d& b) {
return _mm512_add_pd(a, b);
}
+template <>
+EIGEN_STRONG_INLINE Packet16i padd<Packet16i>(const Packet16i& a,
+ const Packet16i& b) {
+ return _mm512_add_epi32(a, b);
+}
template <>
EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a,
@@ -170,6 +183,11 @@ EIGEN_STRONG_INLINE Packet8d psub<Packet8d>(const Packet8d& a,
const Packet8d& b) {
return _mm512_sub_pd(a, b);
}
+template <>
+EIGEN_STRONG_INLINE Packet16i psub<Packet16i>(const Packet16i& a,
+ const Packet16i& b) {
+ return _mm512_sub_epi32(a, b);
+}
template <>
EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) {
@@ -203,6 +221,11 @@ EIGEN_STRONG_INLINE Packet8d pmul<Packet8d>(const Packet8d& a,
const Packet8d& b) {
return _mm512_mul_pd(a, b);
}
+template <>
+EIGEN_STRONG_INLINE Packet16i pmul<Packet16i>(const Packet16i& a,
+ const Packet16i& b) {
+ return _mm512_mul_epi32(a, b);
+}
template <>
EIGEN_STRONG_INLINE Packet16f pdiv<Packet16f>(const Packet16f& a,
@@ -215,7 +238,7 @@ EIGEN_STRONG_INLINE Packet8d pdiv<Packet8d>(const Packet8d& a,
return _mm512_div_pd(a, b);
}
-#ifdef __FMA__
+#ifdef EIGEN_VECTORIZE_FMA
template <>
EIGEN_STRONG_INLINE Packet16f pmadd(const Packet16f& a, const Packet16f& b,
const Packet16f& c) {
@@ -254,30 +277,92 @@ EIGEN_STRONG_INLINE Packet8d pmax<Packet8d>(const Packet8d& a,
return _mm512_max_pd(b, a);
}
-template <>
-EIGEN_STRONG_INLINE Packet16f pand<Packet16f>(const Packet16f& a,
- const Packet16f& b) {
#ifdef EIGEN_VECTORIZE_AVX512DQ
- return _mm512_and_ps(a, b);
+template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { return _mm512_extractf32x8_ps(x,I_); }
+template<int I_> EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { return _mm512_extractf64x2_pd(x,I_); }
+EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { return _mm512_insertf32x8(_mm512_castps256_ps512(a),b,1); }
#else
- Packet16f res = _mm512_undefined_ps();
- Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
- Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
- res = _mm512_insertf32x4(res, _mm_and_ps(lane0_a, lane0_b), 0);
+// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
+template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) {
+ return _mm256_castsi256_ps(_mm512_extracti64x4_epi64( _mm512_castps_si512(x),I_));
+}
- Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
- Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
- res = _mm512_insertf32x4(res, _mm_and_ps(lane1_a, lane1_b), 1);
+// AVX512F does not define _mm512_extractf64x2_pd to extract _m128 from _m512
+template<int I_> EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) {
+ return _mm_castsi128_pd(_mm512_extracti32x4_epi32( _mm512_castpd_si512(x),I_));
+}
- Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
- Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
- res = _mm512_insertf32x4(res, _mm_and_ps(lane2_a, lane2_b), 2);
+EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) {
+ return _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castsi256_si512(_mm256_castps_si256(a)),
+ _mm256_castps_si256(b),1));
+}
+#endif
- Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
- Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
- res = _mm512_insertf32x4(res, _mm_and_ps(lane3_a, lane3_b), 3);
+template<> EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) {
+ __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ);
+ return _mm512_castsi512_ps(
+ _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
+}
- return res;
+template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) {
+ __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ);
+ return _mm512_castsi512_ps(
+ _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) {
+ __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGT_UQ);
+ return _mm512_castsi512_ps(
+ _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packet16i& b) {
+ __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _CMP_EQ_OQ);
+ return _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) {
+ __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ);
+ return _mm512_castsi512_ps(
+ _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) {
+ __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ);
+ return _mm512_castsi512_pd(
+ _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16i ptrue<Packet16i>(const Packet16i& /*a*/) {
+ return _mm512_set1_epi32(0xffffffffu);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f ptrue<Packet16f>(const Packet16f& a) {
+ return _mm512_castsi512_ps(ptrue<Packet16i>(_mm512_castps_si512(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d ptrue<Packet8d>(const Packet8d& a) {
+ return _mm512_castsi512_pd(ptrue<Packet16i>(_mm512_castpd_si512(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16i pand<Packet16i>(const Packet16i& a,
+ const Packet16i& b) {
+ return _mm512_and_si512(a,b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pand<Packet16f>(const Packet16f& a,
+ const Packet16f& b) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+ return _mm512_and_ps(a, b);
+#else
+ return _mm512_castsi512_ps(pand(_mm512_castps_si512(a),_mm512_castps_si512(b)));
#endif
}
template <>
@@ -298,30 +383,18 @@ EIGEN_STRONG_INLINE Packet8d pand<Packet8d>(const Packet8d& a,
return res;
#endif
}
+
+template <>
+EIGEN_STRONG_INLINE Packet16i por<Packet16i>(const Packet16i& a, const Packet16i& b) {
+ return _mm512_or_si512(a, b);
+}
+
template <>
-EIGEN_STRONG_INLINE Packet16f por<Packet16f>(const Packet16f& a,
- const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet16f por<Packet16f>(const Packet16f& a, const Packet16f& b) {
#ifdef EIGEN_VECTORIZE_AVX512DQ
return _mm512_or_ps(a, b);
#else
- Packet16f res = _mm512_undefined_ps();
- Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
- Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
- res = _mm512_insertf32x4(res, _mm_or_ps(lane0_a, lane0_b), 0);
-
- Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
- Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
- res = _mm512_insertf32x4(res, _mm_or_ps(lane1_a, lane1_b), 1);
-
- Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
- Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
- res = _mm512_insertf32x4(res, _mm_or_ps(lane2_a, lane2_b), 2);
-
- Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
- Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
- res = _mm512_insertf32x4(res, _mm_or_ps(lane3_a, lane3_b), 3);
-
- return res;
+ return _mm512_castsi512_ps(por(_mm512_castps_si512(a),_mm512_castps_si512(b)));
#endif
}
@@ -331,109 +404,59 @@ EIGEN_STRONG_INLINE Packet8d por<Packet8d>(const Packet8d& a,
#ifdef EIGEN_VECTORIZE_AVX512DQ
return _mm512_or_pd(a, b);
#else
- Packet8d res = _mm512_undefined_pd();
- Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
- Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
- res = _mm512_insertf64x4(res, _mm256_or_pd(lane0_a, lane0_b), 0);
-
- Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
- Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
- res = _mm512_insertf64x4(res, _mm256_or_pd(lane1_a, lane1_b), 1);
-
- return res;
+ return _mm512_castsi512_pd(por(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
#endif
}
template <>
-EIGEN_STRONG_INLINE Packet16f pxor<Packet16f>(const Packet16f& a,
- const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet16i pxor<Packet16i>(const Packet16i& a, const Packet16i& b) {
+ return _mm512_xor_si512(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pxor<Packet16f>(const Packet16f& a, const Packet16f& b) {
#ifdef EIGEN_VECTORIZE_AVX512DQ
return _mm512_xor_ps(a, b);
#else
- Packet16f res = _mm512_undefined_ps();
- Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
- Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
- res = _mm512_insertf32x4(res, _mm_xor_ps(lane0_a, lane0_b), 0);
-
- Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
- Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
- res = _mm512_insertf32x4(res, _mm_xor_ps(lane1_a, lane1_b), 1);
-
- Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
- Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
- res = _mm512_insertf32x4(res, _mm_xor_ps(lane2_a, lane2_b), 2);
-
- Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
- Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
- res = _mm512_insertf32x4(res, _mm_xor_ps(lane3_a, lane3_b), 3);
-
- return res;
+ return _mm512_castsi512_ps(pxor(_mm512_castps_si512(a),_mm512_castps_si512(b)));
#endif
}
+
template <>
-EIGEN_STRONG_INLINE Packet8d pxor<Packet8d>(const Packet8d& a,
- const Packet8d& b) {
+EIGEN_STRONG_INLINE Packet8d pxor<Packet8d>(const Packet8d& a, const Packet8d& b) {
#ifdef EIGEN_VECTORIZE_AVX512DQ
return _mm512_xor_pd(a, b);
#else
- Packet8d res = _mm512_undefined_pd();
- Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
- Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
- res = _mm512_insertf64x4(res, _mm256_xor_pd(lane0_a, lane0_b), 0);
-
- Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
- Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
- res = _mm512_insertf64x4(res, _mm256_xor_pd(lane1_a, lane1_b), 1);
-
- return res;
+ return _mm512_castsi512_pd(pxor(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
#endif
}
template <>
-EIGEN_STRONG_INLINE Packet16f pandnot<Packet16f>(const Packet16f& a,
- const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet16i pandnot<Packet16i>(const Packet16i& a, const Packet16i& b) {
+ return _mm512_andnot_si512(b, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pandnot<Packet16f>(const Packet16f& a, const Packet16f& b) {
#ifdef EIGEN_VECTORIZE_AVX512DQ
- return _mm512_andnot_ps(a, b);
+ return _mm512_andnot_ps(b, a);
#else
- Packet16f res = _mm512_undefined_ps();
- Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
- Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
- res = _mm512_insertf32x4(res, _mm_andnot_ps(lane0_a, lane0_b), 0);
-
- Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
- Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
- res = _mm512_insertf32x4(res, _mm_andnot_ps(lane1_a, lane1_b), 1);
-
- Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
- Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
- res = _mm512_insertf32x4(res, _mm_andnot_ps(lane2_a, lane2_b), 2);
-
- Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
- Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
- res = _mm512_insertf32x4(res, _mm_andnot_ps(lane3_a, lane3_b), 3);
-
- return res;
+ return _mm512_castsi512_ps(pandnot(_mm512_castps_si512(a),_mm512_castps_si512(b)));
#endif
}
template <>
-EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a,
- const Packet8d& b) {
+EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a,const Packet8d& b) {
#ifdef EIGEN_VECTORIZE_AVX512DQ
- return _mm512_andnot_pd(a, b);
+ return _mm512_andnot_pd(b, a);
#else
- Packet8d res = _mm512_undefined_pd();
- Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
- Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
- res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane0_a, lane0_b), 0);
-
- Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
- Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
- res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane1_a, lane1_b), 1);
-
- return res;
+ return _mm512_castsi512_pd(pandnot(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
#endif
}
+template<int N> EIGEN_STRONG_INLINE Packet16i pshiftleft(Packet16i a) {
+ return _mm512_slli_epi32(a, N);
+}
+
template <>
EIGEN_STRONG_INLINE Packet16f pload<Packet16f>(const float* from) {
EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ps(from);
@@ -475,6 +498,7 @@ EIGEN_STRONG_INLINE Packet16f ploaddup<Packet16f>(const float* from) {
}
#ifdef EIGEN_VECTORIZE_AVX512DQ
+// FIXME: this does not look optimal, better load a Packet4d and shuffle...
// Loads 4 doubles from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3,
// a3}
template <>
@@ -502,21 +526,17 @@ EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
// {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3}
template <>
EIGEN_STRONG_INLINE Packet16f ploadquad<Packet16f>(const float* from) {
- Packet16f tmp = _mm512_undefined_ps();
- tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from), 0);
- tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 1), 1);
- tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 2), 2);
- tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 3), 3);
- return tmp;
+ Packet16f tmp = _mm512_castps128_ps512(ploadu<Packet4f>(from));
+ const Packet16i scatter_mask = _mm512_set_epi32(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0);
+ return _mm512_permutexvar_ps(scatter_mask, tmp);
}
+
// Loads 2 doubles from memory a returns the packet
// {a0, a0 a0, a0, a1, a1, a1, a1}
template <>
EIGEN_STRONG_INLINE Packet8d ploadquad<Packet8d>(const double* from) {
- __m128d tmp0 = _mm_load_pd1(from);
- __m256d lane0 = _mm256_broadcastsd_pd(tmp0);
- __m128d tmp1 = _mm_load_pd1(from + 1);
- __m256d lane1 = _mm256_broadcastsd_pd(tmp1);
+ __m256d lane0 = _mm256_set1_pd(*from);
+ __m256d lane1 = _mm256_set1_pd(*(from+1));
__m512d tmp = _mm512_undefined_pd();
tmp = _mm512_insertf64x4(tmp, lane0, 0);
return _mm512_insertf64x4(tmp, lane1, 1);
@@ -981,6 +1001,13 @@ EIGEN_STRONG_INLINE double predux_max<Packet8d>(const Packet8d& a) {
return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1)));
}
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet16f& x)
+{
+ Packet16i xi = _mm512_castps_si512(x);
+ __mmask16 tmp = _mm512_test_epi32_mask(xi,xi);
+ return !_mm512_kortestz(tmp,tmp);
+}
+
template <int Offset>
struct palign_impl<Offset, Packet16f> {
static EIGEN_STRONG_INLINE void run(Packet16f& first,
@@ -1322,6 +1349,22 @@ template<> EIGEN_STRONG_INLINE Packet8d pinsertlast(const Packet8d& a, double b)
return _mm512_mask_broadcastsd_pd(a, (1<<7), _mm_load_sd(&b));
}
+template<> EIGEN_STRONG_INLINE Packet16i pcast<Packet16f, Packet16i>(const Packet16f& a) {
+ return _mm512_cvttps_epi32(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16i, Packet16f>(const Packet16i& a) {
+ return _mm512_cvtepi32_ps(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16i preinterpret<Packet16i,Packet16f>(const Packet16f& a) {
+ return _mm512_castps_si512(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f,Packet16i>(const Packet16i& a) {
+ return _mm512_castsi512_ps(a);
+}
+
} // end namespace internal
} // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h
index 3e665730c..440d058d8 100644
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -60,7 +60,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
};
};
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2cf half; };
template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
{
@@ -82,14 +82,14 @@ template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<f
template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
{
- std::complex<float> EIGEN_ALIGN16 af[2];
+ EIGEN_ALIGN16 std::complex<float> af[2];
af[0] = from[0*stride];
af[1] = from[1*stride];
return pload<Packet2cf>(af);
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
{
- std::complex<float> EIGEN_ALIGN16 af[2];
+ EIGEN_ALIGN16 std::complex<float> af[2];
pstore<std::complex<float> >((std::complex<float> *) af, from);
to[0*stride] = af[0];
to[1*stride] = af[1];
@@ -128,7 +128,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::co
template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
{
- std::complex<float> EIGEN_ALIGN16 res[2];
+ EIGEN_ALIGN16 std::complex<float> res[2];
pstore((float *)&res, a.v);
return res[0];
@@ -286,7 +286,7 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits
};
};
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true}; typedef Packet1cd half; };
template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { return Packet1cd(pload<Packet2d>((const double*)from)); }
template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { return Packet1cd(ploadu<Packet2d>((const double*)from)); }
@@ -298,14 +298,14 @@ template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<dou
template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride)
{
- std::complex<double> EIGEN_ALIGN16 af[2];
+ EIGEN_ALIGN16 std::complex<double> af[2];
af[0] = from[0*stride];
af[1] = from[1*stride];
return pload<Packet1cd>(af);
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride)
{
- std::complex<double> EIGEN_ALIGN16 af[2];
+ EIGEN_ALIGN16 std::complex<double> af[2];
pstore<std::complex<double> >(af, from);
to[0*stride] = af[0];
to[1*stride] = af[1];
@@ -345,7 +345,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::c
template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a)
{
- std::complex<double> EIGEN_ALIGN16 res[2];
+ EIGEN_ALIGN16 std::complex<double> res[2];
pstore<std::complex<double> >(res, a);
return res[0];
diff --git a/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
index c5e4bede7..81097e668 100644
--- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h
+++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
@@ -9,191 +9,37 @@
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-/* The sin, cos, exp, and log functions of this file come from
- * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
- */
-
#ifndef EIGEN_MATH_FUNCTIONS_ALTIVEC_H
#define EIGEN_MATH_FUNCTIONS_ALTIVEC_H
+#include "../Default/GenericPacketMathFunctions.h"
+
namespace Eigen {
namespace internal {
-static _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
-static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-static _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
-static _EIGEN_DECLARE_CONST_Packet4i(23, 23);
-
-static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
-
-/* the smallest non denormalized float number */
-static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000);
-static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000); // -1.f/0.f
-static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan, 0xffffffff);
-
-/* natural logarithm computed for 4 simultaneous float
- return NaN for x <= 0
-*/
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
-
-static _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f);
-static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
-
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
-
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
-
-#ifdef __VSX__
-static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
-static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
-static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
-
-static _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437);
-static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
-
-static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
-
-static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
-static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
-static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
-
-static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
-static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
-static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
-static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
-
-static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
-static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
-
-#ifdef __POWER8_VECTOR__
-static Packet2l p2l_1023 = { 1023, 1023 };
-static Packet2ul p2ul_52 = { 52, 52 };
-#endif
-
-#endif
-
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f plog<Packet4f>(const Packet4f& _x)
{
- Packet4f x = _x;
-
- Packet4i emm0;
-
- /* isvalid_mask is 0 if x < 0 or x is NaN. */
- Packet4ui isvalid_mask = reinterpret_cast<Packet4ui>(vec_cmpge(x, p4f_ZERO));
- Packet4ui iszero_mask = reinterpret_cast<Packet4ui>(vec_cmpeq(x, p4f_ZERO));
-
- x = pmax(x, p4f_min_norm_pos); /* cut off denormalized stuff */
- emm0 = vec_sr(reinterpret_cast<Packet4i>(x),
- reinterpret_cast<Packet4ui>(p4i_23));
-
- /* keep only the fractional part */
- x = pand(x, p4f_inv_mant_mask);
- x = por(x, p4f_half);
-
- emm0 = psub(emm0, p4i_0x7f);
- Packet4f e = padd(vec_ctf(emm0, 0), p4f_1);
-
- /* part2:
- if( x < SQRTHF ) {
- e -= 1;
- x = x + x - 1.0;
- } else { x = x - 1.0; }
- */
- Packet4f mask = reinterpret_cast<Packet4f>(vec_cmplt(x, p4f_cephes_SQRTHF));
- Packet4f tmp = pand(x, mask);
- x = psub(x, p4f_1);
- e = psub(e, pand(p4f_1, mask));
- x = padd(x, tmp);
-
- Packet4f x2 = pmul(x,x);
- Packet4f x3 = pmul(x2,x);
-
- Packet4f y, y1, y2;
- y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
- y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
- y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
- y = pmadd(y , x, p4f_cephes_log_p2);
- y1 = pmadd(y1, x, p4f_cephes_log_p5);
- y2 = pmadd(y2, x, p4f_cephes_log_p8);
- y = pmadd(y, x3, y1);
- y = pmadd(y, x3, y2);
- y = pmul(y, x3);
-
- y1 = pmul(e, p4f_cephes_log_q1);
- tmp = pmul(x2, p4f_half);
- y = padd(y, y1);
- x = psub(x, tmp);
- y2 = pmul(e, p4f_cephes_log_q2);
- x = padd(x, y);
- x = padd(x, y2);
- // negative arg will be NAN, 0 will be -INF
- x = vec_sel(x, p4f_minus_inf, iszero_mask);
- x = vec_sel(p4f_minus_nan, x, isvalid_mask);
- return x;
+ return plog_float(_x);
}
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f pexp<Packet4f>(const Packet4f& _x)
{
- Packet4f x = _x;
-
- Packet4f tmp, fx;
- Packet4i emm0;
-
- // clamp x
- x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo);
-
- // express exp(x) as exp(g + n*log(2))
- fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half);
-
- fx = pfloor(fx);
-
- tmp = pmul(fx, p4f_cephes_exp_C1);
- Packet4f z = pmul(fx, p4f_cephes_exp_C2);
- x = psub(x, tmp);
- x = psub(x, z);
-
- z = pmul(x,x);
-
- Packet4f y = p4f_cephes_exp_p0;
- y = pmadd(y, x, p4f_cephes_exp_p1);
- y = pmadd(y, x, p4f_cephes_exp_p2);
- y = pmadd(y, x, p4f_cephes_exp_p3);
- y = pmadd(y, x, p4f_cephes_exp_p4);
- y = pmadd(y, x, p4f_cephes_exp_p5);
- y = pmadd(y, z, x);
- y = padd(y, p4f_1);
+ return pexp_float(_x);
+}
- // build 2^n
- emm0 = vec_cts(fx, 0);
- emm0 = vec_add(emm0, p4i_0x7f);
- emm0 = vec_sl(emm0, reinterpret_cast<Packet4ui>(p4i_23));
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f psin<Packet4f>(const Packet4f& _x)
+{
+ return psin_float(_x);
+}
- // Altivec's max & min operators just drop silent NaNs. Check NaNs in
- // inputs and return them unmodified.
- Packet4ui isnumber_mask = reinterpret_cast<Packet4ui>(vec_cmpeq(_x, _x));
- return vec_sel(_x, pmax(pmul(y, reinterpret_cast<Packet4f>(emm0)), _x),
- isnumber_mask);
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f pcos<Packet4f>(const Packet4f& _x)
+{
+ return pcos_float(_x);
}
#ifndef EIGEN_COMP_CLANG
@@ -225,93 +71,10 @@ Packet2d psqrt<Packet2d>(const Packet2d& x)
return vec_sqrt(x);
}
-// VSX support varies between different compilers and even different
-// versions of the same compiler. For gcc version >= 4.9.3, we can use
-// vec_cts to efficiently convert Packet2d to Packet2l. Otherwise, use
-// a slow version that works with older compilers.
-// Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles
-// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963
-static inline Packet2l ConvertToPacket2l(const Packet2d& x) {
-#if EIGEN_GNUC_AT_LEAST(5, 4) || \
- (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1)
- return vec_cts(x, 0); // TODO: check clang version.
-#else
- double tmp[2];
- memcpy(tmp, &x, sizeof(tmp));
- Packet2l l = { static_cast<long long>(tmp[0]),
- static_cast<long long>(tmp[1]) };
- return l;
-#endif
-}
-
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet2d pexp<Packet2d>(const Packet2d& _x)
{
- Packet2d x = _x;
-
- Packet2d tmp, fx;
- Packet2l emm0;
-
- // clamp x
- x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo);
-
- /* express exp(x) as exp(g + n*log(2)) */
- fx = pmadd(x, p2d_cephes_LOG2EF, p2d_half);
-
- fx = pfloor(fx);
-
- tmp = pmul(fx, p2d_cephes_exp_C1);
- Packet2d z = pmul(fx, p2d_cephes_exp_C2);
- x = psub(x, tmp);
- x = psub(x, z);
-
- Packet2d x2 = pmul(x,x);
-
- Packet2d px = p2d_cephes_exp_p0;
- px = pmadd(px, x2, p2d_cephes_exp_p1);
- px = pmadd(px, x2, p2d_cephes_exp_p2);
- px = pmul (px, x);
-
- Packet2d qx = p2d_cephes_exp_q0;
- qx = pmadd(qx, x2, p2d_cephes_exp_q1);
- qx = pmadd(qx, x2, p2d_cephes_exp_q2);
- qx = pmadd(qx, x2, p2d_cephes_exp_q3);
-
- x = pdiv(px,psub(qx,px));
- x = pmadd(p2d_2,x,p2d_1);
-
- // build 2^n
- emm0 = ConvertToPacket2l(fx);
-
-#ifdef __POWER8_VECTOR__
- emm0 = vec_add(emm0, p2l_1023);
- emm0 = vec_sl(emm0, p2ul_52);
-#else
- // Code is a bit complex for POWER7. There is actually a
- // vec_xxsldi intrinsic but it is not supported by some gcc versions.
- // So we shift (52-32) bits and do a word swap with zeros.
- _EIGEN_DECLARE_CONST_Packet4i(1023, 1023);
- _EIGEN_DECLARE_CONST_Packet4i(20, 20); // 52 - 32
-
- Packet4i emm04i = reinterpret_cast<Packet4i>(emm0);
- emm04i = vec_add(emm04i, p4i_1023);
- emm04i = vec_sl(emm04i, reinterpret_cast<Packet4ui>(p4i_20));
- static const Packet16uc perm = {
- 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
- 0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b };
-#ifdef _BIG_ENDIAN
- emm0 = reinterpret_cast<Packet2l>(vec_perm(p4i_ZERO, emm04i, perm));
-#else
- emm0 = reinterpret_cast<Packet2l>(vec_perm(emm04i, p4i_ZERO, perm));
-#endif
-
-#endif
-
- // Altivec's max & min operators just drop silent NaNs. Check NaNs in
- // inputs and return them unmodified.
- Packet2ul isnumber_mask = reinterpret_cast<Packet2ul>(vec_cmpeq(_x, _x));
- return vec_sel(_x, pmax(pmul(x, reinterpret_cast<Packet2d>(emm0)), _x),
- isnumber_mask);
+ return pexp_double(_x);
}
#endif
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index 7f4e90f75..9535724eb 100755
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -146,9 +146,9 @@ template<> struct packet_traits<float> : default_packet_traits
HasMin = 1,
HasMax = 1,
HasAbs = 1,
- HasSin = 0,
- HasCos = 0,
- HasLog = 0,
+ HasSin = EIGEN_FAST_MATH,
+ HasCos = EIGEN_FAST_MATH,
+ HasLog = 1,
HasExp = 1,
#ifdef __VSX__
HasSqrt = 1,
@@ -187,8 +187,19 @@ template<> struct packet_traits<int> : default_packet_traits
};
-template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
+template<> struct unpacket_traits<Packet4f>
+{
+ typedef float type;
+ typedef Packet4f half;
+ typedef Packet4i integer_packet;
+ enum {size=4, alignment=Aligned16, vectorizable=true};
+};
+template<> struct unpacket_traits<Packet4i>
+{
+ typedef int type;
+ typedef Packet4i half;
+ enum {size=4, alignment=Aligned16, vectorizable=false};
+};
inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v)
{
@@ -285,6 +296,11 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
Packet4i v = {from, from, from, from};
return v;
}
+
+template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) {
+ return reinterpret_cast<Packet4f>(pset1<Packet4i>(from));
+}
+
template<> EIGEN_STRONG_INLINE void
pbroadcast4<Packet4f>(const float *a,
Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
@@ -308,7 +324,7 @@ pbroadcast4<Packet4i>(const int *a,
template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
{
- float EIGEN_ALIGN16 af[4];
+ EIGEN_ALIGN16 float af[4];
af[0] = from[0*stride];
af[1] = from[1*stride];
af[2] = from[2*stride];
@@ -317,7 +333,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const floa
}
template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
{
- int EIGEN_ALIGN16 ai[4];
+ EIGEN_ALIGN16 int ai[4];
ai[0] = from[0*stride];
ai[1] = from[1*stride];
ai[2] = from[2*stride];
@@ -326,7 +342,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* f
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
{
- float EIGEN_ALIGN16 af[4];
+ EIGEN_ALIGN16 float af[4];
pstore<float>(af, from);
to[0*stride] = af[0];
to[1*stride] = af[1];
@@ -335,7 +351,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, co
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
{
- int EIGEN_ALIGN16 ai[4];
+ EIGEN_ALIGN16 int ai[4];
pstore<int>((int *)ai, from);
to[0*stride] = ai[0];
to[1*stride] = ai[1];
@@ -414,6 +430,15 @@ template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const
}
template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmple(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmplt(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmpeq(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
+ Packet4f c = reinterpret_cast<Packet4f>(vec_cmpge(a,b));
+ return vec_nor(c,c);
+}
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmpeq(a,b)); }
+
template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
@@ -426,6 +451,10 @@ template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const
template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
+ return vec_sel(b, a, mask);
+}
+
template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return vec_round(a); }
template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return vec_ceil(a); }
template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); }
@@ -536,8 +565,8 @@ template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f&
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_PPC_PREFETCH(addr); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_PPC_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }
-template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }
+template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x; vec_ste(a, 0, &x); return x; }
+template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int x; vec_ste(a, 0, &x); return x; }
template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
{
@@ -550,6 +579,19 @@ template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); }
template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); }
+template<int N> EIGEN_STRONG_INLINE Packet4i pshiftright(Packet4i a)
+{ return vec_sr(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
+template<int N> EIGEN_STRONG_INLINE Packet4i pshiftleft(Packet4i a)
+{ return vec_sl(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
+ return pfrexp_float(a,exponent);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
+ return pldexp_float(a,exponent);
+}
+
template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
{
Packet4f b, sum;
@@ -678,6 +720,11 @@ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
return pfirst(res);
}
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
+{
+ return vec_any_ne(x, pzero(x));
+}
+
template<int Offset>
struct palign_impl<Offset,Packet4f>
{
@@ -771,6 +818,43 @@ template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, cons
}
+template <>
+struct type_casting_traits<float, int> {
+ enum {
+ VectorizedCast = 1,
+ SrcCoeffRatio = 1,
+ TgtCoeffRatio = 1
+ };
+};
+
+template <>
+struct type_casting_traits<int, float> {
+ enum {
+ VectorizedCast = 1,
+ SrcCoeffRatio = 1,
+ TgtCoeffRatio = 1
+ };
+};
+
+
+template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
+ return vec_cts(a,0);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
+ return vec_ctf(a,0);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a) {
+ return reinterpret_cast<Packet4i>(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Packet4i& a) {
+ return reinterpret_cast<Packet4f>(a);
+}
+
+
+
//---------- double ----------
#ifdef __VSX__
typedef __vector double Packet2d;
@@ -837,7 +921,7 @@ template<> struct packet_traits<double> : default_packet_traits
};
};
-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
+template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2d half; };
inline std::ostream & operator <<(std::ostream & s, const Packet2l & v)
{
@@ -901,14 +985,14 @@ pbroadcast4<Packet2d>(const double *a,
template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
{
- double EIGEN_ALIGN16 af[2];
+ EIGEN_ALIGN16 double af[2];
af[0] = from[0*stride];
af[1] = from[1*stride];
return pload<Packet2d>(af);
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
{
- double EIGEN_ALIGN16 af[2];
+ EIGEN_ALIGN16 double af[2];
pstore<double>(af, from);
to[0*stride] = af[0];
to[1*stride] = af[1];
@@ -980,7 +1064,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d&
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore<double>(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore<double>(x, a); return x[0]; }
template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
{
@@ -988,6 +1072,59 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
}
template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); }
+// VSX support varies between different compilers and even different
+// versions of the same compiler. For gcc version >= 4.9.3, we can use
+// vec_cts to efficiently convert Packet2d to Packet2l. Otherwise, use
+// a slow version that works with older compilers.
+// Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles
+// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963
+static inline Packet2l ConvertToPacket2l(const Packet2d& x) {
+#if EIGEN_GNUC_AT_LEAST(5, 4) || \
+ (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1)
+ return vec_cts(x, 0); // TODO: check clang version.
+#else
+ double tmp[2];
+ memcpy(tmp, &x, sizeof(tmp));
+ Packet2l l = { static_cast<long long>(tmp[0]),
+ static_cast<long long>(tmp[1]) };
+ return l;
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
+
+ // build 2^n
+ Packet2l emm0 = ConvertToPacket2l(exponent);
+
+#ifdef __POWER8_VECTOR__
+ const Packet2l p2l_1023 = { 1023, 1023 };
+ const Packet2ul p2ul_52 = { 52, 52 };
+ emm0 = vec_add(emm0, p2l_1023);
+ emm0 = vec_sl(emm0, p2ul_52);
+#else
+ // Code is a bit complex for POWER7. There is actually a
+ // vec_xxsldi intrinsic but it is not supported by some gcc versions.
+ // So we shift (52-32) bits and do a word swap with zeros.
+ const Packet4i p4i_1023 = pset1<Packet4i>(1023);
+ const Packet4i p4i_20 = pset1<Packet4i>(20); // 52 - 32
+
+ Packet4i emm04i = reinterpret_cast<Packet4i>(emm0);
+ emm04i = vec_add(emm04i, p4i_1023);
+ emm04i = vec_sl(emm04i, reinterpret_cast<Packet4ui>(p4i_20));
+ static const Packet16uc perm = {
+ 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
+ 0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b };
+#ifdef _BIG_ENDIAN
+ emm0 = reinterpret_cast<Packet2l>(vec_perm(p4i_ZERO, emm04i, perm));
+#else
+ emm0 = reinterpret_cast<Packet2l>(vec_perm(emm04i, p4i_ZERO, perm));
+#endif
+
+#endif
+
+ return pmul(a, reinterpret_cast<Packet2d>(emm0));
+}
+
template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
{
Packet2d b, sum;
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
new file mode 100644
index 000000000..452b4c806
--- /dev/null
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -0,0 +1,471 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007 Julien Pommier
+// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
+// Copyright (C) 2009-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* The exp and log functions of this file initially come from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
+namespace Eigen {
+namespace internal {
+
+template<typename Packet> EIGEN_STRONG_INLINE Packet
+pfrexp_float(const Packet& a, Packet& exponent) {
+ typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+ const Packet cst_126f = pset1<Packet>(126.0f);
+ const Packet cst_half = pset1<Packet>(0.5f);
+ const Packet cst_inv_mant_mask = pset1frombits<Packet>(~0x7f800000u);
+ exponent = psub(pcast<PacketI,Packet>(pshiftright<23>(preinterpret<PacketI>(a))), cst_126f);
+ return por(pand(a, cst_inv_mant_mask), cst_half);
+}
+
+template<typename Packet> EIGEN_STRONG_INLINE Packet
+pldexp_float(Packet a, Packet exponent)
+{
+ typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+ const Packet cst_127 = pset1<Packet>(127.f);
+ // return a * 2^exponent
+ PacketI ei = pcast<Packet,PacketI>(padd(exponent, cst_127));
+ return pmul(a, preinterpret<Packet>(pshiftleft<23>(ei)));
+}
+
+// Natural logarithm
+// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
+// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
+// be easily approximated by a polynomial centered on m=1 for stability.
+// TODO(gonnet): Further reduce the interval allowing for lower-degree
+// polynomial interpolants -> ... -> profit!
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet plog_float(const Packet _x)
+{
+ Packet x = _x;
+
+ const Packet cst_1 = pset1<Packet>(1.0f);
+ const Packet cst_half = pset1<Packet>(0.5f);
+ // The smallest non denormalized float number.
+ const Packet cst_min_norm_pos = pset1frombits<Packet>( 0x00800000u);
+ const Packet cst_minus_inf = pset1frombits<Packet>( 0xff800000u);
+ const Packet cst_pos_inf = pset1frombits<Packet>( 0x7f800000u);
+
+ // Polynomial coefficients.
+ const Packet cst_cephes_SQRTHF = pset1<Packet>(0.707106781186547524f);
+ const Packet cst_cephes_log_p0 = pset1<Packet>(7.0376836292E-2f);
+ const Packet cst_cephes_log_p1 = pset1<Packet>(-1.1514610310E-1f);
+ const Packet cst_cephes_log_p2 = pset1<Packet>(1.1676998740E-1f);
+ const Packet cst_cephes_log_p3 = pset1<Packet>(-1.2420140846E-1f);
+ const Packet cst_cephes_log_p4 = pset1<Packet>(+1.4249322787E-1f);
+ const Packet cst_cephes_log_p5 = pset1<Packet>(-1.6668057665E-1f);
+ const Packet cst_cephes_log_p6 = pset1<Packet>(+2.0000714765E-1f);
+ const Packet cst_cephes_log_p7 = pset1<Packet>(-2.4999993993E-1f);
+ const Packet cst_cephes_log_p8 = pset1<Packet>(+3.3333331174E-1f);
+ const Packet cst_cephes_log_q1 = pset1<Packet>(-2.12194440e-4f);
+ const Packet cst_cephes_log_q2 = pset1<Packet>(0.693359375f);
+
+ // Truncate input values to the minimum positive normal.
+ x = pmax(x, cst_min_norm_pos);
+
+ Packet e;
+ // extract significant in the range [0.5,1) and exponent
+ x = pfrexp(x,e);
+
+ // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
+ // and shift by -1. The values are then centered around 0, which improves
+ // the stability of the polynomial evaluation.
+ // if( x < SQRTHF ) {
+ // e -= 1;
+ // x = x + x - 1.0;
+ // } else { x = x - 1.0; }
+ Packet mask = pcmp_lt(x, cst_cephes_SQRTHF);
+ Packet tmp = pand(x, mask);
+ x = psub(x, cst_1);
+ e = psub(e, pand(cst_1, mask));
+ x = padd(x, tmp);
+
+ Packet x2 = pmul(x, x);
+ Packet x3 = pmul(x2, x);
+
+ // Evaluate the polynomial approximant of degree 8 in three parts, probably
+ // to improve instruction-level parallelism.
+ Packet y, y1, y2;
+ y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
+ y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
+ y2 = pmadd(cst_cephes_log_p6, x, cst_cephes_log_p7);
+ y = pmadd(y, x, cst_cephes_log_p2);
+ y1 = pmadd(y1, x, cst_cephes_log_p5);
+ y2 = pmadd(y2, x, cst_cephes_log_p8);
+ y = pmadd(y, x3, y1);
+ y = pmadd(y, x3, y2);
+ y = pmul(y, x3);
+
+ // Add the logarithm of the exponent back to the result of the interpolation.
+ y1 = pmul(e, cst_cephes_log_q1);
+ tmp = pmul(x2, cst_half);
+ y = padd(y, y1);
+ x = psub(x, tmp);
+ y2 = pmul(e, cst_cephes_log_q2);
+ x = padd(x, y);
+ x = padd(x, y2);
+
+ Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
+ Packet iszero_mask = pcmp_eq(_x,pzero(_x));
+ Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf);
+ // Filter out invalid inputs, i.e.:
+ // - negative arg will be NAN
+ // - 0 will be -INF
+ // - +INF will be +INF
+ return pselect(iszero_mask, cst_minus_inf,
+ por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask));
+}
+
+// Exponential function. Works by writing "x = m*log(2) + r" where
+// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
+// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet pexp_float(const Packet _x)
+{
+ const Packet cst_1 = pset1<Packet>(1.0f);
+ const Packet cst_half = pset1<Packet>(0.5f);
+ const Packet cst_exp_hi = pset1<Packet>( 88.3762626647950f);
+ const Packet cst_exp_lo = pset1<Packet>(-88.3762626647949f);
+
+ const Packet cst_cephes_LOG2EF = pset1<Packet>(1.44269504088896341f);
+ const Packet cst_cephes_exp_p0 = pset1<Packet>(1.9875691500E-4f);
+ const Packet cst_cephes_exp_p1 = pset1<Packet>(1.3981999507E-3f);
+ const Packet cst_cephes_exp_p2 = pset1<Packet>(8.3334519073E-3f);
+ const Packet cst_cephes_exp_p3 = pset1<Packet>(4.1665795894E-2f);
+ const Packet cst_cephes_exp_p4 = pset1<Packet>(1.6666665459E-1f);
+ const Packet cst_cephes_exp_p5 = pset1<Packet>(5.0000001201E-1f);
+
+ // Clamp x.
+ Packet x = pmax(pmin(_x, cst_exp_hi), cst_exp_lo);
+
+ // Express exp(x) as exp(m*ln(2) + r), start by extracting
+ // m = floor(x/ln(2) + 0.5).
+ Packet m = pfloor(pmadd(x, cst_cephes_LOG2EF, cst_half));
+
+ // Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is
+ // subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating
+ // truncation errors.
+ Packet r;
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+ const Packet cst_nln2 = pset1<Packet>(-0.6931471805599453f);
+ r = pmadd(m, cst_nln2, x);
+#else
+ const Packet cst_cephes_exp_C1 = pset1<Packet>(0.693359375f);
+ const Packet cst_cephes_exp_C2 = pset1<Packet>(-2.12194440e-4f);
+ r = psub(x, pmul(m, cst_cephes_exp_C1));
+ r = psub(r, pmul(m, cst_cephes_exp_C2));
+#endif
+
+ Packet r2 = pmul(r, r);
+
+ // TODO(gonnet): Split into odd/even polynomials and try to exploit
+ // instruction-level parallelism.
+ Packet y = cst_cephes_exp_p0;
+ y = pmadd(y, r, cst_cephes_exp_p1);
+ y = pmadd(y, r, cst_cephes_exp_p2);
+ y = pmadd(y, r, cst_cephes_exp_p3);
+ y = pmadd(y, r, cst_cephes_exp_p4);
+ y = pmadd(y, r, cst_cephes_exp_p5);
+ y = pmadd(y, r2, r);
+ y = padd(y, cst_1);
+
+ // Return 2^m * exp(r).
+ return pmax(pldexp(y,m), _x);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet pexp_double(const Packet _x)
+{
+ Packet x = _x;
+
+ const Packet cst_1 = pset1<Packet>(1.0);
+ const Packet cst_2 = pset1<Packet>(2.0);
+ const Packet cst_half = pset1<Packet>(0.5);
+
+ const Packet cst_exp_hi = pset1<Packet>(709.437);
+ const Packet cst_exp_lo = pset1<Packet>(-709.436139303);
+
+ const Packet cst_cephes_LOG2EF = pset1<Packet>(1.4426950408889634073599);
+ const Packet cst_cephes_exp_p0 = pset1<Packet>(1.26177193074810590878e-4);
+ const Packet cst_cephes_exp_p1 = pset1<Packet>(3.02994407707441961300e-2);
+ const Packet cst_cephes_exp_p2 = pset1<Packet>(9.99999999999999999910e-1);
+ const Packet cst_cephes_exp_q0 = pset1<Packet>(3.00198505138664455042e-6);
+ const Packet cst_cephes_exp_q1 = pset1<Packet>(2.52448340349684104192e-3);
+ const Packet cst_cephes_exp_q2 = pset1<Packet>(2.27265548208155028766e-1);
+ const Packet cst_cephes_exp_q3 = pset1<Packet>(2.00000000000000000009e0);
+ const Packet cst_cephes_exp_C1 = pset1<Packet>(0.693145751953125);
+ const Packet cst_cephes_exp_C2 = pset1<Packet>(1.42860682030941723212e-6);
+
+ Packet tmp, fx;
+
+ // clamp x
+ x = pmax(pmin(x, cst_exp_hi), cst_exp_lo);
+ // Express exp(x) as exp(g + n*log(2)).
+ fx = pmadd(cst_cephes_LOG2EF, x, cst_half);
+
+ // Get the integer modulus of log(2), i.e. the "n" described above.
+ fx = pfloor(fx);
+
+ // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
+ // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
+ // digits right.
+ tmp = pmul(fx, cst_cephes_exp_C1);
+ Packet z = pmul(fx, cst_cephes_exp_C2);
+ x = psub(x, tmp);
+ x = psub(x, z);
+
+ Packet x2 = pmul(x, x);
+
+ // Evaluate the numerator polynomial of the rational interpolant.
+ Packet px = cst_cephes_exp_p0;
+ px = pmadd(px, x2, cst_cephes_exp_p1);
+ px = pmadd(px, x2, cst_cephes_exp_p2);
+ px = pmul(px, x);
+
+ // Evaluate the denominator polynomial of the rational interpolant.
+ Packet qx = cst_cephes_exp_q0;
+ qx = pmadd(qx, x2, cst_cephes_exp_q1);
+ qx = pmadd(qx, x2, cst_cephes_exp_q2);
+ qx = pmadd(qx, x2, cst_cephes_exp_q3);
+
+ // I don't really get this bit, copied from the SSE2 routines, so...
+ // TODO(gonnet): Figure out what is going on here, perhaps find a better
+ // rational interpolant?
+ x = pdiv(px, psub(qx, px));
+ x = pmadd(cst_2, x, cst_1);
+
+ // Construct the result 2^n * exp(g) = e * x. The max is used to catch
+ // non-finite values in the input.
+ return pmax(pldexp(x,fx), _x);
+}
+
+// The following code is inspired by the following stack-overflow answer:
+// https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751
+// It has been largely optimized:
+// - By-pass calls to frexp.
+// - Aligned loads of required 96 bits of 2/pi. This is accomplished by
+// (1) balancing the mantissa and exponent to the required bits of 2/pi are
+// aligned on 8-bits, and (2) replicating the storage of the bits of 2/pi.
+// - Avoid a branch in rounding and extraction of the remaining fractional part.
+// Overall, I measured a speed up higher than x2 on x86-64.
+inline float trig_reduce_huge (float xf, int *quadrant)
+{
+ using Eigen::numext::int32_t;
+ using Eigen::numext::uint32_t;
+ using Eigen::numext::int64_t;
+ using Eigen::numext::uint64_t;
+
+ const double pio2_62 = 3.4061215800865545e-19; // pi/2 * 2^-62
+ const uint64_t zero_dot_five = uint64_t(1) << 61; // 0.5 in 2.62-bit fixed-point foramt
+
+ // 192 bits of 2/pi for Payne-Hanek reduction
+ // Bits are introduced by packet of 8 to enable aligned reads.
+ static const uint32_t two_over_pi [] =
+ {
+ 0x00000028, 0x000028be, 0x0028be60, 0x28be60db,
+ 0xbe60db93, 0x60db9391, 0xdb939105, 0x9391054a,
+ 0x91054a7f, 0x054a7f09, 0x4a7f09d5, 0x7f09d5f4,
+ 0x09d5f47d, 0xd5f47d4d, 0xf47d4d37, 0x7d4d3770,
+ 0x4d377036, 0x377036d8, 0x7036d8a5, 0x36d8a566,
+ 0xd8a5664f, 0xa5664f10, 0x664f10e4, 0x4f10e410,
+ 0x10e41000, 0xe4100000
+ };
+
+ uint32_t xi = numext::as_uint(xf);
+ // Below, -118 = -126 + 8.
+ // -126 is to get the exponent,
+ // +8 is to enable alignment of 2/pi's bits on 8 bits.
+ // This is possible because the fractional part of x as only 24 meaningful bits.
+ uint32_t e = (xi >> 23) - 118;
+ // Extract the mantissa and shift it to align it wrt the exponent
+ xi = ((xi & 0x007fffffu)| 0x00800000u) << (e & 0x7);
+
+ uint32_t i = e >> 3;
+ uint32_t twoopi_1 = two_over_pi[i-1];
+ uint32_t twoopi_2 = two_over_pi[i+3];
+ uint32_t twoopi_3 = two_over_pi[i+7];
+
+ // Compute x * 2/pi in 2.62-bit fixed-point format.
+ uint64_t p;
+ p = uint64_t(xi) * twoopi_3;
+ p = uint64_t(xi) * twoopi_2 + (p >> 32);
+ p = (uint64_t(xi * twoopi_1) << 32) + p;
+
+ // Round to nearest: add 0.5 and extract integral part.
+ uint64_t q = (p + zero_dot_five) >> 62;
+ *quadrant = int(q);
+ // Now it remains to compute "r = x - q*pi/2" with high accuracy,
+ // since we have p=x/(pi/2) with high accuracy, we can more efficiently compute r as:
+ // r = (p-q)*pi/2,
+ // where the product can be be carried out with sufficient accuracy using double precision.
+ p -= q<<62;
+ return float(double(int64_t(p)) * pio2_62);
+}
+
+template<bool ComputeSine,typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+#if EIGEN_GNUC_AT_LEAST(4,4)
+__attribute__((optimize("-fno-unsafe-math-optimizations")))
+#endif
+Packet psincos_float(const Packet& _x)
+{
+// Workaround -ffast-math aggressive optimizations
+// See bug 1674
+#if EIGEN_COMP_CLANG && defined(EIGEN_VECTORIZE_SSE)
+#define EIGEN_SINCOS_DONT_OPT(X) __asm__ ("" : "+x" (X));
+#else
+#define EIGEN_SINCOS_DONT_OPT(X)
+#endif
+
+ typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+
+ const Packet cst_2oPI = pset1<Packet>(0.636619746685028076171875f); // 2/PI
+ const Packet cst_rounding_magic = pset1<Packet>(12582912); // 2^23 for rounding
+ const PacketI csti_1 = pset1<PacketI>(1);
+ const Packet cst_sign_mask = pset1frombits<Packet>(0x80000000u);
+
+ Packet x = pabs(_x);
+
+ // Scale x by 2/Pi to find x's octant.
+ Packet y = pmul(x, cst_2oPI);
+
+ // Rounding trick:
+ Packet y_round = padd(y, cst_rounding_magic);
+ EIGEN_SINCOS_DONT_OPT(y_round)
+ PacketI y_int = preinterpret<PacketI>(y_round); // last 23 digits represent integer (if abs(x)<2^24)
+ y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi
+
+ // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4
+ // using "Extended precision modular arithmetic"
+ #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD)
+ // This version requires true FMA for high accuracy
+ // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08):
+ const float huge_th = ComputeSine ? 117435.992f : 71476.0625f;
+ x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x);
+ x = pmadd(y, pset1<Packet>(-3.1391647326017846353352069854736328125e-07f), x);
+ x = pmadd(y, pset1<Packet>(-5.390302529957764765544681040410068817436695098876953125e-15f), x);
+ #else
+ // Without true FMA, the previous set of coefficients maintain 1ULP accuracy
+ // up to x<15.7 (for sin), but accuracy is immediately lost for x>15.7.
+ // We thus use one more iteration to maintain 2ULPs up to reasonably large inputs.
+
+ // The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively.
+ // and 2 ULP up to:
+ const float huge_th = ComputeSine ? 25966.f : 18838.f;
+ x = pmadd(y, pset1<Packet>(-1.5703125), x); // = 0xbfc90000
+ EIGEN_SINCOS_DONT_OPT(x)
+ x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x); // = 0xb9fdc000
+ EIGEN_SINCOS_DONT_OPT(x)
+ x = pmadd(y, pset1<Packet>(1.62865035235881805419921875e-07), x); // = 0x342ee000
+ x = pmadd(y, pset1<Packet>(5.5644315544167710640977020375430583953857421875e-11), x); // = 0x2e74b9ee
+
+ // For the record, the following set of coefficients maintain 2ULP up
+ // to a slightly larger range:
+ // const float huge_th = ComputeSine ? 51981.f : 39086.125f;
+ // but it slightly fails to maintain 1ULP for two values of sin below pi.
+ // x = pmadd(y, pset1<Packet>(-3.140625/2.), x);
+ // x = pmadd(y, pset1<Packet>(-0.00048351287841796875), x);
+ // x = pmadd(y, pset1<Packet>(-3.13855707645416259765625e-07), x);
+ // x = pmadd(y, pset1<Packet>(-6.0771006282767103812147979624569416046142578125e-11), x);
+
+ // For the record, with only 3 iterations it is possible to maintain
+ // 1 ULP up to 3PI (maybe more) and 2ULP up to 255.
+ // The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee
+ #endif
+
+ if(predux_any(pcmp_le(pset1<Packet>(huge_th),pabs(_x))))
+ {
+ const int PacketSize = unpacket_traits<Packet>::size;
+ EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize];
+ EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float x_cpy[PacketSize];
+ EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) int y_int2[PacketSize];
+ pstoreu(vals, pabs(_x));
+ pstoreu(x_cpy, x);
+ pstoreu(y_int2, y_int);
+ for(int k=0; k<PacketSize;++k)
+ {
+ float val = vals[k];
+ if(val>=huge_th && (numext::isfinite)(val))
+ x_cpy[k] = trig_reduce_huge(val,&y_int2[k]);
+ }
+ x = ploadu<Packet>(x_cpy);
+ y_int = ploadu<PacketI>(y_int2);
+ }
+
+ // Compute the sign to apply to the polynomial.
+ // sin: sign = second_bit(y_int) xor signbit(_x)
+ // cos: sign = second_bit(y_int+1)
+ Packet sign_bit = ComputeSine ? pxor(_x, preinterpret<Packet>(pshiftleft<30>(y_int)))
+ : preinterpret<Packet>(pshiftleft<30>(padd(y_int,csti_1)));
+ sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit
+
+ // Get the polynomial selection mask from the second bit of y_int
+ // We'll calculate both (sin and cos) polynomials and then select from the two.
+ Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int, csti_1), pzero(y_int)));
+
+ Packet x2 = pmul(x,x);
+
+ // Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4)
+ Packet y1 = pset1<Packet>(2.4372266125283204019069671630859375e-05f);
+ y1 = pmadd(y1, x2, pset1<Packet>(-0.00138865201734006404876708984375f ));
+ y1 = pmadd(y1, x2, pset1<Packet>(0.041666619479656219482421875f ));
+ y1 = pmadd(y1, x2, pset1<Packet>(-0.5f));
+ y1 = pmadd(y1, x2, pset1<Packet>(1.f));
+
+ // Evaluate the sin(x) polynomial. (Pi/4 <= x <= Pi/4)
+ // octave/matlab code to compute those coefficients:
+ // x = (0:0.0001:pi/4)';
+ // A = [x.^3 x.^5 x.^7];
+ // w = ((1.-(x/(pi/4)).^2).^5)*2000+1; # weights trading relative accuracy
+ // c = (A'*diag(w)*A)\(A'*diag(w)*(sin(x)-x)); # weighted LS, linear coeff forced to 1
+ // printf('%.64f\n %.64f\n%.64f\n', c(3), c(2), c(1))
+ //
+ Packet y2 = pset1<Packet>(-0.0001959234114083702898469196984621021329076029360294342041015625f);
+ y2 = pmadd(y2, x2, pset1<Packet>( 0.0083326873655616851693794799871284340042620897293090820312500000f));
+ y2 = pmadd(y2, x2, pset1<Packet>(-0.1666666203982298255503735617821803316473960876464843750000000000f));
+ y2 = pmul(y2, x2);
+ y2 = pmadd(y2, x, x);
+
+ // Select the correct result from the two polynomials.
+ y = ComputeSine ? pselect(poly_mask,y2,y1)
+ : pselect(poly_mask,y1,y2);
+
+ // Update the sign and filter huge inputs
+ return pxor(y, sign_bit);
+
+#undef EIGEN_SINCOS_DONT_OPT
+}
+
+template<typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet psin_float(const Packet& x)
+{
+ return psincos_float<true>(x);
+}
+
+template<typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet pcos_float(const Packet& x)
+{
+ return psincos_float<false>(x);
+}
+
+} // end namespace internal
+} // end namespace Eigen
diff --git a/Eigen/src/Core/arch/Default/Settings.h b/Eigen/src/Core/arch/Default/Settings.h
index 097373c84..a5c3ada4c 100644
--- a/Eigen/src/Core/arch/Default/Settings.h
+++ b/Eigen/src/Core/arch/Default/Settings.h
@@ -21,7 +21,7 @@
* it does not correspond to the number of iterations or the number of instructions
*/
#ifndef EIGEN_UNROLLING_LIMIT
-#define EIGEN_UNROLLING_LIMIT 100
+#define EIGEN_UNROLLING_LIMIT 110
#endif
/** Defines the threshold between a "small" and a "large" matrix.
diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h
index ddf37b9c1..cd4615a45 100644
--- a/Eigen/src/Core/arch/GPU/PacketMath.h
+++ b/Eigen/src/Core/arch/GPU/PacketMath.h
@@ -53,6 +53,7 @@ template<> struct packet_traits<float> : default_packet_traits
HasBetaInc = 1,
HasBlend = 0,
+ HasFloor = 1,
};
};
@@ -86,12 +87,13 @@ template<> struct packet_traits<double> : default_packet_traits
HasBetaInc = 1,
HasBlend = 0,
+ HasFloor = 1,
};
};
-template<> struct unpacket_traits<float4> { typedef float type; enum {size=4, alignment=Aligned16}; typedef float4 half; };
-template<> struct unpacket_traits<double2> { typedef double type; enum {size=2, alignment=Aligned16}; typedef double2 half; };
+template<> struct unpacket_traits<float4> { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef float4 half; };
+template<> struct unpacket_traits<double2> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef double2 half; };
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float& from) {
return make_float4(from, from, from, from);
@@ -100,6 +102,117 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const do
return make_double2(from, from);
}
+namespace {
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a,
+ const float& b) {
+ return __int_as_float(__float_as_int(a) & __float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_and(const double& a,
+ const double& b) {
+ return __longlong_as_double(__double_as_longlong(a) &
+ __double_as_longlong(b));
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_or(const float& a,
+ const float& b) {
+ return __int_as_float(__float_as_int(a) | __float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_or(const double& a,
+ const double& b) {
+ return __longlong_as_double(__double_as_longlong(a) |
+ __double_as_longlong(b));
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_xor(const float& a,
+ const float& b) {
+ return __int_as_float(__float_as_int(a) ^ __float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_xor(const double& a,
+ const double& b) {
+ return __longlong_as_double(__double_as_longlong(a) ^
+ __double_as_longlong(b));
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_andnot(const float& a,
+ const float& b) {
+ return __int_as_float(__float_as_int(a) & ~__float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_andnot(const double& a,
+ const double& b) {
+ return __longlong_as_double(__double_as_longlong(a) &
+ ~__double_as_longlong(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float eq_mask(const float& a,
+ const float& b) {
+ return __int_as_float(a == b ? 0xffffffffu : 0u);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double eq_mask(const double& a,
+ const double& b) {
+ return __longlong_as_double(a == b ? 0xffffffffffffffffull : 0ull);
+}
+
+} // namespace
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand<float4>(const float4& a,
+ const float4& b) {
+ return make_float4(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y),
+ bitwise_and(a.z, b.z), bitwise_and(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pand<double2>(const double2& a,
+ const double2& b) {
+ return make_double2(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 por<float4>(const float4& a,
+ const float4& b) {
+ return make_float4(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y),
+ bitwise_or(a.z, b.z), bitwise_or(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 por<double2>(const double2& a,
+ const double2& b) {
+ return make_double2(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pxor<float4>(const float4& a,
+ const float4& b) {
+ return make_float4(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y),
+ bitwise_xor(a.z, b.z), bitwise_xor(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pxor<double2>(const double2& a,
+ const double2& b) {
+ return make_double2(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pandnot<float4>(const float4& a,
+ const float4& b) {
+ return make_float4(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y),
+ bitwise_andnot(a.z, b.z), bitwise_andnot(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pandnot<double2>(const double2& a, const double2& b) {
+ return make_double2(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_eq<float4>(const float4& a,
+ const float4& b) {
+ return make_float4(eq_mask(a.x, b.x), eq_mask(a.y, b.y), eq_mask(a.z, b.z),
+ eq_mask(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pcmp_eq<double2>(const double2& a, const double2& b) {
+ return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y));
+}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
return make_float4(a, a+1, a+2, a+3);
@@ -297,6 +410,13 @@ template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
return make_double2(fabs(a.x), fabs(a.y));
}
+template<> EIGEN_DEVICE_FUNC inline float4 pfloor<float4>(const float4& a) {
+ return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double2 pfloor<double2>(const double2& a) {
+ return make_double2(floor(a.x), floor(a.y));
+}
+
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<float4,4>& kernel) {
float tmp = kernel.packet[0].y;
diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
index 8787adcde..869fa7ec6 100644
--- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
@@ -30,6 +30,7 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
size=2,
HasHalfPacket = 0,
HasAdd = 1,
+ HasSub = 1,
HasMul = 1,
HasDiv = 1,
HasSqrt = 1,
@@ -41,7 +42,7 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
};
};
-template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
+template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef half2 half; };
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
@@ -137,12 +138,22 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
- half2 result;
- unsigned temp = *(reinterpret_cast<const unsigned*>(&(a)));
- *(reinterpret_cast<unsigned*>(&(result))) = temp & 0x7FFF7FFF;
- return result;
+ half a1 = __low2half(a);
+ half a2 = __high2half(a);
+ half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF);
+ half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF);
+ return __halves2half2(result1, result2);
}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue<half2>(const half2& a) {
+ half true_half = half_impl::raw_uint16_to_half(0xffffu);
+ return pset1<half2>(true_half);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero<half2>(const half2& a) {
+ half false_half = half_impl::raw_uint16_to_half(0x0000u);
+ return pset1<half2>(false_half);
+}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
ptranspose(PacketBlock<half2,2>& kernel) {
@@ -171,6 +182,68 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen:
#endif
}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq<half2>(const half2& a,
+ const half2& b) {
+ half true_half = half_impl::raw_uint16_to_half(0xffffu);
+ half false_half = half_impl::raw_uint16_to_half(0x0000u);
+ half a1 = __low2half(a);
+ half a2 = __high2half(a);
+ half b1 = __low2half(b);
+ half b2 = __high2half(b);
+ half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half;
+ half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half;
+ return __halves2half2(eq1, eq2);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand<half2>(const half2& a,
+ const half2& b) {
+ half a1 = __low2half(a);
+ half a2 = __high2half(a);
+ half b1 = __low2half(b);
+ half b2 = __high2half(b);
+ half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x);
+ half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x);
+ return __halves2half2(result1, result2);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por<half2>(const half2& a,
+ const half2& b) {
+ half a1 = __low2half(a);
+ half a2 = __high2half(a);
+ half b1 = __low2half(b);
+ half b2 = __high2half(b);
+ half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x);
+ half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x);
+ return __halves2half2(result1, result2);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor<half2>(const half2& a,
+ const half2& b) {
+ half a1 = __low2half(a);
+ half a2 = __high2half(a);
+ half b1 = __low2half(b);
+ half b2 = __high2half(b);
+ half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x);
+ half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x);
+ return __halves2half2(result1, result2);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot<half2>(const half2& a,
+ const half2& b) {
+ half a1 = __low2half(a);
+ half a2 = __high2half(a);
+ half b1 = __low2half(b);
+ half b2 = __high2half(b);
+ half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x);
+ half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x);
+ return __halves2half2(result1, result2);
+}
+
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
@@ -500,6 +573,7 @@ struct packet_traits<half> : default_packet_traits {
HasAdd = 1,
HasSub = 1,
HasMul = 1,
+ HasDiv = 1,
HasNegate = 1,
HasAbs = 0,
HasAbs2 = 0,
@@ -507,7 +581,6 @@ struct packet_traits<half> : default_packet_traits {
HasMax = 0,
HasConj = 0,
HasSetLinear = 0,
- HasDiv = 0,
HasSqrt = 0,
HasRsqrt = 0,
HasExp = 0,
@@ -517,7 +590,7 @@ struct packet_traits<half> : default_packet_traits {
};
-template<> struct unpacket_traits<Packet16h> { typedef Eigen::half type; enum {size=16, alignment=Aligned32}; typedef Packet16h half; };
+template<> struct unpacket_traits<Packet16h> { typedef Eigen::half type; enum {size=16, alignment=Aligned32, vectorizable=true}; typedef Packet16h half; };
template<> EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
Packet16h result;
@@ -640,6 +713,36 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
#endif
}
+template<> EIGEN_STRONG_INLINE Packet16h pnot(const Packet16h& a) {
+ Packet16h r; r.x = _mm256_xor_si256(a.x, pcmp_eq(a.x, a.x)); return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) {
+ Packet16h r; r.x = Packet8i(ptrue(a.x)); return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) {
+ // in some cases Packet8i is a wrapper around __m256i, so we need to
+ // cast to Packet8i to call the correct overload.
+ Packet16h r; r.x = por(Packet8i(a.x),Packet8i(b.x)); return r;
+}
+template<> EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a,const Packet16h& b) {
+ Packet16h r; r.x = pxor(Packet8i(a.x),Packet8i(b.x)); return r;
+}
+template<> EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a,const Packet16h& b) {
+ Packet16h r; r.x = pand(Packet8i(a.x),Packet8i(b.x)); return r;
+}
+template<> EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a,const Packet16h& b) {
+ Packet16h r; r.x = pandnot(Packet8i(a.x),Packet8i(b.x)); return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a,const Packet16h& b) {
+ Packet16f af = half2float(a);
+ Packet16f bf = half2float(b);
+ Packet16f rf = pcmp_eq(af, bf);
+ return float2half(rf);
+}
+
template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) {
// FIXME we could do that with bit manipulation
Packet16f af = half2float(a);
@@ -668,6 +771,13 @@ template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, con
return float2half(rf);
}
+template<> EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {
+ Packet16f af = half2float(a);
+ Packet16f bf = half2float(b);
+ Packet16f rf = pdiv(af, bf);
+ return float2half(rf);
+}
+
template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
Packet16f from_float = half2float(from);
return half(predux(from_float));
@@ -952,6 +1062,7 @@ struct packet_traits<Eigen::half> : default_packet_traits {
HasAdd = 1,
HasSub = 1,
HasMul = 1,
+ HasDiv = 1,
HasNegate = 1,
HasAbs = 0,
HasAbs2 = 0,
@@ -959,7 +1070,6 @@ struct packet_traits<Eigen::half> : default_packet_traits {
HasMax = 0,
HasConj = 0,
HasSetLinear = 0,
- HasDiv = 0,
HasSqrt = 0,
HasRsqrt = 0,
HasExp = 0,
@@ -969,7 +1079,7 @@ struct packet_traits<Eigen::half> : default_packet_traits {
};
-template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16}; typedef Packet8h half; };
+template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true}; typedef Packet8h half; };
template<> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
Packet8h result;
@@ -1063,6 +1173,32 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
#endif
}
+template<> EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h& a) {
+ Packet8h r; r.x = _mm_cmpeq_epi32(a.x, a.x); return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) {
+ // in some cases Packet4i is a wrapper around __m128i, so we either need to
+ // cast to Packet4i to directly call the intrinsics as below:
+ Packet8h r; r.x = _mm_or_si128(a.x,b.x); return r;
+}
+template<> EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a,const Packet8h& b) {
+ Packet8h r; r.x = _mm_xor_si128(a.x,b.x); return r;
+}
+template<> EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a,const Packet8h& b) {
+ Packet8h r; r.x = _mm_and_si128(a.x,b.x); return r;
+}
+template<> EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a,const Packet8h& b) {
+ Packet8h r; r.x = _mm_andnot_si128(b.x,a.x); return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a,const Packet8h& b) {
+ Packet8f af = half2float(a);
+ Packet8f bf = half2float(b);
+ Packet8f rf = pcmp_eq(af, bf);
+ return float2half(rf);
+}
+
template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) {
@@ -1093,6 +1229,13 @@ template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const
return float2half(rf);
}
+template<> EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
+ Packet8f af = half2float(a);
+ Packet8f bf = half2float(b);
+ Packet8f rf = pdiv(af, bf);
+ return float2half(rf);
+}
+
template<> EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride)
{
Packet8h result;
@@ -1279,9 +1422,10 @@ struct packet_traits<Eigen::half> : default_packet_traits {
AlignedOnScalar = 1,
size = 4,
HasHalfPacket = 0,
- HasAdd = 0,
- HasSub = 0,
- HasMul = 0,
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 1,
HasNegate = 0,
HasAbs = 0,
HasAbs2 = 0,
@@ -1289,7 +1433,6 @@ struct packet_traits<Eigen::half> : default_packet_traits {
HasMax = 0,
HasConj = 0,
HasSetLinear = 0,
- HasDiv = 0,
HasSqrt = 0,
HasRsqrt = 0,
HasExp = 0,
@@ -1299,7 +1442,7 @@ struct packet_traits<Eigen::half> : default_packet_traits {
};
-template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16}; typedef Packet4h half; };
+template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef Packet4h half; };
template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) {
Packet4h result;
@@ -1336,6 +1479,29 @@ template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const
return result;
}
+template<> EIGEN_STRONG_INLINE Packet4h psub<Packet4h>(const Packet4h& a, const Packet4h& b) {
+ __int64_t a64 = _mm_cvtm64_si64(a.x);
+ __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+ Eigen::half h[4];
+
+ Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+ Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+ h[0] = ha - hb;
+ ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+ hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+ h[1] = ha - hb;
+ ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+ hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+ h[2] = ha - hb;
+ ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+ hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+ h[3] = ha - hb;
+ Packet4h result;
+ result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+ return result;
+}
+
template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const Packet4h& b) {
__int64_t a64 = _mm_cvtm64_si64(a.x);
__int64_t b64 = _mm_cvtm64_si64(b.x);
@@ -1359,6 +1525,29 @@ template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const
return result;
}
+template<> EIGEN_STRONG_INLINE Packet4h pdiv<Packet4h>(const Packet4h& a, const Packet4h& b) {
+ __int64_t a64 = _mm_cvtm64_si64(a.x);
+ __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+ Eigen::half h[4];
+
+ Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+ Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+ h[0] = ha / hb;
+ ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+ hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+ h[1] = ha / hb;
+ ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+ hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+ h[2] = ha / hb;
+ ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+ hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+ h[3] = ha / hb;
+ Packet4h result;
+ result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+ return result;
+}
+
template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(const Eigen::half* from) {
Packet4h result;
result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
diff --git a/Eigen/src/Core/arch/MSA/Complex.h b/Eigen/src/Core/arch/MSA/Complex.h
index 9a45cf51e..fa64d3564 100644
--- a/Eigen/src/Core/arch/MSA/Complex.h
+++ b/Eigen/src/Core/arch/MSA/Complex.h
@@ -127,7 +127,7 @@ struct packet_traits<std::complex<float> > : default_packet_traits {
template <>
struct unpacket_traits<Packet2cf> {
typedef std::complex<float> type;
- enum { size = 2, alignment = Aligned16 };
+ enum { size = 2, alignment = Aligned16, vectorizable=true };
typedef Packet2cf half;
};
@@ -500,7 +500,7 @@ struct packet_traits<std::complex<double> > : default_packet_traits {
template <>
struct unpacket_traits<Packet1cd> {
typedef std::complex<double> type;
- enum { size = 1, alignment = Aligned16 };
+ enum { size = 1, alignment = Aligned16, vectorizable=true };
typedef Packet1cd half;
};
diff --git a/Eigen/src/Core/arch/MSA/MathFunctions.h b/Eigen/src/Core/arch/MSA/MathFunctions.h
index 98e23e36f..f5181b90e 100644
--- a/Eigen/src/Core/arch/MSA/MathFunctions.h
+++ b/Eigen/src/Core/arch/MSA/MathFunctions.h
@@ -261,7 +261,7 @@ Packet4f psincos_inner_msa_float(const Packet4f& _x) {
// x's from odd-numbered octants will translate to octant -1: [-Pi/4, 0].
// Adjustment for odd-numbered octants: octant = (octant + 1) & (~1).
Packet4i y_int1 = __builtin_msa_addvi_w(y_int, 1);
- Packet4i y_int2 = (Packet4i)__builtin_msa_bclri_w((Packet4ui)y_int1, 0);
+ Packet4i y_int2 = (Packet4i)__builtin_msa_bclri_w((Packet4ui)y_int1, 0); // bclri = bit-clear
y = __builtin_msa_ffint_s_w(y_int2);
// Compute the sign to apply to the polynomial.
@@ -305,7 +305,7 @@ Packet4f psincos_inner_msa_float(const Packet4f& _x) {
// Update the sign.
sign_mask = pxor(sign_mask, (Packet4i)y);
- y = (Packet4f)__builtin_msa_binsli_w((v4u32)y, (v4u32)sign_mask, 0);
+ y = (Packet4f)__builtin_msa_binsli_w((v4u32)y, (v4u32)sign_mask, 0); // binsli = bit-insert-left
return y;
}
diff --git a/Eigen/src/Core/arch/MSA/PacketMath.h b/Eigen/src/Core/arch/MSA/PacketMath.h
index 094c874ee..a97156a84 100644
--- a/Eigen/src/Core/arch/MSA/PacketMath.h
+++ b/Eigen/src/Core/arch/MSA/PacketMath.h
@@ -117,14 +117,14 @@ struct packet_traits<int32_t> : default_packet_traits {
template <>
struct unpacket_traits<Packet4f> {
typedef float type;
- enum { size = 4, alignment = Aligned16 };
+ enum { size = 4, alignment = Aligned16, vectorizable=true };
typedef Packet4f half;
};
template <>
struct unpacket_traits<Packet4i> {
typedef int32_t type;
- enum { size = 4, alignment = Aligned16 };
+ enum { size = 4, alignment = Aligned16, vectorizable=true };
typedef Packet4i half;
};
@@ -925,7 +925,7 @@ struct packet_traits<double> : default_packet_traits {
template <>
struct unpacket_traits<Packet2d> {
typedef double type;
- enum { size = 2, alignment = Aligned16 };
+ enum { size = 2, alignment = Aligned16, vectorizable=true };
typedef Packet2d half;
};
diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index 306a309be..f6c5c211c 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -62,7 +62,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
};
};
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2cf half; };
template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
{
@@ -101,6 +101,18 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con
return Packet2cf(vaddq_f32(v1, v2));
}
+template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b)
+{
+ // Compare real and imaginary parts of a and b to get the mask vector:
+ // [re(a[0])==re(b[0]), im(a[0])==im(b[0]), re(a[1])==re(b[1]), im(a[1])==im(b[1])]
+ Packet4f eq = pcmp_eq<Packet4f>(a.v, b.v);
+ // Swap real/imag elements in the mask in to get:
+ // [im(a[0])==im(b[0]), re(a[0])==re(b[0]), im(a[1])==im(b[1]), re(a[1])==re(b[1])]
+ Packet4f eq_swapped = vrev64q_f32(eq);
+ // Return re(a)==re(b) && im(a)==im(b) by computing bitwise AND of eq and eq_swapped
+ return Packet2cf(pand<Packet4f>(eq, eq_swapped));
+}
+
template<> EIGEN_STRONG_INLINE Packet2cf pand <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
{
return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
@@ -146,7 +158,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::co
template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
{
- std::complex<float> EIGEN_ALIGN16 x[2];
+ EIGEN_ALIGN16 std::complex<float> x[2];
vst1q_f32((float *)x, a.v);
return x[0];
}
@@ -328,7 +340,7 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits
};
};
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true}; typedef Packet1cd half; };
template<> EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
@@ -361,6 +373,18 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, con
return Packet1cd(vaddq_f64(v1, v2));
}
+template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b)
+{
+ // Compare real and imaginary parts of a and b to get the mask vector:
+ // [re(a)==re(b), im(a)==im(b)]
+ Packet2d eq = pcmp_eq<Packet2d>(a.v, b.v);
+ // Swap real/imag elements in the mask in to get:
+ // [im(a)==im(b), re(a)==re(b)]
+ Packet2d eq_swapped = vreinterpretq_f64_u32(vrev64q_u32(vreinterpretq_u32_f64(eq)));
+ // Return re(a)==re(b) & im(a)==im(b) by computing bitwise AND of eq and eq_swapped
+ return Packet1cd(pand<Packet2d>(eq, eq_swapped));
+}
+
template<> EIGEN_STRONG_INLINE Packet1cd pand <Packet1cd>(const Packet1cd& a, const Packet1cd& b)
{
return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v))));
@@ -401,7 +425,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1c
template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a)
{
- std::complex<double> EIGEN_ALIGN16 res;
+ EIGEN_ALIGN16 std::complex<double> res;
pstore<std::complex<double> >(&res, a);
return res;
diff --git a/Eigen/src/Core/arch/NEON/MathFunctions.h b/Eigen/src/Core/arch/NEON/MathFunctions.h
index c48c61023..2e7d0e944 100644
--- a/Eigen/src/Core/arch/NEON/MathFunctions.h
+++ b/Eigen/src/Core/arch/NEON/MathFunctions.h
@@ -5,175 +5,37 @@
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-/* The sin, cos, exp, and log functions of this file come from
- * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
- */
-
#ifndef EIGEN_MATH_FUNCTIONS_NEON_H
#define EIGEN_MATH_FUNCTIONS_NEON_H
+#include "../Default/GenericPacketMathFunctions.h"
+
namespace Eigen {
namespace internal {
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f pexp<Packet4f>(const Packet4f& _x)
+Packet4f pexp<Packet4f>(const Packet4f& x)
{
- Packet4f x = _x;
- Packet4f tmp, fx;
-
- _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
- _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
- _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
- _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f);
- _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
-
- x = vminq_f32(x, p4f_exp_hi);
- x = vmaxq_f32(x, p4f_exp_lo);
-
- /* express exp(x) as exp(g + n*log(2)) */
- fx = vmlaq_f32(p4f_half, x, p4f_cephes_LOG2EF);
-
- /* perform a floorf */
- tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
-
- /* if greater, substract 1 */
- Packet4ui mask = vcgtq_f32(tmp, fx);
- mask = vandq_u32(mask, vreinterpretq_u32_f32(p4f_1));
-
- fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
-
- tmp = vmulq_f32(fx, p4f_cephes_exp_C1);
- Packet4f z = vmulq_f32(fx, p4f_cephes_exp_C2);
- x = vsubq_f32(x, tmp);
- x = vsubq_f32(x, z);
-
- Packet4f y = vmulq_f32(p4f_cephes_exp_p0, x);
- z = vmulq_f32(x, x);
- y = vaddq_f32(y, p4f_cephes_exp_p1);
- y = vmulq_f32(y, x);
- y = vaddq_f32(y, p4f_cephes_exp_p2);
- y = vmulq_f32(y, x);
- y = vaddq_f32(y, p4f_cephes_exp_p3);
- y = vmulq_f32(y, x);
- y = vaddq_f32(y, p4f_cephes_exp_p4);
- y = vmulq_f32(y, x);
- y = vaddq_f32(y, p4f_cephes_exp_p5);
-
- y = vmulq_f32(y, z);
- y = vaddq_f32(y, x);
- y = vaddq_f32(y, p4f_1);
-
- /* build 2^n */
- int32x4_t mm;
- mm = vcvtq_s32_f32(fx);
- mm = vaddq_s32(mm, p4i_0x7f);
- mm = vshlq_n_s32(mm, 23);
- Packet4f pow2n = vreinterpretq_f32_s32(mm);
-
- y = vmulq_f32(y, pow2n);
- return y;
+ return pexp_float(x);
}
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f plog<Packet4f>(const Packet4f& _x)
+Packet4f plog<Packet4f>(const Packet4f& x)
{
- Packet4f x = _x;
- _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
- _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
- _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
-
- _EIGEN_DECLARE_CONST_Packet4i(inv_mant_mask, ~0x7f800000);
-
- /* natural logarithm computed for 4 simultaneous float
- return NaN for x <= 0
- */
- _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
-
- x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */
- Packet4ui invalid_mask = vcleq_f32(x, vdupq_n_f32(0));
-
- Packet4i ux = vreinterpretq_s32_f32(x);
-
- Packet4i emm0 = vshrq_n_s32(ux, 23);
-
- /* keep only the fractional part */
- ux = vandq_s32(ux, p4i_inv_mant_mask);
- ux = vorrq_s32(ux, vreinterpretq_s32_f32(p4f_half));
- x = vreinterpretq_f32_s32(ux);
-
- emm0 = vsubq_s32(emm0, p4i_0x7f);
- Packet4f e = vcvtq_f32_s32(emm0);
-
- e = vaddq_f32(e, p4f_1);
-
- /* part2:
- if( x < SQRTHF ) {
- e -= 1;
- x = x + x - 1.0;
- } else { x = x - 1.0; }
- */
- Packet4ui mask = vcltq_f32(x, p4f_cephes_SQRTHF);
- Packet4f tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
- x = vsubq_f32(x, p4f_1);
- e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(p4f_1), mask)));
- x = vaddq_f32(x, tmp);
-
- Packet4f z = vmulq_f32(x,x);
-
- Packet4f y = p4f_cephes_log_p0;
- y = vmulq_f32(y, x);
- y = vaddq_f32(y, p4f_cephes_log_p1);
- y = vmulq_f32(y, x);
- y = vaddq_f32(y, p4f_cephes_log_p2);
- y = vmulq_f32(y, x);
- y = vaddq_f32(y, p4f_cephes_log_p3);
- y = vmulq_f32(y, x);
- y = vaddq_f32(y, p4f_cephes_log_p4);
- y = vmulq_f32(y, x);
- y = vaddq_f32(y, p4f_cephes_log_p5);
- y = vmulq_f32(y, x);
- y = vaddq_f32(y, p4f_cephes_log_p6);
- y = vmulq_f32(y, x);
- y = vaddq_f32(y, p4f_cephes_log_p7);
- y = vmulq_f32(y, x);
- y = vaddq_f32(y, p4f_cephes_log_p8);
- y = vmulq_f32(y, x);
-
- y = vmulq_f32(y, z);
-
- tmp = vmulq_f32(e, p4f_cephes_log_q1);
- y = vaddq_f32(y, tmp);
-
+ return plog_float(x);
+}
- tmp = vmulq_f32(z, p4f_half);
- y = vsubq_f32(y, tmp);
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f psin<Packet4f>(const Packet4f& x)
+{
+ return psin_float(x);
+}
- tmp = vmulq_f32(e, p4f_cephes_log_q2);
- x = vaddq_f32(x, y);
- x = vaddq_f32(x, tmp);
- x = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN
- return x;
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f pcos<Packet4f>(const Packet4f& x)
+{
+ return pcos_float(x);
}
} // end namespace internal
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 010739380..e8b351849 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -108,10 +108,11 @@ template<> struct packet_traits<float> : default_packet_traits
size = 4,
HasHalfPacket=0, // Packet2f intrinsics not implemented yet
- HasDiv = 1,
+ HasDiv = 1,
+ HasFloor = 1,
// FIXME check the Has*
- HasSin = 0,
- HasCos = 0,
+ HasSin = EIGEN_FAST_MATH,
+ HasCos = EIGEN_FAST_MATH,
HasLog = 1,
HasExp = 1,
HasSqrt = 0
@@ -139,12 +140,25 @@ EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q
EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
#endif
-template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet4i> { typedef int32_t type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
+template<> struct unpacket_traits<Packet4f>
+{
+ typedef float type;
+ typedef Packet4f half;
+ typedef Packet4i integer_packet;
+ enum {size=4, alignment=Aligned16, vectorizable=true};
+};
+template<> struct unpacket_traits<Packet4i>
+{
+ typedef int32_t type;
+ typedef Packet4i half;
+ enum {size=4, alignment=Aligned16, vectorizable=true};
+};
template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); }
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) { return vdupq_n_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) { return vreinterpretq_f32_u32(vdupq_n_u32(from)); }
+
template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
{
const float f[] = {0, 1, 2, 3};
@@ -249,6 +263,25 @@ template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const
template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return vreinterpretq_f32_u32(vcleq_f32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return vreinterpretq_f32_u32(vcltq_f32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return vreinterpretq_f32_u32(vceqq_f32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return vreinterpretq_f32_u32(vmvnq_u32(vcgeq_f32(a,b))); }
+
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return vreinterpretq_s32_u32(vceqq_s32(a,b)); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
+{
+ const Packet4f cst_1 = pset1<Packet4f>(1.0f);
+ /* perform a floorf */
+ Packet4f tmp = vcvtq_f32_s32(vcvtq_s32_f32(a));
+
+ /* if greater, substract 1 */
+ Packet4ui mask = vcgtq_f32(tmp, a);
+ mask = vandq_u32(mask, vreinterpretq_u32_f32(cst_1));
+ return vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
+}
+
// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
{
@@ -274,6 +307,9 @@ template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, con
}
template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); }
+template<int N> EIGEN_STRONG_INLINE Packet4i pshiftright(Packet4i a) { return vshrq_n_s32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4i pshiftleft(Packet4i a) { return vshlq_n_s32(a,N); }
+
template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
@@ -339,8 +375,8 @@ template<> EIGEN_STRONG_INLINE void prefetch<float> (const float* addr) { EI
template<> EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); }
// FIXME only store the 2 first elements ?
-template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
-template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { int32_t EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x[4]; vst1q_f32(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int32_t x[4]; vst1q_s32(x, a); return x[0]; }
template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
float32x2_t a_lo, a_hi;
@@ -364,6 +400,14 @@ template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); }
template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
+ return pfrexp_float(a,exponent);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
+ return pldexp_float(a,exponent);
+}
+
template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
{
float32x2_t a_lo, a_hi, sum;
@@ -507,6 +551,13 @@ template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a)
return vget_lane_s32(max, 0);
}
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
+{
+ uint32x2_t tmp = vorr_u32(vget_low_u32( vreinterpretq_u32_f32(x)),
+ vget_high_u32(vreinterpretq_u32_f32(x)));
+ return vget_lane_u32(vpmax_u32(tmp,tmp),0);
+}
+
// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
#define PALIGN_NEON(Offset,Type,Command) \
@@ -606,7 +657,7 @@ template<> struct packet_traits<double> : default_packet_traits
};
};
-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
+template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2d half; };
template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return vdupq_n_f64(from); }
@@ -660,6 +711,8 @@ template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, con
return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
}
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return vreinterpretq_f64_u64(vceqq_f64(a,b)); }
+
template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); }
template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); }
diff --git a/Eigen/src/Core/arch/NEON/TypeCasting.h b/Eigen/src/Core/arch/NEON/TypeCasting.h
index 95d1fd0e4..20dbe1332 100644
--- a/Eigen/src/Core/arch/NEON/TypeCasting.h
+++ b/Eigen/src/Core/arch/NEON/TypeCasting.h
@@ -41,6 +41,14 @@ template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i
return vcvtq_f32_s32(a);
}
+template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a) {
+ return vreinterpretq_s32_f32(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Packet4i& a) {
+ return vreinterpretq_f32_s32(a);
+}
+
} // end namespace internal
} // end namespace Eigen
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index d075043ce..f39988eac 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -50,7 +50,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
};
#endif
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2cf half; };
template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); }
@@ -82,10 +82,13 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con
#endif
}
+template<> EIGEN_STRONG_INLINE Packet2cf ptrue <Packet2cf>(const Packet2cf& a) { return Packet2cf(ptrue(Packet4f(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pnot <Packet2cf>(const Packet2cf& a) { return Packet2cf(pnot(Packet4f(a.v))); }
+
template<> EIGEN_STRONG_INLINE Packet2cf pand <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf por <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf pxor <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(b.v,a.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(&numext::real_ref(*from))); }
template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>(&numext::real_ref(*from))); }
@@ -280,7 +283,7 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits
};
#endif
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true}; typedef Packet1cd half; };
template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); }
@@ -305,10 +308,12 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, con
#endif
}
+template<> EIGEN_STRONG_INLINE Packet1cd ptrue <Packet1cd>(const Packet1cd& a) { return Packet1cd(ptrue(Packet2d(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet1cd pnot <Packet1cd>(const Packet1cd& a) { return Packet1cd(pnot(Packet2d(a.v))); }
template<> EIGEN_STRONG_INLINE Packet1cd pand <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd por <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd pxor <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(b.v,a.v)); }
// FIXME force unaligned load, this is a temporary fix
template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from)
@@ -439,6 +444,18 @@ ptranspose(PacketBlock<Packet2cf,2>& kernel) {
kernel.packet[1].v = tmp;
}
+template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b)
+{
+ __m128 eq = _mm_cmpeq_ps(a.v, b.v);
+ return Packet2cf(pand<Packet4f>(eq, vec4f_swizzle1(eq, 1, 0, 3, 2)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b)
+{
+ __m128d eq = _mm_cmpeq_pd(a.v, b.v);
+ return Packet1cd(pand<Packet2d>(eq, vec2d_swizzle1(eq, 1, 0)));
+}
+
template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
__m128d result = pblend<Packet2d>(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v));
return Packet2cf(_mm_castpd_ps(result));
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index 4af2c6cae..0d491ab88 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -8,13 +8,15 @@
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-/* The sin, cos, exp, and log functions of this file come from
+/* The sin and cos and functions of this file come from
* Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
*/
#ifndef EIGEN_MATH_FUNCTIONS_SSE_H
#define EIGEN_MATH_FUNCTIONS_SSE_H
+#include "../Default/GenericPacketMathFunctions.h"
+
namespace Eigen {
namespace internal {
@@ -22,424 +24,31 @@ namespace internal {
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f plog<Packet4f>(const Packet4f& _x)
{
- Packet4f x = _x;
- _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
- _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
- _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
-
- _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
-
- /* the smallest non denormalized float number */
- _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000);
- _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000);//-1.f/0.f);
-
- /* natural logarithm computed for 4 simultaneous float
- return NaN for x <= 0
- */
- _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
-
-
- Packet4i emm0;
-
- Packet4f invalid_mask = _mm_cmpnge_ps(x, _mm_setzero_ps()); // not greater equal is true if x is NaN
- Packet4f iszero_mask = _mm_cmpeq_ps(x, _mm_setzero_ps());
-
- x = pmax(x, p4f_min_norm_pos); /* cut off denormalized stuff */
- emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
-
- /* keep only the fractional part */
- x = _mm_and_ps(x, p4f_inv_mant_mask);
- x = _mm_or_ps(x, p4f_half);
-
- emm0 = _mm_sub_epi32(emm0, p4i_0x7f);
- Packet4f e = padd(Packet4f(_mm_cvtepi32_ps(emm0)), p4f_1);
-
- /* part2:
- if( x < SQRTHF ) {
- e -= 1;
- x = x + x - 1.0;
- } else { x = x - 1.0; }
- */
- Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF);
- Packet4f tmp = pand(x, mask);
- x = psub(x, p4f_1);
- e = psub(e, pand(p4f_1, mask));
- x = padd(x, tmp);
-
- Packet4f x2 = pmul(x,x);
- Packet4f x3 = pmul(x2,x);
-
- Packet4f y, y1, y2;
- y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
- y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
- y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
- y = pmadd(y , x, p4f_cephes_log_p2);
- y1 = pmadd(y1, x, p4f_cephes_log_p5);
- y2 = pmadd(y2, x, p4f_cephes_log_p8);
- y = pmadd(y, x3, y1);
- y = pmadd(y, x3, y2);
- y = pmul(y, x3);
-
- y1 = pmul(e, p4f_cephes_log_q1);
- tmp = pmul(x2, p4f_half);
- y = padd(y, y1);
- x = psub(x, tmp);
- y2 = pmul(e, p4f_cephes_log_q2);
- x = padd(x, y);
- x = padd(x, y2);
- // negative arg will be NAN, 0 will be -INF
- return _mm_or_ps(_mm_andnot_ps(iszero_mask, _mm_or_ps(x, invalid_mask)),
- _mm_and_ps(iszero_mask, p4f_minus_inf));
+ return plog_float(_x);
}
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f pexp<Packet4f>(const Packet4f& _x)
{
- Packet4f x = _x;
- _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
- _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
- _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
-
-
- _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f);
- _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
-
- _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
-
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
-
- Packet4f tmp, fx;
- Packet4i emm0;
-
- // clamp x
- x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo);
-
- /* express exp(x) as exp(g + n*log(2)) */
- fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half);
-
-#ifdef EIGEN_VECTORIZE_SSE4_1
- fx = _mm_floor_ps(fx);
-#else
- emm0 = _mm_cvttps_epi32(fx);
- tmp = _mm_cvtepi32_ps(emm0);
- /* if greater, substract 1 */
- Packet4f mask = _mm_cmpgt_ps(tmp, fx);
- mask = _mm_and_ps(mask, p4f_1);
- fx = psub(tmp, mask);
-#endif
-
- tmp = pmul(fx, p4f_cephes_exp_C1);
- Packet4f z = pmul(fx, p4f_cephes_exp_C2);
- x = psub(x, tmp);
- x = psub(x, z);
-
- z = pmul(x,x);
-
- Packet4f y = p4f_cephes_exp_p0;
- y = pmadd(y, x, p4f_cephes_exp_p1);
- y = pmadd(y, x, p4f_cephes_exp_p2);
- y = pmadd(y, x, p4f_cephes_exp_p3);
- y = pmadd(y, x, p4f_cephes_exp_p4);
- y = pmadd(y, x, p4f_cephes_exp_p5);
- y = pmadd(y, z, x);
- y = padd(y, p4f_1);
-
- // build 2^n
- emm0 = _mm_cvttps_epi32(fx);
- emm0 = _mm_add_epi32(emm0, p4i_0x7f);
- emm0 = _mm_slli_epi32(emm0, 23);
- return pmax(pmul(y, Packet4f(_mm_castsi128_ps(emm0))), _x);
+ return pexp_float(_x);
}
+
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet2d pexp<Packet2d>(const Packet2d& _x)
+Packet2d pexp<Packet2d>(const Packet2d& x)
{
- Packet2d x = _x;
-
- _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
- _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
- _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
-
- _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437);
- _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
-
- _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
-
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
-
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
-
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
- static const __m128i p4i_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0);
-
- Packet2d tmp, fx;
- Packet4i emm0;
-
- // clamp x
- x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo);
- /* express exp(x) as exp(g + n*log(2)) */
- fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half);
-
-#ifdef EIGEN_VECTORIZE_SSE4_1
- fx = _mm_floor_pd(fx);
-#else
- emm0 = _mm_cvttpd_epi32(fx);
- tmp = _mm_cvtepi32_pd(emm0);
- /* if greater, substract 1 */
- Packet2d mask = _mm_cmpgt_pd(tmp, fx);
- mask = _mm_and_pd(mask, p2d_1);
- fx = psub(tmp, mask);
-#endif
-
- tmp = pmul(fx, p2d_cephes_exp_C1);
- Packet2d z = pmul(fx, p2d_cephes_exp_C2);
- x = psub(x, tmp);
- x = psub(x, z);
-
- Packet2d x2 = pmul(x,x);
-
- Packet2d px = p2d_cephes_exp_p0;
- px = pmadd(px, x2, p2d_cephes_exp_p1);
- px = pmadd(px, x2, p2d_cephes_exp_p2);
- px = pmul (px, x);
-
- Packet2d qx = p2d_cephes_exp_q0;
- qx = pmadd(qx, x2, p2d_cephes_exp_q1);
- qx = pmadd(qx, x2, p2d_cephes_exp_q2);
- qx = pmadd(qx, x2, p2d_cephes_exp_q3);
-
- x = pdiv(px,psub(qx,px));
- x = pmadd(p2d_2,x,p2d_1);
-
- // build 2^n
- emm0 = _mm_cvttpd_epi32(fx);
- emm0 = _mm_add_epi32(emm0, p4i_1023_0);
- emm0 = _mm_slli_epi32(emm0, 20);
- emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3));
- return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x);
+ return pexp_double(x);
}
-/* evaluation of 4 sines at once, using SSE2 intrinsics.
-
- The code is the exact rewriting of the cephes sinf function.
- Precision is excellent as long as x < 8192 (I did not bother to
- take into account the special handling they have for greater values
- -- it does not return garbage for arguments over 8192, though, but
- the extra precision is missing).
-
- Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
- surprising but correct result.
-*/
-
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f psin<Packet4f>(const Packet4f& _x)
{
- Packet4f x = _x;
- _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
- _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-
- _EIGEN_DECLARE_CONST_Packet4i(1, 1);
- _EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
- _EIGEN_DECLARE_CONST_Packet4i(2, 2);
- _EIGEN_DECLARE_CONST_Packet4i(4, 4);
-
- _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000);
-
- _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f);
- _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
- _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
- _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f);
- _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f);
- _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f);
- _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f);
- _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
-
- Packet4f xmm1, xmm2, xmm3, sign_bit, y;
-
- Packet4i emm0, emm2;
- sign_bit = x;
- /* take the absolute value */
- x = pabs(x);
-
- /* take the modulo */
-
- /* extract the sign bit (upper one) */
- sign_bit = _mm_and_ps(sign_bit, p4f_sign_mask);
-
- /* scale by 4/Pi */
- y = pmul(x, p4f_cephes_FOPI);
-
- /* store the integer part of y in mm0 */
- emm2 = _mm_cvttps_epi32(y);
- /* j=(j+1) & (~1) (see the cephes sources) */
- emm2 = _mm_add_epi32(emm2, p4i_1);
- emm2 = _mm_and_si128(emm2, p4i_not1);
- y = _mm_cvtepi32_ps(emm2);
- /* get the swap sign flag */
- emm0 = _mm_and_si128(emm2, p4i_4);
- emm0 = _mm_slli_epi32(emm0, 29);
- /* get the polynom selection mask
- there is one polynom for 0 <= x <= Pi/4
- and another one for Pi/4<x<=Pi/2
-
- Both branches will be computed.
- */
- emm2 = _mm_and_si128(emm2, p4i_2);
- emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
-
- Packet4f swap_sign_bit = _mm_castsi128_ps(emm0);
- Packet4f poly_mask = _mm_castsi128_ps(emm2);
- sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
-
- /* The magic pass: "Extended precision modular arithmetic"
- x = ((x - y * DP1) - y * DP2) - y * DP3; */
- xmm1 = pmul(y, p4f_minus_cephes_DP1);
- xmm2 = pmul(y, p4f_minus_cephes_DP2);
- xmm3 = pmul(y, p4f_minus_cephes_DP3);
- x = padd(x, xmm1);
- x = padd(x, xmm2);
- x = padd(x, xmm3);
-
- /* Evaluate the first polynom (0 <= x <= Pi/4) */
- y = p4f_coscof_p0;
- Packet4f z = _mm_mul_ps(x,x);
-
- y = pmadd(y, z, p4f_coscof_p1);
- y = pmadd(y, z, p4f_coscof_p2);
- y = pmul(y, z);
- y = pmul(y, z);
- Packet4f tmp = pmul(z, p4f_half);
- y = psub(y, tmp);
- y = padd(y, p4f_1);
-
- /* Evaluate the second polynom (Pi/4 <= x <= 0) */
-
- Packet4f y2 = p4f_sincof_p0;
- y2 = pmadd(y2, z, p4f_sincof_p1);
- y2 = pmadd(y2, z, p4f_sincof_p2);
- y2 = pmul(y2, z);
- y2 = pmul(y2, x);
- y2 = padd(y2, x);
-
- /* select the correct result from the two polynoms */
- y2 = _mm_and_ps(poly_mask, y2);
- y = _mm_andnot_ps(poly_mask, y);
- y = _mm_or_ps(y,y2);
- /* update the sign */
- return _mm_xor_ps(y, sign_bit);
+ return psin_float(_x);
}
-/* almost the same as psin */
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f pcos<Packet4f>(const Packet4f& _x)
{
- Packet4f x = _x;
- _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
- _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-
- _EIGEN_DECLARE_CONST_Packet4i(1, 1);
- _EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
- _EIGEN_DECLARE_CONST_Packet4i(2, 2);
- _EIGEN_DECLARE_CONST_Packet4i(4, 4);
-
- _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f);
- _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
- _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
- _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f);
- _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f);
- _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f);
- _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f);
- _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
-
- Packet4f xmm1, xmm2, xmm3, y;
- Packet4i emm0, emm2;
-
- x = pabs(x);
-
- /* scale by 4/Pi */
- y = pmul(x, p4f_cephes_FOPI);
-
- /* get the integer part of y */
- emm2 = _mm_cvttps_epi32(y);
- /* j=(j+1) & (~1) (see the cephes sources) */
- emm2 = _mm_add_epi32(emm2, p4i_1);
- emm2 = _mm_and_si128(emm2, p4i_not1);
- y = _mm_cvtepi32_ps(emm2);
-
- emm2 = _mm_sub_epi32(emm2, p4i_2);
-
- /* get the swap sign flag */
- emm0 = _mm_andnot_si128(emm2, p4i_4);
- emm0 = _mm_slli_epi32(emm0, 29);
- /* get the polynom selection mask */
- emm2 = _mm_and_si128(emm2, p4i_2);
- emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
-
- Packet4f sign_bit = _mm_castsi128_ps(emm0);
- Packet4f poly_mask = _mm_castsi128_ps(emm2);
-
- /* The magic pass: "Extended precision modular arithmetic"
- x = ((x - y * DP1) - y * DP2) - y * DP3; */
- xmm1 = pmul(y, p4f_minus_cephes_DP1);
- xmm2 = pmul(y, p4f_minus_cephes_DP2);
- xmm3 = pmul(y, p4f_minus_cephes_DP3);
- x = padd(x, xmm1);
- x = padd(x, xmm2);
- x = padd(x, xmm3);
-
- /* Evaluate the first polynom (0 <= x <= Pi/4) */
- y = p4f_coscof_p0;
- Packet4f z = pmul(x,x);
-
- y = pmadd(y,z,p4f_coscof_p1);
- y = pmadd(y,z,p4f_coscof_p2);
- y = pmul(y, z);
- y = pmul(y, z);
- Packet4f tmp = _mm_mul_ps(z, p4f_half);
- y = psub(y, tmp);
- y = padd(y, p4f_1);
-
- /* Evaluate the second polynom (Pi/4 <= x <= 0) */
- Packet4f y2 = p4f_sincof_p0;
- y2 = pmadd(y2, z, p4f_sincof_p1);
- y2 = pmadd(y2, z, p4f_sincof_p2);
- y2 = pmul(y2, z);
- y2 = pmadd(y2, x, x);
-
- /* select the correct result from the two polynoms */
- y2 = _mm_and_ps(poly_mask, y2);
- y = _mm_andnot_ps(poly_mask, y);
- y = _mm_or_ps(y,y2);
-
- /* update the sign */
- return _mm_xor_ps(y, sign_bit);
+ return pcos_float(_x);
}
#if EIGEN_FAST_MATH
@@ -482,11 +91,11 @@ Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f prsqrt<Packet4f>(const Packet4f& _x) {
- _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000);
- _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000);
+ _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000u);
+ _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000u);
_EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f);
_EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f);
- _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000);
+ _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000u);
Packet4f neg_half = pmul(_x, p4f_minus_half);
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 99d55d5e9..9c3750af0 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -18,11 +18,13 @@ namespace internal {
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
#endif
-#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#if !defined(EIGEN_VECTORIZE_AVX) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS)
+// 32 bits => 8 registers
+// 64 bits => 16 registers
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
#endif
-#ifdef __FMA__
+#ifdef EIGEN_VECTORIZE_FMA
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1
#endif
@@ -61,20 +63,22 @@ template<> struct is_arithmetic<__m128> { enum { value = true }; };
template<> struct is_arithmetic<__m128i> { enum { value = true }; };
template<> struct is_arithmetic<__m128d> { enum { value = true }; };
+#define EIGEN_SSE_SHUFFLE_MASK(p,q,r,s) ((s)<<6|(r)<<4|(q)<<2|(p))
+
#define vec4f_swizzle1(v,p,q,r,s) \
- (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p)))))
+ (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), EIGEN_SSE_SHUFFLE_MASK(p,q,r,s))))
#define vec4i_swizzle1(v,p,q,r,s) \
- (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p))))
+ (_mm_shuffle_epi32( v, EIGEN_SSE_SHUFFLE_MASK(p,q,r,s)))
#define vec2d_swizzle1(v,p,q) \
- (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2)))))
+ (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), EIGEN_SSE_SHUFFLE_MASK(2*p,2*p+1,2*q,2*q+1))))
#define vec4f_swizzle2(a,b,p,q,r,s) \
- (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p))))
+ (_mm_shuffle_ps( (a), (b), EIGEN_SSE_SHUFFLE_MASK(p,q,r,s)))
#define vec4i_swizzle2(a,b,p,q,r,s) \
- (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p))))))
+ (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), EIGEN_SSE_SHUFFLE_MASK(p,q,r,s)))))
#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
const Packet4f p4f_##NAME = pset1<Packet4f>(X)
@@ -83,7 +87,7 @@ template<> struct is_arithmetic<__m128d> { enum { value = true }; };
const Packet2d p2d_##NAME = pset1<Packet2d>(X)
#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
- const Packet4f p4f_##NAME = _mm_castsi128_ps(pset1<Packet4i>(X))
+ const Packet4f p4f_##NAME = pset1frombits<Packet4f>(X)
#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
const Packet4i p4i_##NAME = pset1<Packet4i>(X)
@@ -110,12 +114,12 @@ template<> struct packet_traits<float> : default_packet_traits
HasSqrt = 1,
HasRsqrt = 1,
HasTanh = EIGEN_FAST_MATH,
- HasBlend = 1
+ HasBlend = 1,
+ HasFloor = 1
#ifdef EIGEN_VECTORIZE_SSE4_1
,
HasRound = 1,
- HasFloor = 1,
HasCeil = 1
#endif
};
@@ -158,9 +162,22 @@ template<> struct packet_traits<int> : default_packet_traits
};
};
-template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
-template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
+template<> struct unpacket_traits<Packet4f> {
+ typedef float type;
+ typedef Packet4f half;
+ typedef Packet4i integer_packet;
+ enum {size=4, alignment=Aligned16, vectorizable=true};
+};
+template<> struct unpacket_traits<Packet2d> {
+ typedef double type;
+ typedef Packet2d half;
+ enum {size=2, alignment=Aligned16, vectorizable=true};
+};
+template<> struct unpacket_traits<Packet4i> {
+ typedef int type;
+ typedef Packet4i half;
+ enum {size=4, alignment=Aligned16, vectorizable=false};
+};
#ifndef EIGEN_VECTORIZE_AVX
template<> struct scalar_div_cost<float,true> { enum { value = 7 }; };
@@ -180,6 +197,12 @@ template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { re
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set1_epi32(from); }
#endif
+template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) { return _mm_castsi128_ps(pset1<Packet4i>(from)); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /*a*/) { return _mm_setzero_ps(); }
+template<> EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /*a*/) { return _mm_setzero_pd(); }
+template<> EIGEN_STRONG_INLINE Packet4i pzero(const Packet4i& /*a*/) { return _mm_setzero_si128(); }
+
// GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction.
// However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203)
// Using inline assembly is also not an option because then gcc fails to reorder properly the instructions.
@@ -245,19 +268,24 @@ template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const
// for some weird raisons, it has to be overloaded for packet of integers
template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
-#ifdef __FMA__
+#ifdef EIGEN_VECTORIZE_FMA
template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); }
template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); }
#endif
template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
-#if EIGEN_COMP_GNUC
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
// There appears to be a bug in GCC, by which the optimizer may
// flip the argument order in calls to _mm_min_ps, so we have to
// resort to inline ASM here. This is supposed to be fixed in gcc6.3,
// see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+ #ifdef EIGEN_VECTORIZE_AVX
+ Packet4f res;
+ asm("vminps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+ #else
Packet4f res = b;
asm("minps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
+ #endif
return res;
#else
// Arguments are reversed to match NaN propagation behavior of std::min.
@@ -265,13 +293,18 @@ template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const
#endif
}
template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
-#if EIGEN_COMP_GNUC
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
// There appears to be a bug in GCC, by which the optimizer may
// flip the argument order in calls to _mm_min_pd, so we have to
// resort to inline ASM here. This is supposed to be fixed in gcc6.3,
// see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+ #ifdef EIGEN_VECTORIZE_AVX
+ Packet2d res;
+ asm("vminpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+ #else
Packet2d res = b;
asm("minpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
+ #endif
return res;
#else
// Arguments are reversed to match NaN propagation behavior of std::min.
@@ -290,13 +323,18 @@ template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const
}
template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
-#if EIGEN_COMP_GNUC
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
// There appears to be a bug in GCC, by which the optimizer may
// flip the argument order in calls to _mm_max_ps, so we have to
// resort to inline ASM here. This is supposed to be fixed in gcc6.3,
// see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+ #ifdef EIGEN_VECTORIZE_AVX
+ Packet4f res;
+ asm("vmaxps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+ #else
Packet4f res = b;
asm("maxps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
+ #endif
return res;
#else
// Arguments are reversed to match NaN propagation behavior of std::max.
@@ -304,13 +342,18 @@ template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const
#endif
}
template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
-#if EIGEN_COMP_GNUC
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
// There appears to be a bug in GCC, by which the optimizer may
// flip the argument order in calls to _mm_max_pd, so we have to
// resort to inline ASM here. This is supposed to be fixed in gcc6.3,
// see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+ #ifdef EIGEN_VECTORIZE_AVX
+ Packet2d res;
+ asm("vmaxpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+ #else
Packet2d res = b;
asm("maxpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
+ #endif
return res;
#else
// Arguments are reversed to match NaN propagation behavior of std::max.
@@ -328,16 +371,24 @@ template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const
#endif
}
-#ifdef EIGEN_VECTORIZE_SSE4_1
-template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, 0); }
-template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return _mm_round_pd(a, 0); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return _mm_ceil_ps(a); }
-template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return _mm_ceil_pd(a); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return _mm_floor_ps(a); }
-template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return _mm_floor_pd(a); }
-#endif
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return _mm_cmple_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return _mm_cmplt_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4i ptrue<Packet4i>(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); }
+template<> EIGEN_STRONG_INLINE Packet4f
+ptrue<Packet4f>(const Packet4f& a) {
+ Packet4i b = _mm_castps_si128(a);
+ return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b));
+}
+template<> EIGEN_STRONG_INLINE Packet2d
+ptrue<Packet2d>(const Packet2d& a) {
+ Packet4i b = _mm_castpd_si128(a);
+ return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b));
+}
template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
@@ -351,9 +402,47 @@ template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const
template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(b,a); }
+template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); }
+template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b,a); }
+
+template<int N> EIGEN_STRONG_INLINE Packet4i pshiftright(Packet4i a) { return _mm_srli_epi32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4i pshiftleft(Packet4i a) { return _mm_slli_epi32(a,N); }
+
+#ifdef EIGEN_VECTORIZE_SSE4_1
+template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, 0); }
+template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return _mm_round_pd(a, 0); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return _mm_ceil_ps(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return _mm_ceil_pd(a); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return _mm_floor_ps(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return _mm_floor_pd(a); }
+#else
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
+{
+ const Packet4f cst_1 = pset1<Packet4f>(1.0f);
+ Packet4i emm0 = _mm_cvttps_epi32(a);
+ Packet4f tmp = _mm_cvtepi32_ps(emm0);
+ /* if greater, substract 1 */
+ Packet4f mask = _mm_cmpgt_ps(tmp, a);
+ mask = pand(mask, cst_1);
+ return psub(tmp, mask);
+}
+
+// WARNING: this pfloor implementation makes sense for small inputs only,
+// It is currently only used by pexp and not exposed through HasFloor.
+template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
+{
+ const Packet2d cst_1 = pset1<Packet2d>(1.0);
+ Packet4i emm0 = _mm_cvttpd_epi32(a);
+ Packet2d tmp = _mm_cvtepi32_pd(emm0);
+ /* if greater, substract 1 */
+ Packet2d mask = _mm_cmpgt_pd(tmp, a);
+ mask = pand(mask, cst_1);
+ return psub(tmp, mask);
+}
+#endif
template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
@@ -517,6 +606,23 @@ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)
#endif
}
+template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
+ return pfrexp_float(a,exponent);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
+ return pldexp_float(a,exponent);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
+ const Packet4i cst_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0);
+ Packet4i emm0 = _mm_cvttpd_epi32(exponent);
+ emm0 = padd(emm0, cst_1023_0);
+ emm0 = _mm_slli_epi32(emm0, 20);
+ emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3));
+ return pmul(a, Packet2d(_mm_castsi128_pd(emm0)));
+}
+
// with AVX, the default implementations based on pload1 are faster
#ifndef __AVX__
template<> EIGEN_STRONG_INLINE void
@@ -718,6 +824,17 @@ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
#endif // EIGEN_VECTORIZE_SSE4_1
}
+// not needed yet
+// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet4f& x)
+// {
+// return _mm_movemask_ps(x) == 0xF;
+// }
+
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
+{
+ return _mm_movemask_ps(x) != 0x0;
+}
+
#if EIGEN_COMP_GNUC
// template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
// {
@@ -921,7 +1038,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pinsertlast(const Packet2d& a, double b)
}
// Scalar path for pmadd with FMA to ensure consistency with vectorized path.
-#ifdef __FMA__
+#ifdef EIGEN_VECTORIZE_FMA
template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) {
return ::fmaf(a,b,c);
}
diff --git a/Eigen/src/Core/arch/SSE/TypeCasting.h b/Eigen/src/Core/arch/SSE/TypeCasting.h
index c6ca8c716..f607366f0 100644
--- a/Eigen/src/Core/arch/SSE/TypeCasting.h
+++ b/Eigen/src/Core/arch/SSE/TypeCasting.h
@@ -69,6 +69,13 @@ template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f
return _mm_cvtps_pd(a);
}
+template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a) {
+ return _mm_castps_si128(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Packet4i& a) {
+ return _mm_castsi128_ps(a);
+}
} // end namespace internal
diff --git a/Eigen/src/Core/arch/SYCL/InteropHeaders.h b/Eigen/src/Core/arch/SYCL/InteropHeaders.h
index c1da40d14..294cb101a 100644
--- a/Eigen/src/Core/arch/SYCL/InteropHeaders.h
+++ b/Eigen/src/Core/arch/SYCL/InteropHeaders.h
@@ -88,7 +88,7 @@ SYCL_ARITHMETIC(cl::sycl::cl_double2)
#define SYCL_UNPACKET_TRAITS(packet_type, unpacket_type, lengths)\
template<> struct unpacket_traits<packet_type> {\
typedef unpacket_type type;\
- enum {size=lengths, alignment=Aligned16};\
+ enum {size=lengths, alignment=Aligned16, vectorizable=true};\
typedef packet_type half;\
};
SYCL_UNPACKET_TRAITS(cl::sycl::cl_float4, float, 4)
diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h
index 95aba428f..167c3ee4c 100644
--- a/Eigen/src/Core/arch/ZVector/Complex.h
+++ b/Eigen/src/Core/arch/ZVector/Complex.h
@@ -91,8 +91,8 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits
};
};
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true}; typedef Packet1cd half; };
/* Forward declaration */
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel);
diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h
index 0b37f4992..c8e90f1a8 100755
--- a/Eigen/src/Core/arch/ZVector/PacketMath.h
+++ b/Eigen/src/Core/arch/ZVector/PacketMath.h
@@ -239,9 +239,9 @@ template<> struct packet_traits<double> : default_packet_traits
};
};
-template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
-template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
+template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef Packet4i half; };
+template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef Packet4f half; };
+template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2d half; };
/* Forward declaration */
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f,4>& kernel);