aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/arch
diff options
context:
space:
mode:
Diffstat (limited to 'Eigen/src/Core/arch')
-rw-r--r--Eigen/src/Core/arch/AVX/MathFunctions.h132
-rw-r--r--Eigen/src/Core/arch/AVX/PacketMath.h4
-rw-r--r--Eigen/src/Core/arch/AVX/TypeCasting.h51
-rw-r--r--Eigen/src/Core/arch/CUDA/PacketMath.h16
-rw-r--r--Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h110
-rw-r--r--Eigen/src/Core/arch/SSE/MathFunctions.h52
-rwxr-xr-xEigen/src/Core/arch/SSE/PacketMath.h2
-rw-r--r--Eigen/src/Core/arch/SSE/TypeCasting.h77
8 files changed, 429 insertions, 15 deletions
diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h
index 2810a7a0b..06cd56684 100644
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -271,6 +271,86 @@ pexp<Packet8f>(const Packet8f& _x) {
return pmax(pmul(y, _mm256_castsi256_ps(emm0)), _x);
}
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
+pexp<Packet4d>(const Packet4d& _x) {
+ Packet4d x = _x;
+
+ _EIGEN_DECLARE_CONST_Packet4d(1, 1.0);
+ _EIGEN_DECLARE_CONST_Packet4d(2, 2.0);
+ _EIGEN_DECLARE_CONST_Packet4d(half, 0.5);
+
+ _EIGEN_DECLARE_CONST_Packet4d(exp_hi, 709.437);
+ _EIGEN_DECLARE_CONST_Packet4d(exp_lo, -709.436139303);
+
+ _EIGEN_DECLARE_CONST_Packet4d(cephes_LOG2EF, 1.4426950408889634073599);
+
+ _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p0, 1.26177193074810590878e-4);
+ _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p1, 3.02994407707441961300e-2);
+ _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p2, 9.99999999999999999910e-1);
+
+ _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q0, 3.00198505138664455042e-6);
+ _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q1, 2.52448340349684104192e-3);
+ _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q2, 2.27265548208155028766e-1);
+ _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q3, 2.00000000000000000009e0);
+
+ _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C1, 0.693145751953125);
+ _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C2, 1.42860682030941723212e-6);
+ _EIGEN_DECLARE_CONST_Packet4i(1023, 1023);
+
+ Packet4d tmp, fx;
+
+ // clamp x
+ x = pmax(pmin(x, p4d_exp_hi), p4d_exp_lo);
+ // Express exp(x) as exp(g + n*log(2)).
+ fx = pmadd(p4d_cephes_LOG2EF, x, p4d_half);
+
+ // Get the integer modulus of log(2), i.e. the "n" described above.
+ fx = _mm256_floor_pd(fx);
+
+ // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
+ // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
+ // digits right.
+ tmp = pmul(fx, p4d_cephes_exp_C1);
+ Packet4d z = pmul(fx, p4d_cephes_exp_C2);
+ x = psub(x, tmp);
+ x = psub(x, z);
+
+ Packet4d x2 = pmul(x, x);
+
+ // Evaluate the numerator polynomial of the rational interpolant.
+ Packet4d px = p4d_cephes_exp_p0;
+ px = pmadd(px, x2, p4d_cephes_exp_p1);
+ px = pmadd(px, x2, p4d_cephes_exp_p2);
+ px = pmul(px, x);
+
+ // Evaluate the denominator polynomial of the rational interpolant.
+ Packet4d qx = p4d_cephes_exp_q0;
+ qx = pmadd(qx, x2, p4d_cephes_exp_q1);
+ qx = pmadd(qx, x2, p4d_cephes_exp_q2);
+ qx = pmadd(qx, x2, p4d_cephes_exp_q3);
+
+ // I don't really get this bit, copied from the SSE2 routines, so...
+ // TODO(gonnet): Figure out what is going on here, perhaps find a better
+ // rational interpolant?
+ x = _mm256_div_pd(px, psub(qx, px));
+ x = pmadd(p4d_2, x, p4d_1);
+
+ // Build e=2^n by constructing the exponents in a 128-bit vector and
+ // shifting them to where they belong in double-precision values.
+ __m128i emm0 = _mm256_cvtpd_epi32(fx);
+ emm0 = _mm_add_epi32(emm0, p4i_1023);
+ emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0));
+ __m128i lo = _mm_slli_epi64(emm0, 52);
+ __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52);
+ __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0);
+ e = _mm256_insertf128_si256(e, hi, 1);
+
+ // Construct the result 2^n * exp(g) = e * x. The max is used to catch
+ // non-finite values in the input.
+ return pmax(pmul(x, Packet4d(e)), _x);
+}
+
// Functions for sqrt.
// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
// of Newton's method, at a cost of 1-2 bits of precision as opposed to the
@@ -300,15 +380,59 @@ psqrt<Packet8f>(const Packet8f& _x) {
return pmul(_x, x);
}
#else
-template <>
-EIGEN_STRONG_INLINE Packet8f psqrt<Packet8f>(const Packet8f& x) {
+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f psqrt<Packet8f>(const Packet8f& x) {
return _mm256_sqrt_ps(x);
}
#endif
-template <>
-EIGEN_STRONG_INLINE Packet4d psqrt<Packet4d>(const Packet4d& x) {
+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4d psqrt<Packet4d>(const Packet4d& x) {
return _mm256_sqrt_pd(x);
}
+#if EIGEN_FAST_MATH
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f prsqrt<Packet8f>(const Packet8f& _x) {
+ _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000);
+ _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(nan, 0x7fc00000);
+ _EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f);
+ _EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f);
+ _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(flt_min, 0x00800000);
+
+ Packet8f neg_half = pmul(_x, p8f_minus_half);
+
+ // select only the inverse sqrt of positive normal inputs (denormals are
+ // flushed to zero and cause infs as well).
+ Packet8f le_zero_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ);
+ Packet8f x = _mm256_andnot_ps(le_zero_mask, _mm256_rsqrt_ps(_x));
+
+ // Fill in NaNs and Infs for the negative/zero entries.
+ Packet8f neg_mask = _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_LT_OQ);
+ Packet8f zero_mask = _mm256_andnot_ps(neg_mask, le_zero_mask);
+ Packet8f infs_and_nans = _mm256_or_ps(_mm256_and_ps(neg_mask, p8f_nan),
+ _mm256_and_ps(zero_mask, p8f_inf));
+
+ // Do a single step of Newton's iteration.
+ x = pmul(x, pmadd(neg_half, pmul(x, x), p8f_one_point_five));
+
+ // Insert NaNs and Infs in all the right places.
+ return _mm256_or_ps(x, infs_and_nans);
+}
+
+#else
+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f prsqrt<Packet8f>(const Packet8f& x) {
+ _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f);
+ return _mm256_div_ps(p8f_one, _mm256_sqrt_ps(x));
+}
+#endif
+
+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4d prsqrt<Packet4d>(const Packet4d& x) {
+ _EIGEN_DECLARE_CONST_Packet4d(one, 1.0);
+ return _mm256_div_pd(p4d_one, _mm256_sqrt_pd(x));
+}
+
} // end namespace internal
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 695185a49..1e751dc32 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -65,6 +65,7 @@ template<> struct packet_traits<float> : default_packet_traits
HasLog = 1,
HasExp = 1,
HasSqrt = 1,
+ HasRsqrt = 1,
HasBlend = 1
};
};
@@ -79,8 +80,9 @@ template<> struct packet_traits<double> : default_packet_traits
HasHalfPacket = 1,
HasDiv = 1,
- HasExp = 0,
+ HasExp = 1,
HasSqrt = 1,
+ HasRsqrt = 1,
HasBlend = 1
};
};
diff --git a/Eigen/src/Core/arch/AVX/TypeCasting.h b/Eigen/src/Core/arch/AVX/TypeCasting.h
new file mode 100644
index 000000000..83bfdc604
--- /dev/null
+++ b/Eigen/src/Core/arch/AVX/TypeCasting.h
@@ -0,0 +1,51 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_AVX_H
+#define EIGEN_TYPE_CASTING_AVX_H
+
+namespace Eigen {
+
+namespace internal {
+
+// For now we use SSE to handle integers, so we can't use AVX instructions to cast
+// from int to float
+template <>
+struct type_casting_traits<float, int> {
+ enum {
+ VectorizedCast = 0,
+ SrcCoeffRatio = 1,
+ TgtCoeffRatio = 1
+ };
+};
+
+template <>
+struct type_casting_traits<int, float> {
+ enum {
+ VectorizedCast = 0,
+ SrcCoeffRatio = 1,
+ TgtCoeffRatio = 1
+ };
+};
+
+
+
+template<> EIGEN_STRONG_INLINE Packet8i pcast<Packet8f, Packet8i>(const Packet8f& a) {
+ return _mm256_cvtps_epi32(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8i, Packet8f>(const Packet8i& a) {
+ return _mm256_cvtepi32_ps(a);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TYPE_CASTING_AVX_H
diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h
index 19749c832..ceed1d1ef 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@@ -197,21 +197,21 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(cons
}
#endif
-template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, int stride) {
+template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) {
return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
}
-template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, int stride) {
+template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) {
return make_double2(from[0*stride], from[1*stride]);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, int stride) {
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) {
to[stride*0] = from.x;
to[stride*1] = from.y;
to[stride*2] = from.z;
to[stride*3] = from.w;
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, int stride) {
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) {
to[stride*0] = from.x;
to[stride*1] = from.y;
}
@@ -245,14 +245,14 @@ template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a)
}
template<> EIGEN_DEVICE_FUNC inline float4 pabs<float4>(const float4& a) {
- return make_float4(fabs(a.x), fabs(a.y), fabs(a.z), fabs(a.w));
+ return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
}
template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
- return make_double2(abs(a.x), abs(a.y));
+ return make_double2(fabs(a.x), fabs(a.y));
}
-template<> EIGEN_DEVICE_FUNC inline void
+EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<float4,4>& kernel) {
double tmp = kernel.packet[0].y;
kernel.packet[0].y = kernel.packet[1].x;
@@ -279,7 +279,7 @@ ptranspose(PacketBlock<float4,4>& kernel) {
kernel.packet[3].z = tmp;
}
-template<> EIGEN_DEVICE_FUNC inline void
+EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<double2,2>& kernel) {
double tmp = kernel.packet[0].y;
kernel.packet[0].y = kernel.packet[1].x;
diff --git a/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h b/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h
new file mode 100644
index 000000000..5007c155d
--- /dev/null
+++ b/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h
@@ -0,0 +1,110 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_NEON_BLOCKING_SIZES_LOOKUP_TABLES_H
+#define EIGEN_NEON_BLOCKING_SIZES_LOOKUP_TABLES_H
+
+namespace Eigen {
+namespace internal {
+
+/* The following lookup table was generated from measurements on a Nexus 5,
+ * which has a Qualcomm Krait 400 CPU. This is very representative of current
+ * 32bit (ARMv7) Android devices. On the other hand, I don't know how
+ * representative that is outside of these conditions. Accordingly,
+ * let's only use this lookup table on ARM 32bit on Android for now.
+ *
+ * Measurements were single-threaded, with Scalar=float, compiled with
+ * -mfpu=neon-vfpv4, so the pmadd instruction used was VFMA.F32.
+ *
+ * The device was cooled, allowing it to run a the max clock speed throughout.
+ * This may not be representative of real-world thermal conditions.
+ *
+ * The benchmark attempted to flush caches to test cold-cache performance.
+ */
+#if EIGEN_ARCH_ARM && EIGEN_OS_ANDROID
+template<>
+struct BlockingSizesLookupTable<float, float> {
+ static const size_t BaseSize = 16;
+ static const size_t NumSizes = 8;
+ static const unsigned short* Data() {
+ static const unsigned short data[512] = {
+ 0x444, 0x445, 0x446, 0x447, 0x448, 0x449, 0x447, 0x447,
+ 0x454, 0x455, 0x456, 0x457, 0x458, 0x459, 0x45a, 0x456,
+ 0x464, 0x465, 0x466, 0x467, 0x468, 0x469, 0x46a, 0x467,
+ 0x474, 0x475, 0x476, 0x467, 0x478, 0x479, 0x476, 0x478,
+ 0x474, 0x475, 0x476, 0x477, 0x478, 0x479, 0x476, 0x476,
+ 0x474, 0x475, 0x476, 0x477, 0x478, 0x479, 0x496, 0x488,
+ 0x474, 0x475, 0x476, 0x4a6, 0x496, 0x496, 0x495, 0x4a6,
+ 0x474, 0x475, 0x466, 0x4a6, 0x497, 0x4a5, 0x496, 0x4a5,
+ 0x544, 0x545, 0x546, 0x547, 0x548, 0x549, 0x54a, 0x54b,
+ 0x554, 0x555, 0x556, 0x557, 0x558, 0x559, 0x55a, 0x55b,
+ 0x564, 0x565, 0x566, 0x567, 0x568, 0x569, 0x56a, 0x56b,
+ 0x564, 0x565, 0x566, 0x567, 0x568, 0x569, 0x56a, 0x576,
+ 0x564, 0x565, 0x566, 0x567, 0x568, 0x569, 0x56a, 0x587,
+ 0x564, 0x565, 0x566, 0x567, 0x596, 0x596, 0x596, 0x597,
+ 0x574, 0x565, 0x566, 0x596, 0x596, 0x5a6, 0x5a6, 0x5a6,
+ 0x564, 0x565, 0x5a6, 0x596, 0x5a6, 0x5a6, 0x5a6, 0x5a6,
+ 0x644, 0x645, 0x646, 0x647, 0x648, 0x649, 0x64a, 0x64b,
+ 0x644, 0x655, 0x656, 0x657, 0x658, 0x659, 0x65a, 0x65b,
+ 0x664, 0x665, 0x666, 0x667, 0x668, 0x669, 0x65a, 0x667,
+ 0x654, 0x665, 0x676, 0x677, 0x678, 0x679, 0x67a, 0x675,
+ 0x684, 0x675, 0x686, 0x687, 0x688, 0x688, 0x687, 0x686,
+ 0x664, 0x685, 0x666, 0x677, 0x697, 0x696, 0x697, 0x697,
+ 0x664, 0x665, 0x696, 0x696, 0x685, 0x6a6, 0x696, 0x696,
+ 0x664, 0x675, 0x686, 0x696, 0x6a6, 0x696, 0x696, 0x696,
+ 0x744, 0x745, 0x746, 0x747, 0x748, 0x749, 0x74a, 0x747,
+ 0x754, 0x755, 0x756, 0x757, 0x758, 0x759, 0x75a, 0x757,
+ 0x764, 0x765, 0x756, 0x767, 0x768, 0x759, 0x75a, 0x766,
+ 0x744, 0x755, 0x766, 0x777, 0x768, 0x759, 0x778, 0x777,
+ 0x744, 0x745, 0x766, 0x777, 0x788, 0x786, 0x786, 0x788,
+ 0x754, 0x755, 0x766, 0x787, 0x796, 0x796, 0x787, 0x796,
+ 0x684, 0x695, 0x696, 0x6a6, 0x795, 0x786, 0x795, 0x796,
+ 0x684, 0x695, 0x696, 0x795, 0x786, 0x796, 0x795, 0x796,
+ 0x844, 0x845, 0x846, 0x847, 0x848, 0x849, 0x848, 0x848,
+ 0x844, 0x855, 0x846, 0x847, 0x848, 0x849, 0x855, 0x857,
+ 0x844, 0x845, 0x846, 0x857, 0x848, 0x859, 0x866, 0x865,
+ 0x844, 0x855, 0x846, 0x847, 0x878, 0x859, 0x877, 0x877,
+ 0x844, 0x855, 0x846, 0x867, 0x886, 0x887, 0x885, 0x886,
+ 0x784, 0x785, 0x786, 0x877, 0x897, 0x885, 0x896, 0x896,
+ 0x684, 0x695, 0x686, 0x886, 0x885, 0x885, 0x886, 0x896,
+ 0x694, 0x6a5, 0x6a6, 0x885, 0x885, 0x886, 0x896, 0x896,
+ 0x944, 0x945, 0x946, 0x947, 0x948, 0x847, 0x847, 0x848,
+ 0x954, 0x855, 0x856, 0x947, 0x858, 0x857, 0x858, 0x858,
+ 0x944, 0x945, 0x946, 0x867, 0x948, 0x866, 0x867, 0x867,
+ 0x944, 0x975, 0x976, 0x877, 0x877, 0x877, 0x877, 0x877,
+ 0x784, 0x785, 0x886, 0x887, 0x886, 0x887, 0x887, 0x887,
+ 0x784, 0x785, 0x786, 0x796, 0x887, 0x897, 0x896, 0x896,
+ 0x684, 0x695, 0x6a6, 0x886, 0x886, 0x896, 0x896, 0x896,
+ 0x6a4, 0x6a5, 0x696, 0x896, 0x886, 0x896, 0x896, 0x896,
+ 0xa44, 0xa45, 0xa46, 0xa47, 0x847, 0x848, 0x847, 0x848,
+ 0xa44, 0xa45, 0x856, 0x857, 0x857, 0x857, 0x857, 0x857,
+ 0xa44, 0xa65, 0x866, 0x867, 0x867, 0x867, 0x867, 0x867,
+ 0x774, 0x875, 0x876, 0x877, 0x877, 0x877, 0x877, 0x877,
+ 0x784, 0x785, 0x886, 0x887, 0x887, 0x887, 0x887, 0x887,
+ 0x784, 0x785, 0x786, 0x787, 0x887, 0x896, 0x897, 0x897,
+ 0x684, 0x6a5, 0x696, 0x886, 0x886, 0x896, 0x896, 0x896,
+ 0x684, 0x6a5, 0x6a5, 0x886, 0x886, 0x896, 0x896, 0x896,
+ 0xb44, 0x845, 0x846, 0x847, 0x847, 0x945, 0x846, 0x946,
+ 0xb54, 0x855, 0x856, 0x857, 0x857, 0x856, 0x857, 0x856,
+ 0x864, 0x865, 0x866, 0x867, 0x867, 0x866, 0x866, 0x867,
+ 0x864, 0x875, 0x876, 0x877, 0x877, 0x877, 0x877, 0x877,
+ 0x784, 0x885, 0x886, 0x787, 0x887, 0x887, 0x887, 0x887,
+ 0x784, 0x785, 0x786, 0x796, 0x886, 0x897, 0x897, 0x897,
+ 0x684, 0x695, 0x696, 0x886, 0x896, 0x896, 0x896, 0x896,
+ 0x684, 0x685, 0x696, 0xb57, 0x896, 0x896, 0x896, 0x896
+ };
+ return data;
+ }
+};
+#endif
+
+}
+}
+
+#endif // EIGEN_NEON_BLOCKING_SIZES_LOOKUP_TABLES_H
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index f86c0a39a..3b8b7303f 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -462,11 +462,59 @@ Packet4f psqrt<Packet4f>(const Packet4f& _x)
#else
-template<> EIGEN_STRONG_INLINE Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); }
+template<>EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); }
#endif
-template<> EIGEN_STRONG_INLINE Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
+
+#if EIGEN_FAST_MATH
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f prsqrt<Packet4f>(const Packet4f& _x) {
+ _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000);
+ _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000);
+ _EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f);
+ _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f);
+ _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000);
+
+ Packet4f neg_half = pmul(_x, p4f_minus_half);
+
+ // select only the inverse sqrt of positive normal inputs (denormals are
+ // flushed to zero and cause infs as well).
+ Packet4f le_zero_mask = _mm_cmple_ps(_x, p4f_flt_min);
+ Packet4f x = _mm_andnot_ps(le_zero_mask, _mm_rsqrt_ps(_x));
+
+ // Fill in NaNs and Infs for the negative/zero entries.
+ Packet4f neg_mask = _mm_cmplt_ps(_x, _mm_setzero_ps());
+ Packet4f zero_mask = _mm_andnot_ps(neg_mask, le_zero_mask);
+ Packet4f infs_and_nans = _mm_or_ps(_mm_and_ps(neg_mask, p4f_nan),
+ _mm_and_ps(zero_mask, p4f_inf));
+
+ // Do a single step of Newton's iteration.
+ x = pmul(x, pmadd(neg_half, pmul(x, x), p4f_one_point_five));
+
+ // Insert NaNs and Infs in all the right places.
+ return _mm_or_ps(x, infs_and_nans);
+}
+
+#else
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f prsqrt<Packet4f>(const Packet4f& x) {
+ // Unfortunately we can't use the much faster mm_rqsrt_ps since it only provides an approximation.
+ return _mm_div_ps(pset1<Packet4f>(1.0f), _mm_sqrt_ps(x));
+}
+
+#endif
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d prsqrt<Packet2d>(const Packet2d& x) {
+ // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation.
+ return _mm_div_pd(pset1<Packet2d>(1.0), _mm_sqrt_pd(x));
+}
} // end namespace internal
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 3653783fd..38a84273d 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -108,6 +108,7 @@ template<> struct packet_traits<float> : default_packet_traits
HasLog = 1,
HasExp = 1,
HasSqrt = 1,
+ HasRsqrt = 1,
HasBlend = 1
};
};
@@ -124,6 +125,7 @@ template<> struct packet_traits<double> : default_packet_traits
HasDiv = 1,
HasExp = 1,
HasSqrt = 1,
+ HasRsqrt = 1,
HasBlend = 1
};
};
diff --git a/Eigen/src/Core/arch/SSE/TypeCasting.h b/Eigen/src/Core/arch/SSE/TypeCasting.h
new file mode 100644
index 000000000..c84893230
--- /dev/null
+++ b/Eigen/src/Core/arch/SSE/TypeCasting.h
@@ -0,0 +1,77 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_SSE_H
+#define EIGEN_TYPE_CASTING_SSE_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <>
+struct type_casting_traits<float, int> {
+ enum {
+ VectorizedCast = 1,
+ SrcCoeffRatio = 1,
+ TgtCoeffRatio = 1
+ };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
+ return _mm_cvttps_epi32(a);
+}
+
+
+template <>
+struct type_casting_traits<int, float> {
+ enum {
+ VectorizedCast = 1,
+ SrcCoeffRatio = 1,
+ TgtCoeffRatio = 1
+ };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
+ return _mm_cvtepi32_ps(a);
+}
+
+
+template <>
+struct type_casting_traits<double, float> {
+ enum {
+ VectorizedCast = 1,
+ SrcCoeffRatio = 2,
+ TgtCoeffRatio = 1
+ };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
+ return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
+}
+
+template <>
+struct type_casting_traits<float, double> {
+ enum {
+ VectorizedCast = 1,
+ SrcCoeffRatio = 1,
+ TgtCoeffRatio = 2
+ };
+};
+
+template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) {
+ // Simply discard the second half of the input
+ return _mm_cvtps_pd(a);
+}
+
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TYPE_CASTING_SSE_H