diff options
Diffstat (limited to 'Eigen/src/Core/arch')
-rw-r--r-- | Eigen/src/Core/arch/AVX/MathFunctions.h | 132 | ||||
-rw-r--r-- | Eigen/src/Core/arch/AVX/PacketMath.h | 4 | ||||
-rw-r--r-- | Eigen/src/Core/arch/AVX/TypeCasting.h | 51 | ||||
-rw-r--r-- | Eigen/src/Core/arch/CUDA/PacketMath.h | 16 | ||||
-rw-r--r-- | Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h | 110 | ||||
-rw-r--r-- | Eigen/src/Core/arch/SSE/MathFunctions.h | 52 | ||||
-rwxr-xr-x | Eigen/src/Core/arch/SSE/PacketMath.h | 2 | ||||
-rw-r--r-- | Eigen/src/Core/arch/SSE/TypeCasting.h | 77 |
8 files changed, 429 insertions, 15 deletions
diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h index 2810a7a0b..06cd56684 100644 --- a/Eigen/src/Core/arch/AVX/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX/MathFunctions.h @@ -271,6 +271,86 @@ pexp<Packet8f>(const Packet8f& _x) { return pmax(pmul(y, _mm256_castsi256_ps(emm0)), _x); } +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d +pexp<Packet4d>(const Packet4d& _x) { + Packet4d x = _x; + + _EIGEN_DECLARE_CONST_Packet4d(1, 1.0); + _EIGEN_DECLARE_CONST_Packet4d(2, 2.0); + _EIGEN_DECLARE_CONST_Packet4d(half, 0.5); + + _EIGEN_DECLARE_CONST_Packet4d(exp_hi, 709.437); + _EIGEN_DECLARE_CONST_Packet4d(exp_lo, -709.436139303); + + _EIGEN_DECLARE_CONST_Packet4d(cephes_LOG2EF, 1.4426950408889634073599); + + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p0, 1.26177193074810590878e-4); + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p1, 3.02994407707441961300e-2); + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p2, 9.99999999999999999910e-1); + + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q0, 3.00198505138664455042e-6); + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q1, 2.52448340349684104192e-3); + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q2, 2.27265548208155028766e-1); + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q3, 2.00000000000000000009e0); + + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C1, 0.693145751953125); + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C2, 1.42860682030941723212e-6); + _EIGEN_DECLARE_CONST_Packet4i(1023, 1023); + + Packet4d tmp, fx; + + // clamp x + x = pmax(pmin(x, p4d_exp_hi), p4d_exp_lo); + // Express exp(x) as exp(g + n*log(2)). + fx = pmadd(p4d_cephes_LOG2EF, x, p4d_half); + + // Get the integer modulus of log(2), i.e. the "n" described above. + fx = _mm256_floor_pd(fx); + + // Get the remainder modulo log(2), i.e. the "g" described above. Subtract + // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last + // digits right. + tmp = pmul(fx, p4d_cephes_exp_C1); + Packet4d z = pmul(fx, p4d_cephes_exp_C2); + x = psub(x, tmp); + x = psub(x, z); + + Packet4d x2 = pmul(x, x); + + // Evaluate the numerator polynomial of the rational interpolant. + Packet4d px = p4d_cephes_exp_p0; + px = pmadd(px, x2, p4d_cephes_exp_p1); + px = pmadd(px, x2, p4d_cephes_exp_p2); + px = pmul(px, x); + + // Evaluate the denominator polynomial of the rational interpolant. + Packet4d qx = p4d_cephes_exp_q0; + qx = pmadd(qx, x2, p4d_cephes_exp_q1); + qx = pmadd(qx, x2, p4d_cephes_exp_q2); + qx = pmadd(qx, x2, p4d_cephes_exp_q3); + + // I don't really get this bit, copied from the SSE2 routines, so... + // TODO(gonnet): Figure out what is going on here, perhaps find a better + // rational interpolant? + x = _mm256_div_pd(px, psub(qx, px)); + x = pmadd(p4d_2, x, p4d_1); + + // Build e=2^n by constructing the exponents in a 128-bit vector and + // shifting them to where they belong in double-precision values. + __m128i emm0 = _mm256_cvtpd_epi32(fx); + emm0 = _mm_add_epi32(emm0, p4i_1023); + emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0)); + __m128i lo = _mm_slli_epi64(emm0, 52); + __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52); + __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0); + e = _mm256_insertf128_si256(e, hi, 1); + + // Construct the result 2^n * exp(g) = e * x. The max is used to catch + // non-finite values in the input. + return pmax(pmul(x, Packet4d(e)), _x); +} + // Functions for sqrt. // The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step // of Newton's method, at a cost of 1-2 bits of precision as opposed to the @@ -300,15 +380,59 @@ psqrt<Packet8f>(const Packet8f& _x) { return pmul(_x, x); } #else -template <> -EIGEN_STRONG_INLINE Packet8f psqrt<Packet8f>(const Packet8f& x) { +template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet8f psqrt<Packet8f>(const Packet8f& x) { return _mm256_sqrt_ps(x); } #endif -template <> -EIGEN_STRONG_INLINE Packet4d psqrt<Packet4d>(const Packet4d& x) { +template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4d psqrt<Packet4d>(const Packet4d& x) { return _mm256_sqrt_pd(x); } +#if EIGEN_FAST_MATH + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet8f prsqrt<Packet8f>(const Packet8f& _x) { + _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000); + _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(nan, 0x7fc00000); + _EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f); + _EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f); + _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(flt_min, 0x00800000); + + Packet8f neg_half = pmul(_x, p8f_minus_half); + + // select only the inverse sqrt of positive normal inputs (denormals are + // flushed to zero and cause infs as well). + Packet8f le_zero_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ); + Packet8f x = _mm256_andnot_ps(le_zero_mask, _mm256_rsqrt_ps(_x)); + + // Fill in NaNs and Infs for the negative/zero entries. + Packet8f neg_mask = _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_LT_OQ); + Packet8f zero_mask = _mm256_andnot_ps(neg_mask, le_zero_mask); + Packet8f infs_and_nans = _mm256_or_ps(_mm256_and_ps(neg_mask, p8f_nan), + _mm256_and_ps(zero_mask, p8f_inf)); + + // Do a single step of Newton's iteration. + x = pmul(x, pmadd(neg_half, pmul(x, x), p8f_one_point_five)); + + // Insert NaNs and Infs in all the right places. + return _mm256_or_ps(x, infs_and_nans); +} + +#else +template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet8f prsqrt<Packet8f>(const Packet8f& x) { + _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f); + return _mm256_div_ps(p8f_one, _mm256_sqrt_ps(x)); +} +#endif + +template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4d prsqrt<Packet4d>(const Packet4d& x) { + _EIGEN_DECLARE_CONST_Packet4d(one, 1.0); + return _mm256_div_pd(p4d_one, _mm256_sqrt_pd(x)); +} + } // end namespace internal diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 695185a49..1e751dc32 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -65,6 +65,7 @@ template<> struct packet_traits<float> : default_packet_traits HasLog = 1, HasExp = 1, HasSqrt = 1, + HasRsqrt = 1, HasBlend = 1 }; }; @@ -79,8 +80,9 @@ template<> struct packet_traits<double> : default_packet_traits HasHalfPacket = 1, HasDiv = 1, - HasExp = 0, + HasExp = 1, HasSqrt = 1, + HasRsqrt = 1, HasBlend = 1 }; }; diff --git a/Eigen/src/Core/arch/AVX/TypeCasting.h b/Eigen/src/Core/arch/AVX/TypeCasting.h new file mode 100644 index 000000000..83bfdc604 --- /dev/null +++ b/Eigen/src/Core/arch/AVX/TypeCasting.h @@ -0,0 +1,51 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TYPE_CASTING_AVX_H +#define EIGEN_TYPE_CASTING_AVX_H + +namespace Eigen { + +namespace internal { + +// For now we use SSE to handle integers, so we can't use AVX instructions to cast +// from int to float +template <> +struct type_casting_traits<float, int> { + enum { + VectorizedCast = 0, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template <> +struct type_casting_traits<int, float> { + enum { + VectorizedCast = 0, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + + + +template<> EIGEN_STRONG_INLINE Packet8i pcast<Packet8f, Packet8i>(const Packet8f& a) { + return _mm256_cvtps_epi32(a); +} + +template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8i, Packet8f>(const Packet8i& a) { + return _mm256_cvtepi32_ps(a); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TYPE_CASTING_AVX_H diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index 19749c832..ceed1d1ef 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -197,21 +197,21 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(cons } #endif -template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, int stride) { +template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) { return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]); } -template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, int stride) { +template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) { return make_double2(from[0*stride], from[1*stride]); } -template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, int stride) { +template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) { to[stride*0] = from.x; to[stride*1] = from.y; to[stride*2] = from.z; to[stride*3] = from.w; } -template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, int stride) { +template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) { to[stride*0] = from.x; to[stride*1] = from.y; } @@ -245,14 +245,14 @@ template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) } template<> EIGEN_DEVICE_FUNC inline float4 pabs<float4>(const float4& a) { - return make_float4(fabs(a.x), fabs(a.y), fabs(a.z), fabs(a.w)); + return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w)); } template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) { - return make_double2(abs(a.x), abs(a.y)); + return make_double2(fabs(a.x), fabs(a.y)); } -template<> EIGEN_DEVICE_FUNC inline void +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<float4,4>& kernel) { double tmp = kernel.packet[0].y; kernel.packet[0].y = kernel.packet[1].x; @@ -279,7 +279,7 @@ ptranspose(PacketBlock<float4,4>& kernel) { kernel.packet[3].z = tmp; } -template<> EIGEN_DEVICE_FUNC inline void +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<double2,2>& kernel) { double tmp = kernel.packet[0].y; kernel.packet[0].y = kernel.packet[1].x; diff --git a/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h b/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h new file mode 100644 index 000000000..5007c155d --- /dev/null +++ b/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h @@ -0,0 +1,110 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_NEON_BLOCKING_SIZES_LOOKUP_TABLES_H +#define EIGEN_NEON_BLOCKING_SIZES_LOOKUP_TABLES_H + +namespace Eigen { +namespace internal { + +/* The following lookup table was generated from measurements on a Nexus 5, + * which has a Qualcomm Krait 400 CPU. This is very representative of current + * 32bit (ARMv7) Android devices. On the other hand, I don't know how + * representative that is outside of these conditions. Accordingly, + * let's only use this lookup table on ARM 32bit on Android for now. + * + * Measurements were single-threaded, with Scalar=float, compiled with + * -mfpu=neon-vfpv4, so the pmadd instruction used was VFMA.F32. + * + * The device was cooled, allowing it to run a the max clock speed throughout. + * This may not be representative of real-world thermal conditions. + * + * The benchmark attempted to flush caches to test cold-cache performance. + */ +#if EIGEN_ARCH_ARM && EIGEN_OS_ANDROID +template<> +struct BlockingSizesLookupTable<float, float> { + static const size_t BaseSize = 16; + static const size_t NumSizes = 8; + static const unsigned short* Data() { + static const unsigned short data[512] = { + 0x444, 0x445, 0x446, 0x447, 0x448, 0x449, 0x447, 0x447, + 0x454, 0x455, 0x456, 0x457, 0x458, 0x459, 0x45a, 0x456, + 0x464, 0x465, 0x466, 0x467, 0x468, 0x469, 0x46a, 0x467, + 0x474, 0x475, 0x476, 0x467, 0x478, 0x479, 0x476, 0x478, + 0x474, 0x475, 0x476, 0x477, 0x478, 0x479, 0x476, 0x476, + 0x474, 0x475, 0x476, 0x477, 0x478, 0x479, 0x496, 0x488, + 0x474, 0x475, 0x476, 0x4a6, 0x496, 0x496, 0x495, 0x4a6, + 0x474, 0x475, 0x466, 0x4a6, 0x497, 0x4a5, 0x496, 0x4a5, + 0x544, 0x545, 0x546, 0x547, 0x548, 0x549, 0x54a, 0x54b, + 0x554, 0x555, 0x556, 0x557, 0x558, 0x559, 0x55a, 0x55b, + 0x564, 0x565, 0x566, 0x567, 0x568, 0x569, 0x56a, 0x56b, + 0x564, 0x565, 0x566, 0x567, 0x568, 0x569, 0x56a, 0x576, + 0x564, 0x565, 0x566, 0x567, 0x568, 0x569, 0x56a, 0x587, + 0x564, 0x565, 0x566, 0x567, 0x596, 0x596, 0x596, 0x597, + 0x574, 0x565, 0x566, 0x596, 0x596, 0x5a6, 0x5a6, 0x5a6, + 0x564, 0x565, 0x5a6, 0x596, 0x5a6, 0x5a6, 0x5a6, 0x5a6, + 0x644, 0x645, 0x646, 0x647, 0x648, 0x649, 0x64a, 0x64b, + 0x644, 0x655, 0x656, 0x657, 0x658, 0x659, 0x65a, 0x65b, + 0x664, 0x665, 0x666, 0x667, 0x668, 0x669, 0x65a, 0x667, + 0x654, 0x665, 0x676, 0x677, 0x678, 0x679, 0x67a, 0x675, + 0x684, 0x675, 0x686, 0x687, 0x688, 0x688, 0x687, 0x686, + 0x664, 0x685, 0x666, 0x677, 0x697, 0x696, 0x697, 0x697, + 0x664, 0x665, 0x696, 0x696, 0x685, 0x6a6, 0x696, 0x696, + 0x664, 0x675, 0x686, 0x696, 0x6a6, 0x696, 0x696, 0x696, + 0x744, 0x745, 0x746, 0x747, 0x748, 0x749, 0x74a, 0x747, + 0x754, 0x755, 0x756, 0x757, 0x758, 0x759, 0x75a, 0x757, + 0x764, 0x765, 0x756, 0x767, 0x768, 0x759, 0x75a, 0x766, + 0x744, 0x755, 0x766, 0x777, 0x768, 0x759, 0x778, 0x777, + 0x744, 0x745, 0x766, 0x777, 0x788, 0x786, 0x786, 0x788, + 0x754, 0x755, 0x766, 0x787, 0x796, 0x796, 0x787, 0x796, + 0x684, 0x695, 0x696, 0x6a6, 0x795, 0x786, 0x795, 0x796, + 0x684, 0x695, 0x696, 0x795, 0x786, 0x796, 0x795, 0x796, + 0x844, 0x845, 0x846, 0x847, 0x848, 0x849, 0x848, 0x848, + 0x844, 0x855, 0x846, 0x847, 0x848, 0x849, 0x855, 0x857, + 0x844, 0x845, 0x846, 0x857, 0x848, 0x859, 0x866, 0x865, + 0x844, 0x855, 0x846, 0x847, 0x878, 0x859, 0x877, 0x877, + 0x844, 0x855, 0x846, 0x867, 0x886, 0x887, 0x885, 0x886, + 0x784, 0x785, 0x786, 0x877, 0x897, 0x885, 0x896, 0x896, + 0x684, 0x695, 0x686, 0x886, 0x885, 0x885, 0x886, 0x896, + 0x694, 0x6a5, 0x6a6, 0x885, 0x885, 0x886, 0x896, 0x896, + 0x944, 0x945, 0x946, 0x947, 0x948, 0x847, 0x847, 0x848, + 0x954, 0x855, 0x856, 0x947, 0x858, 0x857, 0x858, 0x858, + 0x944, 0x945, 0x946, 0x867, 0x948, 0x866, 0x867, 0x867, + 0x944, 0x975, 0x976, 0x877, 0x877, 0x877, 0x877, 0x877, + 0x784, 0x785, 0x886, 0x887, 0x886, 0x887, 0x887, 0x887, + 0x784, 0x785, 0x786, 0x796, 0x887, 0x897, 0x896, 0x896, + 0x684, 0x695, 0x6a6, 0x886, 0x886, 0x896, 0x896, 0x896, + 0x6a4, 0x6a5, 0x696, 0x896, 0x886, 0x896, 0x896, 0x896, + 0xa44, 0xa45, 0xa46, 0xa47, 0x847, 0x848, 0x847, 0x848, + 0xa44, 0xa45, 0x856, 0x857, 0x857, 0x857, 0x857, 0x857, + 0xa44, 0xa65, 0x866, 0x867, 0x867, 0x867, 0x867, 0x867, + 0x774, 0x875, 0x876, 0x877, 0x877, 0x877, 0x877, 0x877, + 0x784, 0x785, 0x886, 0x887, 0x887, 0x887, 0x887, 0x887, + 0x784, 0x785, 0x786, 0x787, 0x887, 0x896, 0x897, 0x897, + 0x684, 0x6a5, 0x696, 0x886, 0x886, 0x896, 0x896, 0x896, + 0x684, 0x6a5, 0x6a5, 0x886, 0x886, 0x896, 0x896, 0x896, + 0xb44, 0x845, 0x846, 0x847, 0x847, 0x945, 0x846, 0x946, + 0xb54, 0x855, 0x856, 0x857, 0x857, 0x856, 0x857, 0x856, + 0x864, 0x865, 0x866, 0x867, 0x867, 0x866, 0x866, 0x867, + 0x864, 0x875, 0x876, 0x877, 0x877, 0x877, 0x877, 0x877, + 0x784, 0x885, 0x886, 0x787, 0x887, 0x887, 0x887, 0x887, + 0x784, 0x785, 0x786, 0x796, 0x886, 0x897, 0x897, 0x897, + 0x684, 0x695, 0x696, 0x886, 0x896, 0x896, 0x896, 0x896, + 0x684, 0x685, 0x696, 0xb57, 0x896, 0x896, 0x896, 0x896 + }; + return data; + } +}; +#endif + +} +} + +#endif // EIGEN_NEON_BLOCKING_SIZES_LOOKUP_TABLES_H diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index f86c0a39a..3b8b7303f 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -462,11 +462,59 @@ Packet4f psqrt<Packet4f>(const Packet4f& _x) #else -template<> EIGEN_STRONG_INLINE Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); } +template<>EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); } #endif -template<> EIGEN_STRONG_INLINE Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); } + +#if EIGEN_FAST_MATH + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f prsqrt<Packet4f>(const Packet4f& _x) { + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000); + _EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f); + _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000); + + Packet4f neg_half = pmul(_x, p4f_minus_half); + + // select only the inverse sqrt of positive normal inputs (denormals are + // flushed to zero and cause infs as well). + Packet4f le_zero_mask = _mm_cmple_ps(_x, p4f_flt_min); + Packet4f x = _mm_andnot_ps(le_zero_mask, _mm_rsqrt_ps(_x)); + + // Fill in NaNs and Infs for the negative/zero entries. + Packet4f neg_mask = _mm_cmplt_ps(_x, _mm_setzero_ps()); + Packet4f zero_mask = _mm_andnot_ps(neg_mask, le_zero_mask); + Packet4f infs_and_nans = _mm_or_ps(_mm_and_ps(neg_mask, p4f_nan), + _mm_and_ps(zero_mask, p4f_inf)); + + // Do a single step of Newton's iteration. + x = pmul(x, pmadd(neg_half, pmul(x, x), p4f_one_point_five)); + + // Insert NaNs and Infs in all the right places. + return _mm_or_ps(x, infs_and_nans); +} + +#else + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f prsqrt<Packet4f>(const Packet4f& x) { + // Unfortunately we can't use the much faster mm_rqsrt_ps since it only provides an approximation. + return _mm_div_ps(pset1<Packet4f>(1.0f), _mm_sqrt_ps(x)); +} + +#endif + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d prsqrt<Packet2d>(const Packet2d& x) { + // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation. + return _mm_div_pd(pset1<Packet2d>(1.0), _mm_sqrt_pd(x)); +} } // end namespace internal diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 3653783fd..38a84273d 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -108,6 +108,7 @@ template<> struct packet_traits<float> : default_packet_traits HasLog = 1, HasExp = 1, HasSqrt = 1, + HasRsqrt = 1, HasBlend = 1 }; }; @@ -124,6 +125,7 @@ template<> struct packet_traits<double> : default_packet_traits HasDiv = 1, HasExp = 1, HasSqrt = 1, + HasRsqrt = 1, HasBlend = 1 }; }; diff --git a/Eigen/src/Core/arch/SSE/TypeCasting.h b/Eigen/src/Core/arch/SSE/TypeCasting.h new file mode 100644 index 000000000..c84893230 --- /dev/null +++ b/Eigen/src/Core/arch/SSE/TypeCasting.h @@ -0,0 +1,77 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TYPE_CASTING_SSE_H +#define EIGEN_TYPE_CASTING_SSE_H + +namespace Eigen { + +namespace internal { + +template <> +struct type_casting_traits<float, int> { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) { + return _mm_cvttps_epi32(a); +} + + +template <> +struct type_casting_traits<int, float> { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) { + return _mm_cvtepi32_ps(a); +} + + +template <> +struct type_casting_traits<double, float> { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 2, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) { + return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6)); +} + +template <> +struct type_casting_traits<float, double> { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 2 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) { + // Simply discard the second half of the input + return _mm_cvtps_pd(a); +} + + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TYPE_CASTING_SSE_H |