aboutsummaryrefslogtreecommitdiffhomepage
path: root/third_party/eigen3/Eigen/src/Core/arch/SSE
diff options
context:
space:
mode:
authorGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2016-01-12 11:11:40 -0800
committerGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2016-01-12 11:11:40 -0800
commit42673c3a588ce4cc20b02ab20e5e9d38b64a3cb4 (patch)
tree1545b8e2411a774728685a4da519058897d49ee5 /third_party/eigen3/Eigen/src/Core/arch/SSE
parentccf01f9d77b28b649777f5a937a295f6dee2a130 (diff)
Deleted the remainder of the local copy of eigen that is shipped with
TensorFlow.
Diffstat (limited to 'third_party/eigen3/Eigen/src/Core/arch/SSE')
-rw-r--r--third_party/eigen3/Eigen/src/Core/arch/SSE/Complex.h486
-rw-r--r--third_party/eigen3/Eigen/src/Core/arch/SSE/MathFunctions.h529
-rw-r--r--third_party/eigen3/Eigen/src/Core/arch/SSE/PacketMath.h883
-rw-r--r--third_party/eigen3/Eigen/src/Core/arch/SSE/TypeCasting.h77
4 files changed, 0 insertions, 1975 deletions
diff --git a/third_party/eigen3/Eigen/src/Core/arch/SSE/Complex.h b/third_party/eigen3/Eigen/src/Core/arch/SSE/Complex.h
deleted file mode 100644
index 2722893dcf..0000000000
--- a/third_party/eigen3/Eigen/src/Core/arch/SSE/Complex.h
+++ /dev/null
@@ -1,486 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_COMPLEX_SSE_H
-#define EIGEN_COMPLEX_SSE_H
-
-namespace Eigen {
-
-namespace internal {
-
-//---------- float ----------
-struct Packet2cf
-{
- EIGEN_STRONG_INLINE Packet2cf() {}
- EIGEN_STRONG_INLINE explicit Packet2cf(const __m128& a) : v(a) {}
- __m128 v;
-};
-
-// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
-// to leverage AVX instructions.
-#ifndef EIGEN_VECTORIZE_AVX
-template<> struct packet_traits<std::complex<float> > : default_packet_traits
-{
- typedef Packet2cf type;
- typedef Packet2cf half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size = 2,
- HasHalfPacket = 0,
-
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 1,
- HasNegate = 1,
- HasAbs = 0,
- HasAbs2 = 0,
- HasMin = 0,
- HasMax = 0,
- HasSetLinear = 0,
- HasBlend = 1,
- };
-};
-#endif
-
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; typedef Packet2cf half; };
-
-template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a)
-{
- const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
- return Packet2cf(_mm_xor_ps(a.v,mask));
-}
-template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
-{
- const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
- return Packet2cf(_mm_xor_ps(a.v,mask));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
- // TODO optimize it for SSE3 and 4
- #ifdef EIGEN_VECTORIZE_SSE3
- return Packet2cf(_mm_addsub_ps(_mm_mul_ps(_mm_moveldup_ps(a.v), b.v),
- _mm_mul_ps(_mm_movehdup_ps(a.v),
- vec4f_swizzle1(b.v, 1, 0, 3, 2))));
-// return Packet2cf(_mm_addsub_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
-// _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
-// vec4f_swizzle1(b.v, 1, 0, 3, 2))));
- #else
- const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x00000000,0x80000000,0x00000000));
- return Packet2cf(_mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
- _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
- vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask)));
- #endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf pand <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf por <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pxor <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(a.v,b.v)); }
-
-template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(&numext::real_ref(*from))); }
-template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>(&numext::real_ref(*from))); }
-
-template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
-{
- Packet2cf res;
-#if EIGEN_GNUC_AT_MOST(4,2)
- // Workaround annoying "may be used uninitialized in this function" warning with gcc 4.2
- res.v = _mm_loadl_pi(_mm_set1_ps(0.0f), reinterpret_cast<const __m64*>(&from));
-#elif EIGEN_GNUC_AT_LEAST(4,6)
- // Suppress annoying "may be used uninitialized in this function" warning with gcc >= 4.6
- #pragma GCC diagnostic push
- #pragma GCC diagnostic ignored "-Wuninitialized"
- res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
- #pragma GCC diagnostic pop
-#else
- res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
-#endif
- return Packet2cf(_mm_movelh_ps(res.v,res.v));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
-
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), Packet4f(from.v)); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), Packet4f(from.v)); }
-
-
-template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, int stride)
-{
- return Packet2cf(_mm_set_ps(std::imag(from[1*stride]), std::real(from[1*stride]),
- std::imag(from[0*stride]), std::real(from[0*stride])));
-}
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, int stride)
-{
- to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 0)),
- _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 1)));
- to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 2)),
- _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3)));
-}
-
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-
-template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
-{
- #if EIGEN_GNUC_AT_MOST(4,3)
- // Workaround gcc 4.2 ICE - this is not performance wise ideal, but who cares...
- // This workaround also fix invalid code generation with gcc 4.3
- EIGEN_ALIGN16 std::complex<float> res[2];
- _mm_store_ps((float*)res, a.v);
- return res[0];
- #else
- std::complex<float> res;
- _mm_storel_pi((__m64*)&res, a.v);
- return res;
- #endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(Packet2d(_mm_castps_pd(a.v))))); }
-
-template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
-{
- return pfirst(Packet2cf(_mm_add_ps(a.v, _mm_movehl_ps(a.v,a.v))));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
-{
- return Packet2cf(_mm_add_ps(_mm_movelh_ps(vecs[0].v,vecs[1].v), _mm_movehl_ps(vecs[1].v,vecs[0].v)));
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
-{
- return pfirst(pmul(a, Packet2cf(_mm_movehl_ps(a.v,a.v))));
-}
-
-template<int Offset>
-struct palign_impl<Offset,Packet2cf>
-{
- static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second)
- {
- if (Offset==1)
- {
- first.v = _mm_movehl_ps(first.v, first.v);
- first.v = _mm_movelh_ps(first.v, second.v);
- }
- }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
-{
- EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
- { return padd(pmul(x,y),c); }
-
- EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
- {
- #ifdef EIGEN_VECTORIZE_SSE3
- return internal::pmul(a, pconj(b));
- #else
- const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
- return Packet2cf(_mm_add_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask),
- _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
- vec4f_swizzle1(b.v, 1, 0, 3, 2))));
- #endif
- }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, true,false>
-{
- EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
- { return padd(pmul(x,y),c); }
-
- EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
- {
- #ifdef EIGEN_VECTORIZE_SSE3
- return internal::pmul(pconj(a), b);
- #else
- const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
- return Packet2cf(_mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
- _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
- vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask)));
- #endif
- }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
-{
- EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
- { return padd(pmul(x,y),c); }
-
- EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
- {
- #ifdef EIGEN_VECTORIZE_SSE3
- return pconj(internal::pmul(a, b));
- #else
- const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
- return Packet2cf(_mm_sub_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask),
- _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
- vec4f_swizzle1(b.v, 1, 0, 3, 2))));
- #endif
- }
-};
-
-template<> struct conj_helper<Packet4f, Packet2cf, false,false>
-{
- EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const
- { return padd(c, pmul(x,y)); }
-
- EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
- { return Packet2cf(Eigen::internal::pmul<Packet4f>(x, y.v)); }
-};
-
-template<> struct conj_helper<Packet2cf, Packet4f, false,false>
-{
- EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const
- { return padd(c, pmul(x,y)); }
-
- EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
- { return Packet2cf(Eigen::internal::pmul<Packet4f>(x.v, y)); }
-};
-
-template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
- // TODO optimize it for SSE3 and 4
- Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b);
- __m128 s = _mm_mul_ps(b.v,b.v);
- return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(s), 0xb1)))));
-}
-
-EIGEN_STRONG_INLINE Packet2cf pcplxflip/*<Packet2cf>*/(const Packet2cf& x)
-{
- return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2));
-}
-
-
-//---------- double ----------
-struct Packet1cd
-{
- EIGEN_STRONG_INLINE Packet1cd() {}
- EIGEN_STRONG_INLINE explicit Packet1cd(const __m128d& a) : v(a) {}
- __m128d v;
-};
-
-// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
-// to leverage AVX instructions.
-#ifndef EIGEN_VECTORIZE_AVX
-template<> struct packet_traits<std::complex<double> > : default_packet_traits
-{
- typedef Packet1cd type;
- typedef Packet1cd half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 0,
- size = 1,
- HasHalfPacket = 0,
-
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 1,
- HasNegate = 1,
- HasAbs = 0,
- HasAbs2 = 0,
- HasMin = 0,
- HasMax = 0,
- HasSetLinear = 0
- };
-};
-#endif
-
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; typedef Packet1cd half; };
-
-template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a)
-{
- const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
- return Packet1cd(_mm_xor_pd(a.v,mask));
-}
-
-template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
- // TODO optimize it for SSE3 and 4
- #ifdef EIGEN_VECTORIZE_SSE3
- return Packet1cd(_mm_addsub_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v),
- _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
- vec2d_swizzle1(b.v, 1, 0))));
- #else
- const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0));
- return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v),
- _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
- vec2d_swizzle1(b.v, 1, 0)), mask)));
- #endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet1cd pand <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd por <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pxor <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); }
-
-// FIXME force unaligned load, this is a temporary fix
-template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from)
-{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
-
-template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); }
-
-// FIXME force unaligned store, this is a temporary fix
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); }
-
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-
-template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a)
-{
- EIGEN_ALIGN16 double res[2];
- _mm_store_pd(res, a.v);
- return std::complex<double>(res[0],res[1]);
-}
-
-template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
-
-template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a)
-{
- return pfirst(a);
-}
-
-template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs)
-{
- return vecs[0];
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
-{
- return pfirst(a);
-}
-
-template<int Offset>
-struct palign_impl<Offset,Packet1cd>
-{
- static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
- {
- // FIXME is it sure we never have to align a Packet1cd?
- // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
- }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
-{
- EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
- { return padd(pmul(x,y),c); }
-
- EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
- {
- #ifdef EIGEN_VECTORIZE_SSE3
- return internal::pmul(a, pconj(b));
- #else
- const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
- return Packet1cd(_mm_add_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask),
- _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
- vec2d_swizzle1(b.v, 1, 0))));
- #endif
- }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, true,false>
-{
- EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
- { return padd(pmul(x,y),c); }
-
- EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
- {
- #ifdef EIGEN_VECTORIZE_SSE3
- return internal::pmul(pconj(a), b);
- #else
- const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
- return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v),
- _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
- vec2d_swizzle1(b.v, 1, 0)), mask)));
- #endif
- }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
-{
- EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
- { return padd(pmul(x,y),c); }
-
- EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
- {
- #ifdef EIGEN_VECTORIZE_SSE3
- return pconj(internal::pmul(a, b));
- #else
- const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
- return Packet1cd(_mm_sub_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask),
- _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
- vec2d_swizzle1(b.v, 1, 0))));
- #endif
- }
-};
-
-template<> struct conj_helper<Packet2d, Packet1cd, false,false>
-{
- EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const
- { return padd(c, pmul(x,y)); }
-
- EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
- { return Packet1cd(Eigen::internal::pmul<Packet2d>(x, y.v)); }
-};
-
-template<> struct conj_helper<Packet1cd, Packet2d, false,false>
-{
- EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
- { return padd(c, pmul(x,y)); }
-
- EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
- { return Packet1cd(Eigen::internal::pmul<Packet2d>(x.v, y)); }
-};
-
-template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
- // TODO optimize it for SSE3 and 4
- Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
- __m128d s = _mm_mul_pd(b.v,b.v);
- return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1))));
-}
-
-EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
-{
- return Packet1cd(preverse(Packet2d(x.v)));
-}
-
-template<> EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet2cf,2>& kernel) {
- __m128d w1 = _mm_castps_pd(kernel.packet[0].v);
- __m128d w2 = _mm_castps_pd(kernel.packet[1].v);
-
- __m128 tmp = _mm_castpd_ps(_mm_unpackhi_pd(w1, w2));
- kernel.packet[0].v = _mm_castpd_ps(_mm_unpacklo_pd(w1, w2));
- kernel.packet[1].v = tmp;
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
- __m128d result = pblend(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v));
- return Packet2cf(_mm_castpd_ps(result));
-}
-
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_COMPLEX_SSE_H
diff --git a/third_party/eigen3/Eigen/src/Core/arch/SSE/MathFunctions.h b/third_party/eigen3/Eigen/src/Core/arch/SSE/MathFunctions.h
deleted file mode 100644
index 0baa7b4b58..0000000000
--- a/third_party/eigen3/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ /dev/null
@@ -1,529 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2007 Julien Pommier
-// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/* The sin, cos, exp, and log functions of this file come from
- * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
- */
-
-#ifndef EIGEN_MATH_FUNCTIONS_SSE_H
-#define EIGEN_MATH_FUNCTIONS_SSE_H
-
-namespace Eigen {
-
-namespace internal {
-
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f plog<Packet4f>(const Packet4f& _x)
-{
- Packet4f x = _x;
- _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
- _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
- _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
-
- _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
-
- /* the smallest non denormalized float number */
- _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000);
- _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000);//-1.f/0.f);
-
- /* natural logarithm computed for 4 simultaneous float
- return NaN for x <= 0
- */
- _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
-
-
- Packet4i emm0;
-
- // invalid_mask is set to true when x is NaN
- Packet4f invalid_mask = _mm_cmpnge_ps(x, _mm_setzero_ps());
- Packet4f iszero_mask = _mm_cmpeq_ps(x, _mm_setzero_ps());
-
- x = pmax(x, p4f_min_norm_pos); /* cut off denormalized stuff */
- emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
-
- /* keep only the fractional part */
- x = _mm_and_ps(x, p4f_inv_mant_mask);
- x = _mm_or_ps(x, p4f_half);
-
- emm0 = _mm_sub_epi32(emm0, p4i_0x7f);
- Packet4f e = padd(Packet4f(_mm_cvtepi32_ps(emm0)), p4f_1);
-
- /* part2:
- if( x < SQRTHF ) {
- e -= 1;
- x = x + x - 1.0;
- } else { x = x - 1.0; }
- */
- Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF);
- Packet4f tmp = pand(x, mask);
- x = psub(x, p4f_1);
- e = psub(e, pand(p4f_1, mask));
- x = padd(x, tmp);
-
- Packet4f x2 = pmul(x,x);
- Packet4f x3 = pmul(x2,x);
-
- Packet4f y, y1, y2;
- y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
- y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
- y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
- y = pmadd(y , x, p4f_cephes_log_p2);
- y1 = pmadd(y1, x, p4f_cephes_log_p5);
- y2 = pmadd(y2, x, p4f_cephes_log_p8);
- y = pmadd(y, x3, y1);
- y = pmadd(y, x3, y2);
- y = pmul(y, x3);
-
- y1 = pmul(e, p4f_cephes_log_q1);
- tmp = pmul(x2, p4f_half);
- y = padd(y, y1);
- x = psub(x, tmp);
- y2 = pmul(e, p4f_cephes_log_q2);
- x = padd(x, y);
- x = padd(x, y2);
- // negative arg will be NAN, 0 will be -INF
- return _mm_or_ps(_mm_andnot_ps(iszero_mask, _mm_or_ps(x, invalid_mask)),
- _mm_and_ps(iszero_mask, p4f_minus_inf));
-}
-
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f pexp<Packet4f>(const Packet4f& _x)
-{
- Packet4f x = _x;
- _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
- _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
- _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
-
-
- _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f);
- _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
-
- _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
-
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
-
- Packet4f tmp, fx;
- Packet4i emm0;
-
- // clamp x
- x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo);
-
- /* express exp(x) as exp(g + n*log(2)) */
- fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half);
-
-#ifdef EIGEN_VECTORIZE_SSE4_1
- fx = _mm_floor_ps(fx);
-#else
- emm0 = _mm_cvttps_epi32(fx);
- tmp = _mm_cvtepi32_ps(emm0);
- /* if greater, substract 1 */
- Packet4f mask = _mm_cmpgt_ps(tmp, fx);
- mask = _mm_and_ps(mask, p4f_1);
- fx = psub(tmp, mask);
-#endif
-
- tmp = pmul(fx, p4f_cephes_exp_C1);
- Packet4f z = pmul(fx, p4f_cephes_exp_C2);
- x = psub(x, tmp);
- x = psub(x, z);
-
- z = pmul(x,x);
-
- Packet4f y = p4f_cephes_exp_p0;
- y = pmadd(y, x, p4f_cephes_exp_p1);
- y = pmadd(y, x, p4f_cephes_exp_p2);
- y = pmadd(y, x, p4f_cephes_exp_p3);
- y = pmadd(y, x, p4f_cephes_exp_p4);
- y = pmadd(y, x, p4f_cephes_exp_p5);
- y = pmadd(y, z, x);
- y = padd(y, p4f_1);
-
- // build 2^n
- emm0 = _mm_cvttps_epi32(fx);
- emm0 = _mm_add_epi32(emm0, p4i_0x7f);
- emm0 = _mm_slli_epi32(emm0, 23);
- return pmax(pmul(y, Packet4f(_mm_castsi128_ps(emm0))), _x);
-}
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet2d pexp<Packet2d>(const Packet2d& _x)
-{
- Packet2d x = _x;
-
- _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
- _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
- _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
-
- _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437);
- _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
-
- _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
-
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
-
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
-
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
- static const __m128i p4i_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0);
-
- Packet2d tmp, fx;
- Packet4i emm0;
-
- // clamp x
- x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo);
- /* express exp(x) as exp(g + n*log(2)) */
- fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half);
-
-#ifdef EIGEN_VECTORIZE_SSE4_1
- fx = _mm_floor_pd(fx);
-#else
- emm0 = _mm_cvttpd_epi32(fx);
- tmp = _mm_cvtepi32_pd(emm0);
- /* if greater, substract 1 */
- Packet2d mask = _mm_cmpgt_pd(tmp, fx);
- mask = _mm_and_pd(mask, p2d_1);
- fx = psub(tmp, mask);
-#endif
-
- tmp = pmul(fx, p2d_cephes_exp_C1);
- Packet2d z = pmul(fx, p2d_cephes_exp_C2);
- x = psub(x, tmp);
- x = psub(x, z);
-
- Packet2d x2 = pmul(x,x);
-
- Packet2d px = p2d_cephes_exp_p0;
- px = pmadd(px, x2, p2d_cephes_exp_p1);
- px = pmadd(px, x2, p2d_cephes_exp_p2);
- px = pmul (px, x);
-
- Packet2d qx = p2d_cephes_exp_q0;
- qx = pmadd(qx, x2, p2d_cephes_exp_q1);
- qx = pmadd(qx, x2, p2d_cephes_exp_q2);
- qx = pmadd(qx, x2, p2d_cephes_exp_q3);
-
- x = pdiv(px,psub(qx,px));
- x = pmadd(p2d_2,x,p2d_1);
-
- // build 2^n
- emm0 = _mm_cvttpd_epi32(fx);
- emm0 = _mm_add_epi32(emm0, p4i_1023_0);
- emm0 = _mm_slli_epi32(emm0, 20);
- emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3));
- return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x);
-}
-
-/* evaluation of 4 sines at onces, using SSE2 intrinsics.
-
- The code is the exact rewriting of the cephes sinf function.
- Precision is excellent as long as x < 8192 (I did not bother to
- take into account the special handling they have for greater values
- -- it does not return garbage for arguments over 8192, though, but
- the extra precision is missing).
-
- Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
- surprising but correct result.
-*/
-
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f psin<Packet4f>(const Packet4f& _x)
-{
- Packet4f x = _x;
- _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
- _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-
- _EIGEN_DECLARE_CONST_Packet4i(1, 1);
- _EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
- _EIGEN_DECLARE_CONST_Packet4i(2, 2);
- _EIGEN_DECLARE_CONST_Packet4i(4, 4);
-
- _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000);
-
- _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f);
- _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
- _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
- _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f);
- _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f);
- _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f);
- _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f);
- _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
-
- Packet4f xmm1, xmm2, xmm3, sign_bit, y;
-
- Packet4i emm0, emm2;
- sign_bit = x;
- /* take the absolute value */
- x = pabs(x);
-
- /* take the modulo */
-
- /* extract the sign bit (upper one) */
- sign_bit = _mm_and_ps(sign_bit, p4f_sign_mask);
-
- /* scale by 4/Pi */
- y = pmul(x, p4f_cephes_FOPI);
-
- /* store the integer part of y in mm0 */
- emm2 = _mm_cvttps_epi32(y);
- /* j=(j+1) & (~1) (see the cephes sources) */
- emm2 = _mm_add_epi32(emm2, p4i_1);
- emm2 = _mm_and_si128(emm2, p4i_not1);
- y = _mm_cvtepi32_ps(emm2);
- /* get the swap sign flag */
- emm0 = _mm_and_si128(emm2, p4i_4);
- emm0 = _mm_slli_epi32(emm0, 29);
- /* get the polynom selection mask
- there is one polynom for 0 <= x <= Pi/4
- and another one for Pi/4<x<=Pi/2
-
- Both branches will be computed.
- */
- emm2 = _mm_and_si128(emm2, p4i_2);
- emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
-
- Packet4f swap_sign_bit = _mm_castsi128_ps(emm0);
- Packet4f poly_mask = _mm_castsi128_ps(emm2);
- sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
-
- /* The magic pass: "Extended precision modular arithmetic"
- x = ((x - y * DP1) - y * DP2) - y * DP3; */
- xmm1 = pmul(y, p4f_minus_cephes_DP1);
- xmm2 = pmul(y, p4f_minus_cephes_DP2);
- xmm3 = pmul(y, p4f_minus_cephes_DP3);
- x = padd(x, xmm1);
- x = padd(x, xmm2);
- x = padd(x, xmm3);
-
- /* Evaluate the first polynom (0 <= x <= Pi/4) */
- y = p4f_coscof_p0;
- Packet4f z = _mm_mul_ps(x,x);
-
- y = pmadd(y, z, p4f_coscof_p1);
- y = pmadd(y, z, p4f_coscof_p2);
- y = pmul(y, z);
- y = pmul(y, z);
- Packet4f tmp = pmul(z, p4f_half);
- y = psub(y, tmp);
- y = padd(y, p4f_1);
-
- /* Evaluate the second polynom (Pi/4 <= x <= 0) */
-
- Packet4f y2 = p4f_sincof_p0;
- y2 = pmadd(y2, z, p4f_sincof_p1);
- y2 = pmadd(y2, z, p4f_sincof_p2);
- y2 = pmul(y2, z);
- y2 = pmul(y2, x);
- y2 = padd(y2, x);
-
- /* select the correct result from the two polynoms */
- y2 = _mm_and_ps(poly_mask, y2);
- y = _mm_andnot_ps(poly_mask, y);
- y = _mm_or_ps(y,y2);
- /* update the sign */
- return _mm_xor_ps(y, sign_bit);
-}
-
-/* almost the same as psin */
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f pcos<Packet4f>(const Packet4f& _x)
-{
- Packet4f x = _x;
- _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
- _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-
- _EIGEN_DECLARE_CONST_Packet4i(1, 1);
- _EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
- _EIGEN_DECLARE_CONST_Packet4i(2, 2);
- _EIGEN_DECLARE_CONST_Packet4i(4, 4);
-
- _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f);
- _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
- _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
- _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f);
- _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f);
- _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f);
- _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f);
- _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
-
- Packet4f xmm1, xmm2, xmm3, y;
- Packet4i emm0, emm2;
-
- x = pabs(x);
-
- /* scale by 4/Pi */
- y = pmul(x, p4f_cephes_FOPI);
-
- /* get the integer part of y */
- emm2 = _mm_cvttps_epi32(y);
- /* j=(j+1) & (~1) (see the cephes sources) */
- emm2 = _mm_add_epi32(emm2, p4i_1);
- emm2 = _mm_and_si128(emm2, p4i_not1);
- y = _mm_cvtepi32_ps(emm2);
-
- emm2 = _mm_sub_epi32(emm2, p4i_2);
-
- /* get the swap sign flag */
- emm0 = _mm_andnot_si128(emm2, p4i_4);
- emm0 = _mm_slli_epi32(emm0, 29);
- /* get the polynom selection mask */
- emm2 = _mm_and_si128(emm2, p4i_2);
- emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
-
- Packet4f sign_bit = _mm_castsi128_ps(emm0);
- Packet4f poly_mask = _mm_castsi128_ps(emm2);
-
- /* The magic pass: "Extended precision modular arithmetic"
- x = ((x - y * DP1) - y * DP2) - y * DP3; */
- xmm1 = pmul(y, p4f_minus_cephes_DP1);
- xmm2 = pmul(y, p4f_minus_cephes_DP2);
- xmm3 = pmul(y, p4f_minus_cephes_DP3);
- x = padd(x, xmm1);
- x = padd(x, xmm2);
- x = padd(x, xmm3);
-
- /* Evaluate the first polynom (0 <= x <= Pi/4) */
- y = p4f_coscof_p0;
- Packet4f z = pmul(x,x);
-
- y = pmadd(y,z,p4f_coscof_p1);
- y = pmadd(y,z,p4f_coscof_p2);
- y = pmul(y, z);
- y = pmul(y, z);
- Packet4f tmp = _mm_mul_ps(z, p4f_half);
- y = psub(y, tmp);
- y = padd(y, p4f_1);
-
- /* Evaluate the second polynom (Pi/4 <= x <= 0) */
- Packet4f y2 = p4f_sincof_p0;
- y2 = pmadd(y2, z, p4f_sincof_p1);
- y2 = pmadd(y2, z, p4f_sincof_p2);
- y2 = pmul(y2, z);
- y2 = pmadd(y2, x, x);
-
- /* select the correct result from the two polynoms */
- y2 = _mm_and_ps(poly_mask, y2);
- y = _mm_andnot_ps(poly_mask, y);
- y = _mm_or_ps(y,y2);
-
- /* update the sign */
- return _mm_xor_ps(y, sign_bit);
-}
-
-#if EIGEN_FAST_MATH
-
-// This is based on Quake3's fast inverse square root.
-// For detail see here: http://www.beyond3d.com/content/articles/8/
-// It lacks 1 (or 2 bits in some rare cases) of precision, and does not handle negative, +inf, or denormalized numbers correctly.
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f psqrt<Packet4f>(const Packet4f& _x)
-{
- Packet4f half = pmul(_x, pset1<Packet4f>(.5f));
-
- /* select only the inverse sqrt of non-zero inputs */
- Packet4f non_zero_mask = _mm_cmpge_ps(_x, pset1<Packet4f>((std::numeric_limits<float>::min)()));
- Packet4f x = _mm_and_ps(non_zero_mask, _mm_rsqrt_ps(_x));
-
- x = pmul(x, psub(pset1<Packet4f>(1.5f), pmul(half, pmul(x,x))));
- return pmul(_x,x);
-}
-
-#else
-
-template<> EIGEN_STRONG_INLINE Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); }
-
-#endif
-
-template<> EIGEN_STRONG_INLINE Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
-
-
-#if EIGEN_FAST_MATH
-
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f prsqrt<Packet4f>(const Packet4f& _x) {
- _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000);
- _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000);
- _EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f);
- _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f);
- _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000);
-
- Packet4f neg_half = pmul(_x, p4f_minus_half);
-
- // select only the inverse sqrt of positive normal inputs (denormals are
- // flushed to zero and cause infs as well).
- Packet4f le_zero_mask = _mm_cmple_ps(_x, p4f_flt_min);
- Packet4f x = _mm_andnot_ps(le_zero_mask, _mm_rsqrt_ps(_x));
-
- // Fill in NaNs and Infs for the negative/zero entries.
- Packet4f neg_mask = _mm_cmplt_ps(_x, _mm_setzero_ps());
- Packet4f zero_mask = _mm_andnot_ps(neg_mask, le_zero_mask);
- Packet4f infs_and_nans = _mm_or_ps(_mm_and_ps(neg_mask, p4f_nan),
- _mm_and_ps(zero_mask, p4f_inf));
-
- // Do a single step of Newton's iteration.
- x = pmul(x, pmadd(neg_half, pmul(x, x), p4f_one_point_five));
-
- // Insert NaNs and Infs in all the right places.
- return _mm_or_ps(x, infs_and_nans);
-}
-
-#else
-
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f prsqrt<Packet4f>(const Packet4f& x) {
- // Unfortunately we can't use the much faster mm_rqsrt_ps since it only provides an approximation.
- return _mm_div_ps(pset1<Packet4f>(1.0f), _mm_sqrt_ps(x));
-}
-
-#endif
-
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet2d prsqrt<Packet2d>(const Packet2d& x) {
- // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation.
- return _mm_div_pd(pset1<Packet2d>(1.0), _mm_sqrt_pd(x));
-}
-
-// Identical to the ptanh in GenericPacketMath.h, but for doubles use
-// a small/medium approximation threshold of 0.001.
-template<> EIGEN_STRONG_INLINE Packet2d ptanh_approx_threshold() {
- return pset1<Packet2d>(0.001);
-}
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_MATH_FUNCTIONS_SSE_H
diff --git a/third_party/eigen3/Eigen/src/Core/arch/SSE/PacketMath.h b/third_party/eigen3/Eigen/src/Core/arch/SSE/PacketMath.h
deleted file mode 100644
index 7f4274fd99..0000000000
--- a/third_party/eigen3/Eigen/src/Core/arch/SSE/PacketMath.h
+++ /dev/null
@@ -1,883 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_PACKET_MATH_SSE_H
-#define EIGEN_PACKET_MATH_SSE_H
-
-namespace Eigen {
-
-namespace internal {
-
-#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
-#endif
-
-#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
-#endif
-
-#ifdef __FMA__
-#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
-#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1
-#endif
-#endif
-
-typedef __m128 Packet4f;
-typedef __m128i Packet4i;
-typedef __m128d Packet2d;
-
-template<> struct is_arithmetic<__m128> { enum { value = true }; };
-template<> struct is_arithmetic<__m128i> { enum { value = true }; };
-template<> struct is_arithmetic<__m128d> { enum { value = true }; };
-
-#define vec4f_swizzle1(v,p,q,r,s) \
- (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p)))))
-
-#define vec4i_swizzle1(v,p,q,r,s) \
- (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p))))
-
-#define vec2d_swizzle1(v,p,q) \
- (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2)))))
-
-#define vec4f_swizzle2(a,b,p,q,r,s) \
- (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p))))
-
-#define vec4i_swizzle2(a,b,p,q,r,s) \
- (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p))))))
-
-#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
- const Packet4f p4f_##NAME = pset1<Packet4f>(X)
-
-#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
- const Packet2d p2d_##NAME = pset1<Packet2d>(X)
-
-#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
- const Packet4f p4f_##NAME = _mm_castsi128_ps(pset1<Packet4i>(X))
-
-#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
- const Packet4i p4i_##NAME = pset1<Packet4i>(X)
-
-
-// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
-// to leverage AVX instructions.
-#ifndef EIGEN_VECTORIZE_AVX
-template<> struct packet_traits<float> : default_packet_traits
-{
- typedef Packet4f type;
- typedef Packet4f half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size=4,
- HasHalfPacket = 0,
-
- HasDiv = 1,
- HasSin = EIGEN_FAST_MATH,
- HasCos = EIGEN_FAST_MATH,
- HasTanH = 1,
- HasLog = 1,
- HasExp = 1,
- HasSqrt = 1,
- HasRsqrt = 1,
-
- HasBlend = 1,
- HasSelect = 1,
- HasEq = 1,
- };
-};
-template<> struct packet_traits<double> : default_packet_traits
-{
- typedef Packet2d type;
- typedef Packet2d half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size=2,
- HasHalfPacket = 0,
-
- HasDiv = 1,
- HasTanH = 1,
- HasExp = 1,
- HasSqrt = 1,
- HasRsqrt = 1,
-
- HasBlend = 1,
- HasSelect = 1,
- HasEq = 1,
- };
-};
-#endif
-template<> struct packet_traits<int> : default_packet_traits
-{
- typedef Packet4i type;
- typedef Packet4i half;
- enum {
- // FIXME check the Has*
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size=4,
-
- HasBlend = 1,
- };
-};
-
-template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; typedef Packet2d half; };
-template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; typedef Packet4i half; };
-
-#if EIGEN_COMP_MSVC==1500
-// Workaround MSVC 9 internal compiler error.
-// TODO: It has been detected with win64 builds (amd64), so let's check whether it also happens in 32bits+SSE mode
-// TODO: let's check whether there does not exist a better fix, like adding a pset0() function. (it crashed on pset1(0)).
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set_ps(from,from,from,from); }
-template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set_pd(from,from); }
-template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set_epi32(from,from,from,from); }
-#else
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set_ps1(from); }
-template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set1_epi32(from); }
-#endif
-
-// GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction.
-// However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203)
-// Using inline assembly is also not an option because then gcc fails to reorder properly the instructions.
-// Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply.
-// Also note that with AVX, we want it to generate a vbroadcastss.
-#if EIGEN_COMP_GNUC_STRICT && (!defined __AVX__)
-template<> EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float *from) {
- return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0);
-}
-#endif
-
-#ifndef EIGEN_VECTORIZE_AVX
-template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
-template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
-#endif
-template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
-
-template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f ple<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_cmple_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d ple<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_cmple_pd(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f plt<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_cmplt_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d plt<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_cmplt_pd(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f peq<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d peq<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pselect<Packet4f>(const Packet4f& a, const Packet4f& b, const Packet4f& false_mask) {
-#if defined(EIGEN_VECTORIZE_SSE4_1)
- return _mm_blendv_ps(a, b, false_mask);
-#else
- return _mm_or_ps(_mm_andnot_ps(false_mask, a), _mm_and_ps(false_mask, b));
-#endif
-}
-template<> EIGEN_STRONG_INLINE Packet2d pselect<Packet2d>(const Packet2d& a, const Packet2d& b, const Packet2d& false_mask) {
-#if defined(EIGEN_VECTORIZE_SSE4_1)
- return _mm_blendv_pd(a, b, false_mask);
-#else
- return _mm_or_pd(_mm_andnot_pd(false_mask, a), _mm_and_pd(false_mask, b));
-#endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
-{
- const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
- return _mm_xor_ps(a,mask);
-}
-template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
-{
- const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x80000000));
- return _mm_xor_pd(a,mask);
-}
-template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
-{
- return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
-
-template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_mul_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_mul_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
-{
-#ifdef EIGEN_VECTORIZE_SSE4_1
- return _mm_mullo_epi32(a,b);
-#else
- // this version is slightly faster than 4 scalar products
- return vec4i_swizzle1(
- vec4i_swizzle2(
- _mm_mul_epu32(a,b),
- _mm_mul_epu32(vec4i_swizzle1(a,1,0,3,2),
- vec4i_swizzle1(b,1,0,3,2)),
- 0,2,0,2),
- 0,2,1,3);
-#endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
-{ eigen_assert(false && "packet integer division are not supported by SSE");
- return pset1<Packet4i>(0);
-}
-
-// for some weird raisons, it has to be overloaded for packet of integers
-template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
-#ifdef __FMA__
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); }
-template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); }
-#endif
-
-template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b)
-{
-#ifdef EIGEN_VECTORIZE_SSE4_1
- return _mm_min_epi32(a,b);
-#else
- // after some bench, this version *is* faster than a scalar implementation
- Packet4i mask = _mm_cmplt_epi32(a,b);
- return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
-#endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_max_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_max_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b)
-{
-#ifdef EIGEN_VECTORIZE_SSE4_1
- return _mm_max_epi32(a,b);
-#else
- // after some bench, this version *is* faster than a scalar implementation
- Packet4i mask = _mm_cmpgt_epi32(a,b);
- return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
-#endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
-template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
-
-#if EIGEN_COMP_MSVC
- template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
- EIGEN_DEBUG_UNALIGNED_LOAD
- #if (EIGEN_COMP_MSVC==1600)
- // NOTE Some version of MSVC10 generates bad code when using _mm_loadu_ps
- // (i.e., it does not generate an unaligned load!!
- // TODO On most architectures this version should also be faster than a single _mm_loadu_ps
- // so we could also enable it for MSVC08 but first we have to make this later does not generate crap when doing so...
- __m128 res = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from));
- res = _mm_loadh_pi(res, (const __m64*)(from+2));
- return res;
- #else
- return _mm_loadu_ps(from);
- #endif
- }
- template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); }
- template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from)); }
-#else
-// Fast unaligned loads. Note that here we cannot directly use intrinsics: this would
-// require pointer casting to incompatible pointer types and leads to invalid code
-// because of the strict aliasing rule. The "dummy" stuff are required to enforce
-// a correct instruction dependency.
-// TODO: do the same for MSVC (ICC is compatible)
-// NOTE: with the code below, MSVC's compiler crashes!
-
-#if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386 || (EIGEN_ARCH_x86_64 && EIGEN_GNUC_AT_LEAST(4, 8)))
- // bug 195: gcc/i386 emits weird x87 fldl/fstpl instructions for _mm_load_sd
- #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
- #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 1
-#elif EIGEN_COMP_CLANG
- // bug 201: Segfaults in __mm_loadh_pd with clang 2.8
- #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
- #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 0
-#else
- #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 0
- #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 0
-#endif
-
-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
-{
- EIGEN_DEBUG_UNALIGNED_LOAD
-#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
- return _mm_loadu_ps(from);
-#else
- __m128d res;
- res = _mm_load_sd((const double*)(from)) ;
- res = _mm_loadh_pd(res, (const double*)(from+2)) ;
- return _mm_castpd_ps(res);
-#endif
-}
-template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
-{
- EIGEN_DEBUG_UNALIGNED_LOAD
-#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
- return _mm_loadu_pd(from);
-#else
- __m128d res;
- res = _mm_load_sd(from) ;
- res = _mm_loadh_pd(res,from+1);
- return res;
-#endif
-}
-template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
-{
- EIGEN_DEBUG_UNALIGNED_LOAD
-#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
- return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
-#else
- __m128d res;
- res = _mm_load_sd((const double*)(from)) ;
- res = _mm_loadh_pd(res, (const double*)(from+2)) ;
- return _mm_castpd_si128(res);
-#endif
-}
-#endif
-
-template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
-{
- return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from))), 0, 0, 1, 1);
-}
-template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
-{ return pset1<Packet2d>(from[0]); }
-template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
-{
- Packet4i tmp;
- tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
- return vec4i_swizzle1(tmp, 0, 0, 1, 1);
-}
-
-template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
-template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
-template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
-
-template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
- EIGEN_DEBUG_UNALIGNED_STORE
-#if EIGEN_AVOID_CUSTOM_UNALIGNED_STORES
- _mm_storeu_pd(to, from);
-#else
- _mm_storel_pd((to), from);
- _mm_storeh_pd((to+1), from);
-#endif
-}
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), Packet2d(_mm_castps_pd(from))); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), Packet2d(_mm_castsi128_pd(from))); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, int stride)
-{
- return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
-}
-template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, int stride)
-{
- return _mm_set_pd(from[1*stride], from[0*stride]);
-}
-template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, int stride)
-{
- return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
- }
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, int stride)
-{
- to[stride*0] = _mm_cvtss_f32(from);
- to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1));
- to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2));
- to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3));
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, int stride)
-{
- to[stride*0] = _mm_cvtsd_f64(from);
- to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1));
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, int stride)
-{
- to[stride*0] = _mm_cvtsi128_si32(from);
- to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
- to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
- to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
-}
-
-// some compilers might be tempted to perform multiple moves instead of using a vector path.
-template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a)
-{
- Packet4f pa = _mm_set_ss(a);
- pstore(to, Packet4f(vec4f_swizzle1(pa,0,0,0,0)));
-}
-// some compilers might be tempted to perform multiple moves instead of using a vector path.
-template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a)
-{
- Packet2d pa = _mm_set_sd(a);
- pstore(to, Packet2d(vec2d_swizzle1(pa,0,0)));
-}
-
-#ifndef EIGEN_VECTORIZE_AVX
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-#endif
-
-#if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
-// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
-// Direct of the struct members fixed bug #62.
-template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return a.m128_f32[0]; }
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return a.m128d_f64[0]; }
-template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
-#elif EIGEN_COMP_MSVC_STRICT
-// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
-template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; }
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; }
-template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
-#else
-template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return _mm_cvtss_f32(a); }
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return _mm_cvtsd_f64(a); }
-template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { return _mm_cvtsi128_si32(a); }
-#endif
-
-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
-{ return _mm_shuffle_ps(a,a,0x1B); }
-template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
-{ return _mm_shuffle_pd(a,a,0x1); }
-template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
-{ return _mm_shuffle_epi32(a,0x1B); }
-
-template<size_t offset>
-struct protate_impl<offset, Packet4f>
-{
- static Packet4f run(const Packet4f& a) {
- return vec4f_swizzle1(a, offset, (offset + 1) % 4, (offset + 2) % 4, (offset + 3) % 4);
- }
-};
-
-template<size_t offset>
-struct protate_impl<offset, Packet4i>
-{
- static Packet4i run(const Packet4i& a) {
- return vec4i_swizzle1(a, offset, (offset + 1) % 4, (offset + 2) % 4, (offset + 3) % 4);
- }
-};
-
-template<size_t offset>
-struct protate_impl<offset, Packet2d>
-{
- static Packet2d run(const Packet2d& a) {
- return vec2d_swizzle1(a, offset, (offset + 1) % 2);
- }
-};
-
-template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)
-{
- const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
- return _mm_and_ps(a,mask);
-}
-template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a)
-{
- const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
- return _mm_and_pd(a,mask);
-}
-template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)
-{
- #ifdef EIGEN_VECTORIZE_SSSE3
- return _mm_abs_epi32(a);
- #else
- Packet4i aux = _mm_srai_epi32(a,31);
- return _mm_sub_epi32(_mm_xor_si128(a,aux),aux);
- #endif
-}
-
-// with AVX, the default implementations based on pload1 are faster
-#ifndef __AVX__
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet4f>(const float *a,
- Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
-{
- a3 = pload<Packet4f>(a);
- a0 = vec4f_swizzle1(a3, 0,0,0,0);
- a1 = vec4f_swizzle1(a3, 1,1,1,1);
- a2 = vec4f_swizzle1(a3, 2,2,2,2);
- a3 = vec4f_swizzle1(a3, 3,3,3,3);
-}
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet2d>(const double *a,
- Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
-{
-#ifdef EIGEN_VECTORIZE_SSE3
- a0 = _mm_loaddup_pd(a+0);
- a1 = _mm_loaddup_pd(a+1);
- a2 = _mm_loaddup_pd(a+2);
- a3 = _mm_loaddup_pd(a+3);
-#else
- a1 = pload<Packet2d>(a);
- a0 = vec2d_swizzle1(a1, 0,0);
- a1 = vec2d_swizzle1(a1, 1,1);
- a3 = pload<Packet2d>(a+2);
- a2 = vec2d_swizzle1(a3, 0,0);
- a3 = vec2d_swizzle1(a3, 1,1);
-#endif
-}
-#endif
-
-EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs)
-{
- vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
- vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA));
- vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF));
- vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
-}
-
-#ifdef EIGEN_VECTORIZE_SSE3
-// TODO implement SSE2 versions as well as integer versions
-template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
-{
- return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3]));
-}
-template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
-{
- return _mm_hadd_pd(vecs[0], vecs[1]);
-}
-// SSSE3 version:
-// EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs)
-// {
-// return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3]));
-// }
-
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
- Packet4f tmp0 = _mm_hadd_ps(a,a);
- return pfirst<Packet4f>(_mm_hadd_ps(tmp0, tmp0));
-}
-
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst<Packet2d>(_mm_hadd_pd(a, a)); }
-
-// SSSE3 version:
-// EIGEN_STRONG_INLINE float predux(const Packet4i& a)
-// {
-// Packet4i tmp0 = _mm_hadd_epi32(a,a);
-// return pfirst(_mm_hadd_epi32(tmp0, tmp0));
-// }
-#else
-// SSE2 versions
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
- Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
- return pfirst(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
-}
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
-{
- return pfirst(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
-{
- Packet4f tmp0, tmp1, tmp2;
- tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]);
- tmp1 = _mm_unpackhi_ps(vecs[0], vecs[1]);
- tmp2 = _mm_unpackhi_ps(vecs[2], vecs[3]);
- tmp0 = _mm_add_ps(tmp0, tmp1);
- tmp1 = _mm_unpacklo_ps(vecs[2], vecs[3]);
- tmp1 = _mm_add_ps(tmp1, tmp2);
- tmp2 = _mm_movehl_ps(tmp1, tmp0);
- tmp0 = _mm_movelh_ps(tmp0, tmp1);
- return _mm_add_ps(tmp0, tmp2);
-}
-
-template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
-{
- return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1]));
-}
-#endif // SSE3
-
-template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
-{
- Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
- return pfirst(tmp) + pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
-{
- Packet4i tmp0, tmp1, tmp2;
- tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
- tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
- tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
- tmp0 = _mm_add_epi32(tmp0, tmp1);
- tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
- tmp1 = _mm_add_epi32(tmp1, tmp2);
- tmp2 = _mm_unpacklo_epi64(tmp0, tmp1);
- tmp0 = _mm_unpackhi_epi64(tmp0, tmp1);
- return _mm_add_epi32(tmp0, tmp2);
-}
-
-// Other reduction functions:
-
-// mul
-template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
-{
- Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a));
- return pfirst<Packet4f>(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
-}
-template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
-{
- return pfirst<Packet2d>(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));
-}
-template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
-{
- // after some experiments, it is seems this is the fastest way to implement it
- // for GCC (eg., reusing pmul is very slow !)
- // TODO try to call _mm_mul_epu32 directly
- EIGEN_ALIGN16 int aux[4];
- pstore(aux, a);
- return (aux[0] * aux[1]) * (aux[2] * aux[3]);;
-}
-
-// min
-template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
-{
- Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a));
- return pfirst<Packet4f>(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
-}
-template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
-{
- return pfirst<Packet2d>(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
-}
-template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
-{
-#ifdef EIGEN_VECTORIZE_SSE4_1
- Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
- return pfirst<Packet4i>(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
-#else
- // after some experiments, it is seems this is the fastest way to implement it
- // for GCC (eg., it does not like using std::min after the pstore !!)
- EIGEN_ALIGN16 int aux[4];
- pstore(aux, a);
- int aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
- int aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
- return aux0<aux2 ? aux0 : aux2;
-#endif // EIGEN_VECTORIZE_SSE4_1
-}
-
-// max
-template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
-{
- Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a));
- return pfirst<Packet4f>(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
-}
-template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
-{
- return pfirst<Packet2d>(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
-}
-template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
-{
-#ifdef EIGEN_VECTORIZE_SSE4_1
- Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
- return pfirst<Packet4i>(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
-#else
- // after some experiments, it is seems this is the fastest way to implement it
- // for GCC (eg., it does not like using std::min after the pstore !!)
- EIGEN_ALIGN16 int aux[4];
- pstore(aux, a);
- int aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
- int aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
- return aux0>aux2 ? aux0 : aux2;
-#endif // EIGEN_VECTORIZE_SSE4_1
-}
-
-#if EIGEN_COMP_GNUC
-// template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
-// {
-// Packet4f res = b;
-// asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c));
-// return res;
-// }
-// EIGEN_STRONG_INLINE Packet4i _mm_alignr_epi8(const Packet4i& a, const Packet4i& b, const int i)
-// {
-// Packet4i res = a;
-// asm("palignr %[i], %[a], %[b] " : [b] "+x" (res) : [a] "x" (a), [i] "i" (i));
-// return res;
-// }
-#endif
-
-#ifdef EIGEN_VECTORIZE_SSSE3
-// SSSE3 versions
-template<int Offset>
-struct palign_impl<Offset,Packet4f>
-{
- static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
- {
- if (Offset!=0)
- first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(second), _mm_castps_si128(first), Offset*4));
- }
-};
-
-template<int Offset>
-struct palign_impl<Offset,Packet4i>
-{
- static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
- {
- if (Offset!=0)
- first = _mm_alignr_epi8(second,first, Offset*4);
- }
-};
-
-template<int Offset>
-struct palign_impl<Offset,Packet2d>
-{
- static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
- {
- if (Offset==1)
- first = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(second), _mm_castpd_si128(first), 8));
- }
-};
-#else
-// SSE2 versions
-template<int Offset>
-struct palign_impl<Offset,Packet4f>
-{
- static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
- {
- if (Offset==1)
- {
- first = _mm_move_ss(first,second);
- first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39));
- }
- else if (Offset==2)
- {
- first = _mm_movehl_ps(first,first);
- first = _mm_movelh_ps(first,second);
- }
- else if (Offset==3)
- {
- first = _mm_move_ss(first,second);
- first = _mm_shuffle_ps(first,second,0x93);
- }
- }
-};
-
-template<int Offset>
-struct palign_impl<Offset,Packet4i>
-{
- static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
- {
- if (Offset==1)
- {
- first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
- first = _mm_shuffle_epi32(first,0x39);
- }
- else if (Offset==2)
- {
- first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first)));
- first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
- }
- else if (Offset==3)
- {
- first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
- first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93));
- }
- }
-};
-
-template<int Offset>
-struct palign_impl<Offset,Packet2d>
-{
- static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
- {
- if (Offset==1)
- {
- first = _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(first),_mm_castpd_ps(first)));
- first = _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(first),_mm_castpd_ps(second)));
- }
- }
-};
-#endif
-
-template<> EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4f,4>& kernel) {
- _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
-}
-
-template<> EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet2d,2>& kernel) {
- __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
- kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
- kernel.packet[1] = tmp;
-}
-
-template<> EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4i,4>& kernel) {
- __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
- __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
- __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
- __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
-
- kernel.packet[0] = _mm_unpacklo_epi64(T0, T1);
- kernel.packet[1] = _mm_unpackhi_epi64(T0, T1);
- kernel.packet[2] = _mm_unpacklo_epi64(T2, T3);
- kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
- __m128i false_mask = _mm_cmpeq_epi32(select, zero);
-#ifdef EIGEN_VECTORIZE_SSE4_1
- return _mm_blendv_epi8(thenPacket, elsePacket, false_mask);
-#else
- return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket));
-#endif
-}
-template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
- const __m128 zero = _mm_setzero_ps();
- const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
- __m128 false_mask = _mm_cmpeq_ps(select, zero);
-#ifdef EIGEN_VECTORIZE_SSE4_1
- return _mm_blendv_ps(thenPacket, elsePacket, false_mask);
-#else
- return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket));
-#endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
- const __m128d zero = _mm_setzero_pd();
- const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]);
- __m128d false_mask = _mm_cmpeq_pd(select, zero);
-#ifdef EIGEN_VECTORIZE_SSE4_1
- return _mm_blendv_pd(thenPacket, elsePacket, false_mask);
-#else
- return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket));
-#endif
-}
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_PACKET_MATH_SSE_H
diff --git a/third_party/eigen3/Eigen/src/Core/arch/SSE/TypeCasting.h b/third_party/eigen3/Eigen/src/Core/arch/SSE/TypeCasting.h
deleted file mode 100644
index c848932306..0000000000
--- a/third_party/eigen3/Eigen/src/Core/arch/SSE/TypeCasting.h
+++ /dev/null
@@ -1,77 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_TYPE_CASTING_SSE_H
-#define EIGEN_TYPE_CASTING_SSE_H
-
-namespace Eigen {
-
-namespace internal {
-
-template <>
-struct type_casting_traits<float, int> {
- enum {
- VectorizedCast = 1,
- SrcCoeffRatio = 1,
- TgtCoeffRatio = 1
- };
-};
-
-template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
- return _mm_cvttps_epi32(a);
-}
-
-
-template <>
-struct type_casting_traits<int, float> {
- enum {
- VectorizedCast = 1,
- SrcCoeffRatio = 1,
- TgtCoeffRatio = 1
- };
-};
-
-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
- return _mm_cvtepi32_ps(a);
-}
-
-
-template <>
-struct type_casting_traits<double, float> {
- enum {
- VectorizedCast = 1,
- SrcCoeffRatio = 2,
- TgtCoeffRatio = 1
- };
-};
-
-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
- return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
-}
-
-template <>
-struct type_casting_traits<float, double> {
- enum {
- VectorizedCast = 1,
- SrcCoeffRatio = 1,
- TgtCoeffRatio = 2
- };
-};
-
-template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) {
- // Simply discard the second half of the input
- return _mm_cvtps_pd(a);
-}
-
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_TYPE_CASTING_SSE_H