diff options
author | 2016-01-12 11:11:40 -0800 | |
---|---|---|
committer | 2016-01-12 11:11:40 -0800 | |
commit | 42673c3a588ce4cc20b02ab20e5e9d38b64a3cb4 (patch) | |
tree | 1545b8e2411a774728685a4da519058897d49ee5 /third_party/eigen3/Eigen/src/Core/arch/SSE | |
parent | ccf01f9d77b28b649777f5a937a295f6dee2a130 (diff) |
Deleted the remainder of the local copy of eigen that is shipped with
TensorFlow.
Diffstat (limited to 'third_party/eigen3/Eigen/src/Core/arch/SSE')
4 files changed, 0 insertions, 1975 deletions
diff --git a/third_party/eigen3/Eigen/src/Core/arch/SSE/Complex.h b/third_party/eigen3/Eigen/src/Core/arch/SSE/Complex.h deleted file mode 100644 index 2722893dcf..0000000000 --- a/third_party/eigen3/Eigen/src/Core/arch/SSE/Complex.h +++ /dev/null @@ -1,486 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_COMPLEX_SSE_H -#define EIGEN_COMPLEX_SSE_H - -namespace Eigen { - -namespace internal { - -//---------- float ---------- -struct Packet2cf -{ - EIGEN_STRONG_INLINE Packet2cf() {} - EIGEN_STRONG_INLINE explicit Packet2cf(const __m128& a) : v(a) {} - __m128 v; -}; - -// Use the packet_traits defined in AVX/PacketMath.h instead if we're going -// to leverage AVX instructions. -#ifndef EIGEN_VECTORIZE_AVX -template<> struct packet_traits<std::complex<float> > : default_packet_traits -{ - typedef Packet2cf type; - typedef Packet2cf half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 2, - HasHalfPacket = 0, - - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasNegate = 1, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, - HasSetLinear = 0, - HasBlend = 1, - }; -}; -#endif - -template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; typedef Packet2cf half; }; - -template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) -{ - const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000)); - return Packet2cf(_mm_xor_ps(a.v,mask)); -} -template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) -{ - const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); - return Packet2cf(_mm_xor_ps(a.v,mask)); -} - -template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) -{ - // TODO optimize it for SSE3 and 4 - #ifdef EIGEN_VECTORIZE_SSE3 - return Packet2cf(_mm_addsub_ps(_mm_mul_ps(_mm_moveldup_ps(a.v), b.v), - _mm_mul_ps(_mm_movehdup_ps(a.v), - vec4f_swizzle1(b.v, 1, 0, 3, 2)))); -// return Packet2cf(_mm_addsub_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), -// _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), -// vec4f_swizzle1(b.v, 1, 0, 3, 2)))); - #else - const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x00000000,0x80000000,0x00000000)); - return Packet2cf(_mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), - _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), - vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask))); - #endif -} - -template<> EIGEN_STRONG_INLINE Packet2cf pand <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf por <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pxor <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(a.v,b.v)); } - -template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(&numext::real_ref(*from))); } -template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>(&numext::real_ref(*from))); } - -template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) -{ - Packet2cf res; -#if EIGEN_GNUC_AT_MOST(4,2) - // Workaround annoying "may be used uninitialized in this function" warning with gcc 4.2 - res.v = _mm_loadl_pi(_mm_set1_ps(0.0f), reinterpret_cast<const __m64*>(&from)); -#elif EIGEN_GNUC_AT_LEAST(4,6) - // Suppress annoying "may be used uninitialized in this function" warning with gcc >= 4.6 - #pragma GCC diagnostic push - #pragma GCC diagnostic ignored "-Wuninitialized" - res.v = _mm_loadl_pi(res.v, (const __m64*)&from); - #pragma GCC diagnostic pop -#else - res.v = _mm_loadl_pi(res.v, (const __m64*)&from); -#endif - return Packet2cf(_mm_movelh_ps(res.v,res.v)); -} - -template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); } - -template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), Packet4f(from.v)); } -template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), Packet4f(from.v)); } - - -template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, int stride) -{ - return Packet2cf(_mm_set_ps(std::imag(from[1*stride]), std::real(from[1*stride]), - std::imag(from[0*stride]), std::real(from[0*stride]))); -} - -template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, int stride) -{ - to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 0)), - _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 1))); - to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 2)), - _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3))); -} - -template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } - -template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) -{ - #if EIGEN_GNUC_AT_MOST(4,3) - // Workaround gcc 4.2 ICE - this is not performance wise ideal, but who cares... - // This workaround also fix invalid code generation with gcc 4.3 - EIGEN_ALIGN16 std::complex<float> res[2]; - _mm_store_ps((float*)res, a.v); - return res[0]; - #else - std::complex<float> res; - _mm_storel_pi((__m64*)&res, a.v); - return res; - #endif -} - -template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(Packet2d(_mm_castps_pd(a.v))))); } - -template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) -{ - return pfirst(Packet2cf(_mm_add_ps(a.v, _mm_movehl_ps(a.v,a.v)))); -} - -template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs) -{ - return Packet2cf(_mm_add_ps(_mm_movelh_ps(vecs[0].v,vecs[1].v), _mm_movehl_ps(vecs[1].v,vecs[0].v))); -} - -template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) -{ - return pfirst(pmul(a, Packet2cf(_mm_movehl_ps(a.v,a.v)))); -} - -template<int Offset> -struct palign_impl<Offset,Packet2cf> -{ - static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second) - { - if (Offset==1) - { - first.v = _mm_movehl_ps(first.v, first.v); - first.v = _mm_movelh_ps(first.v, second.v); - } - } -}; - -template<> struct conj_helper<Packet2cf, Packet2cf, false,true> -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return internal::pmul(a, pconj(b)); - #else - const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); - return Packet2cf(_mm_add_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask), - _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), - vec4f_swizzle1(b.v, 1, 0, 3, 2)))); - #endif - } -}; - -template<> struct conj_helper<Packet2cf, Packet2cf, true,false> -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return internal::pmul(pconj(a), b); - #else - const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); - return Packet2cf(_mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), - _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), - vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask))); - #endif - } -}; - -template<> struct conj_helper<Packet2cf, Packet2cf, true,true> -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return pconj(internal::pmul(a, b)); - #else - const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); - return Packet2cf(_mm_sub_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask), - _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), - vec4f_swizzle1(b.v, 1, 0, 3, 2)))); - #endif - } -}; - -template<> struct conj_helper<Packet4f, Packet2cf, false,false> -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const - { return Packet2cf(Eigen::internal::pmul<Packet4f>(x, y.v)); } -}; - -template<> struct conj_helper<Packet2cf, Packet4f, false,false> -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const - { return Packet2cf(Eigen::internal::pmul<Packet4f>(x.v, y)); } -}; - -template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) -{ - // TODO optimize it for SSE3 and 4 - Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b); - __m128 s = _mm_mul_ps(b.v,b.v); - return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(s), 0xb1))))); -} - -EIGEN_STRONG_INLINE Packet2cf pcplxflip/*<Packet2cf>*/(const Packet2cf& x) -{ - return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2)); -} - - -//---------- double ---------- -struct Packet1cd -{ - EIGEN_STRONG_INLINE Packet1cd() {} - EIGEN_STRONG_INLINE explicit Packet1cd(const __m128d& a) : v(a) {} - __m128d v; -}; - -// Use the packet_traits defined in AVX/PacketMath.h instead if we're going -// to leverage AVX instructions. -#ifndef EIGEN_VECTORIZE_AVX -template<> struct packet_traits<std::complex<double> > : default_packet_traits -{ - typedef Packet1cd type; - typedef Packet1cd half; - enum { - Vectorizable = 1, - AlignedOnScalar = 0, - size = 1, - HasHalfPacket = 0, - - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasNegate = 1, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, - HasSetLinear = 0 - }; -}; -#endif - -template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; typedef Packet1cd half; }; - -template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); } -template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) -{ - const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); - return Packet1cd(_mm_xor_pd(a.v,mask)); -} - -template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) -{ - // TODO optimize it for SSE3 and 4 - #ifdef EIGEN_VECTORIZE_SSE3 - return Packet1cd(_mm_addsub_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), - _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), - vec2d_swizzle1(b.v, 1, 0)))); - #else - const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0)); - return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), - _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), - vec2d_swizzle1(b.v, 1, 0)), mask))); - #endif -} - -template<> EIGEN_STRONG_INLINE Packet1cd pand <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd por <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pxor <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); } - -// FIXME force unaligned load, this is a temporary fix -template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) -{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); } -template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) -{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); } -template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from) -{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); } - -template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); } - -// FIXME force unaligned store, this is a temporary fix -template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); } -template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); } - -template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } - -template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) -{ - EIGEN_ALIGN16 double res[2]; - _mm_store_pd(res, a.v); - return std::complex<double>(res[0],res[1]); -} - -template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; } - -template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) -{ - return pfirst(a); -} - -template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs) -{ - return vecs[0]; -} - -template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) -{ - return pfirst(a); -} - -template<int Offset> -struct palign_impl<Offset,Packet1cd> -{ - static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) - { - // FIXME is it sure we never have to align a Packet1cd? - // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary... - } -}; - -template<> struct conj_helper<Packet1cd, Packet1cd, false,true> -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return internal::pmul(a, pconj(b)); - #else - const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); - return Packet1cd(_mm_add_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask), - _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), - vec2d_swizzle1(b.v, 1, 0)))); - #endif - } -}; - -template<> struct conj_helper<Packet1cd, Packet1cd, true,false> -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return internal::pmul(pconj(a), b); - #else - const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); - return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), - _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), - vec2d_swizzle1(b.v, 1, 0)), mask))); - #endif - } -}; - -template<> struct conj_helper<Packet1cd, Packet1cd, true,true> -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return pconj(internal::pmul(a, b)); - #else - const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); - return Packet1cd(_mm_sub_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask), - _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), - vec2d_swizzle1(b.v, 1, 0)))); - #endif - } -}; - -template<> struct conj_helper<Packet2d, Packet1cd, false,false> -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const - { return Packet1cd(Eigen::internal::pmul<Packet2d>(x, y.v)); } -}; - -template<> struct conj_helper<Packet1cd, Packet2d, false,false> -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const - { return Packet1cd(Eigen::internal::pmul<Packet2d>(x.v, y)); } -}; - -template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) -{ - // TODO optimize it for SSE3 and 4 - Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b); - __m128d s = _mm_mul_pd(b.v,b.v); - return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1)))); -} - -EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x) -{ - return Packet1cd(preverse(Packet2d(x.v))); -} - -template<> EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock<Packet2cf,2>& kernel) { - __m128d w1 = _mm_castps_pd(kernel.packet[0].v); - __m128d w2 = _mm_castps_pd(kernel.packet[1].v); - - __m128 tmp = _mm_castpd_ps(_mm_unpackhi_pd(w1, w2)); - kernel.packet[0].v = _mm_castpd_ps(_mm_unpacklo_pd(w1, w2)); - kernel.packet[1].v = tmp; -} - -template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { - __m128d result = pblend(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v)); - return Packet2cf(_mm_castpd_ps(result)); -} - - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_COMPLEX_SSE_H diff --git a/third_party/eigen3/Eigen/src/Core/arch/SSE/MathFunctions.h b/third_party/eigen3/Eigen/src/Core/arch/SSE/MathFunctions.h deleted file mode 100644 index 0baa7b4b58..0000000000 --- a/third_party/eigen3/Eigen/src/Core/arch/SSE/MathFunctions.h +++ /dev/null @@ -1,529 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2007 Julien Pommier -// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -/* The sin, cos, exp, and log functions of this file come from - * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ - */ - -#ifndef EIGEN_MATH_FUNCTIONS_SSE_H -#define EIGEN_MATH_FUNCTIONS_SSE_H - -namespace Eigen { - -namespace internal { - -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f plog<Packet4f>(const Packet4f& _x) -{ - Packet4f x = _x; - _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); - _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); - - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000); - - /* the smallest non denormalized float number */ - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000); - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000);//-1.f/0.f); - - /* natural logarithm computed for 4 simultaneous float - return NaN for x <= 0 - */ - _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); - - - Packet4i emm0; - - // invalid_mask is set to true when x is NaN - Packet4f invalid_mask = _mm_cmpnge_ps(x, _mm_setzero_ps()); - Packet4f iszero_mask = _mm_cmpeq_ps(x, _mm_setzero_ps()); - - x = pmax(x, p4f_min_norm_pos); /* cut off denormalized stuff */ - emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23); - - /* keep only the fractional part */ - x = _mm_and_ps(x, p4f_inv_mant_mask); - x = _mm_or_ps(x, p4f_half); - - emm0 = _mm_sub_epi32(emm0, p4i_0x7f); - Packet4f e = padd(Packet4f(_mm_cvtepi32_ps(emm0)), p4f_1); - - /* part2: - if( x < SQRTHF ) { - e -= 1; - x = x + x - 1.0; - } else { x = x - 1.0; } - */ - Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF); - Packet4f tmp = pand(x, mask); - x = psub(x, p4f_1); - e = psub(e, pand(p4f_1, mask)); - x = padd(x, tmp); - - Packet4f x2 = pmul(x,x); - Packet4f x3 = pmul(x2,x); - - Packet4f y, y1, y2; - y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1); - y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4); - y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7); - y = pmadd(y , x, p4f_cephes_log_p2); - y1 = pmadd(y1, x, p4f_cephes_log_p5); - y2 = pmadd(y2, x, p4f_cephes_log_p8); - y = pmadd(y, x3, y1); - y = pmadd(y, x3, y2); - y = pmul(y, x3); - - y1 = pmul(e, p4f_cephes_log_q1); - tmp = pmul(x2, p4f_half); - y = padd(y, y1); - x = psub(x, tmp); - y2 = pmul(e, p4f_cephes_log_q2); - x = padd(x, y); - x = padd(x, y2); - // negative arg will be NAN, 0 will be -INF - return _mm_or_ps(_mm_andnot_ps(iszero_mask, _mm_or_ps(x, invalid_mask)), - _mm_and_ps(iszero_mask, p4f_minus_inf)); -} - -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f pexp<Packet4f>(const Packet4f& _x) -{ - Packet4f x = _x; - _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); - _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); - - - _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); - _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); - - _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); - - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); - - Packet4f tmp, fx; - Packet4i emm0; - - // clamp x - x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo); - - /* express exp(x) as exp(g + n*log(2)) */ - fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half); - -#ifdef EIGEN_VECTORIZE_SSE4_1 - fx = _mm_floor_ps(fx); -#else - emm0 = _mm_cvttps_epi32(fx); - tmp = _mm_cvtepi32_ps(emm0); - /* if greater, substract 1 */ - Packet4f mask = _mm_cmpgt_ps(tmp, fx); - mask = _mm_and_ps(mask, p4f_1); - fx = psub(tmp, mask); -#endif - - tmp = pmul(fx, p4f_cephes_exp_C1); - Packet4f z = pmul(fx, p4f_cephes_exp_C2); - x = psub(x, tmp); - x = psub(x, z); - - z = pmul(x,x); - - Packet4f y = p4f_cephes_exp_p0; - y = pmadd(y, x, p4f_cephes_exp_p1); - y = pmadd(y, x, p4f_cephes_exp_p2); - y = pmadd(y, x, p4f_cephes_exp_p3); - y = pmadd(y, x, p4f_cephes_exp_p4); - y = pmadd(y, x, p4f_cephes_exp_p5); - y = pmadd(y, z, x); - y = padd(y, p4f_1); - - // build 2^n - emm0 = _mm_cvttps_epi32(fx); - emm0 = _mm_add_epi32(emm0, p4i_0x7f); - emm0 = _mm_slli_epi32(emm0, 23); - return pmax(pmul(y, Packet4f(_mm_castsi128_ps(emm0))), _x); -} -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet2d pexp<Packet2d>(const Packet2d& _x) -{ - Packet2d x = _x; - - _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); - _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); - _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); - - _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437); - _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); - static const __m128i p4i_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0); - - Packet2d tmp, fx; - Packet4i emm0; - - // clamp x - x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo); - /* express exp(x) as exp(g + n*log(2)) */ - fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half); - -#ifdef EIGEN_VECTORIZE_SSE4_1 - fx = _mm_floor_pd(fx); -#else - emm0 = _mm_cvttpd_epi32(fx); - tmp = _mm_cvtepi32_pd(emm0); - /* if greater, substract 1 */ - Packet2d mask = _mm_cmpgt_pd(tmp, fx); - mask = _mm_and_pd(mask, p2d_1); - fx = psub(tmp, mask); -#endif - - tmp = pmul(fx, p2d_cephes_exp_C1); - Packet2d z = pmul(fx, p2d_cephes_exp_C2); - x = psub(x, tmp); - x = psub(x, z); - - Packet2d x2 = pmul(x,x); - - Packet2d px = p2d_cephes_exp_p0; - px = pmadd(px, x2, p2d_cephes_exp_p1); - px = pmadd(px, x2, p2d_cephes_exp_p2); - px = pmul (px, x); - - Packet2d qx = p2d_cephes_exp_q0; - qx = pmadd(qx, x2, p2d_cephes_exp_q1); - qx = pmadd(qx, x2, p2d_cephes_exp_q2); - qx = pmadd(qx, x2, p2d_cephes_exp_q3); - - x = pdiv(px,psub(qx,px)); - x = pmadd(p2d_2,x,p2d_1); - - // build 2^n - emm0 = _mm_cvttpd_epi32(fx); - emm0 = _mm_add_epi32(emm0, p4i_1023_0); - emm0 = _mm_slli_epi32(emm0, 20); - emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3)); - return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x); -} - -/* evaluation of 4 sines at onces, using SSE2 intrinsics. - - The code is the exact rewriting of the cephes sinf function. - Precision is excellent as long as x < 8192 (I did not bother to - take into account the special handling they have for greater values - -- it does not return garbage for arguments over 8192, though, but - the extra precision is missing). - - Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the - surprising but correct result. -*/ - -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f psin<Packet4f>(const Packet4f& _x) -{ - Packet4f x = _x; - _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); - _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - - _EIGEN_DECLARE_CONST_Packet4i(1, 1); - _EIGEN_DECLARE_CONST_Packet4i(not1, ~1); - _EIGEN_DECLARE_CONST_Packet4i(2, 2); - _EIGEN_DECLARE_CONST_Packet4i(4, 4); - - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000); - - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f); - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f); - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI - - Packet4f xmm1, xmm2, xmm3, sign_bit, y; - - Packet4i emm0, emm2; - sign_bit = x; - /* take the absolute value */ - x = pabs(x); - - /* take the modulo */ - - /* extract the sign bit (upper one) */ - sign_bit = _mm_and_ps(sign_bit, p4f_sign_mask); - - /* scale by 4/Pi */ - y = pmul(x, p4f_cephes_FOPI); - - /* store the integer part of y in mm0 */ - emm2 = _mm_cvttps_epi32(y); - /* j=(j+1) & (~1) (see the cephes sources) */ - emm2 = _mm_add_epi32(emm2, p4i_1); - emm2 = _mm_and_si128(emm2, p4i_not1); - y = _mm_cvtepi32_ps(emm2); - /* get the swap sign flag */ - emm0 = _mm_and_si128(emm2, p4i_4); - emm0 = _mm_slli_epi32(emm0, 29); - /* get the polynom selection mask - there is one polynom for 0 <= x <= Pi/4 - and another one for Pi/4<x<=Pi/2 - - Both branches will be computed. - */ - emm2 = _mm_and_si128(emm2, p4i_2); - emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); - - Packet4f swap_sign_bit = _mm_castsi128_ps(emm0); - Packet4f poly_mask = _mm_castsi128_ps(emm2); - sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit); - - /* The magic pass: "Extended precision modular arithmetic" - x = ((x - y * DP1) - y * DP2) - y * DP3; */ - xmm1 = pmul(y, p4f_minus_cephes_DP1); - xmm2 = pmul(y, p4f_minus_cephes_DP2); - xmm3 = pmul(y, p4f_minus_cephes_DP3); - x = padd(x, xmm1); - x = padd(x, xmm2); - x = padd(x, xmm3); - - /* Evaluate the first polynom (0 <= x <= Pi/4) */ - y = p4f_coscof_p0; - Packet4f z = _mm_mul_ps(x,x); - - y = pmadd(y, z, p4f_coscof_p1); - y = pmadd(y, z, p4f_coscof_p2); - y = pmul(y, z); - y = pmul(y, z); - Packet4f tmp = pmul(z, p4f_half); - y = psub(y, tmp); - y = padd(y, p4f_1); - - /* Evaluate the second polynom (Pi/4 <= x <= 0) */ - - Packet4f y2 = p4f_sincof_p0; - y2 = pmadd(y2, z, p4f_sincof_p1); - y2 = pmadd(y2, z, p4f_sincof_p2); - y2 = pmul(y2, z); - y2 = pmul(y2, x); - y2 = padd(y2, x); - - /* select the correct result from the two polynoms */ - y2 = _mm_and_ps(poly_mask, y2); - y = _mm_andnot_ps(poly_mask, y); - y = _mm_or_ps(y,y2); - /* update the sign */ - return _mm_xor_ps(y, sign_bit); -} - -/* almost the same as psin */ -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f pcos<Packet4f>(const Packet4f& _x) -{ - Packet4f x = _x; - _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); - _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - - _EIGEN_DECLARE_CONST_Packet4i(1, 1); - _EIGEN_DECLARE_CONST_Packet4i(not1, ~1); - _EIGEN_DECLARE_CONST_Packet4i(2, 2); - _EIGEN_DECLARE_CONST_Packet4i(4, 4); - - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f); - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f); - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI - - Packet4f xmm1, xmm2, xmm3, y; - Packet4i emm0, emm2; - - x = pabs(x); - - /* scale by 4/Pi */ - y = pmul(x, p4f_cephes_FOPI); - - /* get the integer part of y */ - emm2 = _mm_cvttps_epi32(y); - /* j=(j+1) & (~1) (see the cephes sources) */ - emm2 = _mm_add_epi32(emm2, p4i_1); - emm2 = _mm_and_si128(emm2, p4i_not1); - y = _mm_cvtepi32_ps(emm2); - - emm2 = _mm_sub_epi32(emm2, p4i_2); - - /* get the swap sign flag */ - emm0 = _mm_andnot_si128(emm2, p4i_4); - emm0 = _mm_slli_epi32(emm0, 29); - /* get the polynom selection mask */ - emm2 = _mm_and_si128(emm2, p4i_2); - emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); - - Packet4f sign_bit = _mm_castsi128_ps(emm0); - Packet4f poly_mask = _mm_castsi128_ps(emm2); - - /* The magic pass: "Extended precision modular arithmetic" - x = ((x - y * DP1) - y * DP2) - y * DP3; */ - xmm1 = pmul(y, p4f_minus_cephes_DP1); - xmm2 = pmul(y, p4f_minus_cephes_DP2); - xmm3 = pmul(y, p4f_minus_cephes_DP3); - x = padd(x, xmm1); - x = padd(x, xmm2); - x = padd(x, xmm3); - - /* Evaluate the first polynom (0 <= x <= Pi/4) */ - y = p4f_coscof_p0; - Packet4f z = pmul(x,x); - - y = pmadd(y,z,p4f_coscof_p1); - y = pmadd(y,z,p4f_coscof_p2); - y = pmul(y, z); - y = pmul(y, z); - Packet4f tmp = _mm_mul_ps(z, p4f_half); - y = psub(y, tmp); - y = padd(y, p4f_1); - - /* Evaluate the second polynom (Pi/4 <= x <= 0) */ - Packet4f y2 = p4f_sincof_p0; - y2 = pmadd(y2, z, p4f_sincof_p1); - y2 = pmadd(y2, z, p4f_sincof_p2); - y2 = pmul(y2, z); - y2 = pmadd(y2, x, x); - - /* select the correct result from the two polynoms */ - y2 = _mm_and_ps(poly_mask, y2); - y = _mm_andnot_ps(poly_mask, y); - y = _mm_or_ps(y,y2); - - /* update the sign */ - return _mm_xor_ps(y, sign_bit); -} - -#if EIGEN_FAST_MATH - -// This is based on Quake3's fast inverse square root. -// For detail see here: http://www.beyond3d.com/content/articles/8/ -// It lacks 1 (or 2 bits in some rare cases) of precision, and does not handle negative, +inf, or denormalized numbers correctly. -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f psqrt<Packet4f>(const Packet4f& _x) -{ - Packet4f half = pmul(_x, pset1<Packet4f>(.5f)); - - /* select only the inverse sqrt of non-zero inputs */ - Packet4f non_zero_mask = _mm_cmpge_ps(_x, pset1<Packet4f>((std::numeric_limits<float>::min)())); - Packet4f x = _mm_and_ps(non_zero_mask, _mm_rsqrt_ps(_x)); - - x = pmul(x, psub(pset1<Packet4f>(1.5f), pmul(half, pmul(x,x)))); - return pmul(_x,x); -} - -#else - -template<> EIGEN_STRONG_INLINE Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); } - -#endif - -template<> EIGEN_STRONG_INLINE Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); } - - -#if EIGEN_FAST_MATH - -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f prsqrt<Packet4f>(const Packet4f& _x) { - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000); - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000); - _EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f); - _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f); - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000); - - Packet4f neg_half = pmul(_x, p4f_minus_half); - - // select only the inverse sqrt of positive normal inputs (denormals are - // flushed to zero and cause infs as well). - Packet4f le_zero_mask = _mm_cmple_ps(_x, p4f_flt_min); - Packet4f x = _mm_andnot_ps(le_zero_mask, _mm_rsqrt_ps(_x)); - - // Fill in NaNs and Infs for the negative/zero entries. - Packet4f neg_mask = _mm_cmplt_ps(_x, _mm_setzero_ps()); - Packet4f zero_mask = _mm_andnot_ps(neg_mask, le_zero_mask); - Packet4f infs_and_nans = _mm_or_ps(_mm_and_ps(neg_mask, p4f_nan), - _mm_and_ps(zero_mask, p4f_inf)); - - // Do a single step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p4f_one_point_five)); - - // Insert NaNs and Infs in all the right places. - return _mm_or_ps(x, infs_and_nans); -} - -#else - -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f prsqrt<Packet4f>(const Packet4f& x) { - // Unfortunately we can't use the much faster mm_rqsrt_ps since it only provides an approximation. - return _mm_div_ps(pset1<Packet4f>(1.0f), _mm_sqrt_ps(x)); -} - -#endif - -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet2d prsqrt<Packet2d>(const Packet2d& x) { - // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation. - return _mm_div_pd(pset1<Packet2d>(1.0), _mm_sqrt_pd(x)); -} - -// Identical to the ptanh in GenericPacketMath.h, but for doubles use -// a small/medium approximation threshold of 0.001. -template<> EIGEN_STRONG_INLINE Packet2d ptanh_approx_threshold() { - return pset1<Packet2d>(0.001); -} - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_MATH_FUNCTIONS_SSE_H diff --git a/third_party/eigen3/Eigen/src/Core/arch/SSE/PacketMath.h b/third_party/eigen3/Eigen/src/Core/arch/SSE/PacketMath.h deleted file mode 100644 index 7f4274fd99..0000000000 --- a/third_party/eigen3/Eigen/src/Core/arch/SSE/PacketMath.h +++ /dev/null @@ -1,883 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_PACKET_MATH_SSE_H -#define EIGEN_PACKET_MATH_SSE_H - -namespace Eigen { - -namespace internal { - -#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD -#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 -#endif - -#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) -#endif - -#ifdef __FMA__ -#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 -#endif -#endif - -typedef __m128 Packet4f; -typedef __m128i Packet4i; -typedef __m128d Packet2d; - -template<> struct is_arithmetic<__m128> { enum { value = true }; }; -template<> struct is_arithmetic<__m128i> { enum { value = true }; }; -template<> struct is_arithmetic<__m128d> { enum { value = true }; }; - -#define vec4f_swizzle1(v,p,q,r,s) \ - (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p))))) - -#define vec4i_swizzle1(v,p,q,r,s) \ - (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p)))) - -#define vec2d_swizzle1(v,p,q) \ - (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2))))) - -#define vec4f_swizzle2(a,b,p,q,r,s) \ - (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p)))) - -#define vec4i_swizzle2(a,b,p,q,r,s) \ - (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p)))))) - -#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ - const Packet4f p4f_##NAME = pset1<Packet4f>(X) - -#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \ - const Packet2d p2d_##NAME = pset1<Packet2d>(X) - -#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ - const Packet4f p4f_##NAME = _mm_castsi128_ps(pset1<Packet4i>(X)) - -#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ - const Packet4i p4i_##NAME = pset1<Packet4i>(X) - - -// Use the packet_traits defined in AVX/PacketMath.h instead if we're going -// to leverage AVX instructions. -#ifndef EIGEN_VECTORIZE_AVX -template<> struct packet_traits<float> : default_packet_traits -{ - typedef Packet4f type; - typedef Packet4f half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size=4, - HasHalfPacket = 0, - - HasDiv = 1, - HasSin = EIGEN_FAST_MATH, - HasCos = EIGEN_FAST_MATH, - HasTanH = 1, - HasLog = 1, - HasExp = 1, - HasSqrt = 1, - HasRsqrt = 1, - - HasBlend = 1, - HasSelect = 1, - HasEq = 1, - }; -}; -template<> struct packet_traits<double> : default_packet_traits -{ - typedef Packet2d type; - typedef Packet2d half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size=2, - HasHalfPacket = 0, - - HasDiv = 1, - HasTanH = 1, - HasExp = 1, - HasSqrt = 1, - HasRsqrt = 1, - - HasBlend = 1, - HasSelect = 1, - HasEq = 1, - }; -}; -#endif -template<> struct packet_traits<int> : default_packet_traits -{ - typedef Packet4i type; - typedef Packet4i half; - enum { - // FIXME check the Has* - Vectorizable = 1, - AlignedOnScalar = 1, - size=4, - - HasBlend = 1, - }; -}; - -template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; typedef Packet4f half; }; -template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; typedef Packet2d half; }; -template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; typedef Packet4i half; }; - -#if EIGEN_COMP_MSVC==1500 -// Workaround MSVC 9 internal compiler error. -// TODO: It has been detected with win64 builds (amd64), so let's check whether it also happens in 32bits+SSE mode -// TODO: let's check whether there does not exist a better fix, like adding a pset0() function. (it crashed on pset1(0)). -template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set_ps(from,from,from,from); } -template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set_pd(from,from); } -template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set_epi32(from,from,from,from); } -#else -template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set_ps1(from); } -template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); } -template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set1_epi32(from); } -#endif - -// GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction. -// However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203) -// Using inline assembly is also not an option because then gcc fails to reorder properly the instructions. -// Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply. -// Also note that with AVX, we want it to generate a vbroadcastss. -#if EIGEN_COMP_GNUC_STRICT && (!defined __AVX__) -template<> EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float *from) { - return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0); -} -#endif - -#ifndef EIGEN_VECTORIZE_AVX -template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); } -template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); } -#endif -template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); } - -template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f ple<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_cmple_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d ple<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_cmple_pd(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f plt<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_cmplt_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d plt<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_cmplt_pd(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f peq<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d peq<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f pselect<Packet4f>(const Packet4f& a, const Packet4f& b, const Packet4f& false_mask) { -#if defined(EIGEN_VECTORIZE_SSE4_1) - return _mm_blendv_ps(a, b, false_mask); -#else - return _mm_or_ps(_mm_andnot_ps(false_mask, a), _mm_and_ps(false_mask, b)); -#endif -} -template<> EIGEN_STRONG_INLINE Packet2d pselect<Packet2d>(const Packet2d& a, const Packet2d& b, const Packet2d& false_mask) { -#if defined(EIGEN_VECTORIZE_SSE4_1) - return _mm_blendv_pd(a, b, false_mask); -#else - return _mm_or_pd(_mm_andnot_pd(false_mask, a), _mm_and_pd(false_mask, b)); -#endif -} - -template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) -{ - const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000)); - return _mm_xor_ps(a,mask); -} -template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) -{ - const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x80000000)); - return _mm_xor_pd(a,mask); -} -template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) -{ - return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a); -} - -template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } - -template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_mul_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_mul_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) -{ -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_mullo_epi32(a,b); -#else - // this version is slightly faster than 4 scalar products - return vec4i_swizzle1( - vec4i_swizzle2( - _mm_mul_epu32(a,b), - _mm_mul_epu32(vec4i_swizzle1(a,1,0,3,2), - vec4i_swizzle1(b,1,0,3,2)), - 0,2,0,2), - 0,2,1,3); -#endif -} - -template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) -{ eigen_assert(false && "packet integer division are not supported by SSE"); - return pset1<Packet4i>(0); -} - -// for some weird raisons, it has to be overloaded for packet of integers -template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); } -#ifdef __FMA__ -template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); } -template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); } -#endif - -template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) -{ -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_min_epi32(a,b); -#else - // after some bench, this version *is* faster than a scalar implementation - Packet4i mask = _mm_cmplt_epi32(a,b); - return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b)); -#endif -} - -template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_max_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_max_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) -{ -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_max_epi32(a,b); -#else - // after some bench, this version *is* faster than a scalar implementation - Packet4i mask = _mm_cmpgt_epi32(a,b); - return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b)); -#endif -} - -template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); } -template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); } -template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); } - -#if EIGEN_COMP_MSVC - template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { - EIGEN_DEBUG_UNALIGNED_LOAD - #if (EIGEN_COMP_MSVC==1600) - // NOTE Some version of MSVC10 generates bad code when using _mm_loadu_ps - // (i.e., it does not generate an unaligned load!! - // TODO On most architectures this version should also be faster than a single _mm_loadu_ps - // so we could also enable it for MSVC08 but first we have to make this later does not generate crap when doing so... - __m128 res = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from)); - res = _mm_loadh_pi(res, (const __m64*)(from+2)); - return res; - #else - return _mm_loadu_ps(from); - #endif - } - template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); } - template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from)); } -#else -// Fast unaligned loads. Note that here we cannot directly use intrinsics: this would -// require pointer casting to incompatible pointer types and leads to invalid code -// because of the strict aliasing rule. The "dummy" stuff are required to enforce -// a correct instruction dependency. -// TODO: do the same for MSVC (ICC is compatible) -// NOTE: with the code below, MSVC's compiler crashes! - -#if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386 || (EIGEN_ARCH_x86_64 && EIGEN_GNUC_AT_LEAST(4, 8))) - // bug 195: gcc/i386 emits weird x87 fldl/fstpl instructions for _mm_load_sd - #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1 - #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 1 -#elif EIGEN_COMP_CLANG - // bug 201: Segfaults in __mm_loadh_pd with clang 2.8 - #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1 - #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 0 -#else - #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 0 - #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 0 -#endif - -template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) -{ - EIGEN_DEBUG_UNALIGNED_LOAD -#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS - return _mm_loadu_ps(from); -#else - __m128d res; - res = _mm_load_sd((const double*)(from)) ; - res = _mm_loadh_pd(res, (const double*)(from+2)) ; - return _mm_castpd_ps(res); -#endif -} -template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) -{ - EIGEN_DEBUG_UNALIGNED_LOAD -#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS - return _mm_loadu_pd(from); -#else - __m128d res; - res = _mm_load_sd(from) ; - res = _mm_loadh_pd(res,from+1); - return res; -#endif -} -template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) -{ - EIGEN_DEBUG_UNALIGNED_LOAD -#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS - return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from)); -#else - __m128d res; - res = _mm_load_sd((const double*)(from)) ; - res = _mm_loadh_pd(res, (const double*)(from+2)) ; - return _mm_castpd_si128(res); -#endif -} -#endif - -template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) -{ - return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from))), 0, 0, 1, 1); -} -template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) -{ return pset1<Packet2d>(from[0]); } -template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) -{ - Packet4i tmp; - tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from)); - return vec4i_swizzle1(tmp, 0, 0, 1, 1); -} - -template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); } -template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); } -template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); } - -template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { - EIGEN_DEBUG_UNALIGNED_STORE -#if EIGEN_AVOID_CUSTOM_UNALIGNED_STORES - _mm_storeu_pd(to, from); -#else - _mm_storel_pd((to), from); - _mm_storeh_pd((to+1), from); -#endif -} -template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), Packet2d(_mm_castps_pd(from))); } -template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), Packet2d(_mm_castsi128_pd(from))); } - -template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, int stride) -{ - return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); -} -template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, int stride) -{ - return _mm_set_pd(from[1*stride], from[0*stride]); -} -template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, int stride) -{ - return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); - } - -template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, int stride) -{ - to[stride*0] = _mm_cvtss_f32(from); - to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1)); - to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2)); - to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3)); -} -template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, int stride) -{ - to[stride*0] = _mm_cvtsd_f64(from); - to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1)); -} -template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, int stride) -{ - to[stride*0] = _mm_cvtsi128_si32(from); - to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1)); - to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)); - to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3)); -} - -// some compilers might be tempted to perform multiple moves instead of using a vector path. -template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a) -{ - Packet4f pa = _mm_set_ss(a); - pstore(to, Packet4f(vec4f_swizzle1(pa,0,0,0,0))); -} -// some compilers might be tempted to perform multiple moves instead of using a vector path. -template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a) -{ - Packet2d pa = _mm_set_sd(a); - pstore(to, Packet2d(vec2d_swizzle1(pa,0,0))); -} - -#ifndef EIGEN_VECTORIZE_AVX -template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -#endif - -#if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64 -// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010 -// Direct of the struct members fixed bug #62. -template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return a.m128_f32[0]; } -template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return a.m128d_f64[0]; } -template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; } -#elif EIGEN_COMP_MSVC_STRICT -// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010 -template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; } -template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; } -template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; } -#else -template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return _mm_cvtss_f32(a); } -template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return _mm_cvtsd_f64(a); } -template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { return _mm_cvtsi128_si32(a); } -#endif - -template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) -{ return _mm_shuffle_ps(a,a,0x1B); } -template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) -{ return _mm_shuffle_pd(a,a,0x1); } -template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) -{ return _mm_shuffle_epi32(a,0x1B); } - -template<size_t offset> -struct protate_impl<offset, Packet4f> -{ - static Packet4f run(const Packet4f& a) { - return vec4f_swizzle1(a, offset, (offset + 1) % 4, (offset + 2) % 4, (offset + 3) % 4); - } -}; - -template<size_t offset> -struct protate_impl<offset, Packet4i> -{ - static Packet4i run(const Packet4i& a) { - return vec4i_swizzle1(a, offset, (offset + 1) % 4, (offset + 2) % 4, (offset + 3) % 4); - } -}; - -template<size_t offset> -struct protate_impl<offset, Packet2d> -{ - static Packet2d run(const Packet2d& a) { - return vec2d_swizzle1(a, offset, (offset + 1) % 2); - } -}; - -template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) -{ - const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF)); - return _mm_and_ps(a,mask); -} -template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) -{ - const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF)); - return _mm_and_pd(a,mask); -} -template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) -{ - #ifdef EIGEN_VECTORIZE_SSSE3 - return _mm_abs_epi32(a); - #else - Packet4i aux = _mm_srai_epi32(a,31); - return _mm_sub_epi32(_mm_xor_si128(a,aux),aux); - #endif -} - -// with AVX, the default implementations based on pload1 are faster -#ifndef __AVX__ -template<> EIGEN_STRONG_INLINE void -pbroadcast4<Packet4f>(const float *a, - Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) -{ - a3 = pload<Packet4f>(a); - a0 = vec4f_swizzle1(a3, 0,0,0,0); - a1 = vec4f_swizzle1(a3, 1,1,1,1); - a2 = vec4f_swizzle1(a3, 2,2,2,2); - a3 = vec4f_swizzle1(a3, 3,3,3,3); -} -template<> EIGEN_STRONG_INLINE void -pbroadcast4<Packet2d>(const double *a, - Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) -{ -#ifdef EIGEN_VECTORIZE_SSE3 - a0 = _mm_loaddup_pd(a+0); - a1 = _mm_loaddup_pd(a+1); - a2 = _mm_loaddup_pd(a+2); - a3 = _mm_loaddup_pd(a+3); -#else - a1 = pload<Packet2d>(a); - a0 = vec2d_swizzle1(a1, 0,0); - a1 = vec2d_swizzle1(a1, 1,1); - a3 = pload<Packet2d>(a+2); - a2 = vec2d_swizzle1(a3, 0,0); - a3 = vec2d_swizzle1(a3, 1,1); -#endif -} -#endif - -EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs) -{ - vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55)); - vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA)); - vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF)); - vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00)); -} - -#ifdef EIGEN_VECTORIZE_SSE3 -// TODO implement SSE2 versions as well as integer versions -template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs) -{ - return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3])); -} -template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) -{ - return _mm_hadd_pd(vecs[0], vecs[1]); -} -// SSSE3 version: -// EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) -// { -// return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3])); -// } - -template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) -{ - Packet4f tmp0 = _mm_hadd_ps(a,a); - return pfirst<Packet4f>(_mm_hadd_ps(tmp0, tmp0)); -} - -template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst<Packet2d>(_mm_hadd_pd(a, a)); } - -// SSSE3 version: -// EIGEN_STRONG_INLINE float predux(const Packet4i& a) -// { -// Packet4i tmp0 = _mm_hadd_epi32(a,a); -// return pfirst(_mm_hadd_epi32(tmp0, tmp0)); -// } -#else -// SSE2 versions -template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) -{ - Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a)); - return pfirst(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); -} -template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) -{ - return pfirst(_mm_add_sd(a, _mm_unpackhi_pd(a,a))); -} - -template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs) -{ - Packet4f tmp0, tmp1, tmp2; - tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]); - tmp1 = _mm_unpackhi_ps(vecs[0], vecs[1]); - tmp2 = _mm_unpackhi_ps(vecs[2], vecs[3]); - tmp0 = _mm_add_ps(tmp0, tmp1); - tmp1 = _mm_unpacklo_ps(vecs[2], vecs[3]); - tmp1 = _mm_add_ps(tmp1, tmp2); - tmp2 = _mm_movehl_ps(tmp1, tmp0); - tmp0 = _mm_movelh_ps(tmp0, tmp1); - return _mm_add_ps(tmp0, tmp2); -} - -template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) -{ - return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1])); -} -#endif // SSE3 - -template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) -{ - Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a)); - return pfirst(tmp) + pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)); -} - -template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs) -{ - Packet4i tmp0, tmp1, tmp2; - tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]); - tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]); - tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]); - tmp0 = _mm_add_epi32(tmp0, tmp1); - tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]); - tmp1 = _mm_add_epi32(tmp1, tmp2); - tmp2 = _mm_unpacklo_epi64(tmp0, tmp1); - tmp0 = _mm_unpackhi_epi64(tmp0, tmp1); - return _mm_add_epi32(tmp0, tmp2); -} - -// Other reduction functions: - -// mul -template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) -{ - Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a)); - return pfirst<Packet4f>(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); -} -template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) -{ - return pfirst<Packet2d>(_mm_mul_sd(a, _mm_unpackhi_pd(a,a))); -} -template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) -{ - // after some experiments, it is seems this is the fastest way to implement it - // for GCC (eg., reusing pmul is very slow !) - // TODO try to call _mm_mul_epu32 directly - EIGEN_ALIGN16 int aux[4]; - pstore(aux, a); - return (aux[0] * aux[1]) * (aux[2] * aux[3]);; -} - -// min -template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) -{ - Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a)); - return pfirst<Packet4f>(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); -} -template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) -{ - return pfirst<Packet2d>(_mm_min_sd(a, _mm_unpackhi_pd(a,a))); -} -template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) -{ -#ifdef EIGEN_VECTORIZE_SSE4_1 - Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2))); - return pfirst<Packet4i>(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); -#else - // after some experiments, it is seems this is the fastest way to implement it - // for GCC (eg., it does not like using std::min after the pstore !!) - EIGEN_ALIGN16 int aux[4]; - pstore(aux, a); - int aux0 = aux[0]<aux[1] ? aux[0] : aux[1]; - int aux2 = aux[2]<aux[3] ? aux[2] : aux[3]; - return aux0<aux2 ? aux0 : aux2; -#endif // EIGEN_VECTORIZE_SSE4_1 -} - -// max -template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) -{ - Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a)); - return pfirst<Packet4f>(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); -} -template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) -{ - return pfirst<Packet2d>(_mm_max_sd(a, _mm_unpackhi_pd(a,a))); -} -template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) -{ -#ifdef EIGEN_VECTORIZE_SSE4_1 - Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2))); - return pfirst<Packet4i>(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); -#else - // after some experiments, it is seems this is the fastest way to implement it - // for GCC (eg., it does not like using std::min after the pstore !!) - EIGEN_ALIGN16 int aux[4]; - pstore(aux, a); - int aux0 = aux[0]>aux[1] ? aux[0] : aux[1]; - int aux2 = aux[2]>aux[3] ? aux[2] : aux[3]; - return aux0>aux2 ? aux0 : aux2; -#endif // EIGEN_VECTORIZE_SSE4_1 -} - -#if EIGEN_COMP_GNUC -// template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) -// { -// Packet4f res = b; -// asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c)); -// return res; -// } -// EIGEN_STRONG_INLINE Packet4i _mm_alignr_epi8(const Packet4i& a, const Packet4i& b, const int i) -// { -// Packet4i res = a; -// asm("palignr %[i], %[a], %[b] " : [b] "+x" (res) : [a] "x" (a), [i] "i" (i)); -// return res; -// } -#endif - -#ifdef EIGEN_VECTORIZE_SSSE3 -// SSSE3 versions -template<int Offset> -struct palign_impl<Offset,Packet4f> -{ - static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) - { - if (Offset!=0) - first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(second), _mm_castps_si128(first), Offset*4)); - } -}; - -template<int Offset> -struct palign_impl<Offset,Packet4i> -{ - static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) - { - if (Offset!=0) - first = _mm_alignr_epi8(second,first, Offset*4); - } -}; - -template<int Offset> -struct palign_impl<Offset,Packet2d> -{ - static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) - { - if (Offset==1) - first = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(second), _mm_castpd_si128(first), 8)); - } -}; -#else -// SSE2 versions -template<int Offset> -struct palign_impl<Offset,Packet4f> -{ - static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) - { - if (Offset==1) - { - first = _mm_move_ss(first,second); - first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39)); - } - else if (Offset==2) - { - first = _mm_movehl_ps(first,first); - first = _mm_movelh_ps(first,second); - } - else if (Offset==3) - { - first = _mm_move_ss(first,second); - first = _mm_shuffle_ps(first,second,0x93); - } - } -}; - -template<int Offset> -struct palign_impl<Offset,Packet4i> -{ - static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) - { - if (Offset==1) - { - first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); - first = _mm_shuffle_epi32(first,0x39); - } - else if (Offset==2) - { - first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first))); - first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); - } - else if (Offset==3) - { - first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); - first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93)); - } - } -}; - -template<int Offset> -struct palign_impl<Offset,Packet2d> -{ - static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) - { - if (Offset==1) - { - first = _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(first),_mm_castpd_ps(first))); - first = _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(first),_mm_castpd_ps(second))); - } - } -}; -#endif - -template<> EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock<Packet4f,4>& kernel) { - _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]); -} - -template<> EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock<Packet2d,2>& kernel) { - __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]); - kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]); - kernel.packet[1] = tmp; -} - -template<> EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock<Packet4i,4>& kernel) { - __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]); - __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]); - __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]); - __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]); - - kernel.packet[0] = _mm_unpacklo_epi64(T0, T1); - kernel.packet[1] = _mm_unpackhi_epi64(T0, T1); - kernel.packet[2] = _mm_unpacklo_epi64(T2, T3); - kernel.packet[3] = _mm_unpackhi_epi64(T2, T3); -} - -template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { - const __m128i zero = _mm_setzero_si128(); - const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); - __m128i false_mask = _mm_cmpeq_epi32(select, zero); -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_blendv_epi8(thenPacket, elsePacket, false_mask); -#else - return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket)); -#endif -} -template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { - const __m128 zero = _mm_setzero_ps(); - const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); - __m128 false_mask = _mm_cmpeq_ps(select, zero); -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_blendv_ps(thenPacket, elsePacket, false_mask); -#else - return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket)); -#endif -} - -template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { - const __m128d zero = _mm_setzero_pd(); - const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]); - __m128d false_mask = _mm_cmpeq_pd(select, zero); -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_blendv_pd(thenPacket, elsePacket, false_mask); -#else - return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket)); -#endif -} - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_PACKET_MATH_SSE_H diff --git a/third_party/eigen3/Eigen/src/Core/arch/SSE/TypeCasting.h b/third_party/eigen3/Eigen/src/Core/arch/SSE/TypeCasting.h deleted file mode 100644 index c848932306..0000000000 --- a/third_party/eigen3/Eigen/src/Core/arch/SSE/TypeCasting.h +++ /dev/null @@ -1,77 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_TYPE_CASTING_SSE_H -#define EIGEN_TYPE_CASTING_SSE_H - -namespace Eigen { - -namespace internal { - -template <> -struct type_casting_traits<float, int> { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) { - return _mm_cvttps_epi32(a); -} - - -template <> -struct type_casting_traits<int, float> { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) { - return _mm_cvtepi32_ps(a); -} - - -template <> -struct type_casting_traits<double, float> { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 2, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) { - return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6)); -} - -template <> -struct type_casting_traits<float, double> { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 2 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) { - // Simply discard the second half of the input - return _mm_cvtps_pd(a); -} - - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_TYPE_CASTING_SSE_H |